Spaces:

H-Liu1997
/

EMAGE

Running on Zero

App Files Files Community

H-Liu1997 commited on Jan 18

Commit

35ee6b1

1 Parent(s): 93735c1

Clean rewrite: Python 3.10 venv + script-based inference

Browse files

Files changed (1) hide show

EMAGE_Colab_Demo.ipynb +151 -330

EMAGE_Colab_Demo.ipynb CHANGED Viewed

@@ -11,424 +11,244 @@
         "- **DisCo**: Upper body gesture generation with diffusion\n",
         "- **EMAGE**: Full body + face gesture generation\n",
         "\n",
-        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/#fileId=https%3A//huggingface.co/spaces/H-Liu1997/EMAGE/resolve/main/EMAGE_Colab_Demo.ipynb)\n",
-        "\n",
-        "[Project Page](https://pantomatrix.github.io/EMAGE/) | [GitHub](https://github.com/PantoMatrix/PantoMatrix) | [Paper](https://arxiv.org/abs/2401.00374)"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## 1. Setup Environment"
       ]
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
-        "# Step 1: Install dependencies (using Colab's Python 3.11)\n",
-        "import sys\n",
-        "print(f\"Python version: {sys.version}\")\n",
         "\n",
-        "# Install PyTorch with CUDA 12.1\n",
-        "!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121\n",
-        "!pip install -q numpy librosa soundfile transformers huggingface_hub\n",
-        "!pip install -q smplx trimesh scipy easydict omegaconf\n",
         "\n",
-        "# Verify installation\n",
-        "import torch\n",
-        "print(f\"PyTorch: {torch.__version__}\")\n",
-        "print(f\"CUDA available: {torch.cuda.is_available()}\")"
-      ],
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
-        "# Step 2: Clone code repositories\n",
         "!apt-get install -y git-lfs > /dev/null 2>&1\n",
         "!git lfs install\n",
         "\n",
-        "# Clone PantoMatrix from GitHub\n",
-        "!git clone https://github.com/PantoMatrix/PantoMatrix.git\n",
-        "\n",
-        "# Clone evaluation tools (contains SMPLX models)\n",
-        "!git clone https://huggingface.co/H-Liu1997/emage_evaltools PantoMatrix/emage_evaltools\n",
-        "%cd PantoMatrix/emage_evaltools\n",
         "!git lfs pull\n",
-        "%cd /content/PantoMatrix\n",
         "\n",
-        "print(\"Code cloned successfully!\")"
-      ],
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
-        "# Step 3: Import libraries\n",
-        "import sys\n",
-        "import os\n",
-        "import torch\n",
-        "import numpy as np\n",
-        "import librosa\n",
-        "import soundfile as sf\n",
-        "from IPython.display import Video, Audio, display\n",
         "\n",
         "from models.camn_audio import CamnAudioModel\n",
         "from models.disco_audio import DiscoAudioModel\n",
         "from models.emage_audio import EmageAudioModel, EmageVQVAEConv, EmageVAEConv, EmageVQModel\n",
         "from emage_utils.motion_io import beat_format_save\n",
         "from emage_utils.npz2pose import render2d\n",
-        "from torchvision.io import write_video\n",
-        "import torch.nn.functional as F\n",
-        "\n",
-        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
-        "print(f\"Python: {sys.version}\")\n",
-        "print(f\"PyTorch: {torch.__version__}\")\n",
-        "print(f\"NumPy: {np.__version__}\")\n",
-        "print(f\"CUDA available: {torch.cuda.is_available()}\")\n",
-        "print(f\"Using device: {device}\")\n",
         "\n",
-        "os.makedirs(\"./outputs\", exist_ok=True)\n",
-        "\n",
-        "# Check if evaluation tools are available\n",
-        "EVAL_AVAILABLE = os.path.exists(\"./emage_evaltools/metric.py\")\n",
-        "print(f\"Evaluation tools available: {EVAL_AVAILABLE}\")"
-      ],
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## 2. Upload Your Audio\n",
         "\n",
-        "Upload a `.wav` file or use the example audio."
       ]
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
-        "# Option 1: Use example audio\n",
-        "audio_path = \"./examples/audio/2_scott_0_103_103_10s.wav\"\n",
         "\n",
-        "# Option 2: Upload your own audio\n",
         "# from google.colab import files\n",
         "# uploaded = files.upload()\n",
-        "# audio_path = list(uploaded.keys())[0]\n",
         "\n",
         "display(Audio(audio_path))"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## 3. CaMN Model (Upper Body)\n",
-        "\n",
-        "CaMN generates upper body gestures from speech audio."
       ]
     },
     {
       "cell_type": "code",
-      "metadata": {},
-      "source": [
-        "# Load CaMN model\n",
-        "model_camn = CamnAudioModel.from_pretrained(\"H-Liu1997/camn_audio\").to(device).eval()\n",
-        "print(\"CaMN model loaded!\")"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {},
-      "source": [
-        "# CaMN Inference\n",
-        "sr_model = model_camn.cfg.audio_sr\n",
-        "pose_fps = model_camn.cfg.pose_fps\n",
-        "seed_frames = model_camn.cfg.seed_frames\n",
-        "\n",
-        "audio_loaded, _ = librosa.load(audio_path, sr=sr_model)\n",
-        "audio_t = torch.from_numpy(audio_loaded).float().unsqueeze(0).to(device)\n",
-        "sid = torch.zeros(1, 1).long().to(device)\n",
-        "\n",
-        "with torch.no_grad():\n",
-        "    motion_pred = model_camn(audio_t, sid, seed_frames=seed_frames)[\"motion_axis_angle\"]\n",
-        "\n",
-        "t = motion_pred.shape[1]\n",
-        "motion_pred_np = motion_pred.cpu().numpy().reshape(t, -1)\n",
-        "\n",
-        "# Save motion\n",
-        "camn_npz_path = \"./outputs/camn_output.npz\"\n",
-        "beat_format_save(camn_npz_path, motion_pred_np, upsample=30 // pose_fps)\n",
-        "print(f\"CaMN motion saved to {camn_npz_path}\")"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {},
-      "source": [
-        "# Visualize CaMN result\n",
-        "motion_dict = np.load(camn_npz_path, allow_pickle=True)\n",
-        "v2d = render2d(motion_dict, (720, 480), face_only=False, remove_global=True)\n",
-        "camn_video_path = \"./outputs/camn_output.mp4\"\n",
-        "write_video(camn_video_path, v2d.permute(0, 2, 3, 1), fps=30)\n",
-        "print(\"CaMN visualization:\")\n",
-        "display(Video(camn_video_path, embed=True, width=480))"
-      ],
       "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## 4. DisCo Model (Upper Body with Diffusion)\n",
         "\n",
-        "DisCo uses diffusion for more diverse upper body gesture generation."
       ]
     },
     {
       "cell_type": "code",
-      "metadata": {},
-      "source": [
-        "# Load DisCo model\n",
-        "model_disco = DiscoAudioModel.from_pretrained(\"H-Liu1997/disco_audio\").to(device).eval()\n",
-        "print(\"DisCo model loaded!\")"
-      ],
       "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
       "metadata": {},
       "source": [
-        "# DisCo Inference\n",
-        "sr_model = model_disco.cfg.audio_sr\n",
-        "pose_fps = model_disco.cfg.pose_fps\n",
-        "seed_frames = model_disco.cfg.seed_frames\n",
-        "\n",
-        "audio_loaded, _ = librosa.load(audio_path, sr=sr_model)\n",
-        "audio_t = torch.from_numpy(audio_loaded).float().unsqueeze(0).to(device)\n",
-        "sid = torch.zeros(1, 1).long().to(device)\n",
-        "\n",
-        "with torch.no_grad():\n",
-        "    motion_pred = model_disco(audio_t, sid, seed_frames=seed_frames, seed_motion=None)[\"motion_axis_angle\"]\n",
-        "\n",
-        "t = motion_pred.shape[1]\n",
-        "motion_pred_np = motion_pred.cpu().numpy().reshape(t, -1)\n",
         "\n",
-        "# Save motion\n",
-        "disco_npz_path = \"./outputs/disco_output.npz\"\n",
-        "beat_format_save(disco_npz_path, motion_pred_np, upsample=30 // pose_fps)\n",
-        "print(f\"DisCo motion saved to {disco_npz_path}\")"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {},
-      "source": [
-        "# Visualize DisCo result\n",
-        "motion_dict = np.load(disco_npz_path, allow_pickle=True)\n",
-        "v2d = render2d(motion_dict, (720, 480), face_only=False, remove_global=True)\n",
-        "disco_video_path = \"./outputs/disco_output.mp4\"\n",
-        "write_video(disco_video_path, v2d.permute(0, 2, 3, 1), fps=30)\n",
-        "print(\"DisCo visualization:\")\n",
-        "display(Video(disco_video_path, embed=True, width=480))"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## 5. EMAGE Model (Full Body + Face)\n",
-        "\n",
-        "EMAGE generates full body gestures including face expressions."
       ]
     },
     {
       "cell_type": "code",
-      "metadata": {},
-      "source": [
-        "# Load EMAGE model and VQ components\n",
-        "face_motion_vq = EmageVQVAEConv.from_pretrained(\"H-Liu1997/emage_audio\", subfolder=\"emage_vq/face\").to(device).eval()\n",
-        "upper_motion_vq = EmageVQVAEConv.from_pretrained(\"H-Liu1997/emage_audio\", subfolder=\"emage_vq/upper\").to(device).eval()\n",
-        "lower_motion_vq = EmageVQVAEConv.from_pretrained(\"H-Liu1997/emage_audio\", subfolder=\"emage_vq/lower\").to(device).eval()\n",
-        "hands_motion_vq = EmageVQVAEConv.from_pretrained(\"H-Liu1997/emage_audio\", subfolder=\"emage_vq/hands\").to(device).eval()\n",
-        "global_motion_ae = EmageVAEConv.from_pretrained(\"H-Liu1997/emage_audio\", subfolder=\"emage_vq/global\").to(device).eval()\n",
-        "\n",
-        "emage_vq_model = EmageVQModel(\n",
-        "    face_model=face_motion_vq, \n",
-        "    upper_model=upper_motion_vq,\n",
-        "    lower_model=lower_motion_vq, \n",
-        "    hands_model=hands_motion_vq,\n",
-        "    global_model=global_motion_ae\n",
-        ").to(device).eval()\n",
-        "\n",
-        "model_emage = EmageAudioModel.from_pretrained(\"H-Liu1997/emage_audio\").to(device).eval()\n",
-        "print(\"EMAGE model loaded!\")"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {},
-      "source": [
-        "# EMAGE Inference\n",
-        "sr_model = model_emage.cfg.audio_sr\n",
-        "pose_fps = model_emage.cfg.pose_fps\n",
-        "\n",
-        "audio_loaded, _ = librosa.load(audio_path, sr=sr_model)\n",
-        "audio_t = torch.from_numpy(audio_loaded).float().unsqueeze(0).to(device)\n",
-        "sid = torch.zeros(1, 1).long().to(device)\n",
-        "\n",
-        "with torch.no_grad():\n",
-        "    latent_dict = model_emage.inference(audio_t, sid, emage_vq_model, masked_motion=None, mask=None)\n",
-        "    \n",
-        "    face_latent = latent_dict[\"rec_face\"] if model_emage.cfg.lf > 0 and model_emage.cfg.cf == 0 else None\n",
-        "    upper_latent = latent_dict[\"rec_upper\"] if model_emage.cfg.lu > 0 and model_emage.cfg.cu == 0 else None\n",
-        "    hands_latent = latent_dict[\"rec_hands\"] if model_emage.cfg.lh > 0 and model_emage.cfg.ch == 0 else None\n",
-        "    lower_latent = latent_dict[\"rec_lower\"] if model_emage.cfg.ll > 0 and model_emage.cfg.cl == 0 else None\n",
-        "\n",
-        "    face_index = torch.max(F.log_softmax(latent_dict[\"cls_face\"], dim=2), dim=2)[1] if model_emage.cfg.cf > 0 else None\n",
-        "    upper_index = torch.max(F.log_softmax(latent_dict[\"cls_upper\"], dim=2), dim=2)[1] if model_emage.cfg.cu > 0 else None\n",
-        "    hands_index = torch.max(F.log_softmax(latent_dict[\"cls_hands\"], dim=2), dim=2)[1] if model_emage.cfg.ch > 0 else None\n",
-        "    lower_index = torch.max(F.log_softmax(latent_dict[\"cls_lower\"], dim=2), dim=2)[1] if model_emage.cfg.cl > 0 else None\n",
-        "\n",
-        "    ref_trans = torch.zeros(1, 1, 3).to(device)\n",
-        "    all_pred = emage_vq_model.decode(\n",
-        "        face_latent=face_latent, \n",
-        "        upper_latent=upper_latent, \n",
-        "        lower_latent=lower_latent, \n",
-        "        hands_latent=hands_latent,\n",
-        "        face_index=face_index, \n",
-        "        upper_index=upper_index, \n",
-        "        lower_index=lower_index, \n",
-        "        hands_index=hands_index,\n",
-        "        get_global_motion=True, \n",
-        "        ref_trans=ref_trans[:, 0]\n",
-        "    )\n",
-        "\n",
-        "motion_pred = all_pred[\"motion_axis_angle\"]\n",
-        "t = motion_pred.shape[1]\n",
-        "motion_pred_np = motion_pred.cpu().numpy().reshape(t, -1)\n",
-        "face_pred = all_pred[\"expression\"].cpu().numpy().reshape(t, -1)\n",
-        "trans_pred = all_pred[\"trans\"].cpu().numpy().reshape(t, -1)\n",
-        "\n",
-        "# Save motion\n",
-        "emage_npz_path = \"./outputs/emage_output.npz\"\n",
-        "beat_format_save(emage_npz_path, motion_pred_np, upsample=30 // pose_fps, expressions=face_pred, trans=trans_pred)\n",
-        "print(f\"EMAGE motion saved to {emage_npz_path}\")"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {},
-      "source": [
-        "# Visualize EMAGE body result\n",
-        "motion_dict = np.load(emage_npz_path, allow_pickle=True)\n",
-        "v2d_body = render2d(motion_dict, (720, 480), face_only=False, remove_global=True)\n",
-        "emage_body_path = \"./outputs/emage_body.mp4\"\n",
-        "write_video(emage_body_path, v2d_body.permute(0, 2, 3, 1), fps=30)\n",
-        "print(\"EMAGE body visualization:\")\n",
-        "display(Video(emage_body_path, embed=True, width=480))"
-      ],
       "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## 7. Evaluation (Optional)\n",
         "\n",
-        "Compute metrics like FGD (Frechet Gesture Distance), BC (Beat Consistency), L1 Diversity."
       ]
     },
-    {
-      "cell_type": "code",
-      "metadata": {},
-      "source": [
-        "# Evaluation requires ground truth motion data\n",
-        "# This is a demo showing how to use the evaluation API\n",
-        "\n",
-        "if EVAL_AVAILABLE:\n",
-        "    from emage_evaltools.metric import FGD, BC, L1Div\n",
-        "    \n",
-        "    # Initialize evaluators\n",
-        "    fgd_evaluator = FGD(download_path=\"./emage_evaltools/\")\n",
-        "    bc_evaluator = BC(download_path=\"./emage_evaltools/\", sigma=0.3, order=7)\n",
-        "    l1div_evaluator = L1Div()\n",
-        "    \n",
-        "    print(\"Evaluation tools loaded!\")\n",
-        "    print(\"Note: Full evaluation requires ground truth motion data from BEAT2 dataset\")\n",
-        "    print(\"Download BEAT2: git clone https://huggingface.co/datasets/H-Liu1997/BEAT2\")\n",
-        "else:\n",
-        "    print(\"Evaluation tools not available. Clone from GitHub to enable.\")"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {},
-      "source": [
-        "# Visualize EMAGE face result\n",
-        "v2d_face = render2d(motion_dict, (720, 480), face_only=True, remove_global=True)\n",
-        "emage_face_path = \"./outputs/emage_face.mp4\"\n",
-        "write_video(emage_face_path, v2d_face.permute(0, 2, 3, 1), fps=30)\n",
-        "print(\"EMAGE face visualization:\")\n",
-        "display(Video(emage_face_path, embed=True, width=480))"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## 6. Download Results\n",
         "\n",
-        "Download the generated motion files (`.npz`) for use with Blender."
       ]
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
-        "from google.colab import files\n",
-        "\n",
         "print(\"Generated files:\")\n",
-        "print(f\"  - CaMN: {camn_npz_path}\")\n",
-        "print(f\"  - DisCo: {disco_npz_path}\")\n",
-        "print(f\"  - EMAGE: {emage_npz_path}\")\n",
         "\n",
-        "# Uncomment to download\n",
-        "# files.download(camn_npz_path)\n",
-        "# files.download(disco_npz_path)\n",
-        "# files.download(emage_npz_path)"
-      ],
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -436,9 +256,10 @@
       "source": [
         "## Notes\n",
         "\n",
-        "- **Motion Format**: The `.npz` files contain SMPL-X format motion data\n",
-        "- **Blender Visualization**: Use the [Blender Add-on](https://huggingface.co/datasets/H-Liu1997/BEAT2_Tools/blob/main/smplx_blender_addon_20230921.zip) for high-quality rendering\n",
-        "- **HuggingFace Space**: Try the [interactive demo](https://huggingface.co/spaces/H-Liu1997/EMAGE) for quick testing"
       ]
     }
   ],
@@ -455,4 +276,4 @@
   },
   "nbformat": 4,
   "nbformat_minor": 4
-}

         "- **DisCo**: Upper body gesture generation with diffusion\n",
         "- **EMAGE**: Full body + face gesture generation\n",
         "\n",
+        "[Project Page](https://pantomatrix.github.io/EMAGE/) | [GitHub](https://github.com/PantoMatrix/PantoMatrix) | [Paper](https://arxiv.org/abs/2401.00374) | [HF Space](https://huggingface.co/spaces/H-Liu1997/EMAGE)"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
+        "## 1. Setup Environment\n",
+        "\n",
+        "Install Python 3.10, create virtual environment, and install dependencies."
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
+        "# Install Python 3.10 and create virtual environment\n",
+        "!sudo add-apt-repository -y ppa:deadsnakes/ppa > /dev/null 2>&1\n",
+        "!sudo apt-get update -qq\n",
+        "!sudo apt-get install -y python3.10 python3.10-venv python3.10-dev > /dev/null 2>&1\n",
         "\n",
+        "ENV_PATH = \"/content/py310_env\"\n",
+        "!python3.10 -m venv {ENV_PATH}\n",
         "\n",
+        "PYTHON = f\"{ENV_PATH}/bin/python\"\n",
+        "PIP = f\"{ENV_PATH}/bin/pip\"\n",
+        "\n",
+        "# Install dependencies\n",
+        "!{PIP} install -q --upgrade pip\n",
+        "!{PIP} install -q torch==2.1.2 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121\n",
+        "!{PIP} install -q numpy==1.23.0 librosa soundfile transformers huggingface_hub\n",
+        "!{PIP} install -q smplx trimesh scipy easydict omegaconf\n",
+        "\n",
+        "# Verify\n",
+        "!{PYTHON} -c \"import torch; print(f'Python 3.10 + PyTorch {torch.__version__} + CUDA: {torch.cuda.is_available()}')\""
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
+        "# Clone code repositories\n",
         "!apt-get install -y git-lfs > /dev/null 2>&1\n",
         "!git lfs install\n",
         "\n",
+        "!git clone https://github.com/PantoMatrix/PantoMatrix.git /content/PantoMatrix\n",
+        "!git clone https://huggingface.co/H-Liu1997/emage_evaltools /content/PantoMatrix/emage_evaltools\n",
+        "%cd /content/PantoMatrix/emage_evaltools\n",
         "!git lfs pull\n",
+        "%cd /content\n",
         "\n",
+        "print(\"Code ready!\")"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
+        "%%writefile /content/run_inference.py\n",
+        "import sys, os\n",
+        "sys.path.insert(0, '/content/PantoMatrix')\n",
+        "os.chdir('/content/PantoMatrix')\n",
         "\n",
+        "import torch, numpy as np, librosa, argparse\n",
+        "import torch.nn.functional as F\n",
+        "from torchvision.io import write_video\n",
         "from models.camn_audio import CamnAudioModel\n",
         "from models.disco_audio import DiscoAudioModel\n",
         "from models.emage_audio import EmageAudioModel, EmageVQVAEConv, EmageVAEConv, EmageVQModel\n",
         "from emage_utils.motion_io import beat_format_save\n",
         "from emage_utils.npz2pose import render2d\n",
         "\n",
+        "def main():\n",
+        "    parser = argparse.ArgumentParser()\n",
+        "    parser.add_argument('--audio', type=str, required=True)\n",
+        "    parser.add_argument('--model', type=str, default='camn', choices=['camn', 'disco', 'emage'])\n",
+        "    parser.add_argument('--output_dir', type=str, default='/content/outputs')\n",
+        "    args = parser.parse_args()\n",
+        "    \n",
+        "    os.makedirs(args.output_dir, exist_ok=True)\n",
+        "    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
+        "    print(f'Using device: {device}')\n",
+        "    \n",
+        "    if args.model == 'camn':\n",
+        "        model = CamnAudioModel.from_pretrained('H-Liu1997/camn_audio').to(device).eval()\n",
+        "        sr, fps, seed = model.cfg.audio_sr, model.cfg.pose_fps, model.cfg.seed_frames\n",
+        "        audio, _ = librosa.load(args.audio, sr=sr)\n",
+        "        audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(device)\n",
+        "        with torch.no_grad():\n",
+        "            motion = model(audio_t, torch.zeros(1,1).long().to(device), seed_frames=seed)['motion_axis_angle']\n",
+        "        npz_path = os.path.join(args.output_dir, 'camn_output.npz')\n",
+        "        beat_format_save(npz_path, motion.cpu().numpy().reshape(motion.shape[1], -1), upsample=30//fps)\n",
+        "        \n",
+        "    elif args.model == 'disco':\n",
+        "        model = DiscoAudioModel.from_pretrained('H-Liu1997/disco_audio').to(device).eval()\n",
+        "        sr, fps, seed = model.cfg.audio_sr, model.cfg.pose_fps, model.cfg.seed_frames\n",
+        "        audio, _ = librosa.load(args.audio, sr=sr)\n",
+        "        audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(device)\n",
+        "        with torch.no_grad():\n",
+        "            motion = model(audio_t, torch.zeros(1,1).long().to(device), seed_frames=seed, seed_motion=None)['motion_axis_angle']\n",
+        "        npz_path = os.path.join(args.output_dir, 'disco_output.npz')\n",
+        "        beat_format_save(npz_path, motion.cpu().numpy().reshape(motion.shape[1], -1), upsample=30//fps)\n",
+        "        \n",
+        "    else:  # emage\n",
+        "        vq_models = {k: EmageVQVAEConv.from_pretrained('H-Liu1997/emage_audio', subfolder=f'emage_vq/{k}').to(device).eval() \n",
+        "                     for k in ['face', 'upper', 'lower', 'hands']}\n",
+        "        global_ae = EmageVAEConv.from_pretrained('H-Liu1997/emage_audio', subfolder='emage_vq/global').to(device).eval()\n",
+        "        vq = EmageVQModel(face_model=vq_models['face'], upper_model=vq_models['upper'],\n",
+        "                          lower_model=vq_models['lower'], hands_model=vq_models['hands'], global_model=global_ae).to(device).eval()\n",
+        "        model = EmageAudioModel.from_pretrained('H-Liu1997/emage_audio').to(device).eval()\n",
+        "        sr, fps = model.cfg.audio_sr, model.cfg.pose_fps\n",
+        "        audio, _ = librosa.load(args.audio, sr=sr)\n",
+        "        audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(device)\n",
+        "        with torch.no_grad():\n",
+        "            lat = model.inference(audio_t, torch.zeros(1,1).long().to(device), vq, masked_motion=None, mask=None)\n",
+        "            get = lambda k, c: lat[f'rec_{k}'] if getattr(model.cfg, f'l{k[0]}') > 0 and getattr(model.cfg, f'c{k[0]}') == 0 else None\n",
+        "            idx = lambda k: torch.max(F.log_softmax(lat[f'cls_{k}'], dim=2), dim=2)[1] if getattr(model.cfg, f'c{k[0]}') > 0 else None\n",
+        "            pred = vq.decode(face_latent=get('face','f'), upper_latent=get('upper','u'), lower_latent=get('lower','l'), hands_latent=get('hands','h'),\n",
+        "                             face_index=idx('face'), upper_index=idx('upper'), lower_index=idx('lower'), hands_index=idx('hands'),\n",
+        "                             get_global_motion=True, ref_trans=torch.zeros(1,3).to(device))\n",
+        "        motion = pred['motion_axis_angle']\n",
+        "        npz_path = os.path.join(args.output_dir, 'emage_output.npz')\n",
+        "        beat_format_save(npz_path, motion.cpu().numpy().reshape(motion.shape[1], -1), upsample=30//fps,\n",
+        "                         expressions=pred['expression'].cpu().numpy().reshape(motion.shape[1], -1),\n",
+        "                         trans=pred['trans'].cpu().numpy().reshape(motion.shape[1], -1))\n",
+        "    \n",
+        "    # Render 2D visualization\n",
+        "    motion_dict = np.load(npz_path, allow_pickle=True)\n",
+        "    v2d = render2d(motion_dict, (720, 480), face_only=False, remove_global=True)\n",
+        "    video_path = npz_path.replace('.npz', '_2d.mp4')\n",
+        "    write_video(video_path, v2d.permute(0, 2, 3, 1), fps=30)\n",
+        "    print(f'Saved: {npz_path}')\n",
+        "    print(f'Video: {video_path}')\n",
+        "\n",
+        "if __name__ == '__main__': main()"
+      ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
+        "## 2. Run Inference\n",
         "\n",
+        "Choose your audio and model, then run inference."
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
+        "# Audio file (use example or upload your own)\n",
+        "audio_path = \"/content/PantoMatrix/examples/audio/2_scott_0_103_103_10s.wav\"\n",
         "\n",
+        "# Uncomment to upload your own audio:\n",
         "# from google.colab import files\n",
         "# uploaded = files.upload()\n",
+        "# audio_path = \"/content/\" + list(uploaded.keys())[0]\n",
         "\n",
+        "from IPython.display import Audio\n",
         "display(Audio(audio_path))"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
+        "# Run CaMN (Upper Body)\n",
+        "PYTHON = \"/content/py310_env/bin/python\"\n",
+        "!{PYTHON} /content/run_inference.py --audio {audio_path} --model camn\n",
         "\n",
+        "from IPython.display import Video\n",
+        "display(Video(\"/content/outputs/camn_output_2d.mp4\", embed=True, width=600))"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
+        "# Run DisCo (Upper Body with Diffusion)\n",
+        "PYTHON = \"/content/py310_env/bin/python\"\n",
+        "!{PYTHON} /content/run_inference.py --audio {audio_path} --model disco\n",
         "\n",
+        "from IPython.display import Video\n",
+        "display(Video(\"/content/outputs/disco_output_2d.mp4\", embed=True, width=600))"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
+        "# Run EMAGE (Full Body + Face)\n",
+        "PYTHON = \"/content/py310_env/bin/python\"\n",
+        "!{PYTHON} /content/run_inference.py --audio {audio_path} --model emage\n",
         "\n",
+        "from IPython.display import Video\n",
+        "display(Video(\"/content/outputs/emage_output_2d.mp4\", embed=True, width=600))"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
+        "## 3. Download Results\n",
         "\n",
+        "Download motion files (`.npz`) for use with Blender."
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
+        "import os\n",
         "print(\"Generated files:\")\n",
+        "for f in os.listdir(\"/content/outputs\"):\n",
+        "    print(f\"  /content/outputs/{f}\")\n",
         "\n",
+        "# Uncomment to download:\n",
+        "# from google.colab import files\n",
+        "# files.download(\"/content/outputs/camn_output.npz\")\n",
+        "# files.download(\"/content/outputs/disco_output.npz\")\n",
+        "# files.download(\"/content/outputs/emage_output.npz\")"
+      ]
     },
     {
       "cell_type": "markdown",
       "source": [
         "## Notes\n",
         "\n",
+        "- **Environment**: Python 3.10.x + PyTorch 2.1.2 + CUDA 12.1\n",
+        "- **Motion Format**: `.npz` files contain SMPL-X format motion data\n",
+        "- **Visualization**: Use the [Blender Add-on](https://huggingface.co/datasets/H-Liu1997/BEAT2_Tools/blob/main/smplx_blender_addon_20230921.zip) for high-quality rendering\n",
+        "- **Interactive Demo**: [HuggingFace Space](https://huggingface.co/spaces/H-Liu1997/EMAGE)"
       ]
     }
   ],
   },
   "nbformat": 4,
   "nbformat_minor": 4
+}