Spaces:

MLAdaptiveIntelligence
/

PRIMA-demo

Running

App Files Files Community

HF Space deploy commited on 19 days ago

Commit

2ba375b

0 Parent(s):

Deploy snapshot (LFS for demo images per .gitattributes)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +5 -0
.github/workflows/check-headers.yml +36 -0
.github/workflows/codespell.yml +21 -0
.github/workflows/release-pypi.yml +48 -0
.gitignore +175 -0
README.md +220 -0
app.py +561 -0
chumpy/__init__.py +16 -0
chumpy/ch.py +37 -0
configs_hydra/experiment/default.yaml +28 -0
configs_hydra/experiment/default_val.yaml +34 -0
configs_hydra/experiment/primaStage1.yaml +83 -0
configs_hydra/experiment/primaStage2.yaml +113 -0
configs_hydra/extras/default.yaml +8 -0
configs_hydra/hydra/default.yaml +26 -0
configs_hydra/launcher/local.yaml +13 -0
configs_hydra/launcher/slurm.yaml +22 -0
configs_hydra/paths/default.yaml +18 -0
configs_hydra/train.yaml +46 -0
configs_hydra/trainer/cpu.yaml +6 -0
configs_hydra/trainer/ddp.yaml +14 -0
configs_hydra/trainer/default.yaml +10 -0
configs_hydra/trainer/default_amr.yaml +9 -0
configs_hydra/trainer/gpu.yaml +6 -0
configs_hydra/trainer/mps.yaml +6 -0
demo.py +144 -0
demo_data/000000015956_horse.png +3 -0
demo_data/000000315905_zebra.jpg +3 -0
demo_data/beagle.jpg +3 -0
demo_data/n02101388_1188.png +3 -0
demo_data/n02412080_12159.png +3 -0
demo_data/shepherd_hati.jpg +3 -0
demo_tta.py +340 -0
eval.py +102 -0
images/teaser.png +3 -0
packages.txt +4 -0
prima/__init__.py +25 -0
prima/configs/__init__.py +99 -0
prima/models/__init__.py +54 -0
prima/models/backbones/__init__.py +19 -0
prima/models/backbones/vit.py +375 -0
prima/models/bioclip_embedding.py +70 -0
prima/models/components/__init__.py +0 -0
prima/models/components/model_utils.py +160 -0
prima/models/components/pose_transformer.py +366 -0
prima/models/components/position_encoding.py +84 -0
prima/models/components/t_cond_mlp.py +204 -0
prima/models/components/transformer.py +400 -0
prima/models/discriminator.py +129 -0
prima/models/heads/__init__.py +1 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,5 @@

+# Hugging Face Hub stores these via Git LFS / Xet (plain PNG/JPG in git are rejected on push).
+demo_data/*.png filter=lfs diff=lfs merge=lfs -text
+demo_data/*.jpg filter=lfs diff=lfs merge=lfs -text
+demo_data/*.jpeg filter=lfs diff=lfs merge=lfs -text
+images/*.png filter=lfs diff=lfs merge=lfs -text

.github/workflows/check-headers.yml ADDED Viewed

	@@ -0,0 +1,36 @@

+---
+    name: Check File Headers
+    on:
+      push:
+        branches: [main]
+      pull_request:
+        branches: [main]
+    jobs:
+      check-headers:
+        name: Check Python file headers
+        runs-on: ubuntu-latest
+        permissions:
+          contents: read
+        steps:
+          - name: Checkout code
+            uses: actions/checkout@v3
+          - name: Set up Python
+            uses: actions/setup-python@v4
+            with:
+              python-version: "3.10"
+          - name: Check headers
+            run: |
+              python scripts/update_headers.py --check
+            continue-on-error: false
+          - name: Provide fix instructions
+            if: failure()
+            run: |
+              echo "::error::Some files are missing proper headers."
+              echo "To fix this, run: python scripts/update_headers.py"
+              echo "Then commit the changes."

.github/workflows/codespell.yml ADDED Viewed

	@@ -0,0 +1,21 @@

+---
+    name: Codespell
+    on:
+      push:
+        branches: [main]
+      pull_request:
+        branches: [main]
+    jobs:
+      codespell:
+        name: Check for spelling errors
+        runs-on: ubuntu-latest
+        steps:
+          - name: Checkout
+            uses: actions/checkout@v3
+          - name: Codespell
+            uses: codespell-project/actions-codespell@v1
+            with:
+               ignore_words_list: prima-animal, mpjpe, uvd, xyz, hm36, cpn, dbb

.github/workflows/release-pypi.yml ADDED Viewed

	@@ -0,0 +1,48 @@

+name: Update pypi release
+on:
+  push:
+    tags:
+      - 'v*.*.*'
+  pull_request:
+    branches:
+      - main
+    types:
+      - labeled
+      - opened
+      - edited
+      - synchronize
+      - reopened
+jobs:
+  release:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Cache dependencies
+        id: pip-cache
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-pip
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install wheel
+          # NOTE(stes) see https://github.com/pypa/twine/issues/1216#issuecomment-2629069669
+          pip install "packaging>=24.2"
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Build and publish to PyPI
+        if: ${{ github.event_name == 'push' }}
+        env:
+          TWINE_USERNAME: __token__
+          TWINE_PASSWORD: ${{ secrets.TWINE_API_KEY }}
+        run: |
+          pip install build twine
+          python3 -m build
+          ls dist/
+          python3 -m twine upload --verbose dist/*

.gitignore ADDED Viewed

	@@ -0,0 +1,175 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Vscode
+.vscode/
+# Directory
+.gradio/
+demo_out/
+demo_out*/
+data/PRIMA*/
+data/backbone.pth
+logs/
+*.pth
+*.pkl
+datasets/

README.md ADDED Viewed

	@@ -0,0 +1,220 @@

+---
+title: PRIMA Demo
+emoji: 🦮
+colorFrom: blue
+colorTo: green
+sdk: gradio
+python_version: "3.10"
+app_file: app.py
+startup_duration_timeout: 60m
+---
+# PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation
+This is the official implementation of the approach described in the preprint:
+PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation \
+Xiaohang Yu, Ti Wang, Mackenzie Weygandt Mathis
+![PRIMA teaser](images/teaser.png)
+---
+## 🚀 TL;DR
+PRIMA creates a 3D quadruped mesh from a single 2D image. It leverages BioCLIP-based biological priors for robust cross-species shape understanding, then applies test-time adaptation with 2D reprojection and auxiliary keypoint guidance to refine SMAL pose and shape predictions.
+It further can be used to build Quadruped3D, a large-scale pseudo-3D dataset with diverse species and poses.
+PRIMA achieves state-of-the-art results on Animal3D, CtrlAni3D, Quadruped2D, and Animal Kingdom datasets.
+## Installation
+### Install from PyPI
+> Recommended: Python 3.10 and a CUDA-enabled PyTorch installation.
+```bash
+conda create -n prima python=3.10 -y
+conda activate prima
+# Install PyTorch matching your CUDA (example: CUDA 11.8)
+pip install --index-url https://download.pytorch.org/whl/cu118 \
+    "torch==2.2.1" "torchvision==0.17.1" "torchaudio==2.2.1"
+# Install chumpy and PyTorch3D
+python -m pip install --no-build-isolation \
+      "git+https://github.com/mattloper/chumpy.git"
+python -m pip install --no-build-isolation \
+      "git+https://github.com/facebookresearch/pytorch3d.git"
+# Install PRIMA from PyPI
+pip install prima-animal
+```
+`prima-animal` includes demo runtime dependencies used by `demo.py`, `demo_tta.py`, and `app.py` (including Detectron2 and DeepLabCut).
+---
+## Demo
+### Checkpoints and data
+We provide an automated demo-download script for models hosted on Hugging Face.
+Use the helper script to download and place all demo assets automatically in `data/`:
+```bash
+python scripts/setup_demo_data.py --hf-repo-id MLAdaptiveIntelligence/PRIMA
+```
+Approximate download volume from Hugging Face is ~24 GB total
+(`s1ckpt.ckpt` ~10.2 GB + `s3ckpt.ckpt` ~10.2 GB + `amr_vitbb.pth` ~2.5 GB + SMAL files).
+Expected time is roughly:
+- 100 Mbps: ~35-45 minutes
+- 300 Mbps: ~12-18 minutes
+- 1 Gbps: ~4-8 minutes
+To avoid re-downloading completed assets, rerun without `--force`. The script now
+re-downloads only missing or invalid checkpoints.
+Expected files in that Hugging Face repo root:
+- `my_smpl_00781_4_all.pkl`
+- `my_smpl_data_00781_4_all.pkl`
+- `walking_toy_symmetric_pose_prior_with_cov_35parts.pkl`
+- `amr_vitbb.pth`
+- `config_s1_HYDRA.yaml`
+- `config_s3_HYDRA.yaml`
+- `s1ckpt.ckpt`
+- `s3ckpt.ckpt`
+### Demo (without TTA)
+Run animal detection + PRIMA 3D pose/shape inference:
+```bash
+python demo.py \
+  --checkpoint data/PRIMAS1/checkpoints/s1ckpt.ckpt \
+  --img_folder demo_data/ \
+  --out_folder demo_out/
+```
+Outputs are written to `demo_out/`.
+---
+### Demo (with TTA)
+`demo_tta.py` pipeline: specify learning rate and number of iterations:
+Example:
+```bash
+python demo_tta.py \
+  --checkpoint data/PRIMAS1/checkpoints/s1ckpt.ckpt \
+  --img_folder demo_data/ \
+  --out_folder demo_out_tta/ \
+  --tta_lr 1e-6 \
+  --tta_num_iters 30
+```
+Outputs are written to `demo_out_tta/` (before/after TTA renders, keypoints, and optional meshes).
+---
+### Gradio demo
+We also provide a simple Gradio-based web demo for interactive testing in the
+browser:
+```bash
+python app.py \
+  --checkpoint data/PRIMAS1/checkpoints/s1ckpt.ckpt \
+  --out_folder demo_out_tta_gradio/
+```
+This starts a local Gradio app (by default on http://127.0.0.1:7860), where
+you can upload images and visualize PRIMA predictions and adaptation results.
+#### Hugging Face Space (maintainers)
+Demo images under `demo_data/` and `images/teaser.png` are tracked with **Git LFS**
+(see `.gitattributes`) so they can be pushed to a Hugging Face Space under the Hub’s
+LFS / **Xet** bridge. Install tooling once:
+```bash
+brew install git-lfs git-xet
+git xet install
+git lfs install
+```
+Then from a clean checkout with LFS files present, deploy the Space repo:
+```bash
+./scripts/deploy_hf_space.sh
+```
+The script rsyncs the working tree (not `git archive`) so image files are materialized
+before `git add` turns them into LFS blobs.
+---
+## Training and Evaluation
+### Dataset Setup
+Download datasets from [Animal3D](https://xujiacong.github.io/Animal3D/), [CtrlAni3D](https://github.com/luoxue-star/AniMer?tab=readme-ov-file#training), Quadruped2D, and [Animal Kingdom](https://drive.google.com/file/d/1dk2a0qB0fbVZ4X6eAgP6VJVXj0rxVfsJ/view?usp=drive_link). For Quadruped2D, download the images from [SuperAnimal-Quadruped80K](https://zenodo.org/records/14016777) and our processed annotations from [here](https://drive.google.com/drive/folders/1eBNboxVwl_eGPoC93zxf-U3hmE6e2f-f?usp=sharing). Put all the datasets under `datasets/`.
+### Training
+Two-stage training script:
+```bash
+bash train.sh
+```
+Training outputs are written to `logs/train/runs/<exp_name>/`.
+### Evaluation
+```bash
+python eval.py \
+  --config data/PRIMAS1/.hydra/config.yaml \
+  --checkpoint data/PRIMAS1/checkpoints/s1ckpt.ckpt
+```
+Common values for `--dataset` are controlled by:
+- `configs_hydra/experiment/default_val.yaml`
+---
+## Acknowledgements
+This release builds on several open-source projects, including:
+- [Detectron2](https://github.com/facebookresearch/detectron2)
+- [BioCLIP](https://github.com/Imageomics/BioCLIP)
+- [AniMer](https://github.com/luoxue-star/AniMer)
+- [DeepLabCut](https://github.com/DeepLabCut/DeepLabCut)
+---
+## Citation
+If you use this code in your research, please cite our PRIMA paper.
+```bibtex
+@misc{yu_prima,
+  title={PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation},
+  author={Xiaohang Yu and Ti Wang and Mackenzie Weygandt Mathis},
+}
+```
+---
+## Contact
+For issues, please open a GitHub issue in this repository.

app.py ADDED Viewed

	@@ -0,0 +1,561 @@

+"""
+PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation
+Official implementation of the paper:
+"PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation"
+by Xiaohang Yu, Ti Wang, and Mackenzie Weygandt Mathis
+Licensed under a modified MIT license
+"""
+"""Gradio demo for PRIMA + SuperAnimal + TTA.
+This script wraps the ``demo_tta.py`` pipeline into an interactive
+Gradio interface. The overall logic follows:
+1. Given an input image, run Detectron2 to detect animals.
+2. For each detected animal, run PRIMA for 3D pose/shape estimation.
+3. Run DeepLabCut SuperAnimal to obtain 2D keypoints.
+4. Map SuperAnimal 39 keypoints to the 26 PRIMA keypoints.
+5. Run test-time adaptation (TTA) with user-specified lr and iters.
+6. Render and save before/after TTA results and keypoint visualizations.
+"""
+import argparse
+import os
+import sys
+import tempfile
+import traceback
+from types import SimpleNamespace
+from typing import List, Tuple
+from pathlib import Path
+import cv2
+import gradio as gr
+import numpy as np
+import torch
+import torch.utils.data
+# Repo-local minimal ``chumpy`` shim (see ``chumpy/__init__.py``) so SMAL pickles load
+# without installing the full chumpy package in Space builds.
+_REPO_ROOT = Path(__file__).resolve().parent
+if str(_REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT))
+# Default checkpoint path following README instructions
+DEFAULT_CHECKPOINT = "data/PRIMAS1/checkpoints/s1ckpt.ckpt"
+DEFAULT_HF_ASSET_REPO = "MLAdaptiveIntelligence/PRIMA"
+# Output folder for rendered images/meshes and keypoints
+DEFAULT_OUT_FOLDER = "demo_out_tta_gradio"
+def _is_truthy_env(var_name: str) -> bool:
+    return os.environ.get(var_name, "").strip().lower() in {"1", "true", "yes", "on"}
+def _running_on_space() -> bool:
+    return bool(os.environ.get("SPACE_ID") or os.environ.get("HF_SPACE_ID"))
+def _gradio_examples_for_interface() -> List[List]:
+    """Gradio prefetches example media at startup.
+    Demo images are tracked with Git LFS / Xet (see ``.gitattributes``) so they can live
+    in the Hugging Face Space repo. Use absolute paths only when files exist beside ``app.py``.
+    """
+    if _is_truthy_env("PRIMA_DISABLE_GRADIO_EXAMPLES"):
+        return []
+    rows: List[List] = []
+    template: List[Tuple[str, float, int, float, float, bool, bool]] = [
+        ("demo_data/000000015956_horse.png", 1e-6, 30, 0.7, 0.1, False, True),
+        ("demo_data/n02412080_12159.png", 1e-6, 30, 0.7, 0.1, False, True),
+        ("demo_data/000000315905_zebra.jpg", 1e-6, 30, 0.7, 0.1, False, True),
+        ("demo_data/beagle.jpg", 1e-6, 0, 0.7, 0.1, False, True),
+        ("demo_data/shepherd_hati.jpg", 1e-6, 0, 0.7, 0.1, False, True),
+    ]
+    for rel, *rest in template:
+        p = _REPO_ROOT / rel
+        if p.is_file():
+            rows.append([str(p), *rest])
+    return rows
+def _should_preload_assets() -> bool:
+    """Default to preload on Spaces; configurable via PRIMA_PRELOAD_ASSETS."""
+    preload_env = os.environ.get("PRIMA_PRELOAD_ASSETS")
+    if preload_env is not None:
+        return _is_truthy_env("PRIMA_PRELOAD_ASSETS")
+    return _running_on_space()
+def _ensure_demo_assets(checkpoint_path: str) -> None:
+    """Download required demo assets when running in a clean environment."""
+    from scripts.setup_demo_data import (
+        maybe_download_smal,
+        maybe_download_backbone,
+        maybe_download_stage,
+    )
+    checkpoint = Path(checkpoint_path)
+    data_dir = checkpoint.parents[2]
+    hf_repo_id = os.environ.get("PRIMA_HF_REPO_ID", DEFAULT_HF_ASSET_REPO)
+    maybe_download_smal(data_dir, force=False, hf_repo_id=hf_repo_id)
+    maybe_download_backbone(data_dir, force=False, hf_repo_id=hf_repo_id)
+    maybe_download_stage(
+        "PRIMAS1",
+        "config_s1_HYDRA.yaml",
+        "s1ckpt.ckpt",
+        "s1ckpt.ckpt",
+        data_dir,
+        force=False,
+        hf_repo_id=hf_repo_id,
+    )
+def _preload_assets_once(checkpoint_path: str) -> None:
+    checkpoint = Path(checkpoint_path)
+    cfg_path = checkpoint.parent.parent / ".hydra" / "config.yaml"
+    if checkpoint.exists() and cfg_path.exists():
+        print("[startup] Assets already present; skipping preload.")
+        return
+    print("[startup] Preloading demo assets from Hugging Face Hub...")
+    _ensure_demo_assets(checkpoint_path)
+    print("[startup] Asset preload complete.")
+def _load_prima_model(checkpoint_path: str = DEFAULT_CHECKPOINT):
+    """Load PRIMA model and renderer once for the Gradio app."""
+    from prima.models import load_prima
+    from prima.utils.renderer import Renderer
+    checkpoint = Path(checkpoint_path)
+    cfg_path = checkpoint.parent.parent / ".hydra" / "config.yaml"
+    if not checkpoint.exists() or not cfg_path.exists():
+        _ensure_demo_assets(checkpoint_path)
+    if not checkpoint.exists():
+        raise FileNotFoundError(
+            f"Missing checkpoint: {checkpoint}. Download demo checkpoints/data as described in README."
+        )
+    if not cfg_path.exists():
+        raise FileNotFoundError(
+            f"Missing model config: {cfg_path}. Ensure the full checkpoint folder layout from README is present."
+        )
+    model, model_cfg = load_prima(checkpoint_path)
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    model = model.to(device)
+    model.eval()
+    renderer = Renderer(model_cfg, faces=model.smal.faces)
+    return model, model_cfg, renderer, device
+def _build_detector():
+    """Build Detectron2 animal detector (same config as demo_tta/demo.py)."""
+    try:
+        import detectron2.config
+        import detectron2.engine
+        from detectron2 import model_zoo
+    except Exception as e:
+        print(f"[warn] Detectron2 unavailable ({type(e).__name__}: {e}); using full-image fallback bbox.")
+        return None
+    cfg = detectron2.config.get_cfg()
+    cfg.merge_from_file(
+        model_zoo.get_config_file("COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml")
+    )
+    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5
+    cfg.MODEL.WEIGHTS = (
+        "https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/"
+        "faster_rcnn_X_101_32x8d_FPN_3x/139173657/model_final_68b088.pkl"
+    )
+    cfg.MODEL.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+    detector = detectron2.engine.DefaultPredictor(cfg)
+    return detector
+# SuperAnimal defaults (same as in demo_tta parser)
+SUPER_ANIMAL_ARGS = SimpleNamespace(
+    superanimal_name="superanimal_quadruped",
+    superanimal_model_name="hrnet_w32",
+    superanimal_detector_name="fasterrcnn_resnet50_fpn_v2",
+    superanimal_max_individuals=1,
+)
+def _collect_animal_results(
+    model,
+    model_cfg,
+    renderer,
+    device,
+    detector,
+    out_folder: str,
+    img_rgb: np.ndarray,
+    tta_lr: float,
+    tta_num_iters: int,
+    det_thresh: float,
+    kp_conf_thresh: float,
+    side_view: bool,
+    save_mesh: bool,
+) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray], str | None, str | None]:
+    """Run detection + PRIMA + SuperAnimal + TTA on a single RGB image.
+    Returns:
+        before_imgs: list of HxWx3 RGB images (before TTA) for all animals
+        after_imgs: list of HxWx3 RGB images (after TTA) for all animals
+        kpt_imgs: list of HxWx3 RGB keypoint visualizations
+        first_before_mesh: path to first animal's before-TTA mesh (.obj) or None
+        first_after_mesh: path to first animal's after-TTA mesh (.obj) or None
+    """
+    from prima.utils import recursive_to
+    from prima.datasets.vitdet_dataset import ViTDetDataset
+    from demo_tta import (
+        ANIMAL_COCO_IDS,
+        denorm_patch_to_rgb,
+        map_superanimal_to_prima,
+        run_superanimal_on_patch,
+        save_keypoint_vis,
+        tta_optimize,
+    )
+    # Detect animals
+    img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
+    if detector is None:
+        # Fallback for environments where Detectron2 is unavailable: process full image as one crop.
+        h, w = img_bgr.shape[:2]
+        boxes = np.array([[0.0, 0.0, float(max(1, w - 1)), float(max(1, h - 1))]], dtype=np.float32)
+    else:
+        det_out = detector(img_bgr)
+        det_instances = det_out["instances"]
+        valid_idx = [
+            i
+            for i, (c, s) in enumerate(zip(det_instances.pred_classes, det_instances.scores))
+            if (int(c) in ANIMAL_COCO_IDS) and (float(s) > float(det_thresh))
+        ]
+        if len(valid_idx) == 0:
+            return [], [], [], None, None
+        boxes = det_instances.pred_boxes.tensor[valid_idx].cpu().numpy()
+    dataset = ViTDetDataset(model_cfg, img_bgr, boxes)
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)
+    before_imgs: List[np.ndarray] = []
+    after_imgs: List[np.ndarray] = []
+    kpt_imgs: List[np.ndarray] = []
+    before_mesh_paths: List[str] = []
+    after_mesh_paths: List[str] = []
+    img_token = next(tempfile._get_candidate_names())
+    for batch in dataloader:
+        batch = recursive_to(batch, device)
+        with torch.no_grad():
+            out_before = model(batch)
+        animal_id = int(batch["animalid"][0])
+        # Save/render before TTA
+        img_fn = f"{img_token}"
+        from demo_tta import render_and_save  # imported lazily to avoid circular issues
+        render_and_save(
+            renderer,
+            out_before,
+            batch,
+            img_fn,
+            animal_id,
+            out_folder,
+            suffix="before_tta",
+            side_view=side_view,
+            save_mesh=save_mesh,
+        )
+        before_png_path = os.path.join(out_folder, f"{img_fn}_{animal_id}_before_tta.png")
+        if os.path.exists(before_png_path):
+            before_bgr = cv2.imread(before_png_path)
+            if before_bgr is not None:
+                before_imgs.append(cv2.cvtColor(before_bgr, cv2.COLOR_BGR2RGB))
+        if save_mesh:
+            before_obj_path = os.path.join(out_folder, f"{img_fn}_{animal_id}_before_tta.obj")
+            if os.path.exists(before_obj_path):
+                before_mesh_paths.append(before_obj_path)
+        if int(tta_num_iters) <= 0:
+            render_and_save(
+                renderer,
+                out_before,
+                batch,
+                img_fn,
+                animal_id,
+                out_folder,
+                suffix="after_tta",
+                side_view=side_view,
+                save_mesh=save_mesh,
+            )
+            after_png_path = os.path.join(out_folder, f"{img_fn}_{animal_id}_after_tta.png")
+            if os.path.exists(after_png_path):
+                after_bgr = cv2.imread(after_png_path)
+                if after_bgr is not None:
+                    after_imgs.append(cv2.cvtColor(after_bgr, cv2.COLOR_BGR2RGB))
+            if save_mesh:
+                after_obj_path = os.path.join(out_folder, f"{img_fn}_{animal_id}_after_tta.obj")
+                if os.path.exists(after_obj_path):
+                    after_mesh_paths.append(after_obj_path)
+            continue
+        # Prepare patch for SuperAnimal
+        patch_rgb = denorm_patch_to_rgb(batch["img"][0])
+        with tempfile.TemporaryDirectory(prefix=f"dlc_{img_fn}_{animal_id}_") as tmp_dir:
+            bodyparts_xyc = run_superanimal_on_patch(patch_rgb, SUPER_ANIMAL_ARGS, tmp_dir)
+        if bodyparts_xyc is None:
+            # No keypoints => skip TTA for this animal
+            continue
+        mapped_xyc = map_superanimal_to_prima(bodyparts_xyc)
+        mapped_xyc[mapped_xyc[:, 2] < float(kp_conf_thresh), 2] = 0.0
+        # Save keypoint visualization and npy
+        kpt_png_path = os.path.join(out_folder, f"{img_fn}_{animal_id}_prima26_kpts.png")
+        save_keypoint_vis(patch_rgb, mapped_xyc, kpt_png_path)
+        npy_path = os.path.join(out_folder, f"{img_fn}_{animal_id}_prima26_kpts.npy")
+        np.save(npy_path, mapped_xyc)
+        if os.path.exists(kpt_png_path):
+            kpt_bgr = cv2.imread(kpt_png_path)
+            if kpt_bgr is not None:
+                kpt_imgs.append(cv2.cvtColor(kpt_bgr, cv2.COLOR_BGR2RGB))
+        # Normalize keypoints to [-0.5, 0.5] as in demo_tta
+        patch_h, patch_w = patch_rgb.shape[:2]
+        mapped_norm = mapped_xyc.copy()
+        mapped_norm[:, 0] = mapped_norm[:, 0] / float(patch_w) - 0.5
+        mapped_norm[:, 1] = mapped_norm[:, 1] / float(patch_h) - 0.5
+        gt_kpts_norm = torch.from_numpy(mapped_norm[None]).to(device=device, dtype=batch["img"].dtype)
+        # Run TTA
+        out_after = tta_optimize(
+            model,
+            batch,
+            gt_kpts_norm,
+            num_iters=int(tta_num_iters),
+            lr=float(tta_lr),
+        )
+        render_and_save(
+            renderer,
+            out_after,
+            batch,
+            img_fn,
+            animal_id,
+            out_folder,
+            suffix="after_tta",
+            side_view=side_view,
+            save_mesh=save_mesh,
+        )
+        after_png_path = os.path.join(out_folder, f"{img_fn}_{animal_id}_after_tta.png")
+        if os.path.exists(after_png_path):
+            after_bgr = cv2.imread(after_png_path)
+            if after_bgr is not None:
+                after_imgs.append(cv2.cvtColor(after_bgr, cv2.COLOR_BGR2RGB))
+        if save_mesh:
+            after_obj_path = os.path.join(out_folder, f"{img_fn}_{animal_id}_after_tta.obj")
+            if os.path.exists(after_obj_path):
+                after_mesh_paths.append(after_obj_path)
+    first_before_mesh = before_mesh_paths[0] if before_mesh_paths else None
+    first_after_mesh = after_mesh_paths[0] if after_mesh_paths else None
+    return before_imgs, after_imgs, kpt_imgs, first_before_mesh, first_after_mesh
+def build_demo(checkpoint_path: str = DEFAULT_CHECKPOINT, out_folder: str = DEFAULT_OUT_FOLDER) -> gr.Interface:
+    os.makedirs(out_folder, exist_ok=True)
+    runtime_cache = {
+        "model": None,
+        "model_cfg": None,
+        "renderer": None,
+        "device": None,
+        "detector": None,
+    }
+    def gradio_inference(
+        image: np.ndarray,
+        tta_lr: float,
+        tta_num_iters: int,
+        det_thresh: float,
+        kp_conf_thresh: float,
+        side_view: bool,
+        save_mesh: bool,
+    ):
+        """Wrapper for Gradio. ``image`` is an RGB numpy array.
+        Yields intermediate status so long first-run (Hub downloads + model load)
+        does not hit silent client/proxy timeouts.
+        """
+        if image is None:
+            yield None, None, None, "No image provided."
+            return
+        if image.dtype != np.uint8:
+            img_rgb = np.clip(image, 0, 255).astype(np.uint8)
+        else:
+            img_rgb = image
+        yield None, None, None, "Queued; preparing run…"
+        if runtime_cache["model"] is None:
+            yield (
+                None,
+                None,
+                None,
+                "First run: downloading demo assets from Hugging Face (large checkpoint) "
+                "and loading the model. This can take many minutes; status updates here "
+                "mean the session is still alive.",
+            )
+            try:
+                model, model_cfg, renderer, device = _load_prima_model(checkpoint_path)
+                detector = _build_detector()
+            except Exception:
+                yield None, None, None, f"Model initialization failed:\n{traceback.format_exc()}"
+                return
+            runtime_cache["model"] = model
+            runtime_cache["model_cfg"] = model_cfg
+            runtime_cache["renderer"] = renderer
+            runtime_cache["device"] = device
+            runtime_cache["detector"] = detector
+            yield None, None, None, "Model loaded. Running detection and inference…"
+        try:
+            before_imgs, after_imgs, kpt_imgs, mesh_before, mesh_after = _collect_animal_results(
+                runtime_cache["model"],
+                runtime_cache["model_cfg"],
+                runtime_cache["renderer"],
+                runtime_cache["device"],
+                runtime_cache["detector"],
+                out_folder,
+                img_rgb,
+                tta_lr=tta_lr,
+                tta_num_iters=tta_num_iters,
+                det_thresh=det_thresh,
+                kp_conf_thresh=kp_conf_thresh,
+                side_view=side_view,
+                save_mesh=save_mesh,
+            )
+        except Exception:
+            yield None, None, None, f"Inference failed:\n{traceback.format_exc()}"
+            return
+        first_before = before_imgs[0] if before_imgs else None
+        first_after = after_imgs[0] if after_imgs else None
+        first_kpts = kpt_imgs[0] if kpt_imgs else None
+        if first_before is None and first_after is None:
+            yield (
+                None,
+                None,
+                None,
+                "No output generated. Try an image with a clearly visible quadruped.",
+            )
+            return
+        yield first_before, first_after, first_kpts, "OK"
+    _gradio_examples = _gradio_examples_for_interface()
+    _iface_kw = dict(
+        fn=gradio_inference,
+        analytics_enabled=False,
+        cache_examples=False,
+        inputs=[
+            gr.Image(
+                label="Input image",
+                type="numpy",
+                sources=["upload", "clipboard"],
+            ),
+            gr.Slider(
+                label="TTA learning rate",
+                minimum=1e-7,
+                maximum=1e-4,
+                value=1e-6,
+                step=1e-7,
+            ),
+            gr.Slider(
+                label="TTA iterations",
+                minimum=0,
+                maximum=100,
+                value=30,
+                step=1,
+                info="Set to 0 to disable TTA and reuse the initial PRIMA prediction.",
+            ),
+            gr.Slider(
+                label="Detection threshold",
+                minimum=0.3,
+                maximum=0.9,
+                value=0.7,
+                step=0.05,
+            ),
+            gr.Slider(
+                label="Keypoint confidence threshold",
+                minimum=0.0,
+                maximum=1.0,
+                value=0.1,
+                step=0.05,
+            ),
+            gr.Checkbox(label="Render side view", value=False),
+            gr.Checkbox(label="Save meshes (.obj)", value=True),
+        ],
+        outputs=[
+            gr.Image(label="Before TTA"),
+            gr.Image(label="After TTA"),
+            gr.Image(label="PRIMA 26 keypoints"),
+            gr.Textbox(label="Status / Traceback", lines=12),
+        ],
+        title="PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation",
+        description=(
+            "Upload an animal image. The demo runs Detectron2 for animal detection, "
+            "PRIMA for 3D pose/shape, DeepLabCut SuperAnimal for 2D keypoints, and "
+            "test-time adaptation (TTA) with configurable learning rate and iterations. "
+            "Set TTA iterations to 0 to disable adaptation.\n\n"
+            "Results (PNG/OBJ and 26-keypoint visualizations) are saved under "
+            f"'{out_folder}'."
+        ),
+    )
+    if _gradio_examples:
+        _iface_kw["examples"] = _gradio_examples
+    demo = gr.Interface(**_iface_kw)
+    demo.queue(max_size=8, default_concurrency_limit=1)
+    return demo
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Gradio demo for PRIMA + SuperAnimal + TTA")
+    parser.add_argument(
+        "--checkpoint",
+        type=str,
+        default=DEFAULT_CHECKPOINT,
+        help="Path to the pretrained PRIMA checkpoint",
+    )
+    parser.add_argument(
+        "--out_folder",
+        type=str,
+        default=DEFAULT_OUT_FOLDER,
+        help="Folder used to save rendered outputs and meshes",
+    )
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    if _should_preload_assets():
+        _preload_assets_once(args.checkpoint)
+    demo = build_demo(checkpoint_path=args.checkpoint, out_folder=args.out_folder)
+    demo.launch()

chumpy/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+"""
+PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation
+Official implementation of the paper:
+"PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation"
+by Xiaohang Yu, Ti Wang, and Mackenzie Weygandt Mathis
+Licensed under a modified MIT license
+"""
+"""Minimal ``chumpy`` compatibility for unpickling legacy SMAL model configs."""
+from __future__ import annotations
+from .ch import Ch, ChArray
+__all__ = ["Ch", "ChArray"]

chumpy/ch.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""
+PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation
+Official implementation of the paper:
+"PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation"
+by Xiaohang Yu, Ti Wang, and Mackenzie Weygandt Mathis
+Licensed under a modified MIT license
+"""
+"""``chumpy.ch`` namespace expected by legacy SMAL pickles."""
+from __future__ import annotations
+import numpy as np
+class Ch:
+    """Minimal stand-in for ``chumpy.ch.Ch`` (unpickling only)."""
+    def __init__(self, *args, **kwargs):
+        self._data = None
+        if args:
+            self._data = np.asarray(args[0])
+    def r(self):
+        if self._data is None:
+            return np.zeros((), dtype=np.float32)
+        return np.asarray(self._data)
+class ChArray(np.ndarray):
+    """Minimal stand-in for ``chumpy.ch.ChArray``."""
+    pass
+__all__ = ["Ch", "ChArray"]

configs_hydra/experiment/default.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+# @package _global_
+SMAL:
+  DATA_DIR: data/smal
+  MODEL_PATH: data/smal/my_smpl_00781_4_all.pkl
+  SHAPE_PRIOR_PATH: data/smal/my_smpl_data_00781_4_all.pkl
+  POSE_PRIOR_PATH: data/smal/walking_toy_symmetric_pose_prior_with_cov_35parts.pkl
+  NUM_JOINTS: 34
+EXTRA:
+  FOCAL_LENGTH: 1000
+  NUM_LOG_IMAGES: 4
+  NUM_LOG_SAMPLES_PER_IMAGE: 4
+  PELVIS_IND: 0
+DATASETS:
+  CONFIG:
+    SCALE_FACTOR: 0.3
+    ROT_FACTOR: 30
+    TRANS_FACTOR: 0.02
+    COLOR_SCALE: 0.2
+    ROT_AUG_RATE: 0.6
+    TRANS_AUG_RATE: 0.5
+    DO_FLIP: False
+    FLIP_AUG_RATE: 0.0
+    EXTREME_CROP_AUG_RATE: 0.0
+    EXTREME_CROP_AUG_LEVEL: 1

configs_hydra/experiment/default_val.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+# @package _global_
+DATASETS:
+  ANIMAL3D:
+    ROOT_IMAGE: ./datasets/animal3d/
+    JSON_FILE:
+      TEST: ./datasets/animal3d/test.json
+  CONTROL_ANIMAL3D:
+    ROOT_IMAGE: ./datasets/control_animal3dlatest/
+    JSON_FILE:
+      TEST: ./datasets/control_animal3dlatest/test.json
+  QUADRUPED2D:
+    ROOT_IMAGE: ./datasets/quadruped2d/
+    JSON_FILE:
+      TEST: ./datasets/quadruped2d/test.json
+  ANIMAL_KINGDOM:
+    ROOT_IMAGE: ./datasets/Animal_Kingdom_test/
+    JSON_FILE:
+      TEST: ./datasets/Animal_Kingdom_test/test.json
+  CONFIG:
+    SCALE_FACTOR: 0.0
+    ROT_FACTOR: 0
+    TRANS_FACTOR: 0.0
+    COLOR_SCALE: 0.0
+    ROT_AUG_RATE: 0.0
+    TRANS_AUG_RATE: 0.0
+    DO_FLIP: False
+    FLIP_AUG_RATE: 0.0
+    EXTREME_CROP_AUG_RATE: 0.0
+    EXTREME_CROP_AUG_LEVEL: 1
+METRIC:
+  PCK_THRESHOLD: [0.10, 0.15]

configs_hydra/experiment/primaStage1.yaml ADDED Viewed

	@@ -0,0 +1,83 @@

+# @package _global_
+defaults:
+  - default.yaml
+GENERAL:
+  TOTAL_STEPS: 63_000
+  LOG_STEPS: 63
+  VAL_STEPS: 63
+  VAL_EPOCHS: 1
+  CHECKPOINT_EPOCHS: 1
+  CHECKPOINT_SAVE_TOP_K: 2
+  NUM_WORKERS: 8
+  PREFETCH_FACTOR: 2
+LOSS_WEIGHTS:
+  KEYPOINTS_3D: 0.05
+  KEYPOINTS_2D: 0.01
+  INTERMEDIATE_KP2D: 0.001
+  INTERMEDIATE_KP3D: 0.001
+  GLOBAL_ORIENT: 0.005
+  POSE: 0.001
+  BETAS: 0.0005
+  TRANSL: 0.0005
+  ADVERSARIAL: 0.0005
+  SUPCON: 0.0005
+TRAIN:
+  LR: 3.75e-6
+  WEIGHT_DECAY: 1e-4
+  BATCH_SIZE: 48
+  LOSS_REDUCTION: mean
+  NUM_TRAIN_SAMPLES: 2
+  NUM_TEST_SAMPLES: 64
+  POSE_2D_NOISE_RATIO: 0.01
+  SMPL_PARAM_NOISE_RATIO: 0.005
+MODEL:
+  IMAGE_SIZE: 256
+  IMAGE_MEAN: [0.485, 0.456, 0.406]
+  IMAGE_STD: [0.229, 0.224, 0.225]
+  BACKBONE:
+    TYPE: vith
+    PRETRAINED_WEIGHTS: ./data/amr_vitbb.pth
+    FREEZE: False
+  # Enable BioClip embedding
+  USE_BIOCLIP_EMBEDDING: True
+  BIOCLIP_EMBEDDING:
+    EMBED_DIM: 1280  # Match DINOv2 output dimension for token-wise concatenation
+    TYPE: bioclip1
+  # Enable 2D keypoint embedding for initialization; NewBioGuidedSMALPoseDecoder updates it dynamically
+  USE_KEYPOINT_EMBEDDING: False
+  SMAL_HEAD:
+    TYPE: new_bio_pose_transformer_decoder    # Use the newer version with SAM3D-style hierarchical updates
+    IN_CHANNELS: 1280
+    IEF_ITERS: 3
+    # Pose Transformer Decoder configuration
+    DECODER_DIM: 1280
+    NUM_DECODER_LAYERS: 6
+    NUM_HEADS: 8
+    MLP_RATIO: 4.0
+    # Keypoint token configuration specific to NewBioGuidedSMALPoseDecoder
+    USE_KEYPOINT_2D_TOKENS: True            # Enable 2D keypoint tokens with SAM3D-style dynamic updates
+    USE_KEYPOINT_3D_TOKENS: True            # Enable 3D keypoint tokens with pelvis normalization
+    KEYPOINT_TOKEN_UPDATE: True             # Enable hierarchical keypoint prediction and token updates
+    KP2D_INJECT_IMAGE_FEAT: True            # Key setting: inject image features via grid_sample
+DATASETS:
+  ANIMAL3D:
+    ROOT_IMAGE: ./datasets/animal3d/
+    JSON_FILE:
+      TRAIN: ./datasets/animal3d/train.json
+      TEST: ./datasets/animal3d/test.json
+    WEIGHT: 1.0

configs_hydra/experiment/primaStage2.yaml ADDED Viewed

	@@ -0,0 +1,113 @@

+# @package _global_
+defaults:
+  - default.yaml
+GENERAL:
+  TOTAL_STEPS: 450_000
+  LOG_STEPS: 533
+  VAL_STEPS: 533
+  VAL_EPOCHS: 1
+  CHECKPOINT_EPOCHS: 1
+  CHECKPOINT_SAVE_TOP_K: 2
+  NUM_WORKERS: 2
+  PREFETCH_FACTOR: 2
+LOSS_WEIGHTS:
+  KEYPOINTS_3D: 0.05
+  KEYPOINTS_2D: 0.01
+  INTERMEDIATE_KP2D: 0.001
+  INTERMEDIATE_KP3D: 0.001
+  GLOBAL_ORIENT: 0.005
+  POSE: 0.001
+  BETAS: 0.0005
+  TRANSL: 0.0005
+  ADVERSARIAL: 0.0
+  SUPCON: 0.0005
+TRAIN:
+  LR: 3.75e-6
+  WEIGHT_DECAY: 1e-4
+  BATCH_SIZE: 48
+  LOSS_REDUCTION: mean
+  NUM_TRAIN_SAMPLES: 2
+  NUM_TEST_SAMPLES: 64
+  POSE_2D_NOISE_RATIO: 0.01
+  SMPL_PARAM_NOISE_RATIO: 0.005
+MODEL:
+  IMAGE_SIZE: 256
+  IMAGE_MEAN: [0.485, 0.456, 0.406]
+  IMAGE_STD: [0.229, 0.224, 0.225]
+  BACKBONE:
+    TYPE: vith
+    PRETRAINED_WEIGHTS: ./data/amr_vitbb.pth
+    FREEZE: False
+  # Enable BioClip embedding
+  USE_BIOCLIP_EMBEDDING: True
+  BIOCLIP_EMBEDDING:
+    EMBED_DIM: 1280  # Match vit output dimension for token-wise concatenation
+    TYPE: bioclip1
+  # Enable 2D keypoint embedding
+  USE_KEYPOINT_EMBEDDING: False
+  KEYPOINT_EMBEDDING:
+    NUM_KEYPOINTS: 26        # Number of SMAL keypoints
+    KEYPOINT_DIM: 2          # 2D coordinates (x, y)
+    EMBED_DIM: 1280          # Match vit output dimension
+    HIDDEN_DIM: 512          # Hidden layer dimension in MLP
+    TYPE: 'token'            # Use token-based embedding (recommended)
+  SMAL_HEAD:
+    TYPE: new_bio_pose_transformer_decoder    # Use the newer version with SAM3D-style hierarchical updates
+    IN_CHANNELS: 1280
+    IEF_ITERS: 1
+    # Pose Transformer Decoder configuration
+    DECODER_DIM: 1280
+    NUM_DECODER_LAYERS: 6
+    NUM_HEADS: 8
+    MLP_RATIO: 4.0
+    # Keypoint token configuration specific to NewBioGuidedSMALPoseDecoder
+    USE_KEYPOINT_2D_TOKENS: True            # Enable 2D keypoint tokens with SAM3D-style dynamic updates
+    USE_KEYPOINT_3D_TOKENS: True            # Enable 3D keypoint tokens with pelvis normalization
+    KEYPOINT_TOKEN_UPDATE: True             # Enable hierarchical keypoint prediction and token updates
+    KP2D_INJECT_IMAGE_FEAT: True            # Key setting: inject image features via grid_sample
+    # Legacy transformer config (kept for compatibility)
+    TRANSFORMER_DECODER:
+      depth: 6
+      heads: 8
+      mlp_dim: 1024
+      dim_head: 64
+      dropout: 0.0
+      emb_dropout: 0.0
+      norm: layer
+      context_dim: 1280
+DATASETS:
+  ANIMAL3D:
+    ROOT_IMAGE: ./datasets/animal3d/
+    JSON_FILE:
+      TRAIN: ./datasets/animal3d/train.json
+      TEST: ./datasets/animal3d/test.json
+    WEIGHT: 1.0
+  CONTROL_ANIMAL3D:
+    ROOT_IMAGE: ./datasets/control_animal3dlatest/
+    JSON_FILE:
+      TRAIN: ./datasets/control_animal3dlatest/train.json
+      TEST: ./datasets/control_animal3dlatest/test.json
+    WEIGHT: 0.5
+  QUADRUPED2D:
+    ROOT_IMAGE: ./datasets/quadruped2d/
+    JSON_FILE:
+      TRAIN: ./datasets/quadruped2d/train.json
+      TEST: ./datasets/quadruped2d/test.json
+    WEIGHT: 0.15

configs_hydra/extras/default.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+# disable python warnings if they annoy you
+ignore_warnings: False
+# ask user for tags if none are provided in the config
+enforce_tags: True
+# pretty print config tree at the start of the run using Rich library
+print_config: True

configs_hydra/hydra/default.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+# @package _global_
+# https://hydra.cc/docs/configure_hydra/intro/
+# enable color logging
+defaults:
+  - override /hydra/hydra_logging: colorlog
+  - override /hydra/job_logging: colorlog
+# exp_name: ovrd_${hydra:job.override_dirname}
+exp_name: ${now:%Y-%m-%d}_${now:%H-%M-%S}
+hydra:
+  run:
+    dir: ${paths.log_dir}/${task_name}/runs/${exp_name}
+  sweep:
+    dir: ${paths.log_dir}/${task_name}/multiruns/${exp_name}
+    subdir: ${hydra.job.num}
+  job:
+    config:
+      override_dirname:
+        exclude_keys:
+          - trainer
+          - trainer.devices
+          - trainer.num_nodes
+          - callbacks
+          - debug

configs_hydra/launcher/local.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+# @package _global_
+defaults:
+  - override /hydra/launcher: submitit_local
+hydra:
+  launcher:
+    timeout_min: 10_080   # 7 days
+    nodes: 1
+    tasks_per_node: ${trainer.devices}
+    cpus_per_task: 8
+    gpus_per_node: ${trainer.devices}
+    name: amr

configs_hydra/launcher/slurm.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+# @package _global_
+defaults:
+  - override /hydra/launcher: submitit_slurm
+hydra:
+  launcher:
+    timeout_min: 10_080   # 7 days
+    max_num_timeout: 3
+    partition: g40
+    qos: idle
+    nodes: 1
+    tasks_per_node: ${trainer.devices}
+    gpus_per_task: null
+    cpus_per_task: 12
+    gpus_per_node: ${trainer.devices}
+    cpus_per_gpu: null
+    comment: prima
+    name: prima
+    setup:
+      - module load cuda openmpi libfabric-aws
+      - export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

configs_hydra/paths/default.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+# path to root directory
+# this requires PROJECT_ROOT environment variable to exist
+# PROJECT_ROOT is inferred and set by pyrootutils package in `train.py` and `eval.py`
+root_dir: ${oc.env:PROJECT_ROOT}
+# path to data directory
+data_dir: ${paths.root_dir}/data/
+# path to logging directory
+log_dir: logs/
+# path to output directory, created dynamically by hydra
+# path generation pattern is specified in `configs/hydra/default.yaml`
+# use it to store all files generated during the run, like ckpts and metrics
+output_dir: ${hydra:runtime.output_dir}
+# path to working directory
+work_dir: ${hydra:runtime.cwd}

configs_hydra/train.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+# @package _global_
+# specify here default configuration
+# order of defaults determines the order in which configs override each other
+defaults:
+  - _self_
+  - trainer: ddp.yaml
+  - paths: default.yaml
+  - extras: default.yaml
+  - hydra: default.yaml
+  # experiment configs allow for version control of specific hyperparameters
+  # e.g. best hyperparameters for given model and datamodule
+  - experiment: null
+  - texture_exp: null
+  # optional local config for machine/user specific settings
+  # it's optional since it doesn't need to exist and is excluded from version control
+  - optional launcher: local.yaml
+  # - optional launcher: slurm.yaml
+  # debugging config (enable through command line, e.g. `python train.py debug=default)
+  - debug: null
+# task name, determines output directory path
+task_name: "train"
+# tags to help you identify your experiments
+# you can overwrite this in experiment configs
+# overwrite from command line with `python train.py tags="[first_tag, second_tag]"`
+# appending lists from command line is currently not supported :(
+# https://github.com/facebookresearch/hydra/issues/1547
+tags: ["dev"]
+# set False to skip model training
+train: True
+# evaluate on test set, using best model weights achieved during training
+# lightning chooses best weights based on the metric specified in checkpoint callback
+test: False
+# simply provide checkpoint path to resume training
+ckpt_path: True
+# seed for random number generators in pytorch, numpy and python.random
+seed: null

configs_hydra/trainer/cpu.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+defaults:
+  - default.yaml
+  - default_amr.yaml
+accelerator: cpu
+devices: 1

configs_hydra/trainer/ddp.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+defaults:
+  - default.yaml
+  - default_amr.yaml
+# use "ddp_spawn" instead of "ddp",
+# it's slower but normal "ddp" currently doesn't work ideally with hydra
+# https://github.com/facebookresearch/hydra/issues/2070
+# https://pytorch-lightning.readthedocs.io/en/latest/accelerators/gpu_intermediate.html#distributed-data-parallel-spawn
+strategy: ddp_spawn
+accelerator: gpu
+devices: 2
+num_nodes: 1
+sync_batchnorm: True

configs_hydra/trainer/default.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+_target_: pytorch_lightning.Trainer
+default_root_dir: ${paths.output_dir}
+accelerator: gpu
+devices: 1
+# set True to to ensure deterministic results
+# makes training slower but gives more reproducibility than just setting seeds
+deterministic: False

configs_hydra/trainer/default_amr.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+num_sanity_val_steps: 0
+log_every_n_steps: ${GENERAL.LOG_STEPS}
+val_check_interval: ${GENERAL.VAL_STEPS}  # How often within one training epoch to check the validation set.
+check_val_every_n_epoch: ${GENERAL.VAL_EPOCHS}  # Check val every n train epochs.
+precision: 16-mixed  # 16-mixed, 32
+max_steps: ${GENERAL.TOTAL_STEPS}
+# move_metrics_to_cpu: True
+limit_val_batches: 80  # How much of validation dataset to check.
+# track_grad_norm: -1

configs_hydra/trainer/gpu.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+defaults:
+  - default.yaml
+  - default_amr.yaml
+accelerator: gpu
+devices: 1

configs_hydra/trainer/mps.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+defaults:
+  - default.yaml
+  - default_amr.yaml
+accelerator: mps
+devices: 1

demo.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""
+PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation
+Official implementation of the paper:
+"PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation"
+by Xiaohang Yu, Ti Wang, and Mackenzie Weygandt Mathis
+Licensed under a modified MIT license
+"""
+from pathlib import Path
+import detectron2.config
+import detectron2.engine
+import torch
+import argparse
+import os
+import cv2
+import numpy as np
+from tqdm import tqdm
+import torch.utils
+import torch.utils.data
+from prima.models import load_prima
+from prima.utils import recursive_to
+from prima.datasets.vitdet_dataset import ViTDetDataset, DEFAULT_MEAN, DEFAULT_STD
+from prima.utils.renderer import Renderer, cam_crop_to_full
+import detectron2
+from detectron2 import model_zoo
+import warnings
+warnings.filterwarnings("ignore")
+LIGHT_BLUE = (0.65098039, 0.74117647, 0.85882353)
+GREEN = (0.65, 0.86, 0.74)
+def main():
+    parser = argparse.ArgumentParser(description='prima demo code')
+    parser.add_argument('--checkpoint', type=str,
+                        help='Path to pretrained model checkpoint')
+    parser.add_argument('--img_folder', type=str, default='demo_data/', help='Folder with input images')
+    parser.add_argument('--out_folder', type=str, default='demo_out', help='Output folder to save rendered results')
+    parser.add_argument('--side_view', dest='side_view', action='store_true', default=False,
+                        help='If set, render side view also')
+    parser.add_argument('--save_mesh', dest='save_mesh', action='store_true', default=False,
+                        help='If set, save meshes to disk also')
+    parser.add_argument('--batch_size', type=int, default=1, help='Batch size for inference/fitting')
+    parser.add_argument('--file_type', nargs='+', default=['*.jpg', '*.png', '*.jpeg', '*.JPEG'],
+                        help='List of file extensions to consider')
+    args = parser.parse_args()
+    model, model_cfg = load_prima(args.checkpoint)
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    model = model.to(device)
+    model.eval()
+    # Setup the renderer
+    renderer = Renderer(model_cfg, faces=model.smal.faces)
+    # Make output directory if it does not exist
+    os.makedirs(args.out_folder, exist_ok=True)
+    # Load detector
+    cfg = detectron2.config.get_cfg()
+    cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml"))
+    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5
+    cfg.MODEL.WEIGHTS = "https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x/139173657/model_final_68b088.pkl"
+    detector = detectron2.engine.DefaultPredictor(cfg)
+    img_paths = sorted([img for end in args.file_type for img in Path(args.img_folder).glob(end)])
+    for img_path in img_paths:
+        img_bgr = cv2.imread(str(img_path))
+        if img_bgr is None:
+            print(f"[WARN] Cannot read image: {img_path}")
+            continue
+        # Detect animals in image
+        det_out = detector(img_bgr)
+        det_instances = det_out['instances']
+        valid_idx = [i for i, (c, s) in enumerate(zip(det_instances.pred_classes, det_instances.scores)) if ((c in [15, 16, 17, 18, 19, 21, 22]) & (s > 0.7))]
+        boxes = det_instances.pred_boxes.tensor[valid_idx].cpu().numpy()
+        # Run PRIMA on detected animals
+        dataset = ViTDetDataset(model_cfg, img_bgr, boxes)
+        dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=0)
+        for batch in tqdm(dataloader):
+            batch = recursive_to(batch, device)
+            with torch.no_grad():
+                out = model(batch)
+            pred_cam = out['pred_cam']
+            box_center = batch["box_center"].float()
+            box_size = batch["box_size"].float()
+            img_size = batch["img_size"].float()
+            scaled_focal_length = model_cfg.EXTRA.FOCAL_LENGTH / model_cfg.MODEL.IMAGE_SIZE * img_size.max()
+            pred_cam_t_full = cam_crop_to_full(pred_cam, box_center, box_size, img_size,
+                                               scaled_focal_length).detach().cpu().numpy()
+            # Render the result
+            batch_size = batch['img'].shape[0]
+            for n in range(batch_size):
+                # Get filename from path img_path
+                img_fn, _ = os.path.splitext(os.path.basename(img_path))
+                animal_id = int(batch['animalid'][n])
+                white_img = (torch.ones_like(batch['img'][n]).cpu() - DEFAULT_MEAN[:, None, None] / 255) / (
+                            DEFAULT_STD[:, None, None] / 255)
+                input_patch = (batch['img'][n].cpu() * (DEFAULT_STD[:, None, None]) + (
+                            DEFAULT_MEAN[:, None, None])) / 255.
+                input_patch = input_patch.permute(1, 2, 0).numpy()
+                regression_img = renderer(out['pred_vertices'][n].detach().cpu().numpy(),
+                                        out['pred_cam_t'][n].detach().cpu().numpy(),
+                                        batch['img'][n],
+                                        mesh_base_color=GREEN,
+                                        scene_bg_color=(1, 1, 1),
+                                            )
+                final_img = np.concatenate([input_patch, regression_img], axis=1)
+                if args.side_view:
+                    side_img = renderer(out['pred_vertices'][n].detach().cpu().numpy(),
+                                        out['pred_cam_t'][n].detach().cpu().numpy(),
+                                        white_img,
+                                        mesh_base_color=GREEN,
+                                        scene_bg_color=(1, 1, 1),
+                                        side_view=True)
+                    final_img = np.concatenate([final_img, side_img], axis=1)
+                cv2.imwrite(os.path.join(args.out_folder, f'{img_fn}_{animal_id}.png'),
+                            cv2.cvtColor((255 * final_img).astype(np.uint8), cv2.COLOR_RGB2BGR))
+                # Add all verts and cams to list
+                verts = out['pred_vertices'][n].detach().cpu().numpy()
+                cam_t = pred_cam_t_full[n]
+                # Save all meshes to disk
+                if args.save_mesh:
+                    camera_translation = cam_t.copy()
+                    tmesh = renderer.vertices_to_trimesh(verts, camera_translation, LIGHT_BLUE)
+                    tmesh.export(os.path.join(args.out_folder, f'{img_fn}_{animal_id}.obj'))
+if __name__ == '__main__':
+    main()

demo_data/000000015956_horse.png ADDED Viewed

Git LFS Details

SHA256: 2a2398ba7df40a47c636afefa28be17b55f4b7bc2c378e053aeea507580ad2cb
Pointer size: 131 Bytes
Size of remote file: 620 kB

demo_data/000000315905_zebra.jpg ADDED Viewed

Git LFS Details

SHA256: e0a17e1f1650820b020a9025144015c1e27f0f1ab435859f0bde3a0047d8f689
Pointer size: 131 Bytes
Size of remote file: 257 kB

demo_data/beagle.jpg ADDED Viewed

Git LFS Details

SHA256: ac29e6ea6086831dd9806a8cd3fd608e264ac1af567f6fcfc8797c5bd3d5d560
Pointer size: 131 Bytes
Size of remote file: 350 kB

demo_data/n02101388_1188.png ADDED Viewed

Git LFS Details

SHA256: e45ff508fb8c6437cce22fcb59b4f1b6fe37ddfab1d4cf68d97629f9caa939f4
Pointer size: 131 Bytes
Size of remote file: 319 kB

demo_data/n02412080_12159.png ADDED Viewed

Git LFS Details

SHA256: 03273c57e8b25b258d3eb96af7b4f77b43b5c40be90da83c21875f3322b487f1
Pointer size: 131 Bytes
Size of remote file: 347 kB

demo_data/shepherd_hati.jpg ADDED Viewed

Git LFS Details

SHA256: 65c5878203bc3165dda9011ebfce77cc7d930daed0a215396d8036509d1963c1
Pointer size: 131 Bytes
Size of remote file: 210 kB

demo_tta.py ADDED Viewed

	@@ -0,0 +1,340 @@

+"""
+PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation
+Official implementation of the paper:
+"PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation"
+by Xiaohang Yu, Ti Wang, and Mackenzie Weygandt Mathis
+Licensed under a modified MIT license
+"""
+"""
+demo_tta.py: PRIMA inference with DeepLabCut SuperAnimal TTA
+Pipeline:
+1. Run Detectron2 to detect animals in the input image.
+2. Run PRIMA on each detected animal to obtain 3D pose/shape estimation.
+3. Run DeepLabCut SuperAnimal to obtain 2D keypoint estimation.
+4. Map the 39 SuperAnimal keypoints to the 26 PRIMA keypoints.
+5. Run test-time adaptation (TTA) with user-specified lr and num_iters
+   to further optimize the 3D pose and shape estimation.
+6. Render and save before/after TTA results (PNG + OBJ) and the
+   26-keypoint visualization (PNG).
+Reference code:
+- Test-time adaptation: prima/../eval_with_tta.py
+- DeepLabCut: https://github.com/AdaptiveMotorControlLab/FMPose3D/blob/main/animals/demo/vis_animals.py
+- Keypoint mapping (SuperAnimal 39 → PRIMA 26):
+    keypoint_mapping = {"quadruped80k":[10, 5, -1, 26, 29, 30, 35, 22, 24, 27, 31, 32, -1, -1,
+                                     25, 28, 33, 34, 15, 23, 11, 6, 4, 3, 0, -1]}
+"""
+from pathlib import Path
+import argparse
+import copy
+import os
+import tempfile
+import warnings
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.data
+from tqdm import tqdm
+from prima.models import load_prima
+from prima.utils import recursive_to
+from prima.datasets.vitdet_dataset import ViTDetDataset, DEFAULT_MEAN, DEFAULT_STD
+from prima.utils.renderer import Renderer, cam_crop_to_full
+warnings.filterwarnings("ignore")
+LIGHT_BLUE = (0.65098039, 0.74117647, 0.85882353)
+GREEN = (0.65, 0.86, 0.74)
+ANIMAL_COCO_IDS = [15, 16, 17, 18, 19, 21, 22]
+keypoint_mapping = {
+    "quadruped80k": [10, 5, -1, 26, 29, 30, 35, 22, 24, 27, 31, 32, -1, -1, 25, 28, 33, 34, 15, 23, 11, 6, 4, 3, 0, -1]
+}
+def denorm_patch_to_rgb(img_tensor: torch.Tensor) -> np.ndarray:
+    patch = (img_tensor.detach().cpu() * (DEFAULT_STD[:, None, None]) + DEFAULT_MEAN[:, None, None]) / 255.0
+    patch = patch.permute(1, 2, 0).numpy()
+    return np.clip(patch, 0.0, 1.0)
+def map_superanimal_to_prima(bodyparts_xyc: np.ndarray) -> np.ndarray:
+    mapping = keypoint_mapping["quadruped80k"]
+    num_src = bodyparts_xyc.shape[0]
+    mapped = np.zeros((len(mapping), 3), dtype=np.float32)
+    for tgt_i, src_i in enumerate(mapping):
+        if src_i >= 0 and src_i < num_src:
+            mapped[tgt_i] = bodyparts_xyc[src_i]
+    return mapped
+def save_keypoint_vis(patch_rgb: np.ndarray, kpts_xyc: np.ndarray, save_path: str) -> None:
+    vis = cv2.cvtColor((patch_rgb * 255).astype(np.uint8), cv2.COLOR_RGB2BGR).copy()
+    num_kpts = len(kpts_xyc)
+    for i, (x, y, c) in enumerate(kpts_xyc):
+        if c <= 0:
+            continue
+        # Use distinct color for each keypoint (OpenCV uses BGR)
+        hue = int(179 * i / max(1, num_kpts - 1))
+        color_bgr = cv2.cvtColor(np.uint8([[[hue, 255, 255]]]), cv2.COLOR_HSV2BGR)[0, 0]
+        color_bgr = (int(color_bgr[0]), int(color_bgr[1]), int(color_bgr[2]))
+        cx, cy = int(round(float(x))), int(round(float(y)))
+        cv2.circle(vis, (cx, cy), 3, color_bgr, -1)
+        cv2.putText(vis, str(i), (cx + 3, cy - 3), cv2.FONT_HERSHEY_SIMPLEX, 0.35, (255, 255, 255), 1, cv2.LINE_AA)
+    cv2.imwrite(save_path, vis)
+def run_superanimal_on_patch(patch_rgb: np.ndarray, args, tmp_dir: str):
+    try:
+        from deeplabcut.pose_estimation_pytorch.apis import superanimal_analyze_images
+    except Exception as e:
+        raise RuntimeError(
+            "Cannot import DeepLabCut SuperAnimal API. Please install deeplabcut with pose_estimation_pytorch support."
+        ) from e
+    patch_path = os.path.join(tmp_dir, "patch.png")
+    cv2.imwrite(patch_path, cv2.cvtColor((patch_rgb * 255).astype(np.uint8), cv2.COLOR_RGB2BGR))
+    preds = superanimal_analyze_images(
+        args.superanimal_name,
+        args.superanimal_model_name,
+        args.superanimal_detector_name,
+        patch_path,
+        args.superanimal_max_individuals,
+        out_folder=tmp_dir,
+    )
+    payload = preds.get(patch_path, None)
+    if payload is None:
+        return None
+    bodyparts = payload.get("bodyparts", None)
+    if bodyparts is None or len(bodyparts) == 0:
+        return None
+    best_idx = int(np.argmax(bodyparts[..., 2].mean(axis=1)))
+    return bodyparts[best_idx]
+def render_and_save(renderer, out, batch, img_fn, animal_id, out_folder, suffix, side_view, save_mesh):
+    pred_cam = out['pred_cam']
+    box_center = batch['box_center'].float()
+    box_size = batch['box_size'].float()
+    img_size = batch['img_size'].float()
+    scaled_focal_length = batch['focal_length'][0, 0] / batch['img'].shape[-1] * img_size.max()
+    pred_cam_t_full = cam_crop_to_full(pred_cam, box_center, box_size, img_size, scaled_focal_length)
+    white_img = (torch.ones_like(batch['img'][0]).cpu() - DEFAULT_MEAN[:, None, None] / 255) / (
+        DEFAULT_STD[:, None, None] / 255
+    )
+    input_patch = denorm_patch_to_rgb(batch['img'][0])
+    regression_img = renderer(
+        out['pred_vertices'][0].detach().cpu().numpy(),
+        out['pred_cam_t'][0].detach().cpu().numpy(),
+        batch['img'][0],
+        mesh_base_color=GREEN,
+        scene_bg_color=(1, 1, 1),
+    )
+    final_img = np.concatenate([input_patch, regression_img], axis=1)
+    if side_view:
+        side_img = renderer(
+            out['pred_vertices'][0].detach().cpu().numpy(),
+            out['pred_cam_t'][0].detach().cpu().numpy(),
+            white_img,
+            mesh_base_color=GREEN,
+            scene_bg_color=(1, 1, 1),
+            side_view=True,
+        )
+        final_img = np.concatenate([final_img, side_img], axis=1)
+    cv2.imwrite(
+        os.path.join(out_folder, f'{img_fn}_{animal_id}_{suffix}.png'),
+        cv2.cvtColor((255 * final_img).astype(np.uint8), cv2.COLOR_RGB2BGR),
+    )
+    if save_mesh:
+        verts = out['pred_vertices'][0].detach().cpu().numpy()
+        cam_t = pred_cam_t_full[0].detach().cpu().numpy()
+        tmesh = renderer.vertices_to_trimesh(verts, cam_t.copy(), LIGHT_BLUE)
+        tmesh.export(os.path.join(out_folder, f'{img_fn}_{animal_id}_{suffix}.obj'))
+def tta_optimize(model, batch, gt_kpts_norm, num_iters, lr):
+    model.eval()
+    if hasattr(model, 'backbone'):
+        for p in model.backbone.parameters():
+            p.requires_grad = False
+    orig_smal_head_state = copy.deepcopy(model.smal_head.state_dict())
+    model.smal_head.freeze_except_regression_heads()
+    tta_params = model.smal_head.get_tta_parameters(mode='all')
+    optimizer = torch.optim.Adam(tta_params, lr=lr)
+    valid_mask = (gt_kpts_norm[..., 2] > 0).float().unsqueeze(-1)
+    gt_xy = gt_kpts_norm[..., :2]
+    for _ in range(num_iters):
+        optimizer.zero_grad()
+        out = model(batch)
+        pred_xy = out['pred_keypoints_2d']
+        loss = F.mse_loss(pred_xy * valid_mask, gt_xy * valid_mask, reduction='sum') / (valid_mask.sum() + 1e-6)
+        loss.backward()
+        optimizer.step()
+    with torch.no_grad():
+        out_after = model(batch)
+    model.smal_head.load_state_dict(orig_smal_head_state)
+    model.smal_head.unfreeze_all()
+    return out_after
+def main():
+    parser = argparse.ArgumentParser(description='PRIMA + SuperAnimal + TTA demo')
+    parser.add_argument('--checkpoint', type=str, required=True, help='Path to pretrained PRIMA checkpoint')
+    parser.add_argument('--img_path', type=str, default=None, help='Single image path')
+    parser.add_argument('--img_folder', type=str, default='demo_data/', help='Folder with input images')
+    parser.add_argument('--out_folder', type=str, default='demo_out_tta', help='Output folder')
+    parser.add_argument('--side_view', dest='side_view', action='store_true', default=False, help='Render side view')
+    parser.add_argument('--save_mesh', dest='save_mesh', action='store_true', default=False, help='Save meshes')
+    parser.add_argument('--file_type', nargs='+', default=['*.jpg', '*.png', '*.jpeg', '*.JPEG'], help='Image globs')
+    parser.add_argument('--det_thresh', type=float, default=0.7, help='Detectron2 score threshold for animals')
+    parser.add_argument('--tta_lr', type=float, default=1e-6, help='TTA learning rate')
+    parser.add_argument('--tta_num_iters', type=int, default=30, help='TTA iterations')
+    parser.add_argument('--kp_conf_thresh', type=float, default=0.1, help='Keypoint confidence threshold')
+    parser.add_argument('--superanimal_name', type=str, default='superanimal_quadruped')
+    parser.add_argument('--superanimal_model_name', type=str, default='hrnet_w32')
+    parser.add_argument('--superanimal_detector_name', type=str, default='fasterrcnn_resnet50_fpn_v2')
+    parser.add_argument('--superanimal_max_individuals', type=int, default=1)
+    args = parser.parse_args()
+    model, model_cfg = load_prima(args.checkpoint)
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    model = model.to(device)
+    model.eval()
+    renderer = Renderer(model_cfg, faces=model.smal.faces)
+    os.makedirs(args.out_folder, exist_ok=True)
+    import detectron2.config
+    import detectron2.engine
+    from detectron2 import model_zoo
+    cfg = detectron2.config.get_cfg()
+    cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml"))
+    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5
+    cfg.MODEL.WEIGHTS = "https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x/139173657/model_final_68b088.pkl"
+    detector = detectron2.engine.DefaultPredictor(cfg)
+    if args.img_path is not None:
+        img_paths = [Path(args.img_path)]
+    else:
+        img_paths = sorted([img for end in args.file_type for img in Path(args.img_folder).glob(end)])
+    for img_path in img_paths:
+        img_bgr = cv2.imread(str(img_path))
+        if img_bgr is None:
+            print(f"[WARN] Cannot read image: {img_path}")
+            continue
+        det_out = detector(img_bgr)
+        det_instances = det_out['instances']
+        valid_idx = [
+            i for i, (c, s) in enumerate(zip(det_instances.pred_classes, det_instances.scores))
+            if (int(c) in ANIMAL_COCO_IDS) and (float(s) > args.det_thresh)
+        ]
+        boxes = det_instances.pred_boxes.tensor[valid_idx].cpu().numpy()
+        if len(boxes) == 0:
+            print(f"[INFO] No animal detected in {img_path}")
+            continue
+        dataset = ViTDetDataset(model_cfg, img_bgr, boxes)
+        dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)
+        for batch in tqdm(dataloader, desc=f"{img_path.name}"):
+            batch = recursive_to(batch, device)
+            with torch.no_grad():
+                out_before = model(batch)
+            img_fn = img_path.stem
+            animal_id = int(batch['animalid'][0])
+            render_and_save(
+                renderer,
+                out_before,
+                batch,
+                img_fn,
+                animal_id,
+                args.out_folder,
+                suffix='before_tta',
+                side_view=args.side_view,
+                save_mesh=args.save_mesh,
+            )
+            patch_rgb = denorm_patch_to_rgb(batch['img'][0])
+            with tempfile.TemporaryDirectory(prefix=f"dlc_{img_fn}_{animal_id}_") as tmp_dir:
+                bodyparts_xyc = run_superanimal_on_patch(patch_rgb, args, tmp_dir)
+            if bodyparts_xyc is None:
+                print(f"[WARN] No SuperAnimal keypoints for {img_fn}_{animal_id}, skip TTA")
+                continue
+            mapped_xyc = map_superanimal_to_prima(bodyparts_xyc)
+            mapped_xyc[mapped_xyc[:, 2] < args.kp_conf_thresh, 2] = 0.0
+            save_keypoint_vis(
+                patch_rgb,
+                mapped_xyc,
+                os.path.join(args.out_folder, f"{img_fn}_{animal_id}_prima26_kpts.png"),
+            )
+            np.save(os.path.join(args.out_folder, f"{img_fn}_{animal_id}_prima26_kpts.npy"), mapped_xyc)
+            patch_h, patch_w = patch_rgb.shape[:2]
+            mapped_norm = mapped_xyc.copy()
+            mapped_norm[:, 0] = mapped_norm[:, 0] / float(patch_w) - 0.5
+            mapped_norm[:, 1] = mapped_norm[:, 1] / float(patch_h) - 0.5
+            gt_kpts_norm = torch.from_numpy(mapped_norm[None]).to(device=device, dtype=batch['img'].dtype)
+            out_after = tta_optimize(
+                model,
+                batch,
+                gt_kpts_norm,
+                num_iters=args.tta_num_iters,
+                lr=args.tta_lr,
+            )
+            render_and_save(
+                renderer,
+                out_after,
+                batch,
+                img_fn,
+                animal_id,
+                args.out_folder,
+                suffix='after_tta',
+                side_view=args.side_view,
+                save_mesh=args.save_mesh,
+            )
+if __name__ == '__main__':
+    main()

eval.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""
+PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation
+Official implementation of the paper:
+"PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation"
+by Xiaohang Yu, Ti Wang, and Mackenzie Weygandt Mathis
+Licensed under a modified MIT license
+"""
+import numpy as np
+from tqdm import tqdm
+import torch
+from prima.utils import recursive_to
+from prima.utils.evaluate_metric import Evaluator
+from prima.datasets.datasets import EvaluationDataset
+import argparse
+from torch.utils.data import DataLoader
+from prima.models.prima import PRIMA
+from prima.configs import get_config
+torch.multiprocessing.set_sharing_strategy('file_system')
+def main(args):
+    cfg = get_config(args.config)
+    default_cfg = get_config(args.default_eval_config)
+    model = PRIMA.load_from_checkpoint(args.checkpoint, cfg=cfg, strict=False)
+    model.eval()
+    smal_evaluator = Evaluator(smal_model=model.smal, image_size=cfg.MODEL.IMAGE_SIZE)
+    cfg_eval_dataset = dict(default_cfg.DATASETS)
+    aug_cfg = cfg_eval_dataset.pop("CONFIG", None)  # augmentation config is not used in evaluation
+    if args.dataset.upper() == "ALL":
+        for key in cfg_eval_dataset.keys():
+            print(f"-------- Evaluate {key} dataset ------------")
+            eval_one_dataset(cfg_eval_dataset[key], default_cfg, cfg, model,
+                             evaluator=smal_evaluator,
+                             aug_cfg=aug_cfg,
+                             key=key,
+                             device=args.device)
+            print(f"-------{key} Dataset evaluate finish ------")
+    else:
+        print(f"-------- Evaluate {args.dataset} dataset ------------")
+        eval_one_dataset(cfg_eval_dataset[args.dataset], default_cfg, cfg, model,
+                         evaluator=smal_evaluator,
+                         aug_cfg=aug_cfg,
+                         key=args.dataset,
+                         device=args.device)
+        print(f"-------{args.dataset} Dataset evaluate finish ------")
+def eval_one_dataset(dataset_cfg, default_cfg, cfg, model, evaluator, aug_cfg, key, device='cuda'):
+    dataset = EvaluationDataset(root_image=dataset_cfg['ROOT_IMAGE'],
+                                json_file=dataset_cfg['JSON_FILE']['TEST'],
+                                augm_config=aug_cfg, focal_length=cfg.SMAL.get("FOCAL_LENGTH", 1000),
+                                image_size=cfg.MODEL.IMAGE_SIZE,
+                                )
+    dataloader = DataLoader(dataset, batch_size=1, num_workers=cfg.GENERAL.NUM_WORKERS)
+    bar = tqdm(dataloader)
+    pa_mpjpe_list, pck_list, auc_list, pa_mpvpe_list = [], [], [], []
+    for i, batch in enumerate(bar):
+        batch = recursive_to(batch, device)
+        with torch.no_grad():
+            output = model(batch)
+        if key in ["ANIMAL3D", "CONTROL_ANIMAL3D"]:
+            pa_mpjpe, pa_mpvpe = evaluator.eval_3d(output, batch)
+        else:
+            pa_mpjpe, pa_mpvpe = 0., 0.
+        pck, auc = evaluator.eval_2d(output, batch, pck_threshold=default_cfg.METRIC.PCK_THRESHOLD)
+        pa_mpjpe_list.append(pa_mpjpe)
+        pa_mpvpe_list.append(pa_mpvpe)
+        auc_list.append(auc)
+        pck_list.append(pck)
+        bar.set_postfix(PA_MPJPE=pa_mpjpe,
+                        PA_MPVPE=pa_mpvpe,
+                        AUC=auc,
+                        pck=pck,)
+    print("---------------- 3D metric -----------------")
+    print(f"Avg PA-MPJPE: {np.mean(pa_mpjpe_list)}")
+    print(f"Avg PA-MPVPE: {np.mean(pa_mpvpe_list)}")
+    print("--------------- 2D metric ------------------")
+    print(f"AUC: {np.mean(auc_list)}")
+    pck_list = np.array(pck_list)
+    for _, th in enumerate(default_cfg.METRIC.PCK_THRESHOLD):
+        print(f"PCK@{th}: {np.mean(pck_list[:, _])}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, help="Path to config file", required=True)
+    parser.add_argument("--checkpoint", type=str, help="Path to checkpoint file", required=True)
+    parser.add_argument("--default_eval_config", type=str, default="./configs_hydra/experiment/default_val.yaml")
+    parser.add_argument("--dataset", type=str, default="ALL")
+    parser.add_argument("--device", type=str, default="cuda", help="Device to use for evaluation")
+    args = parser.parse_args()
+    main(args)

images/teaser.png ADDED Viewed

Git LFS Details

SHA256: a617ca4fd37de03e2db4ccf397ce9841ed32c3fe18c766c4832d41af574ad746
Pointer size: 132 Bytes
Size of remote file: 4.29 MB

packages.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+libosmesa6
+libgl1
+libegl1
+libgles2

prima/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+"""
+PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation
+Official implementation of the paper:
+"PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation"
+by Xiaohang Yu, Ti Wang, and Mackenzie Weygandt Mathis
+Licensed under a modified MIT license
+"""
+"""Top-level package for PRIMA.
+This package contains models, datasets and utilities for
+3D animal pose and shape estimation.
+"""
+from importlib.metadata import PackageNotFoundError, version
+try:  # pragma: no cover - best effort during development
+	__version__ = version("prima-animal")
+except PackageNotFoundError:  # pragma: no cover
+	__version__ = "0.0.0"
+__all__ = ["__version__"]

prima/configs/__init__.py ADDED Viewed

	@@ -0,0 +1,99 @@

+"""
+PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation
+Official implementation of the paper:
+"PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation"
+by Xiaohang Yu, Ti Wang, and Mackenzie Weygandt Mathis
+Licensed under a modified MIT license
+"""
+from typing import Dict
+from yacs.config import CfgNode as CN
+def to_lower(x: Dict) -> Dict:
+    """
+    Convert all dictionary keys to lowercase
+    Args:
+      x (dict): Input dictionary
+    Returns:
+      dict: Output dictionary with all keys converted to lowercase
+    """
+    return {k.lower(): v for k, v in x.items()}
+_C = CN(new_allowed=True)
+_C.GENERAL = CN(new_allowed=True)
+_C.GENERAL.RESUME = True
+_C.GENERAL.TIME_TO_RUN = 3300
+_C.GENERAL.VAL_STEPS = 100
+_C.GENERAL.LOG_STEPS = 100
+_C.GENERAL.CHECKPOINT_STEPS = 20000
+_C.GENERAL.CHECKPOINT_DIR = "checkpoints"
+_C.GENERAL.SUMMARY_DIR = "tensorboard"
+_C.GENERAL.NUM_GPUS = 1
+_C.GENERAL.NUM_WORKERS = 4
+_C.GENERAL.MIXED_PRECISION = True
+_C.GENERAL.ALLOW_CUDA = True
+_C.GENERAL.PIN_MEMORY = False
+_C.GENERAL.DISTRIBUTED = False
+_C.GENERAL.LOCAL_RANK = 0
+_C.GENERAL.USE_SYNCBN = False
+_C.GENERAL.WORLD_SIZE = 1
+_C.GENERAL.PREFETCH_FACTOR = 2
+_C.TRAIN = CN(new_allowed=True)
+_C.TRAIN.NUM_EPOCHS = 100
+_C.TRAIN.SHUFFLE = True
+_C.TRAIN.WARMUP = False
+_C.TRAIN.NORMALIZE_PER_IMAGE = False
+_C.TRAIN.CLIP_GRAD = False
+_C.TRAIN.CLIP_GRAD_VALUE = 1.0
+_C.LOSS_WEIGHTS = CN(new_allowed=True)
+_C.DATASETS = CN(new_allowed=True)
+_C.MODEL = CN(new_allowed=True)
+_C.MODEL.IMAGE_SIZE = 224
+_C.EXTRA = CN(new_allowed=True)
+_C.EXTRA.FOCAL_LENGTH = 5000
+_C.DATASETS.CONFIG = CN(new_allowed=True)
+_C.DATASETS.CONFIG.SCALE_FACTOR = 0.3
+_C.DATASETS.CONFIG.ROT_FACTOR = 30
+_C.DATASETS.CONFIG.TRANS_FACTOR = 0.02
+_C.DATASETS.CONFIG.COLOR_SCALE = 0.2
+_C.DATASETS.CONFIG.ROT_AUG_RATE = 0.6
+_C.DATASETS.CONFIG.TRANS_AUG_RATE = 0.5
+_C.DATASETS.CONFIG.DO_FLIP = False
+_C.DATASETS.CONFIG.FLIP_AUG_RATE = 0.5
+_C.DATASETS.CONFIG.EXTREME_CROP_AUG_RATE = 0.10
+def default_config() -> CN:
+    """
+    Get a yacs CfgNode object with the default config values.
+    """
+    # Return a clone so that the defaults will not be altered
+    # This is for the "local variable" use pattern
+    return _C.clone()
+def get_config(config_file: str, merge: bool = True) -> CN:
+    """
+    Read a config file and optionally merge it with the default config file.
+    Args:
+      config_file (str): Path to config file.
+      merge (bool): Whether to merge with the default config or not.
+    Returns:
+      CfgNode: Config as a yacs CfgNode object.
+    """
+    if merge:
+        cfg = default_config()
+    else:
+        cfg = CN(new_allowed=True)
+    cfg.merge_from_file(config_file)
+    cfg.freeze()
+    return cfg

prima/models/__init__.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""
+PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation
+Official implementation of the paper:
+"PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation"
+by Xiaohang Yu, Ti Wang, and Mackenzie Weygandt Mathis
+Licensed under a modified MIT license
+"""
+from .prima import PRIMA
+def load_prima(checkpoint_path):
+    from pathlib import Path
+    from ..configs import get_config
+    model_cfg = str(Path(checkpoint_path).parent.parent / '.hydra/config.yaml')
+    model_cfg = get_config(model_cfg)
+    # Override some config values, to crop bbox correctly
+    if (model_cfg.MODEL.BACKBONE.TYPE == 'vit') and ('BBOX_SHAPE' not in model_cfg.MODEL):
+        model_cfg.defrost()
+        assert model_cfg.MODEL.IMAGE_SIZE == 256, f"MODEL.IMAGE_SIZE ({model_cfg.MODEL.IMAGE_SIZE}) should be 256 for ViT backbone"
+        model_cfg.MODEL.BBOX_SHAPE = [192, 256]
+        model_cfg.freeze()
+    if (model_cfg.MODEL.BACKBONE.TYPE == 'dinov3') and ('BBOX_SHAPE' not in model_cfg.MODEL):
+        model_cfg.defrost()
+        assert model_cfg.MODEL.IMAGE_SIZE == 256, f"MODEL.IMAGE_SIZE ({model_cfg.MODEL.IMAGE_SIZE}) should be 256 for dino backbone"
+        model_cfg.MODEL.BBOX_SHAPE = [256, 256]
+        model_cfg.freeze()
+    if (model_cfg.MODEL.BACKBONE.TYPE == 'dinov2') and ('BBOX_SHAPE' not in model_cfg.MODEL):
+        model_cfg.defrost()
+        assert model_cfg.MODEL.IMAGE_SIZE == 252, f"MODEL.IMAGE_SIZE ({model_cfg.MODEL.IMAGE_SIZE}) should be 252 for dino backbone"
+        model_cfg.MODEL.BBOX_SHAPE = [252, 252]
+        model_cfg.freeze()
+    # Update config to be compatible with demo
+    if ('PRETRAINED_WEIGHTS' in model_cfg.MODEL.BACKBONE):
+        model_cfg.defrost()
+        model_cfg.MODEL.BACKBONE.pop('PRETRAINED_WEIGHTS')
+        model_cfg.freeze()
+    # Offscreen training renderer is not needed for demo/inference startup and
+    # can fail on some local OpenGL backends.
+    model = PRIMA.load_from_checkpoint(
+        checkpoint_path,
+        strict=False,
+        cfg=model_cfg,
+        map_location='cpu',
+        init_renderer=False,
+    )
+    return model, model_cfg

prima/models/backbones/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""
+PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation
+Official implementation of the paper:
+"PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation"
+by Xiaohang Yu, Ti Wang, and Mackenzie Weygandt Mathis
+Licensed under a modified MIT license
+"""
+from .vit import vith
+def create_backbone(cfg):
+    if cfg.MODEL.BACKBONE.TYPE in ['vith','concat','aa']:   # vit bb will be used in these three cases - animal feature extractor
+        return vith(cfg)
+    else:
+        raise NotImplementedError('Backbone type is not implemented')

prima/models/backbones/vit.py ADDED Viewed

	@@ -0,0 +1,375 @@

+"""
+PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation
+Official implementation of the paper:
+"PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation"
+by Xiaohang Yu, Ti Wang, and Mackenzie Weygandt Mathis
+Licensed under a modified MIT license
+"""
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import torch
+from functools import partial
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.layers import drop_path, to_2tuple, trunc_normal_
+def vith(cfg):
+    return ViT(
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        # use_checkpoint=True,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.55,
+        use_cls=True, # cls for animal family classification
+    )
+def get_abs_pos(abs_pos, h, w, ori_h, ori_w, has_cls_token=True):
+    """
+    Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
+        dimension for the original embeddings.
+    Args:
+        abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
+        has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
+        hw (Tuple): size of input image tokens.
+    Returns:
+        Absolute positional embeddings after processing with shape (1, H, W, C)
+    """
+    cls_token = None
+    B, L, C = abs_pos.shape
+    if has_cls_token:
+        cls_token = abs_pos[:, 0:1]
+        abs_pos = abs_pos[:, 1:]
+    if ori_h != h or ori_w != w:
+        new_abs_pos = F.interpolate(
+            abs_pos.reshape(1, ori_h, ori_w, -1).permute(0, 3, 1, 2),
+            size=(h, w),
+            mode="bicubic",
+            align_corners=False,
+        ).permute(0, 2, 3, 1).reshape(B, -1, C)
+    else:
+        new_abs_pos = abs_pos
+    if cls_token is not None:
+        new_abs_pos = torch.cat([cls_token, new_abs_pos], dim=1)
+    return new_abs_pos
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+    def extra_repr(self):
+        return 'p={}'.format(self.drop_prob)
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., attn_head_dim=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.dim = dim
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None,
+                 drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm, attn_head_dim=None,
+                 ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim
+        )
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, ratio=1):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (ratio ** 2)
+        self.patch_shape = (int(img_size[0] // patch_size[0] * ratio), int(img_size[1] // patch_size[1] * ratio))
+        self.origin_patch_shape = (int(img_size[0] // patch_size[0]), int(img_size[1] // patch_size[1]))
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=(patch_size[0] // ratio),
+                              padding=4 + 2 * (ratio // 2 - 1))
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        x = self.proj(x)
+        Hp, Wp = x.shape[2], x.shape[3]
+        x = x.flatten(2).transpose(1, 2)
+        return x, (Hp, Wp)
+class HybridEmbed(nn.Module):
+    """ CNN Feature Map Embedding
+    Extract feature map from CNN, flatten, project to embedding dim.
+    """
+    def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768):
+        super().__init__()
+        assert isinstance(backbone, nn.Module)
+        img_size = to_2tuple(img_size)
+        self.img_size = img_size
+        self.backbone = backbone
+        if feature_size is None:
+            with torch.no_grad():
+                training = backbone.training
+                if training:
+                    backbone.eval()
+                o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1]
+                feature_size = o.shape[-2:]
+                feature_dim = o.shape[1]
+                backbone.train(training)
+        else:
+            feature_size = to_2tuple(feature_size)
+            feature_dim = self.backbone.feature_info.channels()[-1]
+        self.num_patches = feature_size[0] * feature_size[1]
+        self.proj = nn.Linear(feature_dim, embed_dim)
+    def forward(self, x):
+        x = self.backbone(x)[-1]
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+        return x
+class ViT(nn.Module):
+    def __init__(self,
+                 img_size=224, patch_size=16, in_chans=3, num_classes=80, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., hybrid_backbone=None, norm_layer=None, use_checkpoint=False,
+                 frozen_stages=-1, ratio=1, last_norm=True, use_cls=False,
+                 patch_padding='pad', freeze_attn=False, freeze_ffn=False,
+                 ):
+        # Protect mutable default arguments
+        super(ViT, self).__init__()
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.frozen_stages = frozen_stages
+        self.use_checkpoint = use_checkpoint
+        self.patch_padding = patch_padding
+        self.freeze_attn = freeze_attn
+        self.freeze_ffn = freeze_ffn
+        self.depth = depth
+        if hybrid_backbone is not None:
+            self.patch_embed = HybridEmbed(
+                hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim)
+        else:
+            self.patch_embed = PatchEmbed(
+                img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ratio=ratio)
+        num_patches = self.patch_embed.num_patches
+        # since the pretraining model has class token
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+            )
+            for i in range(depth)])
+        self.last_norm = norm_layer(embed_dim) if last_norm else nn.Identity()
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=.02)
+        self.use_cls = use_cls
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        nn.init.normal_(self.cls_token, std=1e-6)
+        self._freeze_stages()
+    def _freeze_stages(self):
+        """Freeze parameters."""
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+        for i in range(1, self.frozen_stages + 1):
+            m = self.blocks[i]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+        if self.freeze_attn:
+            for i in range(0, self.depth):
+                m = self.blocks[i]
+                m.attn.eval()
+                m.norm1.eval()
+                for param in m.attn.parameters():
+                    param.requires_grad = False
+                for param in m.norm1.parameters():
+                    param.requires_grad = False
+        if self.freeze_ffn:
+            self.pos_embed.requires_grad = False
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+            for i in range(0, self.depth):
+                m = self.blocks[i]
+                m.mlp.eval()
+                m.norm2.eval()
+                for param in m.mlp.parameters():
+                    param.requires_grad = False
+                for param in m.norm2.parameters():
+                    param.requires_grad = False
+    def init_weights(self):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+        self.apply(_init_weights)
+    def get_num_layers(self):
+        return len(self.blocks)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+    def forward_features(self, x):
+        B, C, H, W = x.shape
+        x, (Hp, Wp) = self.patch_embed(x)
+        if self.pos_embed is not None:
+            # fit for multiple GPU training
+            # since the first element for pos embed (sin-cos manner) is zero, it will cause no difference
+            x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1]
+        x = torch.cat((self.cls_token.expand(B, -1, -1), x), dim=1) if self.use_cls else x
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        x = self.last_norm(x)
+        cls = x[:, 0] if self.use_cls else None
+        x = x[:, 1:] if self.use_cls else x
+        xp = x.permute(0, 2, 1).reshape(B, -1, Hp, Wp).contiguous()
+        return xp, cls # shape [B, D, Hp, Wp], [B, D]
+    def forward(self, x):
+        x, cls = self.forward_features(x)
+        return x, cls
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        self._freeze_stages()

prima/models/bioclip_embedding.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""
+PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation
+Official implementation of the paper:
+"PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation"
+by Xiaohang Yu, Ti Wang, and Mackenzie Weygandt Mathis
+Licensed under a modified MIT license
+"""
+"""
+bioclip Embedding Module
+Converts image batch to embeddings that can be concatenated with image features
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class BioClipEmbedding(nn.Module):
+    """
+    Embeds images into a feature space using BioClip model that can be combined with image features.
+    Args:
+        embed_dim: Output embedding dimension, should match the dimension of image features for concatenation
+    """
+    def __init__(self, cfg, embed_dim: int = 1024):
+        super().__init__()
+        self.embed_dim = embed_dim
+        import open_clip
+        if cfg.MODEL.BIOCLIP_EMBEDDING.TYPE == 'bioclip2':
+            print("[BioClipEmbedding] Using BioClip2 model from Hugging Face Hub")
+            self.species_model, _,_ = open_clip.create_model_and_transforms('hf-hub:imageomics/bioclip-2')
+        else:
+            self.species_model, _,_ = open_clip.create_model_and_transforms('hf-hub:imageomics/bioclip')
+        # tokenizer = open_clip.get_tokenizer('hf-hub:imageomics/bioclip')
+        self.species_model.eval()
+        # Get the output dimension from the model
+        bioclip_output_dim = self.species_model.visual.output_dim
+        # Project to target dimension
+        self.projection = nn.Sequential(
+            nn.Linear(bioclip_output_dim, embed_dim),
+            nn.LayerNorm(embed_dim),
+        )
+    def forward(self, images: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            images: Tensor of shape (B, C, H, W) representing a batch of images
+        Returns:
+            Tensor of shape (B, embed_dim) representing the embedded features
+        """
+        # BioClip expects 224x224 input, resize if needed
+        if images.shape[-2:] != (224, 224):
+            images_resized = F.interpolate(images, size=(224, 224), mode='bilinear', align_corners=False)
+        else:
+            images_resized = images
+        with torch.no_grad():
+            image_features = self.species_model.encode_image(images_resized)
+        projected_features = self.projection(image_features)
+        return projected_features

prima/models/components/__init__.py ADDED Viewed

File without changes

prima/models/components/model_utils.py ADDED Viewed

	@@ -0,0 +1,160 @@

+"""
+PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation
+Official implementation of the paper:
+"PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation"
+by Xiaohang Yu, Ti Wang, and Mackenzie Weygandt Mathis
+Licensed under a modified MIT license
+"""
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import copy
+from typing import Tuple
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def select_closest_cond_frames(frame_idx, cond_frame_outputs, max_cond_frame_num):
+    """
+    Select up to `max_cond_frame_num` conditioning frames from `cond_frame_outputs`
+    that are temporally closest to the current frame at `frame_idx`. Here, we take
+    - a) the closest conditioning frame before `frame_idx` (if any);
+    - b) the closest conditioning frame after `frame_idx` (if any);
+    - c) any other temporally closest conditioning frames until reaching a total
+         of `max_cond_frame_num` conditioning frames.
+    Outputs:
+    - selected_outputs: selected items (keys & values) from `cond_frame_outputs`.
+    - unselected_outputs: items (keys & values) not selected in `cond_frame_outputs`.
+    """
+    if max_cond_frame_num == -1 or len(cond_frame_outputs) <= max_cond_frame_num:
+        selected_outputs = cond_frame_outputs
+        unselected_outputs = {}
+    else:
+        assert max_cond_frame_num >= 2, "we should allow using 2+ conditioning frames"
+        selected_outputs = {}
+        # the closest conditioning frame before `frame_idx` (if any)
+        idx_before = max((t for t in cond_frame_outputs if t < frame_idx), default=None)
+        if idx_before is not None:
+            selected_outputs[idx_before] = cond_frame_outputs[idx_before]
+        # the closest conditioning frame after `frame_idx` (if any)
+        idx_after = min((t for t in cond_frame_outputs if t >= frame_idx), default=None)
+        if idx_after is not None:
+            selected_outputs[idx_after] = cond_frame_outputs[idx_after]
+        # add other temporally closest conditioning frames until reaching a total
+        # of `max_cond_frame_num` conditioning frames.
+        num_remain = max_cond_frame_num - len(selected_outputs)
+        inds_remain = sorted(
+            (t for t in cond_frame_outputs if t not in selected_outputs),
+            key=lambda x: abs(x - frame_idx),
+        )[:num_remain]
+        selected_outputs.update((t, cond_frame_outputs[t]) for t in inds_remain)
+        unselected_outputs = {
+            t: v for t, v in cond_frame_outputs.items() if t not in selected_outputs
+        }
+    return selected_outputs, unselected_outputs
+def get_1d_sine_pe(pos_inds, dim, temperature=10000):
+    """
+    Get 1D sine positional embedding as in the original Transformer paper.
+    """
+    pe_dim = dim // 2
+    dim_t = torch.arange(pe_dim, dtype=torch.float32, device=pos_inds.device)
+    dim_t = temperature ** (2 * (dim_t // 2) / pe_dim)
+    pos_embed = pos_inds.unsqueeze(-1) / dim_t
+    pos_embed = torch.cat([pos_embed.sin(), pos_embed.cos()], dim=-1)
+    return pos_embed
+def get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(f"activation should be relu/gelu, not {activation}.")
+def get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+class DropPath(nn.Module):
+    # adapted from https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
+    def __init__(self, drop_prob=0.0, scale_by_keep=True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        if self.drop_prob == 0.0 or not self.training:
+            return x
+        keep_prob = 1 - self.drop_prob
+        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
+        random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+        if keep_prob > 0.0 and self.scale_by_keep:
+            random_tensor.div_(keep_prob)
+        return x * random_tensor
+# Lightly adapted from
+# https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py # noqa
+class MLP(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        output_dim: int,
+        num_layers: int,
+        activation: nn.Module = nn.ReLU,
+        sigmoid_output: bool = False,
+    ) -> None:
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
+        )
+        self.sigmoid_output = sigmoid_output
+        self.act = activation()
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = self.act(layer(x)) if i < self.num_layers - 1 else layer(x)
+        if self.sigmoid_output:
+            x = F.sigmoid(x)
+        return x
+# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
+# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa
+class LayerNorm2d(nn.Module):
+    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x

prima/models/components/pose_transformer.py ADDED Viewed

	@@ -0,0 +1,366 @@

+"""
+PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation
+Official implementation of the paper:
+"PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation"
+by Xiaohang Yu, Ti Wang, and Mackenzie Weygandt Mathis
+Licensed under a modified MIT license
+"""
+from inspect import isfunction
+from typing import Callable, Optional
+import torch
+from einops import rearrange
+from einops.layers.torch import Rearrange
+from torch import nn
+from .t_cond_mlp import (
+    AdaptiveLayerNorm1D,
+    FrequencyEmbedder,
+    normalization_layer,
+)
+def exists(val):
+    return val is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+class PreNorm(nn.Module):
+    def __init__(self, dim: int, fn: Callable, norm: str = "layer", norm_cond_dim: int = -1):
+        super().__init__()
+        self.norm = normalization_layer(norm, dim, norm_cond_dim)
+        self.fn = fn
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        if isinstance(self.norm, AdaptiveLayerNorm1D):
+            return self.fn(self.norm(x, *args), **kwargs)
+        else:
+            return self.fn(self.norm(x), **kwargs)
+class FeedForward(nn.Module):
+    def __init__(self, dim, hidden_dim, dropout=0.0):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(dim, hidden_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, dim),
+            nn.Dropout(dropout),
+        )
+    def forward(self, x):
+        return self.net(x)
+class Attention(nn.Module):
+    def __init__(self, dim, heads=8, dim_head=64, dropout=0.0):
+        super().__init__()
+        inner_dim = dim_head * heads
+        project_out = not (heads == 1 and dim_head == dim)
+        self.heads = heads
+        self.scale = dim_head**-0.5
+        self.attend = nn.Softmax(dim=-1)
+        self.dropout = nn.Dropout(dropout)
+        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
+        self.to_out = (
+            nn.Sequential(nn.Linear(inner_dim, dim), nn.Dropout(dropout))
+            if project_out
+            else nn.Identity()
+        )
+    def forward(self, x):
+        qkv = self.to_qkv(x).chunk(3, dim=-1)
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=self.heads), qkv)
+        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale
+        attn = self.attend(dots)
+        attn = self.dropout(attn)
+        out = torch.matmul(attn, v)
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+class CrossAttention(nn.Module):
+    def __init__(self, dim, context_dim=None, heads=8, dim_head=64, dropout=0.0):
+        super().__init__()
+        inner_dim = dim_head * heads
+        project_out = not (heads == 1 and dim_head == dim)
+        self.heads = heads
+        self.scale = dim_head**-0.5
+        self.attend = nn.Softmax(dim=-1)
+        self.dropout = nn.Dropout(dropout)
+        context_dim = default(context_dim, dim)
+        self.to_kv = nn.Linear(context_dim, inner_dim * 2, bias=False)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_out = (
+            nn.Sequential(nn.Linear(inner_dim, dim), nn.Dropout(dropout))
+            if project_out
+            else nn.Identity()
+        )
+    def forward(self, x, context=None):
+        context = default(context, x)
+        k, v = self.to_kv(context).chunk(2, dim=-1)
+        q = self.to_q(x)
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=self.heads), [q, k, v])
+        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale
+        attn = self.attend(dots)
+        attn = self.dropout(attn)
+        out = torch.matmul(attn, v)
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        depth: int,
+        heads: int,
+        dim_head: int,
+        mlp_dim: int,
+        dropout: float = 0.0,
+        norm: str = "layer",
+        norm_cond_dim: int = -1,
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            sa = Attention(dim, heads=heads, dim_head=dim_head, dropout=dropout)
+            ff = FeedForward(dim, mlp_dim, dropout=dropout)
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PreNorm(dim, sa, norm=norm, norm_cond_dim=norm_cond_dim),
+                        PreNorm(dim, ff, norm=norm, norm_cond_dim=norm_cond_dim),
+                    ]
+                )
+            )
+    def forward(self, x: torch.Tensor, *args):
+        for attn, ff in self.layers:
+            x = attn(x, *args) + x
+            x = ff(x, *args) + x
+        return x
+class TransformerCrossAttn(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        depth: int,
+        heads: int,
+        dim_head: int,
+        mlp_dim: int,
+        dropout: float = 0.0,
+        norm: str = "layer",
+        norm_cond_dim: int = -1,
+        context_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            sa = Attention(dim, heads=heads, dim_head=dim_head, dropout=dropout)
+            ca = CrossAttention(
+                dim, context_dim=context_dim, heads=heads, dim_head=dim_head, dropout=dropout
+            )
+            ff = FeedForward(dim, mlp_dim, dropout=dropout)
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PreNorm(dim, sa, norm=norm, norm_cond_dim=norm_cond_dim),
+                        PreNorm(dim, ca, norm=norm, norm_cond_dim=norm_cond_dim),
+                        PreNorm(dim, ff, norm=norm, norm_cond_dim=norm_cond_dim),
+                    ]
+                )
+            )
+    def forward(self, x: torch.Tensor, *args, context=None, context_list=None):
+        if context_list is None:
+            context_list = [context] * len(self.layers)
+        if len(context_list) != len(self.layers):
+            raise ValueError(f"len(context_list) != len(self.layers) ({len(context_list)} != {len(self.layers)})")
+        for i, (self_attn, cross_attn, ff) in enumerate(self.layers):
+            x = self_attn(x, *args) + x
+            x = cross_attn(x, *args, context=context_list[i]) + x
+            x = ff(x, *args) + x
+        return x
+class DropTokenDropout(nn.Module):
+    def __init__(self, p: float = 0.1):
+        super().__init__()
+        if p < 0 or p > 1:
+            raise ValueError(
+                "dropout probability has to be between 0 and 1, " "but got {}".format(p)
+            )
+        self.p = p
+    def forward(self, x: torch.Tensor):
+        # x: (batch_size, seq_len, dim)
+        if self.training and self.p > 0:
+            zero_mask = torch.full_like(x[0, :, 0], self.p).bernoulli().bool()
+            if zero_mask.any():
+                x = x[:, ~zero_mask, :]
+        return x
+class ZeroTokenDropout(nn.Module):
+    def __init__(self, p: float = 0.1):
+        super().__init__()
+        if p < 0 or p > 1:
+            raise ValueError(
+                "dropout probability has to be between 0 and 1, " "but got {}".format(p)
+            )
+        self.p = p
+    def forward(self, x: torch.Tensor):
+        # x: (batch_size, seq_len, dim)
+        if self.training and self.p > 0:
+            zero_mask = torch.full_like(x[:, :, 0], self.p).bernoulli().bool()
+            # Zero-out the masked tokens
+            x[zero_mask, :] = 0
+        return x
+class TransformerEncoder(nn.Module):
+    def __init__(
+        self,
+        num_tokens: int,
+        token_dim: int,
+        dim: int,
+        depth: int,
+        heads: int,
+        mlp_dim: int,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        emb_dropout: float = 0.0,
+        emb_dropout_type: str = "drop",
+        emb_dropout_loc: str = "token",
+        norm: str = "layer",
+        norm_cond_dim: int = -1,
+        token_pe_numfreq: int = -1,
+    ):
+        super().__init__()
+        if token_pe_numfreq > 0:
+            token_dim_new = token_dim * (2 * token_pe_numfreq + 1)
+            self.to_token_embedding = nn.Sequential(
+                Rearrange("b n d -> (b n) d", n=num_tokens, d=token_dim),
+                FrequencyEmbedder(token_pe_numfreq, token_pe_numfreq - 1),
+                Rearrange("(b n) d -> b n d", n=num_tokens, d=token_dim_new),
+                nn.Linear(token_dim_new, dim),
+            )
+        else:
+            self.to_token_embedding = nn.Linear(token_dim, dim)
+        self.pos_embedding = nn.Parameter(torch.randn(1, num_tokens, dim))
+        if emb_dropout_type == "drop":
+            self.dropout = DropTokenDropout(emb_dropout)
+        elif emb_dropout_type == "zero":
+            self.dropout = ZeroTokenDropout(emb_dropout)
+        else:
+            raise ValueError(f"Unknown emb_dropout_type: {emb_dropout_type}")
+        self.emb_dropout_loc = emb_dropout_loc
+        self.transformer = Transformer(
+            dim, depth, heads, dim_head, mlp_dim, dropout, norm=norm, norm_cond_dim=norm_cond_dim
+        )
+    def forward(self, inp: torch.Tensor, *args, **kwargs):
+        x = inp
+        if self.emb_dropout_loc == "input":
+            x = self.dropout(x)
+        x = self.to_token_embedding(x)
+        if self.emb_dropout_loc == "token":
+            x = self.dropout(x)
+        b, n, _ = x.shape
+        x += self.pos_embedding[:, :n]
+        if self.emb_dropout_loc == "token_afterpos":
+            x = self.dropout(x)
+        x = self.transformer(x, *args)
+        return x
+class TransformerDecoder(nn.Module):
+    def __init__(
+        self,
+        num_tokens: int,
+        token_dim: int,
+        dim: int,
+        depth: int,
+        heads: int,
+        mlp_dim: int,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        emb_dropout: float = 0.0,
+        emb_dropout_type: str = 'drop',
+        norm: str = "layer",
+        norm_cond_dim: int = -1,
+        context_dim: Optional[int] = None,
+        skip_token_embedding: bool = False,
+    ):
+        super().__init__()
+        if not skip_token_embedding:
+            self.to_token_embedding = nn.Linear(token_dim, dim)
+        else:
+            self.to_token_embedding = nn.Identity()
+            if token_dim != dim:
+                raise ValueError(
+                    f"token_dim ({token_dim}) != dim ({dim}) when skip_token_embedding is True"
+                )
+        self.pos_embedding = nn.Parameter(torch.randn(1, num_tokens, dim))
+        if emb_dropout_type == "drop":
+            self.dropout = DropTokenDropout(emb_dropout)
+        elif emb_dropout_type == "zero":
+            self.dropout = ZeroTokenDropout(emb_dropout)
+        elif emb_dropout_type == "normal":
+            self.dropout = nn.Dropout(emb_dropout)
+        self.transformer = TransformerCrossAttn(
+            dim,
+            depth,
+            heads,
+            dim_head,
+            mlp_dim,
+            dropout,
+            norm=norm,
+            norm_cond_dim=norm_cond_dim,
+            context_dim=context_dim,
+        )
+    def forward(self, inp: torch.Tensor, *args, context=None, context_list=None):
+        x = self.to_token_embedding(inp)
+        b, n, _ = x.shape
+        x = self.dropout(x)
+        x += self.pos_embedding[:, :n]
+        x = self.transformer(x, *args, context=context, context_list=context_list)
+        return x

prima/models/components/position_encoding.py ADDED Viewed

	@@ -0,0 +1,84 @@

+"""
+PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation
+Official implementation of the paper:
+"PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation"
+by Xiaohang Yu, Ti Wang, and Mackenzie Weygandt Mathis
+Licensed under a modified MIT license
+"""
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from typing import Any, Optional, Tuple
+import numpy as np
+import torch
+from torch import nn
+# Rotary Positional Encoding, adapted from:
+# 1. https://github.com/meta-llama/codellama/blob/main/llama/model.py
+# 2. https://github.com/naver-ai/rope-vit
+# 3. https://github.com/lucidrains/rotary-embedding-torch
+def init_t_xy(end_x: int, end_y: int):
+    t = torch.arange(end_x * end_y, dtype=torch.float32)
+    t_x = (t % end_x).float()
+    t_y = torch.div(t, end_x, rounding_mode="floor").float()
+    return t_x, t_y
+def compute_axial_cis(dim: int, end_x: int, end_y: int, theta: float = 10000.0):
+    freqs_x = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+    freqs_y = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+    t_x, t_y = init_t_xy(end_x, end_y)
+    freqs_x = torch.outer(t_x, freqs_x)
+    freqs_y = torch.outer(t_y, freqs_y)
+    freqs_cis_x = torch.polar(torch.ones_like(freqs_x), freqs_x)
+    freqs_cis_y = torch.polar(torch.ones_like(freqs_y), freqs_y)
+    return torch.cat([freqs_cis_x, freqs_cis_y], dim=-1)
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    assert freqs_cis.shape == (x.shape[-2], x.shape[-1])
+    shape = [d if i >= ndim - 2 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+def apply_rotary_enc(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+    repeat_freqs_k: bool = False,
+):
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = (
+        torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+        if xk.shape[-2] != 0
+        else None
+    )
+    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    if xk_ is None:
+        # no keys to rotate, due to dropout
+        return xq_out.type_as(xq).to(xq.device), xk
+    # repeat freqs along seq_len dim to match k seq_len
+    if repeat_freqs_k:
+        r = xk_.shape[-2] // xq_.shape[-2]
+        if freqs_cis.is_cuda:
+            freqs_cis = freqs_cis.repeat(*([1] * (freqs_cis.ndim - 2)), r, 1)
+        else:
+            # torch.repeat on complex numbers may not be supported on non-CUDA devices
+            # (freqs_cis has 4 dims and we repeat on dim 2) so we use expand + flatten
+            freqs_cis = freqs_cis.unsqueeze(2).expand(-1, -1, r, -1, -1).flatten(2, 3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq).to(xq.device), xk_out.type_as(xk).to(xk.device)

prima/models/components/t_cond_mlp.py ADDED Viewed

	@@ -0,0 +1,204 @@

+"""
+PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation
+Official implementation of the paper:
+"PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation"
+by Xiaohang Yu, Ti Wang, and Mackenzie Weygandt Mathis
+Licensed under a modified MIT license
+"""
+import copy
+from typing import List, Optional
+import torch
+class AdaptiveLayerNorm1D(torch.nn.Module):
+    def __init__(self, data_dim: int, norm_cond_dim: int):
+        super().__init__()
+        if data_dim <= 0:
+            raise ValueError(f"data_dim must be positive, but got {data_dim}")
+        if norm_cond_dim <= 0:
+            raise ValueError(f"norm_cond_dim must be positive, but got {norm_cond_dim}")
+        self.norm = torch.nn.LayerNorm(data_dim)
+        self.linear = torch.nn.Linear(norm_cond_dim, 2 * data_dim)
+        torch.nn.init.zeros_(self.linear.weight)
+        torch.nn.init.zeros_(self.linear.bias)
+    def forward(self, x: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+        # x: (batch, ..., data_dim)
+        # t: (batch, norm_cond_dim)
+        # return: (batch, data_dim)
+        x = self.norm(x)
+        alpha, beta = self.linear(t).chunk(2, dim=-1)
+        # Add singleton dimensions to alpha and beta
+        if x.dim() > 2:
+            alpha = alpha.view(alpha.shape[0], *([1] * (x.dim() - 2)), alpha.shape[1])
+            beta = beta.view(beta.shape[0], *([1] * (x.dim() - 2)), beta.shape[1])
+        return x * (1 + alpha) + beta
+class SequentialCond(torch.nn.Sequential):
+    def forward(self, input, *args, **kwargs):
+        for module in self:
+            if isinstance(module, (AdaptiveLayerNorm1D, SequentialCond, ResidualMLPBlock)):
+                input = module(input, *args, **kwargs)
+            else:
+                input = module(input)
+        return input
+def normalization_layer(norm: Optional[str], dim: int, norm_cond_dim: int = -1):
+    if norm == "batch":
+        return torch.nn.BatchNorm1d(dim)
+    elif norm == "layer":
+        return torch.nn.LayerNorm(dim)
+    elif norm == "ada":
+        assert norm_cond_dim > 0, f"norm_cond_dim must be positive, got {norm_cond_dim}"
+        return AdaptiveLayerNorm1D(dim, norm_cond_dim)
+    elif norm is None:
+        return torch.nn.Identity()
+    else:
+        raise ValueError(f"Unknown norm: {norm}")
+def linear_norm_activ_dropout(
+    input_dim: int,
+    output_dim: int,
+    activation: torch.nn.Module = torch.nn.ReLU(),
+    bias: bool = True,
+    norm: Optional[str] = "layer",  # Options: ada/batch/layer
+    dropout: float = 0.0,
+    norm_cond_dim: int = -1,
+) -> SequentialCond:
+    layers = []
+    layers.append(torch.nn.Linear(input_dim, output_dim, bias=bias))
+    if norm is not None:
+        layers.append(normalization_layer(norm, output_dim, norm_cond_dim))
+    layers.append(copy.deepcopy(activation))
+    if dropout > 0.0:
+        layers.append(torch.nn.Dropout(dropout))
+    return SequentialCond(*layers)
+def create_simple_mlp(
+    input_dim: int,
+    hidden_dims: List[int],
+    output_dim: int,
+    activation: torch.nn.Module = torch.nn.ReLU(),
+    bias: bool = True,
+    norm: Optional[str] = "layer",  # Options: ada/batch/layer
+    dropout: float = 0.0,
+    norm_cond_dim: int = -1,
+) -> SequentialCond:
+    layers = []
+    prev_dim = input_dim
+    for hidden_dim in hidden_dims:
+        layers.extend(
+            linear_norm_activ_dropout(
+                prev_dim, hidden_dim, activation, bias, norm, dropout, norm_cond_dim
+            )
+        )
+        prev_dim = hidden_dim
+    layers.append(torch.nn.Linear(prev_dim, output_dim, bias=bias))
+    return SequentialCond(*layers)
+class ResidualMLPBlock(torch.nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        num_hidden_layers: int,
+        output_dim: int,
+        activation: torch.nn.Module = torch.nn.ReLU(),
+        bias: bool = True,
+        norm: Optional[str] = "layer",  # Options: ada/batch/layer
+        dropout: float = 0.0,
+        norm_cond_dim: int = -1,
+    ):
+        super().__init__()
+        if not (input_dim == output_dim == hidden_dim):
+            raise NotImplementedError(
+                f"input_dim {input_dim} != output_dim {output_dim} is not implemented"
+            )
+        layers = []
+        prev_dim = input_dim
+        for i in range(num_hidden_layers):
+            layers.append(
+                linear_norm_activ_dropout(
+                    prev_dim, hidden_dim, activation, bias, norm, dropout, norm_cond_dim
+                )
+            )
+            prev_dim = hidden_dim
+        self.model = SequentialCond(*layers)
+        self.skip = torch.nn.Identity()
+    def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        return x + self.model(x, *args, **kwargs)
+class ResidualMLP(torch.nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        num_hidden_layers: int,
+        output_dim: int,
+        activation: torch.nn.Module = torch.nn.ReLU(),
+        bias: bool = True,
+        norm: Optional[str] = "layer",  # Options: ada/batch/layer
+        dropout: float = 0.0,
+        num_blocks: int = 1,
+        norm_cond_dim: int = -1,
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.model = SequentialCond(
+            linear_norm_activ_dropout(
+                input_dim, hidden_dim, activation, bias, norm, dropout, norm_cond_dim
+            ),
+            *[
+                ResidualMLPBlock(
+                    hidden_dim,
+                    hidden_dim,
+                    num_hidden_layers,
+                    hidden_dim,
+                    activation,
+                    bias,
+                    norm,
+                    dropout,
+                    norm_cond_dim,
+                )
+                for _ in range(num_blocks)
+            ],
+            torch.nn.Linear(hidden_dim, output_dim, bias=bias),
+        )
+    def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        return self.model(x, *args, **kwargs)
+class FrequencyEmbedder(torch.nn.Module):
+    def __init__(self, num_frequencies, max_freq_log2):
+        super().__init__()
+        frequencies = 2 ** torch.linspace(0, max_freq_log2, steps=num_frequencies)
+        self.register_buffer("frequencies", frequencies)
+    def forward(self, x):
+        # x should be of size (N,) or (N, D)
+        N = x.size(0)
+        if x.dim() == 1:  # (N,)
+            x = x.unsqueeze(1)  # (N, D) where D=1
+        x_unsqueezed = x.unsqueeze(-1)  # (N, D, 1)
+        scaled = self.frequencies.view(1, 1, -1) * x_unsqueezed  # (N, D, num_frequencies)
+        s = torch.sin(scaled)
+        c = torch.cos(scaled)
+        embedded = torch.cat([s, c, x_unsqueezed], dim=-1).view(
+            N, -1
+        )  # (N, D * 2 * num_frequencies + D)
+        return embedded

prima/models/components/transformer.py ADDED Viewed

	@@ -0,0 +1,400 @@

+"""
+PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation
+Official implementation of the paper:
+"PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation"
+by Xiaohang Yu, Ti Wang, and Mackenzie Weygandt Mathis
+Licensed under a modified MIT license
+"""
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import contextlib
+import math
+import warnings
+from functools import partial
+from typing import Tuple, Type
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+from .position_encoding import apply_rotary_enc, compute_axial_cis
+from .model_utils import MLP
+warnings.simplefilter(action="ignore", category=FutureWarning)
+def get_sdpa_settings():
+    if torch.cuda.is_available():
+        old_gpu = torch.cuda.get_device_properties(0).major < 7
+        # only use Flash Attention on Ampere (8.0) or newer GPUs
+        use_flash_attn = torch.cuda.get_device_properties(0).major >= 8
+        if not use_flash_attn:
+            warnings.warn(
+                "Flash Attention is disabled as it requires a GPU with Ampere (8.0) CUDA capability.",
+                category=UserWarning,
+                stacklevel=2,
+            )
+        # keep math kernel for PyTorch versions before 2.2 (Flash Attention v2 is only
+        # available on PyTorch 2.2+, while Flash Attention v1 cannot handle all cases)
+        pytorch_version = tuple(int(v) for v in torch.__version__.split(".")[:2])
+        if pytorch_version < (2, 2):
+            warnings.warn(
+                f"You are using PyTorch {torch.__version__} without Flash Attention v2 support. "
+                "Consider upgrading to PyTorch 2.2+ for Flash Attention v2 (which could be faster).",
+                category=UserWarning,
+                stacklevel=2,
+            )
+        math_kernel_on = pytorch_version < (2, 2) or not use_flash_attn
+    else:
+        old_gpu = True
+        use_flash_attn = False
+        math_kernel_on = True
+    return old_gpu, use_flash_attn, math_kernel_on
+# Check whether Flash Attention is available (and use it by default)
+OLD_GPU, USE_FLASH_ATTN, MATH_KERNEL_ON = get_sdpa_settings()
+# A fallback setting to allow all available kernels if Flash Attention fails
+ALLOW_ALL_KERNELS = False
+def sdp_kernel_context(dropout_p):
+    """
+    Get the context for the attention scaled dot-product kernel. We use Flash Attention
+    by default, but fall back to all available kernels if Flash Attention fails.
+    """
+    if ALLOW_ALL_KERNELS:
+        return contextlib.nullcontext()
+    return torch.backends.cuda.sdp_kernel(
+        enable_flash=USE_FLASH_ATTN,
+        # if Flash attention kernel is off, then math kernel needs to be enabled
+        enable_math=(OLD_GPU and dropout_p > 0.0) or MATH_KERNEL_ON,
+        enable_mem_efficient=OLD_GPU,
+    )
+class TwoWayTransformer(nn.Module):
+    def __init__(
+        self,
+        depth: int,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int,
+        activation: Type[nn.Module] = nn.ReLU,
+        attention_downsample_rate: int = 2,
+    ) -> None:
+        """
+        A transformer decoder that attends to an input image using
+        queries whose positional embedding is supplied.
+        Args:
+          depth (int): number of layers in the transformer
+          embedding_dim (int): the channel dimension for the input embeddings
+          num_heads (int): the number of heads for multihead attention. Must
+            divide embedding_dim
+          mlp_dim (int): the channel dimension internal to the MLP block
+          activation (nn.Module): the activation to use in the MLP block
+        """
+        super().__init__()
+        self.depth = depth
+        self.embedding_dim = embedding_dim
+        self.num_heads = num_heads
+        self.mlp_dim = mlp_dim
+        self.layers = nn.ModuleList()
+        for i in range(depth):
+            self.layers.append(
+                TwoWayAttentionBlock(
+                    embedding_dim=embedding_dim,
+                    num_heads=num_heads,
+                    mlp_dim=mlp_dim,
+                    activation=activation,
+                    attention_downsample_rate=attention_downsample_rate,
+                    skip_first_layer_pe=(i == 0),
+                )
+            )
+        self.final_attn_token_to_image = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+        self.norm_final_attn = nn.LayerNorm(embedding_dim)
+    def forward(
+        self,
+        image_embedding: Tensor,
+        image_pe: Tensor,
+        point_embedding: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+          image_embedding (torch.Tensor): image to attend to. Should be shape
+            B x embedding_dim x h x w for any h and w.
+          image_pe (torch.Tensor): the positional encoding to add to the image. Must
+            have the same shape as image_embedding.
+          point_embedding (torch.Tensor): the embedding to add to the query points.
+            Must have shape B x N_points x embedding_dim for any N_points.
+        Returns:
+          torch.Tensor: the processed point_embedding
+          torch.Tensor: the processed image_embedding
+        """
+        # BxCxHxW -> BxHWxC == B x N_image_tokens x C
+        bs, c, h, w = image_embedding.shape
+        image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
+        image_pe = image_pe.flatten(2).permute(0, 2, 1)
+        # Prepare queries
+        queries = point_embedding
+        keys = image_embedding
+        # Apply transformer blocks and final layernorm
+        for layer in self.layers:
+            queries, keys = layer(
+                queries=queries,
+                keys=keys,
+                query_pe=point_embedding,
+                key_pe=image_pe,
+            )
+        # Apply the final attention layer from the points to the image
+        q = queries + point_embedding
+        k = keys + image_pe
+        attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm_final_attn(queries)
+        return queries, keys
+class TwoWayAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int = 2048,
+        activation: Type[nn.Module] = nn.ReLU,
+        attention_downsample_rate: int = 2,
+        skip_first_layer_pe: bool = False,
+    ) -> None:
+        """
+        A transformer block with four layers: (1) self-attention of sparse
+        inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp
+        block on sparse inputs, and (4) cross attention of dense inputs to sparse
+        inputs.
+        Arguments:
+          embedding_dim (int): the channel dimension of the embeddings
+          num_heads (int): the number of heads in the attention layers
+          mlp_dim (int): the hidden dimension of the mlp block
+          activation (nn.Module): the activation of the mlp block
+          skip_first_layer_pe (bool): skip the PE on the first layer
+        """
+        super().__init__()
+        self.self_attn = Attention(embedding_dim, num_heads)
+        self.norm1 = nn.LayerNorm(embedding_dim)
+        self.cross_attn_token_to_image = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+        self.norm2 = nn.LayerNorm(embedding_dim)
+        self.mlp = MLP(
+            embedding_dim, mlp_dim, embedding_dim, num_layers=2, activation=activation
+        )
+        self.norm3 = nn.LayerNorm(embedding_dim)
+        self.norm4 = nn.LayerNorm(embedding_dim)
+        self.cross_attn_image_to_token = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+        self.skip_first_layer_pe = skip_first_layer_pe
+    def forward(
+        self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe: Tensor
+    ) -> Tuple[Tensor, Tensor]:
+        # Self attention block
+        if self.skip_first_layer_pe:
+            queries = self.self_attn(q=queries, k=queries, v=queries)
+        else:
+            q = queries + query_pe
+            attn_out = self.self_attn(q=q, k=q, v=queries)
+            queries = queries + attn_out
+        queries = self.norm1(queries)
+        # Cross attention block, tokens attending to image embedding
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm2(queries)
+        # MLP block
+        mlp_out = self.mlp(queries)
+        queries = queries + mlp_out
+        queries = self.norm3(queries)
+        # Cross attention block, image embedding attending to tokens
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries)
+        keys = keys + attn_out
+        keys = self.norm4(keys)
+        return queries, keys
+class Attention(nn.Module):
+    """
+    An attention layer that allows for downscaling the size of the embedding
+    after projection to queries, keys, and values.
+    """
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        downsample_rate: int = 1,
+        dropout: float = 0.0,
+        kv_in_dim: int = None,
+    ) -> None:
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.kv_in_dim = kv_in_dim if kv_in_dim is not None else embedding_dim
+        self.internal_dim = embedding_dim // downsample_rate
+        self.num_heads = num_heads
+        assert (
+            self.internal_dim % num_heads == 0
+        ), "num_heads must divide embedding_dim."
+        self.q_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.k_proj = nn.Linear(self.kv_in_dim, self.internal_dim)
+        self.v_proj = nn.Linear(self.kv_in_dim, self.internal_dim)
+        self.out_proj = nn.Linear(self.internal_dim, embedding_dim)
+        self.dropout_p = dropout
+    def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor:
+        b, n, c = x.shape
+        x = x.reshape(b, n, num_heads, c // num_heads)
+        return x.transpose(1, 2).contiguous()  # B x N_heads x N_tokens x C_per_head
+    def _recombine_heads(self, x: Tensor) -> Tensor:
+        b, n_heads, n_tokens, c_per_head = x.shape
+        x = x.transpose(1, 2).contiguous()
+        return x.reshape(b, n_tokens, n_heads * c_per_head)  # B x N_tokens x C
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+        # Input projections
+        q = self.q_proj(q)
+        k = self.k_proj(k)
+        v = self.v_proj(v)
+        # Separate into heads
+        q = self._separate_heads(q, self.num_heads)
+        k = self._separate_heads(k, self.num_heads)
+        v = self._separate_heads(v, self.num_heads)
+        dropout_p = self.dropout_p if self.training else 0.0
+        # Attention
+        try:
+            with sdp_kernel_context(dropout_p):
+                out = F.scaled_dot_product_attention(q, k, v, dropout_p=dropout_p)
+        except Exception as e:
+            # Fall back to all kernels if the Flash attention kernel fails
+            warnings.warn(
+                f"Flash Attention kernel failed due to: {e}\nFalling back to all available "
+                f"kernels for scaled_dot_product_attention (which may have a slower speed).",
+                category=UserWarning,
+                stacklevel=2,
+            )
+            global ALLOW_ALL_KERNELS
+            ALLOW_ALL_KERNELS = True
+            out = F.scaled_dot_product_attention(q, k, v, dropout_p=dropout_p)
+        out = self._recombine_heads(out)
+        out = self.out_proj(out)
+        return out
+class RoPEAttention(Attention):
+    """Attention with rotary position encoding."""
+    def __init__(
+        self,
+        *args,
+        rope_theta=10000.0,
+        # whether to repeat q rope to match k length
+        # this is needed for cross-attention to memories
+        rope_k_repeat=False,
+        feat_sizes=(32, 32),  # [w, h] for stride 16 feats at 512 resolution
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.compute_cis = partial(
+            compute_axial_cis, dim=self.internal_dim // self.num_heads, theta=rope_theta
+        )
+        freqs_cis = self.compute_cis(end_x=feat_sizes[0], end_y=feat_sizes[1])
+        self.freqs_cis = freqs_cis
+        self.rope_k_repeat = rope_k_repeat
+    def forward(
+        self, q: Tensor, k: Tensor, v: Tensor, num_k_exclude_rope: int=0,
+    ) -> Tensor:
+        # Input projections
+        q = self.q_proj(q)
+        k = self.k_proj(k)
+        v = self.v_proj(v)
+        # Separate into heads
+        q = self._separate_heads(q, self.num_heads)
+        k = self._separate_heads(k, self.num_heads)
+        v = self._separate_heads(v, self.num_heads)
+        # Apply rotary position encoding
+        w = h = math.sqrt(q.shape[-2])
+        self.freqs_cis = self.freqs_cis.to(q.device)
+        if self.freqs_cis.shape[0] != q.shape[-2]:
+            self.freqs_cis = self.compute_cis(end_x=w, end_y=h).to(q.device)
+        if q.shape[-2] != k.shape[-2]:
+            assert self.rope_k_repeat
+        num_k_rope = k.size(-2) - num_k_exclude_rope
+        q, k[:, :, :num_k_rope] = apply_rotary_enc(
+            q,
+            k[:, :, :num_k_rope],
+            freqs_cis=self.freqs_cis,
+            repeat_freqs_k=self.rope_k_repeat,
+        )
+        dropout_p = self.dropout_p if self.training else 0.0
+        # Attention
+        try:
+            with sdp_kernel_context(dropout_p):
+                out = F.scaled_dot_product_attention(q, k, v, dropout_p=dropout_p)
+        except Exception as e:
+            # Fall back to all kernels if the Flash attention kernel fails
+            warnings.warn(
+                f"Flash Attention kernel failed due to: {e}\nFalling back to all available "
+                f"kernels for scaled_dot_product_attention (which may have a slower speed).",
+                category=UserWarning,
+                stacklevel=2,
+            )
+            global ALLOW_ALL_KERNELS
+            ALLOW_ALL_KERNELS = True
+            out = F.scaled_dot_product_attention(q, k, v, dropout_p=dropout_p)
+        out = self._recombine_heads(out)
+        out = self.out_proj(out)
+        return out

prima/models/discriminator.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""
+PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation
+Official implementation of the paper:
+"PRIMA: Boosting Animal Mesh Recovery with Biological Priors and Test-Time Adaptation"
+by Xiaohang Yu, Ti Wang, and Mackenzie Weygandt Mathis
+Licensed under a modified MIT license
+"""
+import torch
+import torch.nn as nn
+class Discriminator(nn.Module):
+    def __init__(self):
+        """
+        Pose + Shape discriminator proposed in HMR
+        """
+        super(Discriminator, self).__init__()
+        self.num_joints = 34
+        # poses_alone
+        self.D_conv1 = nn.Conv2d(9, 32, kernel_size=1)
+        nn.init.xavier_uniform_(self.D_conv1.weight)
+        nn.init.zeros_(self.D_conv1.bias)
+        self.relu = nn.ReLU(inplace=True)
+        self.D_conv2 = nn.Conv2d(32, 32, kernel_size=1)
+        nn.init.xavier_uniform_(self.D_conv2.weight)
+        nn.init.zeros_(self.D_conv2.bias)
+        pose_out = []
+        for i in range(self.num_joints):
+            pose_out_temp = nn.Linear(32, 1)
+            nn.init.xavier_uniform_(pose_out_temp.weight)
+            nn.init.zeros_(pose_out_temp.bias)
+            pose_out.append(pose_out_temp)
+        self.pose_out = nn.ModuleList(pose_out)
+        # betas
+        self.betas_fc1 = nn.Linear(41, 10)  # SMAL betas is 41
+        nn.init.xavier_uniform_(self.betas_fc1.weight)
+        nn.init.zeros_(self.betas_fc1.bias)
+        self.betas_fc2 = nn.Linear(10, 5)
+        nn.init.xavier_uniform_(self.betas_fc2.weight)
+        nn.init.zeros_(self.betas_fc2.bias)
+        self.betas_out = nn.Linear(5, 1)
+        nn.init.xavier_uniform_(self.betas_out.weight)
+        nn.init.zeros_(self.betas_out.bias)
+        # bones
+        self.bone_fc1 = nn.Linear(24, 10)  # SMAL betas is 41
+        nn.init.xavier_uniform_(self.bone_fc1.weight)
+        nn.init.zeros_(self.bone_fc1.bias)
+        self.bone_fc2 = nn.Linear(10, 5)
+        nn.init.xavier_uniform_(self.bone_fc2.weight)
+        nn.init.zeros_(self.bone_fc2.bias)
+        self.bone_out = nn.Linear(5, 1)
+        nn.init.xavier_uniform_(self.bone_out.weight)
+        nn.init.zeros_(self.bone_out.bias)
+        # poses_joint
+        self.D_alljoints_fc1 = nn.Linear(32 * self.num_joints, 1024)
+        nn.init.xavier_uniform_(self.D_alljoints_fc1.weight)
+        nn.init.zeros_(self.D_alljoints_fc1.bias)
+        self.D_alljoints_fc2 = nn.Linear(1024, 1024)
+        nn.init.xavier_uniform_(self.D_alljoints_fc2.weight)
+        nn.init.zeros_(self.D_alljoints_fc2.bias)
+        self.D_alljoints_out = nn.Linear(1024, 1)
+        nn.init.xavier_uniform_(self.D_alljoints_out.weight)
+        nn.init.zeros_(self.D_alljoints_out.bias)
+    def forward(self, poses: torch.Tensor, betas: torch.Tensor, bone=None) -> torch.Tensor:
+        """
+        Forward pass of the discriminator.
+        Args:
+            poses (torch.Tensor): Tensor of shape (B, 23, 3, 3) containing a batch of poses (excluding the global orientation).
+            betas (torch.Tensor): Tensor of shape (B, 41) containing a batch of SMAL beta coefficients.
+        Returns:
+            torch.Tensor: Discriminator output with shape (B, 25)
+        """
+        # bn = poses.shape[0]
+        # poses B x 207
+        # poses = poses.reshape(bn, -1)
+        # poses B x num_joints x 1 x 9
+        poses = poses.reshape(-1, self.num_joints, 1, 9)
+        bn = poses.shape[0]
+        # poses B x 9 x num_joints x 1
+        poses = poses.permute(0, 3, 1, 2).contiguous()
+        # poses_alone
+        poses = self.D_conv1(poses)
+        poses = self.relu(poses)
+        poses = self.D_conv2(poses)
+        poses = self.relu(poses)
+        poses_out = []
+        for i in range(self.num_joints):
+            poses_out_ = self.pose_out[i](poses[:, :, i, 0])
+            poses_out.append(poses_out_)
+        poses_out = torch.cat(poses_out, dim=1)
+        # betas
+        betas = self.betas_fc1(betas)
+        betas = self.relu(betas)
+        betas = self.betas_fc2(betas)
+        betas = self.relu(betas)
+        betas_out = self.betas_out(betas)
+        # bone
+        if bone is not None:
+            bone = self.bone_fc1(bone)
+            bone = self.relu(bone)
+            bone = self.bone_fc2(bone)
+            bone = self.relu(bone)
+            bone_out = self.bone_out(bone)
+        # poses_joint
+        poses = poses.reshape(bn, -1)
+        poses_all = self.D_alljoints_fc1(poses)
+        poses_all = self.relu(poses_all)
+        poses_all = self.D_alljoints_fc2(poses_all)
+        poses_all = self.relu(poses_all)
+        poses_all_out = self.D_alljoints_out(poses_all)
+        if bone is not None:
+            disc_out = torch.cat((poses_out, betas_out, poses_all_out, bone_out), 1)
+        else:
+            disc_out = torch.cat((poses_out, betas_out, poses_all_out), 1)
+        return disc_out

prima/models/heads/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .smal_head import build_smal_head