Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +16 -0
- .gitignore +5 -0
- .gitmodules +6 -0
- LICENSE +36 -0
- README.md +75 -0
- SyntheticRecorder.cs +667 -0
- assets/teaser.png +3 -0
- configs/__init__.py +30 -0
- configs/callbacks/ckpt_saver/every10000s_top100.yaml +5 -0
- configs/callbacks/lr_monitor/pl.yaml +2 -0
- configs/callbacks/metric/metric_3dpw.yaml +2 -0
- configs/callbacks/metric/metric_3dpw_occ.yaml +2 -0
- configs/callbacks/metric/metric_aistpp.yaml +2 -0
- configs/callbacks/metric/metric_emdb1.yaml +4 -0
- configs/callbacks/metric/metric_emdb2.yaml +4 -0
- configs/callbacks/metric/metric_rich.yaml +3 -0
- configs/callbacks/metric/metric_unity.yaml +4 -0
- configs/callbacks/prog_bar/prog_reporter_ed1.yaml +5 -0
- configs/callbacks/train_speed_timer/base.yaml +3 -0
- configs/callbacks/vis/vis_music.yaml +2 -0
- configs/callbacks/vis/vis_speech.yaml +2 -0
- configs/callbacks/vis/vis_text.yaml +2 -0
- configs/callbacks/vis/vis_unity_val.yaml +14 -0
- configs/data/collate_cfg/default.yaml +23 -0
- configs/data/mocap/trainX_testY.yaml +21 -0
- configs/demo.yaml +85 -0
- configs/diffusion/ddim.yaml +8 -0
- configs/endecoder/v1_amass_local_bedlam_cam.yaml +2 -0
- configs/exp/genmo_lg.yaml +64 -0
- configs/finetune_unity.yaml +95 -0
- configs/hydra/default.yaml +19 -0
- configs/infer_video.yaml +67 -0
- configs/model/genmo.yaml +45 -0
- configs/network/diffusion.yaml +25 -0
- configs/optimizer/adamw_2e-4.yaml +2 -0
- configs/pipeline/dual_mode.yaml +37 -0
- configs/scheduler/epoch_half_200_350.yaml +6 -0
- configs/test_datasets/3dpw_fliptest.yaml +3 -0
- configs/test_datasets/3dpw_occ_fliptest.yaml +3 -0
- configs/test_datasets/emdb1_fliptest.yaml +4 -0
- configs/test_datasets/emdb2_fliptest.yaml +4 -0
- configs/test_datasets/humanml3d_eval.yaml +7 -0
- configs/test_datasets/rich_test.yaml +2 -0
- configs/text_encoder/t5_3b.yaml +3 -0
- configs/train.yaml +55 -0
- configs/train_datasets/3dpw_occ_v1.yaml +2 -0
- configs/train_datasets/3dpw_v1.yaml +2 -0
- configs/train_datasets/aistpp_train.yaml +7 -0
- configs/train_datasets/amass_train_v11.yaml +9 -0
- configs/train_datasets/beat2_static_train.yaml +6 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,19 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
assets/teaser.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
test.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
test_10.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
test_11.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
test_6.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
third_party/DroidCalib/build/temp.linux-x86_64-3.10/.ninja_deps filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
third_party/DroidCalib/build/temp.linux-x86_64-3.10/src/droid.o filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
third_party/DroidCalib/misc/droidcalib.png filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
third_party/DroidCalib/thirdparty/lietorch/examples/registration/assets/image1.png filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
third_party/DroidCalib/thirdparty/lietorch/examples/registration/assets/image2.png filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
third_party/DroidCalib/thirdparty/lietorch/examples/registration/assets/image3.png filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
third_party/DroidCalib/thirdparty/lietorch/examples/registration/assets/image4.png filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
third_party/DroidCalib/thirdparty/lietorch/examples/registration/assets/registration.gif filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
third_party/DroidCalib/thirdparty/lietorch/examples/rgbdslam/assets/floor.png filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
third_party/DroidCalib/thirdparty/lietorch/examples/rgbdslam/assets/room.png filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
third_party/DroidCalib/thirdparty/lietorch/lietorch.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
outputs/*
|
| 2 |
+
dataset-generator/
|
| 3 |
+
out/
|
| 4 |
+
__pycache__/
|
| 5 |
+
*.pyc
|
.gitmodules
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[submodule "third-party/DROID-SLAM/thirdparty/eigen"]
|
| 2 |
+
path = third-party/DROID-SLAM/thirdparty/eigen
|
| 3 |
+
url = https://gitlab.com/libeigen/eigen.git
|
| 4 |
+
[submodule "third_party/GVHMR"]
|
| 5 |
+
path = third_party/GVHMR
|
| 6 |
+
url = git@github.com:zju3dv/GVHMR.git
|
LICENSE
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
NVIDIA License
|
| 2 |
+
|
| 3 |
+
1. Definitions
|
| 4 |
+
|
| 5 |
+
“Licensor” means any person or entity that distributes its Work.
|
| 6 |
+
“Work” means (a) the original work of authorship made available under this license, which may include software, documentation, or other files, and (b) any additions to or derivative works thereof that are made available under this license.
|
| 7 |
+
The terms “reproduce,” “reproduction,” “derivative works,” and “distribution” have the meaning as provided under U.S. copyright law; provided, however, that for the purposes of this license, derivative works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work.
|
| 8 |
+
Works are “made available” under this license by including in or with the Work either (a) a copyright notice referencing the applicability of this license to the Work, or (b) a copy of this license.
|
| 9 |
+
|
| 10 |
+
2. License Grant
|
| 11 |
+
|
| 12 |
+
2.1 Copyright Grant. Subject to the terms and conditions of this license, each Licensor grants to you a perpetual, worldwide, non-exclusive, royalty-free, copyright license to use, reproduce, prepare derivative works of, publicly display, publicly perform, sublicense and distribute its Work and any resulting derivative works in any form.
|
| 13 |
+
|
| 14 |
+
3. Limitations
|
| 15 |
+
|
| 16 |
+
3.1 Redistribution. You may reproduce or distribute the Work only if (a) you do so under this license, (b) you include a complete copy of this license with your distribution, and (c) you retain without modification any copyright, patent, trademark, or attribution notices that are present in the Work.
|
| 17 |
+
|
| 18 |
+
3.2 Derivative Works. You may specify that additional or different terms apply to the use, reproduction, and distribution of your derivative works of the Work (“Your Terms”) only if (a) Your Terms provide that the use limitation in Section 3.3 applies to your derivative works, and (b) you identify the specific derivative works that are subject to Your Terms. Notwithstanding Your Terms, this license (including the redistribution requirements in Section 3.1) will continue to apply to the Work itself.
|
| 19 |
+
|
| 20 |
+
3.3 Use Limitation. The Work and any derivative works thereof only may be used or intended for use non-commercially. Notwithstanding the foregoing, NVIDIA Corporation and its affiliates may use the Work and any derivative works commercially. As used herein, “non-commercially” means for non-commercial academic purposes only.
|
| 21 |
+
|
| 22 |
+
3.4 Patent Claims. If you bring or threaten to bring a patent claim against any Licensor (including any claim, cross-claim or counterclaim in a lawsuit) to enforce any patents that you allege are infringed by any Work, then your rights under this license from such Licensor (including the grant in Section 2.1) will terminate immediately.
|
| 23 |
+
|
| 24 |
+
3.5 Trademarks. This license does not grant any rights to use any Licensor’s or its affiliates’ names, logos, or trademarks, except as necessary to reproduce the notices described in this license.
|
| 25 |
+
|
| 26 |
+
3.6 Termination. If you violate any term of this license, then your rights under this license (including the grant in Section 2.1) will terminate immediately.
|
| 27 |
+
|
| 28 |
+
4. Disclaimer of Warranty.
|
| 29 |
+
|
| 30 |
+
THE WORK IS PROVIDED “AS IS” WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
|
| 31 |
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE.
|
| 32 |
+
|
| 33 |
+
5. Limitation of Liability.
|
| 34 |
+
|
| 35 |
+
EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
|
| 36 |
+
|
README.md
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<p align="center">
|
| 2 |
+
<h1 align="center"> GEM: A Generalist Model for Human Motion</h1>
|
| 3 |
+
<p align="center">
|
| 4 |
+
<a href="https://jeffli.site/"><strong>Jiefeng Li</strong></a>
|
| 5 |
+
·
|
| 6 |
+
<a href="https://www.jinkuncao.com/"><strong>Jinkun Cao</strong></a>
|
| 7 |
+
·
|
| 8 |
+
<a href="https://cs.stanford.edu/~haotianz/"><strong>Haotian Zhang</strong></a>
|
| 9 |
+
·
|
| 10 |
+
<a href="https://davrempe.github.io/"><strong>Davis Rempe</strong></a>
|
| 11 |
+
·
|
| 12 |
+
<a href="https://jankautz.com/"><strong>Jan Kautz</strong></a>
|
| 13 |
+
·
|
| 14 |
+
<a href="https://www.umariqbal.info/"><strong>Umar Iqbal</strong></a>
|
| 15 |
+
·
|
| 16 |
+
<a href="https://ye-yuan.com/"><strong>Ye Yuan</strong></a>
|
| 17 |
+
</p>
|
| 18 |
+
<h2 align="center">ICCV 2025 (Highlight)</h2>
|
| 19 |
+
<div align="center">
|
| 20 |
+
<img src="./assets/teaser.png" alt="Logo" width="100%">
|
| 21 |
+
</div>
|
| 22 |
+
</p>
|
| 23 |
+
<p align="center">
|
| 24 |
+
<a href="https://research.nvidia.com/labs/dair/gem/"><img src="https://img.shields.io/badge/Project-Page-0099cc"></a>
|
| 25 |
+
<a href="https://arxiv.org/abs/2505.01425"><img src="https://img.shields.io/badge/arXiv-2505.01425-b31b1b.svg"></a>
|
| 26 |
+
|
| 27 |
+
</p>
|
| 28 |
+
|
| 29 |
+
**GEM** is a generalist model for human motion that handles multiple tasks with a single model, supporting diverse conditioning signals including video, keypoints, text, audio, and 3D keyframes.
|
| 30 |
+
|
| 31 |
+
---
|
| 32 |
+
|
| 33 |
+
## 📰 News
|
| 34 |
+
- **[December 2025]** 📢 GENMO has been renamed to **GEM**.
|
| 35 |
+
- **[October 2025]** 📢 The **GEM** codebase is **released!**
|
| 36 |
+
Stay tuned for the pretrained models and evaluation scripts.
|
| 37 |
+
Follow the [project page](https://research.nvidia.com/labs/dair/gem/) for updates and announcements.
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
---
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
## 🚀 Highlights
|
| 44 |
+
|
| 45 |
+
GEM introduces a **unified generative framework** that connects motion estimation and generation through shared objectives.
|
| 46 |
+
|
| 47 |
+
- **Unified framework:** Reframes motion estimation as *constrained generation*, allowing a single model to perform both tasks.
|
| 48 |
+
- **Regression × Diffusion synergy:** Combines the accuracy of regression models with the diversity of diffusion-based generation.
|
| 49 |
+
- **Estimation-guided training:** Trains effectively on in-the-wild datasets using only 2D or textual supervision.
|
| 50 |
+
- **Multimodal conditioning:** Supports video, text, audio, 2D/3D keyframes, or even time-varying mixed inputs (e.g., video → text → video).
|
| 51 |
+
- **Arbitrary-length motion:** Generates continuous, coherent sequences of any duration in one diffusion pass.
|
| 52 |
+
- **State-of-the-art performance:** Achieves leading results on diverse motion estimation and generation benchmarks.
|
| 53 |
+
|
| 54 |
+
For more details, visit the **[GEM project page →](https://research.nvidia.com/labs/dair/gem/)**
|
| 55 |
+
|
| 56 |
+
---
|
| 57 |
+
|
| 58 |
+
### Pretrained Models
|
| 59 |
+
You can download pretrained models from [Google Drive](https://drive.google.com/file/d/1b1E84G7S0h2n5o0RmrcmKOhRKukOjgsJ/view?usp=sharing).
|
| 60 |
+
|
| 61 |
+
## 📖 Paper & Citation
|
| 62 |
+
|
| 63 |
+
**Paper:**
|
| 64 |
+
[GENMO: A GENeralist Model for Human MOtion](https://arxiv.org/abs/2505.01425)
|
| 65 |
+
*Jiefeng Li, Jinkun Cao, Haotian Zhang, Davis Rempe, Jan Kautz, Umar Iqbal, Ye Yuan*
|
| 66 |
+
ICCV, 2025
|
| 67 |
+
|
| 68 |
+
**BibTeX:**
|
| 69 |
+
```bibtex
|
| 70 |
+
@inproceedings{genmo2025,
|
| 71 |
+
title = {GENMO: A GENeralist Model for Human MOtion},
|
| 72 |
+
author = {Li, Jiefeng and Cao, Jinkun and Zhang, Haotian and Rempe, Davis and Kautz, Jan and Iqbal, Umar and Yuan, Ye},
|
| 73 |
+
booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
|
| 74 |
+
year = {2025}
|
| 75 |
+
}
|
SyntheticRecorder.cs
ADDED
|
@@ -0,0 +1,667 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
using UnityEngine;
|
| 2 |
+
using System;
|
| 3 |
+
using System.IO;
|
| 4 |
+
using System.Collections;
|
| 5 |
+
using System.Collections.Generic;
|
| 6 |
+
using Newtonsoft.Json;
|
| 7 |
+
using UnityEngine.SceneManagement;
|
| 8 |
+
using System.Diagnostics; // Required for FFmpeg Process
|
| 9 |
+
|
| 10 |
+
public class SyntheticRecorder : MonoBehaviour
|
| 11 |
+
{
|
| 12 |
+
// --- CONFIGURATION CLASSES ---
|
| 13 |
+
[System.Serializable]
|
| 14 |
+
public class AvatarConfig
|
| 15 |
+
{
|
| 16 |
+
public string avatarName = "Avatar";
|
| 17 |
+
public GameObject avatarObject;
|
| 18 |
+
[Header("Animation")]
|
| 19 |
+
public Animator animator;
|
| 20 |
+
[Header("Retargeting Link")]
|
| 21 |
+
public HybridPoseCopier retargeter;
|
| 22 |
+
public List<GameObject> extraMeshes = new List<GameObject>();
|
| 23 |
+
public float specificPadding = 40f;
|
| 24 |
+
[Header("Keypoint Markers (COCO-17 order)")]
|
| 25 |
+
public List<Transform> customMarkers = new List<Transform>();
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
// --- JSON STRUCTURES ---
|
| 29 |
+
public class SequenceData { public List<FrameData> frames; }
|
| 30 |
+
public class FrameData { public int i; public float[] p, t, b; public int s; }
|
| 31 |
+
|
| 32 |
+
public class OutputMeta
|
| 33 |
+
{
|
| 34 |
+
public int frame_index;
|
| 35 |
+
public string image_path;
|
| 36 |
+
public string avatar_name;
|
| 37 |
+
public int face_id;
|
| 38 |
+
public int left_hand_id;
|
| 39 |
+
public int right_hand_id;
|
| 40 |
+
public float[] bbox;
|
| 41 |
+
public float[] kpts_2d;
|
| 42 |
+
public int[] kpts_vis;
|
| 43 |
+
public float[] bbox_clip;
|
| 44 |
+
public float[] cam_intrinsics;
|
| 45 |
+
public float[] cam_pos_world;
|
| 46 |
+
public float[] cam_rot_world;
|
| 47 |
+
public float[] pelvis_pos_world;
|
| 48 |
+
public float[] pelvis_rot_world;
|
| 49 |
+
public float[] smpl_incam_quat;
|
| 50 |
+
public float[] smpl_incam_transl;
|
| 51 |
+
public float[] smpl_root_incam_transl;
|
| 52 |
+
public float smpl_root_world_scale;
|
| 53 |
+
public float[] kpts_3d_world;
|
| 54 |
+
public float[] smplx_pose;
|
| 55 |
+
public float[] smplx_betas;
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
[Header("Settings")]
|
| 59 |
+
public string inputFolderPath = "Assets/StreamingAssets";
|
| 60 |
+
public string outputFolder = "C:/Temp/SyntheticDataset";
|
| 61 |
+
public bool startRecordingOnPlay = true;
|
| 62 |
+
public bool showDebugUI = true;
|
| 63 |
+
[Tooltip("If true, saves depth_xxxxx.png files to check what the occlusion camera sees.")]
|
| 64 |
+
public bool saveDebugDepthImages = true;
|
| 65 |
+
|
| 66 |
+
[Header("Video Settings")]
|
| 67 |
+
public string ffmpegPath = "ffmpeg";
|
| 68 |
+
public int frameRate = 30;
|
| 69 |
+
|
| 70 |
+
[Header("Compression (Twitch VOD Simulation)")]
|
| 71 |
+
[Tooltip("Target Bitrate in kbps. 6000 is High Quality 1080p. 2500 is messy 720p.")]
|
| 72 |
+
public int targetBitrateKbps = 2500;
|
| 73 |
+
[Tooltip("GOP (Group of Pictures) size in seconds. Twitch uses 2 seconds.")]
|
| 74 |
+
public float gopSizeSeconds = 2.0f;
|
| 75 |
+
|
| 76 |
+
[Header("Parallel Processing")]
|
| 77 |
+
public int workerId = 0;
|
| 78 |
+
public int totalWorkers = 1;
|
| 79 |
+
|
| 80 |
+
[Header("Sequence Naming")]
|
| 81 |
+
public string sequenceName = "";
|
| 82 |
+
private string _currentInputJsonPath = "";
|
| 83 |
+
|
| 84 |
+
[Header("Occlusion Settings")]
|
| 85 |
+
public float occlusionBias = 0.02f;
|
| 86 |
+
|
| 87 |
+
[Header("References")]
|
| 88 |
+
public GameObject characterRoot;
|
| 89 |
+
public Camera vtuberCamera;
|
| 90 |
+
public SyntheticCameraDriver cameraDriver;
|
| 91 |
+
|
| 92 |
+
[Header("Randomization")]
|
| 93 |
+
public List<AvatarConfig> avatarList = new List<AvatarConfig>();
|
| 94 |
+
public List<string> worldSceneNames = new List<string>();
|
| 95 |
+
public LoadSceneMode worldSceneLoadMode = LoadSceneMode.Additive;
|
| 96 |
+
public bool setLoadedWorldSceneActive = true;
|
| 97 |
+
public string worldMainCameraName = "Main Camera";
|
| 98 |
+
public string spawnPointToken = "SpawnPoint";
|
| 99 |
+
|
| 100 |
+
[Header("Animation Indices")]
|
| 101 |
+
public int faceMaxId = 5;
|
| 102 |
+
public int handsMaxId = 5;
|
| 103 |
+
public int minSwitchFrames = 30;
|
| 104 |
+
public int maxSwitchFrames = 120;
|
| 105 |
+
public string paramFaceIndex = "FaceIndex";
|
| 106 |
+
public string paramLeftHandIndex = "LeftHandIndex";
|
| 107 |
+
public string paramRightHandIndex = "RightHandIndex";
|
| 108 |
+
|
| 109 |
+
[Header("BBOX Accuracy")]
|
| 110 |
+
public bool useBakedSkinnedMeshForBbox = true;
|
| 111 |
+
public int bakedVertexStride = 8;
|
| 112 |
+
|
| 113 |
+
[Header("Calibration")]
|
| 114 |
+
public float movementScale = 1.0f;
|
| 115 |
+
public Vector3 translationOffset = new Vector3(0, 0.05f, 0);
|
| 116 |
+
public Vector3 globalCoordinateCorrection = new Vector3(-90, 180, 0);
|
| 117 |
+
|
| 118 |
+
// --- PRIVATE STATE ---
|
| 119 |
+
private float _activePadding = 40f;
|
| 120 |
+
private SequenceData _data;
|
| 121 |
+
private Transform[] _bones;
|
| 122 |
+
private Transform _pelvisBone;
|
| 123 |
+
private List<Transform> _activeMarkers = new List<Transform>();
|
| 124 |
+
private HybridPoseCopier _activeRetargeter;
|
| 125 |
+
private Transform _activeAvatarRoot = null;
|
| 126 |
+
private Animator _activeAnimator = null;
|
| 127 |
+
private string _activeAvatarName = "";
|
| 128 |
+
private readonly List<Renderer> _activeBboxRenderers = new List<Renderer>();
|
| 129 |
+
private Mesh _bakeMesh;
|
| 130 |
+
private readonly List<Vector3> _bakedVerts = new List<Vector3>(8192);
|
| 131 |
+
private Texture2D _greenTex, _redTex, _occTex;
|
| 132 |
+
private Rect _cachedBbox = new Rect(0, 0, 0, 0);
|
| 133 |
+
private bool _cachedHasBbox = false;
|
| 134 |
+
private int[] _cachedMarkerVis = null;
|
| 135 |
+
private string _currentlyLoadedWorldScene = "";
|
| 136 |
+
|
| 137 |
+
private Shader _autoDepthShader;
|
| 138 |
+
private const int JOINT_COUNT = 22;
|
| 139 |
+
private static readonly string[] BONE_NAMES = {
|
| 140 |
+
"pelvis", "left_hip", "right_hip", "spine1", "left_knee", "right_knee", "spine2",
|
| 141 |
+
"left_ankle", "right_ankle", "spine3", "left_foot", "right_foot", "neck", "left_collar",
|
| 142 |
+
"right_collar", "head", "left_shoulder", "right_shoulder", "left_elbow", "right_elbow",
|
| 143 |
+
"left_wrist", "right_wrist"
|
| 144 |
+
};
|
| 145 |
+
|
| 146 |
+
void Start()
|
| 147 |
+
{
|
| 148 |
+
Screen.SetResolution(1280, 720, FullScreenMode.Windowed);
|
| 149 |
+
|
| 150 |
+
EnsureDepthShaderExists();
|
| 151 |
+
_autoDepthShader = Shader.Find("Custom/AutoLinearDepth");
|
| 152 |
+
if (!_autoDepthShader) UnityEngine.Debug.LogError("Could not load the auto-generated depth shader!");
|
| 153 |
+
|
| 154 |
+
_greenTex = new Texture2D(1, 1); _greenTex.SetPixel(0, 0, Color.green); _greenTex.Apply();
|
| 155 |
+
_redTex = new Texture2D(1, 1); _redTex.SetPixel(0, 0, Color.red); _redTex.Apply();
|
| 156 |
+
_occTex = new Texture2D(1, 1); _occTex.SetPixel(0, 0, new Color(1, 0, 0, 0.5f)); _occTex.Apply();
|
| 157 |
+
|
| 158 |
+
_bakeMesh = new Mesh();
|
| 159 |
+
_bakeMesh.MarkDynamic();
|
| 160 |
+
|
| 161 |
+
if (startRecordingOnPlay)
|
| 162 |
+
StartCoroutine(ProcessBatch());
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
private void EnsureDepthShaderExists()
|
| 166 |
+
{
|
| 167 |
+
string path = "Assets/SyntheticDepth.shader";
|
| 168 |
+
if (File.Exists(path)) return;
|
| 169 |
+
|
| 170 |
+
string shaderCode = @"
|
| 171 |
+
Shader ""Custom/AutoLinearDepth""
|
| 172 |
+
{
|
| 173 |
+
SubShader
|
| 174 |
+
{
|
| 175 |
+
Tags { ""RenderType""="""" ""Queue""=""Geometry"" ""ForceNoShadowCasting""=""True"" }
|
| 176 |
+
Cull Off
|
| 177 |
+
ZWrite On
|
| 178 |
+
ZTest LEqual
|
| 179 |
+
Pass
|
| 180 |
+
{
|
| 181 |
+
CGPROGRAM
|
| 182 |
+
#pragma vertex vert
|
| 183 |
+
#pragma fragment frag
|
| 184 |
+
#include ""UnityCG.cginc""
|
| 185 |
+
struct appdata { float4 vertex : POSITION; };
|
| 186 |
+
struct v2f { float4 pos : SV_POSITION; float depth : TEXCOORD0; };
|
| 187 |
+
v2f vert (appdata v) { v2f o; o.pos = UnityObjectToClipPos(v.vertex); o.depth = -UnityObjectToViewPos(v.vertex).z; return o; }
|
| 188 |
+
float4 frag (v2f i) : SV_Target { return float4(i.depth, 0, 0, 1); }
|
| 189 |
+
ENDCG
|
| 190 |
+
}
|
| 191 |
+
}
|
| 192 |
+
}";
|
| 193 |
+
File.WriteAllText(path, shaderCode);
|
| 194 |
+
#if UNITY_EDITOR
|
| 195 |
+
UnityEditor.AssetDatabase.Refresh();
|
| 196 |
+
#endif
|
| 197 |
+
UnityEngine.Debug.Log("Created Aggressive AutoLinearDepth shader at " + path);
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
private IEnumerator ProcessBatch()
|
| 201 |
+
{
|
| 202 |
+
string fullInputPath = Path.IsPathRooted(inputFolderPath) ? inputFolderPath : Path.Combine(Application.dataPath, "..", inputFolderPath);
|
| 203 |
+
if (!Directory.Exists(fullInputPath)) { UnityEngine.Debug.LogError("Input folder missing"); yield break; }
|
| 204 |
+
|
| 205 |
+
string[] allFiles = Directory.GetFiles(fullInputPath, "*.json");
|
| 206 |
+
Array.Sort(allFiles);
|
| 207 |
+
|
| 208 |
+
List<string> myFiles = new List<string>();
|
| 209 |
+
int safeTotalWorkers = Mathf.Max(1, totalWorkers);
|
| 210 |
+
|
| 211 |
+
for (int i = 0; i < allFiles.Length; i++)
|
| 212 |
+
if (i % safeTotalWorkers == workerId) myFiles.Add(allFiles[i]);
|
| 213 |
+
|
| 214 |
+
foreach (string file in myFiles)
|
| 215 |
+
{
|
| 216 |
+
_currentInputJsonPath = file;
|
| 217 |
+
sequenceName = Path.GetFileNameWithoutExtension(file);
|
| 218 |
+
|
| 219 |
+
Resources.UnloadUnusedAssets();
|
| 220 |
+
System.GC.Collect();
|
| 221 |
+
|
| 222 |
+
yield return StartCoroutine(LoadRandomWorldRoutine());
|
| 223 |
+
|
| 224 |
+
RandomizeAvatarAndGatherRenderers();
|
| 225 |
+
FindAndCacheBones();
|
| 226 |
+
ApplyRandomSpawnPoint(SceneManager.GetActiveScene());
|
| 227 |
+
|
| 228 |
+
yield return StartCoroutine(RecordSingleSequence());
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
#if UNITY_EDITOR
|
| 232 |
+
UnityEditor.EditorApplication.isPlaying = false;
|
| 233 |
+
#else
|
| 234 |
+
Application.Quit();
|
| 235 |
+
#endif
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
private IEnumerator LoadRandomWorldRoutine()
|
| 239 |
+
{
|
| 240 |
+
if (worldSceneNames == null || worldSceneNames.Count == 0) yield break;
|
| 241 |
+
|
| 242 |
+
if (!string.IsNullOrEmpty(_currentlyLoadedWorldScene) && worldSceneLoadMode == LoadSceneMode.Additive)
|
| 243 |
+
{
|
| 244 |
+
AsyncOperation unloadOp = SceneManager.UnloadSceneAsync(_currentlyLoadedWorldScene);
|
| 245 |
+
while (unloadOp != null && !unloadOp.isDone) yield return null;
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
string chosen = worldSceneNames[UnityEngine.Random.Range(0, worldSceneNames.Count)].Trim();
|
| 249 |
+
_currentlyLoadedWorldScene = chosen;
|
| 250 |
+
|
| 251 |
+
AsyncOperation loadOp = SceneManager.LoadSceneAsync(chosen, worldSceneLoadMode);
|
| 252 |
+
while (!loadOp.isDone) yield return null;
|
| 253 |
+
yield return null;
|
| 254 |
+
|
| 255 |
+
Scene loaded = SceneManager.GetSceneByName(chosen);
|
| 256 |
+
if (!loaded.IsValid()) loaded = SceneManager.GetSceneByPath(chosen);
|
| 257 |
+
|
| 258 |
+
if (loaded.IsValid() && loaded.isLoaded)
|
| 259 |
+
{
|
| 260 |
+
if (setLoadedWorldSceneActive) SceneManager.SetActiveScene(loaded);
|
| 261 |
+
BindToWorldMainCameraOrLog(loaded);
|
| 262 |
+
}
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
IEnumerator RecordSingleSequence()
|
| 266 |
+
{
|
| 267 |
+
_data = JsonConvert.DeserializeObject<SequenceData>(File.ReadAllText(_currentInputJsonPath));
|
| 268 |
+
|
| 269 |
+
if (!Directory.Exists(outputFolder)) Directory.CreateDirectory(outputFolder);
|
| 270 |
+
string seqImageDir = Path.Combine(outputFolder, "images", sequenceName);
|
| 271 |
+
if (!Directory.Exists(seqImageDir)) Directory.CreateDirectory(seqImageDir);
|
| 272 |
+
|
| 273 |
+
// --- FFmpeg Setup for VOD SIMULATION ---
|
| 274 |
+
string videoPath = Path.Combine(outputFolder, $"video_{sequenceName}.mp4").Replace("\\", "/");
|
| 275 |
+
|
| 276 |
+
// VOD SIMULATION LOGIC:
|
| 277 |
+
// 1. -b:v {bitrate}k -> Forces the encoder to target a specific bandwidth
|
| 278 |
+
// 2. -maxrate {bitrate}k -> Prevents it from spiking quality during high motion (causes artifacts)
|
| 279 |
+
// 3. -bufsize {bitrate*2}k -> Standard buffer size for streaming
|
| 280 |
+
// 4. -g {gop} -> Sets Keyframe Interval. Twitch uses 2 seconds fixed.
|
| 281 |
+
// 5. -preset ultrafast -> Keeps Unity realtime, but relies on bitrate starvation to cause the artifacts
|
| 282 |
+
|
| 283 |
+
int gopFrames = Mathf.RoundToInt(frameRate * gopSizeSeconds);
|
| 284 |
+
|
| 285 |
+
string ffmpegArgs = $"-y -f rawvideo -vcodec rawvideo -pix_fmt rgb24 " +
|
| 286 |
+
$"-s {Screen.width}x{Screen.height} -r {frameRate} -i - " +
|
| 287 |
+
$"-vf vflip " +
|
| 288 |
+
$"-c:v libx264 " +
|
| 289 |
+
$"-pix_fmt yuv420p " +
|
| 290 |
+
$"-preset ultrafast " +
|
| 291 |
+
$"-b:v {targetBitrateKbps}k -maxrate {targetBitrateKbps}k -bufsize {targetBitrateKbps * 2}k " +
|
| 292 |
+
$"-g {gopFrames} " +
|
| 293 |
+
$"\"{videoPath}\"";
|
| 294 |
+
|
| 295 |
+
ProcessStartInfo psi = new ProcessStartInfo
|
| 296 |
+
{
|
| 297 |
+
FileName = ffmpegPath,
|
| 298 |
+
Arguments = ffmpegArgs,
|
| 299 |
+
UseShellExecute = false,
|
| 300 |
+
RedirectStandardInput = true,
|
| 301 |
+
CreateNoWindow = true
|
| 302 |
+
};
|
| 303 |
+
|
| 304 |
+
Process ffmpegProcess = null;
|
| 305 |
+
try
|
| 306 |
+
{
|
| 307 |
+
ffmpegProcess = Process.Start(psi);
|
| 308 |
+
}
|
| 309 |
+
catch(Exception e)
|
| 310 |
+
{
|
| 311 |
+
UnityEngine.Debug.LogError($"Failed to start FFmpeg. Is it in PATH? Error: {e.Message}");
|
| 312 |
+
yield break;
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
Texture2D screenTex = new Texture2D(Screen.width, Screen.height, TextureFormat.RGB24, false);
|
| 316 |
+
RenderTexture depthRT = new RenderTexture(Screen.width, Screen.height, 24, RenderTextureFormat.RFloat);
|
| 317 |
+
Texture2D depthReadTex = new Texture2D(Screen.width, Screen.height, TextureFormat.RFloat, false);
|
| 318 |
+
|
| 319 |
+
string jsonlPath = Path.Combine(outputFolder, $"sequence_{sequenceName}.jsonl");
|
| 320 |
+
|
| 321 |
+
int framesUntilSwitch = 0;
|
| 322 |
+
int currentFaceId = 0, currentLeftHandId = 0, currentRightHandId = 0;
|
| 323 |
+
|
| 324 |
+
using (var sw = new StreamWriter(jsonlPath, false))
|
| 325 |
+
{
|
| 326 |
+
for (int i = 0; i < _data.frames.Count; i++)
|
| 327 |
+
{
|
| 328 |
+
if (framesUntilSwitch <= 0)
|
| 329 |
+
{
|
| 330 |
+
framesUntilSwitch = UnityEngine.Random.Range(minSwitchFrames, maxSwitchFrames + 1);
|
| 331 |
+
currentFaceId = UnityEngine.Random.Range(0, faceMaxId + 1);
|
| 332 |
+
currentLeftHandId = UnityEngine.Random.Range(0, handsMaxId + 1);
|
| 333 |
+
currentRightHandId = UnityEngine.Random.Range(0, handsMaxId + 1);
|
| 334 |
+
if (_activeAnimator != null)
|
| 335 |
+
{
|
| 336 |
+
_activeAnimator.SetInteger(paramFaceIndex, currentFaceId);
|
| 337 |
+
_activeAnimator.SetInteger(paramLeftHandIndex, currentLeftHandId);
|
| 338 |
+
_activeAnimator.SetInteger(paramRightHandIndex, currentRightHandId);
|
| 339 |
+
}
|
| 340 |
+
}
|
| 341 |
+
framesUntilSwitch--;
|
| 342 |
+
|
| 343 |
+
ApplyFrame(_data.frames[i]);
|
| 344 |
+
if (cameraDriver != null) cameraDriver.OnFrame(i);
|
| 345 |
+
if (_activeRetargeter != null) _activeRetargeter.ManualUpdatePose();
|
| 346 |
+
|
| 347 |
+
Physics.SyncTransforms();
|
| 348 |
+
|
| 349 |
+
vtuberCamera.clearFlags = CameraClearFlags.SolidColor;
|
| 350 |
+
vtuberCamera.backgroundColor = Color.black;
|
| 351 |
+
vtuberCamera.cullingMask = ~0;
|
| 352 |
+
|
| 353 |
+
yield return new WaitForEndOfFrame();
|
| 354 |
+
|
| 355 |
+
if (screenTex.width != Screen.width || screenTex.height != Screen.height)
|
| 356 |
+
screenTex.Reinitialize(Screen.width, Screen.height);
|
| 357 |
+
|
| 358 |
+
screenTex.ReadPixels(new Rect(0, 0, Screen.width, Screen.height), 0, 0);
|
| 359 |
+
screenTex.Apply();
|
| 360 |
+
|
| 361 |
+
byte[] rawFrame = screenTex.GetRawTextureData();
|
| 362 |
+
if (ffmpegProcess != null && !ffmpegProcess.HasExited)
|
| 363 |
+
{
|
| 364 |
+
try {
|
| 365 |
+
ffmpegProcess.StandardInput.BaseStream.Write(rawFrame, 0, rawFrame.Length);
|
| 366 |
+
ffmpegProcess.StandardInput.BaseStream.Flush();
|
| 367 |
+
} catch (Exception ex) {
|
| 368 |
+
UnityEngine.Debug.LogError("FFmpeg write error: " + ex.Message);
|
| 369 |
+
}
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
RenderTexture origRT = vtuberCamera.targetTexture;
|
| 373 |
+
CameraClearFlags origFlags = vtuberCamera.clearFlags;
|
| 374 |
+
Color origBG = vtuberCamera.backgroundColor;
|
| 375 |
+
|
| 376 |
+
vtuberCamera.targetTexture = depthRT;
|
| 377 |
+
vtuberCamera.clearFlags = CameraClearFlags.SolidColor;
|
| 378 |
+
vtuberCamera.backgroundColor = new Color(1000f, 0, 0, 0);
|
| 379 |
+
|
| 380 |
+
if (_autoDepthShader != null) vtuberCamera.RenderWithShader(_autoDepthShader, "");
|
| 381 |
+
else vtuberCamera.Render();
|
| 382 |
+
|
| 383 |
+
RenderTexture.active = depthRT;
|
| 384 |
+
if (depthReadTex.width != Screen.width || depthReadTex.height != Screen.height)
|
| 385 |
+
depthReadTex.Reinitialize(Screen.width, Screen.height);
|
| 386 |
+
depthReadTex.ReadPixels(new Rect(0, 0, Screen.width, Screen.height), 0, 0);
|
| 387 |
+
depthReadTex.Apply();
|
| 388 |
+
|
| 389 |
+
if (saveDebugDepthImages)
|
| 390 |
+
{
|
| 391 |
+
Texture2D visualDepth = new Texture2D(Screen.width, Screen.height, TextureFormat.RGB24, false);
|
| 392 |
+
Color[] rawPixels = depthReadTex.GetPixels();
|
| 393 |
+
Color[] visPixels = new Color[rawPixels.Length];
|
| 394 |
+
float displayRange = 3.0f;
|
| 395 |
+
for (int k = 0; k < rawPixels.Length; k++)
|
| 396 |
+
{
|
| 397 |
+
float d = rawPixels[k].r;
|
| 398 |
+
if (d > 999f) visPixels[k] = Color.white;
|
| 399 |
+
else
|
| 400 |
+
{
|
| 401 |
+
float norm = Mathf.Clamp01(d / displayRange);
|
| 402 |
+
visPixels[k] = new Color(norm, norm, norm);
|
| 403 |
+
}
|
| 404 |
+
}
|
| 405 |
+
visualDepth.SetPixels(visPixels);
|
| 406 |
+
visualDepth.Apply();
|
| 407 |
+
string depthFile = $"depth_{i:D5}.png";
|
| 408 |
+
File.WriteAllBytes(Path.Combine(seqImageDir, depthFile), visualDepth.EncodeToPNG());
|
| 409 |
+
Destroy(visualDepth);
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
vtuberCamera.targetTexture = origRT;
|
| 413 |
+
vtuberCamera.clearFlags = origFlags;
|
| 414 |
+
vtuberCamera.backgroundColor = origBG;
|
| 415 |
+
RenderTexture.active = null;
|
| 416 |
+
|
| 417 |
+
ComputeBoundingBoxCached();
|
| 418 |
+
|
| 419 |
+
float H = Screen.height; float W = Screen.width;
|
| 420 |
+
Vector3 camP0_W = vtuberCamera.transform.TransformPoint(new Vector3(0f, 0f, 1f));
|
| 421 |
+
Vector3 camPx_W = vtuberCamera.transform.TransformPoint(new Vector3(1f, 0f, 1f));
|
| 422 |
+
Vector3 camPyDown_W = vtuberCamera.transform.TransformPoint(new Vector3(0f, -1f, 1f));
|
| 423 |
+
Vector3 s0 = vtuberCamera.WorldToScreenPoint(camP0_W);
|
| 424 |
+
Vector3 sx = vtuberCamera.WorldToScreenPoint(camPx_W);
|
| 425 |
+
Vector3 sy = vtuberCamera.WorldToScreenPoint(camPyDown_W);
|
| 426 |
+
float cx = s0.x; float cy = H - s0.y;
|
| 427 |
+
float fx = sx.x - s0.x; float fy = (H - sy.y) - cy;
|
| 428 |
+
|
| 429 |
+
Rect rFull = _cachedHasBbox ? _cachedBbox : new Rect(0, 0, 0, 0);
|
| 430 |
+
float bbox_x = rFull.x; float bbox_y = H - (rFull.y + rFull.height);
|
| 431 |
+
float bbox_w = rFull.width; float bbox_h = rFull.height;
|
| 432 |
+
float clip_x0 = Mathf.Clamp(bbox_x, 0, W);
|
| 433 |
+
float clip_y0 = Mathf.Clamp(bbox_y, 0, H);
|
| 434 |
+
float clip_w = Mathf.Max(0, Mathf.Clamp(bbox_x + bbox_w, 0, W) - clip_x0);
|
| 435 |
+
float clip_h = Mathf.Max(0, Mathf.Clamp(bbox_y + bbox_h, 0, H) - clip_y0);
|
| 436 |
+
|
| 437 |
+
var kpts2D = new List<float>();
|
| 438 |
+
var kptsVis = new List<int>();
|
| 439 |
+
var kpts3D = new List<float>();
|
| 440 |
+
|
| 441 |
+
if (_activeMarkers != null)
|
| 442 |
+
{
|
| 443 |
+
for (int mi = 0; mi < _activeMarkers.Count; mi++)
|
| 444 |
+
{
|
| 445 |
+
Transform t = _activeMarkers[mi];
|
| 446 |
+
if (t == null) { continue; }
|
| 447 |
+
|
| 448 |
+
Vector3 wPos = t.position;
|
| 449 |
+
kpts3D.Add(wPos.x); kpts3D.Add(wPos.y); kpts3D.Add(wPos.z);
|
| 450 |
+
|
| 451 |
+
Vector3 sPos = vtuberCamera.WorldToScreenPoint(wPos);
|
| 452 |
+
float x_px = sPos.x;
|
| 453 |
+
float y_px = H - sPos.y;
|
| 454 |
+
kpts2D.Add(x_px); kpts2D.Add(y_px);
|
| 455 |
+
|
| 456 |
+
int vis = 0;
|
| 457 |
+
if (sPos.z > 0 && x_px >= 0 && x_px < W && sPos.y >= 0 && sPos.y < H)
|
| 458 |
+
{
|
| 459 |
+
int checkRadius = 2;
|
| 460 |
+
float requiredVisibilityRatio = 0.5f;
|
| 461 |
+
int totalSamples = 0;
|
| 462 |
+
int visibleSamples = 0;
|
| 463 |
+
float markerDistance = sPos.z;
|
| 464 |
+
|
| 465 |
+
for (int ox = -checkRadius; ox <= checkRadius; ox++)
|
| 466 |
+
{
|
| 467 |
+
for (int oy = -checkRadius; oy <= checkRadius; oy++)
|
| 468 |
+
{
|
| 469 |
+
int px = (int)sPos.x + ox;
|
| 470 |
+
int py = (int)sPos.y + oy;
|
| 471 |
+
if (px >= 0 && px < W && py >= 0 && py < H)
|
| 472 |
+
{
|
| 473 |
+
totalSamples++;
|
| 474 |
+
float pixelDepth = depthReadTex.GetPixel(px, py).r;
|
| 475 |
+
if (pixelDepth >= (markerDistance - occlusionBias))
|
| 476 |
+
visibleSamples++;
|
| 477 |
+
}
|
| 478 |
+
}
|
| 479 |
+
}
|
| 480 |
+
if (totalSamples > 0)
|
| 481 |
+
{
|
| 482 |
+
float visibilityPct = (float)visibleSamples / totalSamples;
|
| 483 |
+
vis = (visibilityPct >= requiredVisibilityRatio) ? 2 : 1;
|
| 484 |
+
}
|
| 485 |
+
else vis = 1;
|
| 486 |
+
}
|
| 487 |
+
kptsVis.Add(vis);
|
| 488 |
+
}
|
| 489 |
+
}
|
| 490 |
+
|
| 491 |
+
_cachedMarkerVis = kptsVis.ToArray();
|
| 492 |
+
Transform pelvis = (_bones != null && _bones.Length > 0) ? _bones[0] : null;
|
| 493 |
+
Quaternion correction = Quaternion.Euler(globalCoordinateCorrection);
|
| 494 |
+
Quaternion pelvisWorld = (pelvis != null) ? pelvis.rotation : Quaternion.identity;
|
| 495 |
+
Vector3 pelvisPos = (pelvis != null) ? pelvis.position : Vector3.zero;
|
| 496 |
+
Quaternion incamQ = Quaternion.Inverse(vtuberCamera.transform.rotation) * (Quaternion.Inverse(correction) * pelvisWorld);
|
| 497 |
+
Vector3 incamPos = vtuberCamera.transform.InverseTransformPoint(pelvisPos);
|
| 498 |
+
Vector3 rootIncamPos = (characterRoot != null) ? vtuberCamera.transform.InverseTransformPoint(characterRoot.transform.position) : Vector3.zero;
|
| 499 |
+
|
| 500 |
+
var meta = new OutputMeta
|
| 501 |
+
{
|
| 502 |
+
frame_index = i,
|
| 503 |
+
image_path = videoPath,
|
| 504 |
+
avatar_name = _activeAvatarName,
|
| 505 |
+
face_id = currentFaceId,
|
| 506 |
+
left_hand_id = currentLeftHandId,
|
| 507 |
+
right_hand_id = currentRightHandId,
|
| 508 |
+
bbox = new float[] { bbox_x, bbox_y, bbox_w, bbox_h },
|
| 509 |
+
bbox_clip = new float[] { clip_x0, clip_y0, clip_w, clip_h },
|
| 510 |
+
kpts_2d = kpts2D.ToArray(), kpts_vis = kptsVis.ToArray(),
|
| 511 |
+
cam_intrinsics = new float[] { fx, fy, cx, cy },
|
| 512 |
+
cam_pos_world = new float[] { vtuberCamera.transform.position.x, vtuberCamera.transform.position.y, vtuberCamera.transform.position.z },
|
| 513 |
+
cam_rot_world = new float[] { vtuberCamera.transform.rotation.x, vtuberCamera.transform.rotation.y, vtuberCamera.transform.rotation.z, vtuberCamera.transform.rotation.w },
|
| 514 |
+
pelvis_pos_world = new float[] { pelvisPos.x, pelvisPos.y, pelvisPos.z },
|
| 515 |
+
pelvis_rot_world = new float[] { pelvisWorld.x, pelvisWorld.y, pelvisWorld.z, pelvisWorld.w },
|
| 516 |
+
smpl_incam_quat = new float[] { incamQ.x, incamQ.y, incamQ.z, incamQ.w },
|
| 517 |
+
smpl_incam_transl = new float[] { incamPos.x, incamPos.y, incamPos.z },
|
| 518 |
+
smpl_root_incam_transl = new float[] { rootIncamPos.x, rootIncamPos.y, rootIncamPos.z },
|
| 519 |
+
smpl_root_world_scale = (characterRoot != null) ? characterRoot.transform.lossyScale.x : 1f,
|
| 520 |
+
kpts_3d_world = kpts3D.ToArray(),
|
| 521 |
+
smplx_pose = _data.frames[i].p, smplx_betas = _data.frames[i].b
|
| 522 |
+
};
|
| 523 |
+
sw.WriteLine(JsonConvert.SerializeObject(meta));
|
| 524 |
+
}
|
| 525 |
+
}
|
| 526 |
+
|
| 527 |
+
if (ffmpegProcess != null && !ffmpegProcess.HasExited)
|
| 528 |
+
{
|
| 529 |
+
ffmpegProcess.StandardInput.Close();
|
| 530 |
+
ffmpegProcess.WaitForExit();
|
| 531 |
+
ffmpegProcess.Close();
|
| 532 |
+
}
|
| 533 |
+
|
| 534 |
+
if(screenTex) Destroy(screenTex);
|
| 535 |
+
if(depthRT) Destroy(depthRT);
|
| 536 |
+
if(depthReadTex) Destroy(depthReadTex);
|
| 537 |
+
}
|
| 538 |
+
|
| 539 |
+
void ApplyFrame(FrameData f)
|
| 540 |
+
{
|
| 541 |
+
if (f.p == null || characterRoot == null) return;
|
| 542 |
+
Quaternion correction = Quaternion.Euler(globalCoordinateCorrection);
|
| 543 |
+
characterRoot.transform.localPosition = (correction * (new Vector3(-f.t[0], f.t[1], f.t[2]) * movementScale)) + translationOffset;
|
| 544 |
+
int floatIdx = 0;
|
| 545 |
+
for (int i = 0; i < JOINT_COUNT; i++)
|
| 546 |
+
{
|
| 547 |
+
if (floatIdx + 2 >= f.p.Length) break;
|
| 548 |
+
float x = f.p[floatIdx++], y = f.p[floatIdx++], z = f.p[floatIdx++];
|
| 549 |
+
float angle = Mathf.Sqrt(x * x + y * y + z * z);
|
| 550 |
+
Quaternion q = Quaternion.identity;
|
| 551 |
+
if (angle > 1e-6f) { float c = Mathf.Cos(angle * 0.5f), s = Mathf.Sin(angle * 0.5f); q = new Quaternion(-(x / angle) * s, (y / angle) * s, (z / angle) * s, -c); }
|
| 552 |
+
if (_bones != null && i < _bones.Length && _bones[i] != null) _bones[i].localRotation = (i == 0) ? (correction * q) : q;
|
| 553 |
+
}
|
| 554 |
+
if (cameraDriver != null) cameraDriver.SetStyleFromFrameData(f.s);
|
| 555 |
+
}
|
| 556 |
+
|
| 557 |
+
private void FindAndCacheBones()
|
| 558 |
+
{
|
| 559 |
+
_bones = new Transform[BONE_NAMES.Length];
|
| 560 |
+
for (int i = 0; i < BONE_NAMES.Length; i++) _bones[i] = FindDeep(characterRoot.transform, BONE_NAMES[i]);
|
| 561 |
+
_pelvisBone = (_bones != null && _bones.Length > 0) ? _bones[0] : null;
|
| 562 |
+
}
|
| 563 |
+
|
| 564 |
+
private static Transform FindDeep(Transform root, string name)
|
| 565 |
+
{
|
| 566 |
+
if (root.name == name) return root;
|
| 567 |
+
foreach (Transform child in root) { var res = FindDeep(child, name); if (res) return res; }
|
| 568 |
+
return null;
|
| 569 |
+
}
|
| 570 |
+
|
| 571 |
+
private void RandomizeAvatarAndGatherRenderers()
|
| 572 |
+
{
|
| 573 |
+
if (avatarList == null || avatarList.Count == 0) return;
|
| 574 |
+
_activeBboxRenderers.Clear(); _activeMarkers.Clear();
|
| 575 |
+
int randomIndex = UnityEngine.Random.Range(0, avatarList.Count);
|
| 576 |
+
AvatarConfig selected = avatarList[randomIndex];
|
| 577 |
+
_activeAvatarName = selected.avatarName;
|
| 578 |
+
_activeAnimator = selected.animator;
|
| 579 |
+
if (_activeAnimator == null && selected.avatarObject != null) _activeAnimator = selected.avatarObject.GetComponent<Animator>();
|
| 580 |
+
for (int i = 0; i < avatarList.Count; i++) if (avatarList[i].avatarObject != null) avatarList[i].avatarObject.SetActive(i == randomIndex);
|
| 581 |
+
_activePadding = selected.specificPadding; _activeRetargeter = selected.retargeter;
|
| 582 |
+
if (selected.avatarObject != null) _activeAvatarRoot = selected.avatarObject.transform;
|
| 583 |
+
if (selected.customMarkers != null) _activeMarkers.AddRange(selected.customMarkers);
|
| 584 |
+
if (characterRoot != null) { foreach (var r in characterRoot.GetComponentsInChildren<Renderer>(true)) _activeBboxRenderers.Add(r); }
|
| 585 |
+
foreach (GameObject extra in selected.extraMeshes) if (extra) { foreach (var cr in extra.GetComponentsInChildren<Renderer>(true)) if (!_activeBboxRenderers.Contains(cr)) _activeBboxRenderers.Add(cr); }
|
| 586 |
+
}
|
| 587 |
+
|
| 588 |
+
private void ApplyRandomSpawnPoint(Scene worldScene)
|
| 589 |
+
{
|
| 590 |
+
if (!worldScene.IsValid() || !worldScene.isLoaded) return;
|
| 591 |
+
List<Transform> spawns = new List<Transform>();
|
| 592 |
+
foreach (GameObject root in worldScene.GetRootGameObjects()) { foreach (Transform child in root.GetComponentsInChildren<Transform>(true)) if (child.name.Contains(spawnPointToken)) spawns.Add(child); }
|
| 593 |
+
if (spawns.Count > 0)
|
| 594 |
+
{
|
| 595 |
+
Transform chosen = spawns[UnityEngine.Random.Range(0, spawns.Count)];
|
| 596 |
+
this.transform.position = chosen.position; this.transform.rotation = chosen.rotation;
|
| 597 |
+
if (characterRoot != null) { characterRoot.transform.localPosition = Vector3.zero; characterRoot.transform.localRotation = Quaternion.identity; }
|
| 598 |
+
}
|
| 599 |
+
}
|
| 600 |
+
|
| 601 |
+
private void BindToWorldMainCameraOrLog(Scene worldScene)
|
| 602 |
+
{
|
| 603 |
+
Camera found = null;
|
| 604 |
+
foreach (GameObject root in worldScene.GetRootGameObjects())
|
| 605 |
+
{
|
| 606 |
+
foreach (Transform t in root.GetComponentsInChildren<Transform>(true))
|
| 607 |
+
if (t.name == worldMainCameraName && t.GetComponent<Camera>()) { found = t.GetComponent<Camera>(); break; }
|
| 608 |
+
if (found) break;
|
| 609 |
+
}
|
| 610 |
+
if (!found) return;
|
| 611 |
+
Vector3 ls = found.transform.lossyScale;
|
| 612 |
+
if (Mathf.Abs(ls.x - 1f) > 1e-4f || Mathf.Abs(ls.y - 1f) > 1e-4f || Mathf.Abs(ls.z - 1f) > 1e-4f)
|
| 613 |
+
{
|
| 614 |
+
found.transform.SetParent(null, true);
|
| 615 |
+
found.transform.localScale = Vector3.one;
|
| 616 |
+
}
|
| 617 |
+
if (vtuberCamera && vtuberCamera != found) vtuberCamera.enabled = false;
|
| 618 |
+
vtuberCamera = found;
|
| 619 |
+
cameraDriver = found.GetComponent<SyntheticCameraDriver>() ?? found.gameObject.AddComponent<SyntheticCameraDriver>();
|
| 620 |
+
cameraDriver.BindAndInit(found, characterRoot.transform);
|
| 621 |
+
}
|
| 622 |
+
|
| 623 |
+
private bool ComputeBoundingBoxCached()
|
| 624 |
+
{
|
| 625 |
+
_cachedHasBbox = false; _cachedBbox = new Rect(0, 0, 0, 0);
|
| 626 |
+
if (vtuberCamera == null || _activeBboxRenderers.Count == 0) return false;
|
| 627 |
+
float minVX = float.MaxValue, maxVX = float.MinValue, minVY = float.MaxValue, maxVY = float.MinValue;
|
| 628 |
+
bool foundAny = false;
|
| 629 |
+
int stride = Mathf.Max(1, bakedVertexStride);
|
| 630 |
+
foreach (var rend in _activeBboxRenderers)
|
| 631 |
+
{
|
| 632 |
+
if (!rend) continue;
|
| 633 |
+
if (useBakedSkinnedMeshForBbox && rend is SkinnedMeshRenderer smr)
|
| 634 |
+
{
|
| 635 |
+
_bakeMesh.Clear(); smr.BakeMesh(_bakeMesh); _bakedVerts.Clear(); _bakeMesh.GetVertices(_bakedVerts);
|
| 636 |
+
Matrix4x4 localToWorldNoScale = Matrix4x4.TRS(smr.transform.position, smr.transform.rotation, Vector3.one);
|
| 637 |
+
for (int vi = 0; vi < _bakedVerts.Count; vi += stride)
|
| 638 |
+
{
|
| 639 |
+
Vector3 vp = vtuberCamera.WorldToViewportPoint(localToWorldNoScale.MultiplyPoint3x4(_bakedVerts[vi]));
|
| 640 |
+
if (vp.z <= 0f) continue;
|
| 641 |
+
foundAny = true; minVX = Math.Min(minVX, vp.x); maxVX = Math.Max(maxVX, vp.x); minVY = Math.Min(minVY, vp.y); maxVY = Math.Max(maxVY, vp.y);
|
| 642 |
+
}
|
| 643 |
+
}
|
| 644 |
+
else
|
| 645 |
+
{
|
| 646 |
+
Bounds b = rend.bounds; Vector3 c = b.center, e = b.extents;
|
| 647 |
+
Vector3[] corners = { c+new Vector3(-e.x,-e.y,-e.z), c+new Vector3(-e.x,-e.y,e.z), c+new Vector3(-e.x,e.y,-e.z), c+new Vector3(-e.x,e.y,e.z), c+new Vector3(e.x,-e.y,-e.z), c+new Vector3(e.x,-e.y,e.z), c+new Vector3(e.x,e.y,-e.z), c+new Vector3(e.x,e.y,e.z) };
|
| 648 |
+
foreach (var corner in corners)
|
| 649 |
+
{
|
| 650 |
+
Vector3 vp = vtuberCamera.WorldToViewportPoint(corner);
|
| 651 |
+
if (vp.z <= 0f) continue;
|
| 652 |
+
foundAny = true; minVX = Math.Min(minVX, vp.x); maxVX = Math.Max(maxVX, vp.x); minVY = Math.Min(minVY, vp.y); maxVY = Math.Max(maxVY, vp.y);
|
| 653 |
+
}
|
| 654 |
+
}
|
| 655 |
+
}
|
| 656 |
+
if (!foundAny) return false;
|
| 657 |
+
_cachedBbox = new Rect(minVX * Screen.width - _activePadding, minVY * Screen.height - _activePadding, (maxVX - minVX) * Screen.width + _activePadding * 2, (maxVY - minVY) * Screen.height + _activePadding * 2);
|
| 658 |
+
_cachedHasBbox = true; return true;
|
| 659 |
+
}
|
| 660 |
+
|
| 661 |
+
void OnGUI()
|
| 662 |
+
{
|
| 663 |
+
if (!showDebugUI || vtuberCamera == null) return;
|
| 664 |
+
if (_cachedHasBbox) { Rect r = _cachedBbox; float invY = Screen.height - (r.y + r.height); GUI.DrawTexture(new Rect(r.x, invY, r.width, 3), _greenTex); GUI.DrawTexture(new Rect(r.x, invY + r.height, r.width, 3), _greenTex); GUI.DrawTexture(new Rect(r.x, invY, 3, r.height), _greenTex); GUI.DrawTexture(new Rect(r.x + r.width, invY, 3, r.height), _greenTex); }
|
| 665 |
+
if (_activeMarkers != null) { for (int mi = 0; mi < _activeMarkers.Count; mi++) { if (!_activeMarkers[mi]) continue; Vector3 sc = vtuberCamera.WorldToScreenPoint(_activeMarkers[mi].position); if (sc.z > 0) { int vis = (_cachedMarkerVis != null && mi < _cachedMarkerVis.Length) ? _cachedMarkerVis[mi] : 2; GUI.DrawTexture(new Rect(sc.x - 2, Screen.height - sc.y - 2, 4, 4), vis == 1 ? _occTex : _greenTex); } } }
|
| 666 |
+
}
|
| 667 |
+
}
|
assets/teaser.png
ADDED
|
Git LFS Details
|
configs/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
from hydra import compose, initialize_config_module
|
| 5 |
+
from hydra.core.config_store import ConfigStore
|
| 6 |
+
|
| 7 |
+
os.environ["HYDRA_FULL_ERROR"] = "1"
|
| 8 |
+
|
| 9 |
+
MainStore = ConfigStore.instance()
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def parse_args_to_cfg():
|
| 13 |
+
"""
|
| 14 |
+
Use minimal Hydra API to parse args and return cfg.
|
| 15 |
+
This function don't do _run_hydra which create log file hierarchy.
|
| 16 |
+
"""
|
| 17 |
+
parser = argparse.ArgumentParser()
|
| 18 |
+
parser.add_argument("--config-name", "-cn", default="train")
|
| 19 |
+
parser.add_argument(
|
| 20 |
+
"overrides",
|
| 21 |
+
nargs="*",
|
| 22 |
+
help="Any key=value arguments to override config values (use dots for.nested=overrides)",
|
| 23 |
+
)
|
| 24 |
+
args = parser.parse_args()
|
| 25 |
+
|
| 26 |
+
# Cfg
|
| 27 |
+
with initialize_config_module(version_base="1.3", config_module="configs"):
|
| 28 |
+
cfg = compose(config_name=args.config_name, overrides=args.overrides)
|
| 29 |
+
|
| 30 |
+
return cfg
|
configs/callbacks/ckpt_saver/every10000s_top100.yaml
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
every10000s_top100:
|
| 2 |
+
_target_: genmo.callbacks.simple_ckpt_saver.SimpleCkptSaver
|
| 3 |
+
output_dir: ${output_dir}/checkpoints/
|
| 4 |
+
every_n_steps: 10000
|
| 5 |
+
save_top_k: 100
|
configs/callbacks/lr_monitor/pl.yaml
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pl:
|
| 2 |
+
_target_: pytorch_lightning.callbacks.lr_monitor.LearningRateMonitor
|
configs/callbacks/metric/metric_3dpw.yaml
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
metric_3dpw:
|
| 2 |
+
_target_: genmo.callbacks.metric.metric_3dpw.MetricMocap
|
configs/callbacks/metric/metric_3dpw_occ.yaml
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
metric_3dpw_occ:
|
| 2 |
+
_target_: genmo.callbacks.metric.metric_3dpw_occ.MetricMocap
|
configs/callbacks/metric/metric_aistpp.yaml
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
metric_aistpp:
|
| 2 |
+
_target_: genmo.callbacks.metric.metric_aistpp.MetricMusic
|
configs/callbacks/metric/metric_emdb1.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
metric_emdb1:
|
| 2 |
+
_target_: genmo.callbacks.metric.metric_emdb.MetricMocap
|
| 3 |
+
emdb_split: 1
|
| 4 |
+
occ: false
|
configs/callbacks/metric/metric_emdb2.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
metric_emdb2:
|
| 2 |
+
_target_: genmo.callbacks.metric.metric_emdb.MetricMocap
|
| 3 |
+
emdb_split: 2
|
| 4 |
+
occ: false
|
configs/callbacks/metric/metric_rich.yaml
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
metric_rich:
|
| 2 |
+
_target_: genmo.callbacks.metric.metric_rich.MetricMocap
|
| 3 |
+
occ: false
|
configs/callbacks/metric/metric_unity.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
metric_unity:
|
| 2 |
+
_target_: genmo.callbacks.metric.metric_unity.MetricUnity
|
| 3 |
+
# Disable the old scenepic HTML viz by default (use `vis/vis_unity_val` instead).
|
| 4 |
+
vis_every_n_val: 1000000000
|
configs/callbacks/prog_bar/prog_reporter_ed1.yaml
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
prog_reporter_ed1:
|
| 2 |
+
_target_: genmo.callbacks.prog_bar.ProgressReporter
|
| 3 |
+
log_every_percent: 0.1
|
| 4 |
+
exp_name: ${exp_name}
|
| 5 |
+
data_name: ${data_name}
|
configs/callbacks/train_speed_timer/base.yaml
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
base:
|
| 2 |
+
_target_: genmo.callbacks.train_speed_timer.TrainSpeedTimer
|
| 3 |
+
N_avg: 5
|
configs/callbacks/vis/vis_music.yaml
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
vis_music:
|
| 2 |
+
_target_: genmo.callbacks.vis.vis_music.VisMusic
|
configs/callbacks/vis/vis_speech.yaml
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
vis_speech:
|
| 2 |
+
_target_: genmo.callbacks.vis.vis_speech.VisSpeech
|
configs/callbacks/vis/vis_text.yaml
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
vis_text:
|
| 2 |
+
_target_: genmo.callbacks.vis.vis_text.VisText
|
configs/callbacks/vis/vis_unity_val.yaml
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
vis_unity_val:
|
| 2 |
+
_target_: genmo.callbacks.vis.vis_unity_val.VisUnityVal
|
| 3 |
+
enabled: false
|
| 4 |
+
every_n_epochs: 1
|
| 5 |
+
num_batches: 1
|
| 6 |
+
num_frames: 30
|
| 7 |
+
render_incam: true
|
| 8 |
+
render_global: true
|
| 9 |
+
use_gt_betas_for_pred: true
|
| 10 |
+
global_root_relative: false
|
| 11 |
+
crf: 23
|
| 12 |
+
save_dir: ${output_dir}/vis
|
| 13 |
+
pred_color: [176, 100, 244]
|
| 14 |
+
gt_color: [0, 255, 0]
|
configs/data/collate_cfg/default.yaml
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
max_motion_frames: ${data.dataset_opts.max_motion_frames}
|
| 2 |
+
default_frame_feature_dim:
|
| 3 |
+
music_array: [1024]
|
| 4 |
+
music_embed: [35]
|
| 5 |
+
music_beats: []
|
| 6 |
+
audio_array: []
|
| 7 |
+
use_det_kp: []
|
| 8 |
+
|
| 9 |
+
default_seq_feature_dim:
|
| 10 |
+
text_embed: [50, 1024]
|
| 11 |
+
|
| 12 |
+
default_seq_feature_length_multiplier:
|
| 13 |
+
audio_array: 600
|
| 14 |
+
|
| 15 |
+
default_feature_val:
|
| 16 |
+
caption: ""
|
| 17 |
+
music_fps: 30
|
| 18 |
+
audio_fps: 30
|
| 19 |
+
has_text: False
|
| 20 |
+
# has_audio: False
|
| 21 |
+
# has_music: False
|
| 22 |
+
|
| 23 |
+
default_feature_type: {}
|
configs/data/mocap/trainX_testY.yaml
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
defaults:
|
| 2 |
+
- collate_cfg: default
|
| 3 |
+
|
| 4 |
+
# definition of lightning datamodule (dataset + dataloader)
|
| 5 |
+
_target_: genmo.datamodule.mocap_trainX_testY.DataModule
|
| 6 |
+
|
| 7 |
+
dataset_opts:
|
| 8 |
+
train: ${train_datasets}
|
| 9 |
+
val: ${test_datasets}
|
| 10 |
+
max_motion_frames: 120
|
| 11 |
+
|
| 12 |
+
loader_opts:
|
| 13 |
+
train:
|
| 14 |
+
batch_size: 128
|
| 15 |
+
num_workers: 8
|
| 16 |
+
val:
|
| 17 |
+
batch_size: 1
|
| 18 |
+
num_workers: 1
|
| 19 |
+
encoded_music_dim: ${pipeline.args.encoded_music_dim}
|
| 20 |
+
|
| 21 |
+
limit_each_trainset: null
|
configs/demo.yaml
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
defaults:
|
| 2 |
+
# pytorch-lightning
|
| 3 |
+
- data: ???
|
| 4 |
+
- model: ???
|
| 5 |
+
- /text_encoder@model.model_cfg.text_encoder: t5_3b
|
| 6 |
+
- callbacks: null
|
| 7 |
+
|
| 8 |
+
# system
|
| 9 |
+
- hydra: default
|
| 10 |
+
|
| 11 |
+
# utility groups that changes a lot
|
| 12 |
+
- pipeline: null
|
| 13 |
+
- network: null
|
| 14 |
+
- optimizer: null
|
| 15 |
+
- scheduler: null
|
| 16 |
+
- train_datasets: null
|
| 17 |
+
- test_datasets: null
|
| 18 |
+
- endecoder: null # normalize/unnormalize data
|
| 19 |
+
- refiner: null
|
| 20 |
+
|
| 21 |
+
# global-override
|
| 22 |
+
- exp: mixed # set "data, model and callbacks" in yaml
|
| 23 |
+
- global/task: null # dump/test
|
| 24 |
+
- global/hsearch: null # hyper-param search
|
| 25 |
+
- global/debug: null # debug mode
|
| 26 |
+
- _self_
|
| 27 |
+
|
| 28 |
+
# ================================ #
|
| 29 |
+
# global setting #
|
| 30 |
+
# ================================ #
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# expirement information
|
| 34 |
+
task: fit # [fit, predict]
|
| 35 |
+
exp_name_base: ???
|
| 36 |
+
exp_name_var: ""
|
| 37 |
+
exp_name: ${exp_name_base}_${exp_name_var}
|
| 38 |
+
data_name: ???
|
| 39 |
+
|
| 40 |
+
# utilities in the entry file
|
| 41 |
+
# output_dir: "outputs/${data_name}/${exp_name}"
|
| 42 |
+
resume_mode: null
|
| 43 |
+
seed: 42
|
| 44 |
+
|
| 45 |
+
version: null
|
| 46 |
+
ckpt_dir: outputs/${data_name}/${exp_name}/
|
| 47 |
+
remote_results_path: /lustre/fsw/portfolios/nvr/projects/nvr_torontoai_humanmotionfm/workspaces/motiondiff/motiondiff_results/jiefengl/gvhmr
|
| 48 |
+
ckpt_path: null
|
| 49 |
+
|
| 50 |
+
###
|
| 51 |
+
# W&B logging removed from this repo; TensorBoard is used by `scripts/train.py`.
|
| 52 |
+
rsync_ckpt: true
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# ================================ #
|
| 56 |
+
# global setting #
|
| 57 |
+
# ================================ #
|
| 58 |
+
|
| 59 |
+
video_name: ???
|
| 60 |
+
output_root: outputs/demo
|
| 61 |
+
output_dir: "${output_root}/${text1_video_name}"
|
| 62 |
+
preprocess_dir: ${output_dir}/preprocess
|
| 63 |
+
video_path: "${output_dir}/0_input_video.mp4"
|
| 64 |
+
|
| 65 |
+
# Options
|
| 66 |
+
text1: null
|
| 67 |
+
text1_file: null
|
| 68 |
+
text1_video_path: null
|
| 69 |
+
text1_video_name: null
|
| 70 |
+
text_length: 300
|
| 71 |
+
static_cam: False
|
| 72 |
+
verbose: False
|
| 73 |
+
|
| 74 |
+
paths:
|
| 75 |
+
bbx: ${preprocess_dir}/bbx.pt
|
| 76 |
+
bbx_xyxy_video_overlay: ${preprocess_dir}/bbx_xyxy_video_overlay.mp4
|
| 77 |
+
vit_features: ${preprocess_dir}/vit_features.pt
|
| 78 |
+
vimo_pred: ${preprocess_dir}/vimo_pred.pt
|
| 79 |
+
vitpose: ${preprocess_dir}/vitpose.pt
|
| 80 |
+
vitpose_video_overlay: ${preprocess_dir}/vitpose_video_overlay.mp4
|
| 81 |
+
hmr4d_results: ${output_dir}/hmr4d_results.pt
|
| 82 |
+
incam_video: ${output_dir}/1_incam.mp4
|
| 83 |
+
global_video: ${output_dir}/2_global.mp4
|
| 84 |
+
incam_global_horiz_video: ${output_dir}/3_incam_global_horiz.mp4
|
| 85 |
+
slam: ${preprocess_dir}/camera.npy
|
configs/diffusion/ddim.yaml
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
sampler: ddim
|
| 2 |
+
train_timestep_respacing: ""
|
| 3 |
+
test_timestep_respacing: "50"
|
| 4 |
+
schedule_sampler_type: uniform
|
| 5 |
+
noise_schedule: cosine
|
| 6 |
+
sigma_small: true
|
| 7 |
+
guidance_param: 1.0
|
| 8 |
+
ddim_eta: 0.0
|
configs/endecoder/v1_amass_local_bedlam_cam.yaml
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_target_: genmo.network.endecoder.EnDecoder
|
| 2 |
+
stats_name: MM_V1_AMASS_LOCAL_BEDLAM_CAM
|
configs/exp/genmo_lg.yaml
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# @package _global_
|
| 2 |
+
defaults:
|
| 3 |
+
- /diffusion@model_cfg.diffusion: ddim
|
| 4 |
+
- override /data: mocap/trainX_testY
|
| 5 |
+
- override /model: genmo
|
| 6 |
+
- override /network: diffusion
|
| 7 |
+
- override /pipeline: dual_mode
|
| 8 |
+
- override /endecoder: v1_amass_local_bedlam_cam
|
| 9 |
+
- override /optimizer: adamw_2e-4
|
| 10 |
+
- override /scheduler: epoch_half_200_350
|
| 11 |
+
- override /train_datasets:
|
| 12 |
+
- amass_train_v11
|
| 13 |
+
- humanml3d_static_train
|
| 14 |
+
- bedlam_v2
|
| 15 |
+
- h36m_v1
|
| 16 |
+
- 3dpw_v1
|
| 17 |
+
- 3dpw_occ_v1
|
| 18 |
+
- aistpp_train
|
| 19 |
+
- beat2_static_train
|
| 20 |
+
- override /test_datasets:
|
| 21 |
+
# - aistpp_test
|
| 22 |
+
- humanml3d_eval
|
| 23 |
+
- emdb1_fliptest
|
| 24 |
+
- emdb2_fliptest
|
| 25 |
+
- rich_test
|
| 26 |
+
- 3dpw_fliptest
|
| 27 |
+
- 3dpw_occ_fliptest
|
| 28 |
+
- override /callbacks:
|
| 29 |
+
- ckpt_saver/every10000s_top100
|
| 30 |
+
- prog_bar/prog_reporter_ed1
|
| 31 |
+
- train_speed_timer/base
|
| 32 |
+
- lr_monitor/pl
|
| 33 |
+
- vis/vis_text
|
| 34 |
+
- metric/metric_emdb1
|
| 35 |
+
- metric/metric_emdb2
|
| 36 |
+
- metric/metric_rich
|
| 37 |
+
- metric/metric_3dpw
|
| 38 |
+
- metric/metric_3dpw_occ
|
| 39 |
+
# - metric_aistpp
|
| 40 |
+
- _self_
|
| 41 |
+
|
| 42 |
+
exp_name_base: ${hydra:runtime.choices.exp}
|
| 43 |
+
exp_name_var: ""
|
| 44 |
+
exp_name: ${exp_name_base}_${exp_name_var}
|
| 45 |
+
data_name: genmo_mixed
|
| 46 |
+
|
| 47 |
+
multicond_args: null
|
| 48 |
+
|
| 49 |
+
pl_trainer:
|
| 50 |
+
precision: 16-mixed
|
| 51 |
+
log_every_n_steps: 10
|
| 52 |
+
gradient_clip_val: 0.5
|
| 53 |
+
max_epochs: null
|
| 54 |
+
check_val_every_n_epoch: null
|
| 55 |
+
val_check_interval: 3000
|
| 56 |
+
max_steps: 200000
|
| 57 |
+
devices: 1
|
| 58 |
+
strategy: ddp_find_unused_parameters_true
|
| 59 |
+
|
| 60 |
+
logger:
|
| 61 |
+
_target_: pytorch_lightning.loggers.tensorboard.TensorBoardLogger
|
| 62 |
+
save_dir: ${output_dir}
|
| 63 |
+
name: ""
|
| 64 |
+
version: ""
|
configs/finetune_unity.yaml
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# genmo/configs/finetune_unity.yaml
|
| 2 |
+
defaults:
|
| 3 |
+
- train
|
| 4 |
+
- override /exp: genmo_lg
|
| 5 |
+
# Keep only generic callbacks; drop dataset-specific metrics/visualizers.
|
| 6 |
+
- override /callbacks:
|
| 7 |
+
- ckpt_saver/every10000s_top100
|
| 8 |
+
- prog_bar/prog_reporter_ed1
|
| 9 |
+
- train_speed_timer/base
|
| 10 |
+
- lr_monitor/pl
|
| 11 |
+
- metric/metric_unity
|
| 12 |
+
- vis/vis_unity_val
|
| 13 |
+
- _self_
|
| 14 |
+
|
| 15 |
+
# Fix logging path mismatch by forcing filename to be local to the run dir
|
| 16 |
+
hydra:
|
| 17 |
+
job_logging:
|
| 18 |
+
handlers:
|
| 19 |
+
file:
|
| 20 |
+
filename: train.log
|
| 21 |
+
|
| 22 |
+
# Define mandatory variables and sync output_dir with Hydra run dir
|
| 23 |
+
data_name: "unity"
|
| 24 |
+
exp_name_base: "finetune"
|
| 25 |
+
# Keep `output_dir` from `configs/train.yaml` to avoid a Hydra/OmegaConf interpolation cycle:
|
| 26 |
+
# `hydra.run.dir` -> `${output_dir}` (configs/hydra/default.yaml) and `output_dir` -> `${hydra:run.dir}` would recurse.
|
| 27 |
+
|
| 28 |
+
# For tiny Unity sets, save a checkpoint at the end of every epoch.
|
| 29 |
+
callbacks:
|
| 30 |
+
ckpt_saver:
|
| 31 |
+
every10000s_top100:
|
| 32 |
+
every_n_steps: null
|
| 33 |
+
every_n_epochs: 1
|
| 34 |
+
save_top_k: 1
|
| 35 |
+
vis:
|
| 36 |
+
vis_unity_val:
|
| 37 |
+
enabled: true
|
| 38 |
+
|
| 39 |
+
train_datasets:
|
| 40 |
+
unity:
|
| 41 |
+
_target_: genmo.datasets.unity_dataset.UnityDataset
|
| 42 |
+
root: "./third_party/GVHMR/processed_dataset"
|
| 43 |
+
split: "train"
|
| 44 |
+
motion_frames: 120
|
| 45 |
+
# Explicitly disable datasets inherited from `exp=genmo_lg`.
|
| 46 |
+
amass_train_v11: null
|
| 47 |
+
humanml3d_static_train: null
|
| 48 |
+
bedlam_v2: null
|
| 49 |
+
h36m_v1: null
|
| 50 |
+
3dpw_v1: null
|
| 51 |
+
3dpw_occ_v1: null
|
| 52 |
+
aistpp_train: null
|
| 53 |
+
beat2_static_train: null
|
| 54 |
+
|
| 55 |
+
test_datasets:
|
| 56 |
+
unity_val:
|
| 57 |
+
_target_: genmo.datasets.unity_dataset.UnityDataset
|
| 58 |
+
root: "./third_party/GVHMR/processed_dataset"
|
| 59 |
+
split: "train"
|
| 60 |
+
motion_frames: 120
|
| 61 |
+
# Explicitly disable test datasets inherited from `exp=genmo_lg`.
|
| 62 |
+
humanml3d_eval: null
|
| 63 |
+
emdb1_fliptest: null
|
| 64 |
+
emdb2_fliptest: null
|
| 65 |
+
rich_test: null
|
| 66 |
+
3dpw_fliptest: null
|
| 67 |
+
3dpw_occ_fliptest: null
|
| 68 |
+
|
| 69 |
+
# Fine-tuning Hyperparameters
|
| 70 |
+
solver:
|
| 71 |
+
optimizer:
|
| 72 |
+
lr: 5e-6 # VERY IMPORTANT: Low LR to preserve pretrained knowledge
|
| 73 |
+
|
| 74 |
+
scheduler:
|
| 75 |
+
type: "constant" # Keep it simple for fine-tuning
|
| 76 |
+
|
| 77 |
+
# Lightning Trainer settings
|
| 78 |
+
pl_trainer:
|
| 79 |
+
max_epochs: 5
|
| 80 |
+
check_val_every_n_epoch: 1
|
| 81 |
+
log_every_n_steps: 1
|
| 82 |
+
precision: 16-mixed # Saves VRAM, faster
|
| 83 |
+
gradient_clip_val: 1.0
|
| 84 |
+
val_check_interval: 1.0
|
| 85 |
+
|
| 86 |
+
# Override the default dataloader settings from `exp=genmo_lg` (it uses batch_size=128
|
| 87 |
+
# and the DataModule uses `drop_last=True`, which yields 0 batches for small Unity sets).
|
| 88 |
+
data:
|
| 89 |
+
loader_opts:
|
| 90 |
+
train:
|
| 91 |
+
batch_size: 1
|
| 92 |
+
num_workers: 1
|
| 93 |
+
val:
|
| 94 |
+
batch_size: 1
|
| 95 |
+
num_workers: 1
|
configs/hydra/default.yaml
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# enable color logging
|
| 2 |
+
defaults:
|
| 3 |
+
- override hydra_logging: colorlog
|
| 4 |
+
- override job_logging: colorlog
|
| 5 |
+
|
| 6 |
+
job_logging:
|
| 7 |
+
formatters:
|
| 8 |
+
simple:
|
| 9 |
+
datefmt: "%m/%d %H:%M:%S"
|
| 10 |
+
format: "[%(asctime)s][%(levelname)s] %(message)s"
|
| 11 |
+
colorlog:
|
| 12 |
+
datefmt: "%m/%d %H:%M:%S"
|
| 13 |
+
format: "[%(cyan)s%(asctime)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] %(message)s"
|
| 14 |
+
handlers:
|
| 15 |
+
file:
|
| 16 |
+
filename: ${output_dir}/${hydra.job.name}.log
|
| 17 |
+
|
| 18 |
+
run:
|
| 19 |
+
dir: ${output_dir}
|
configs/infer_video.yaml
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
defaults:
|
| 2 |
+
# pytorch-lightning / hydra wiring (kept for compatibility with `exp=...` configs)
|
| 3 |
+
- data: ???
|
| 4 |
+
- model: ???
|
| 5 |
+
- callbacks: null
|
| 6 |
+
- hydra: default
|
| 7 |
+
- pipeline: null
|
| 8 |
+
- network: null
|
| 9 |
+
- optimizer: null
|
| 10 |
+
- scheduler: null
|
| 11 |
+
- train_datasets: null
|
| 12 |
+
- test_datasets: null
|
| 13 |
+
- endecoder: null
|
| 14 |
+
- refiner: null
|
| 15 |
+
|
| 16 |
+
# pick an experiment preset (sets data/model/network/pipeline/etc)
|
| 17 |
+
- exp: genmo_lg
|
| 18 |
+
- _self_
|
| 19 |
+
|
| 20 |
+
# Video -> SMPL-X inference (GENMO/GEM)
|
| 21 |
+
video_path: null
|
| 22 |
+
video_name: null
|
| 23 |
+
|
| 24 |
+
output_root: outputs/infer_video
|
| 25 |
+
output_dir: ${output_root}/${video_name}
|
| 26 |
+
preprocess_dir: ${output_dir}/preprocess
|
| 27 |
+
|
| 28 |
+
# Checkpoint
|
| 29 |
+
ckpt_path: null
|
| 30 |
+
|
| 31 |
+
# Inference options
|
| 32 |
+
static_cam: true
|
| 33 |
+
use_kp2d: true
|
| 34 |
+
postproc: true
|
| 35 |
+
resample_to_30fps: true
|
| 36 |
+
verbose: false
|
| 37 |
+
|
| 38 |
+
# Rendering
|
| 39 |
+
render_incam: true
|
| 40 |
+
render_global: true
|
| 41 |
+
render_side_by_side: true
|
| 42 |
+
render_crf: 23
|
| 43 |
+
|
| 44 |
+
# Optional visualization: draw estimated camera axes in the global render.
|
| 45 |
+
draw_camera_axes: false
|
| 46 |
+
# Camera pose convention for `paths.slam` (affects camera-axis visualization only):
|
| 47 |
+
# - auto: choose the one closest to the person root each frame
|
| 48 |
+
# - w2c: interpret trajectory as world->camera
|
| 49 |
+
# - c2w: interpret trajectory as camera->world
|
| 50 |
+
camera_pose_convention: auto
|
| 51 |
+
camera_axis_length: 0.5
|
| 52 |
+
camera_axis_width: 3
|
| 53 |
+
|
| 54 |
+
paths:
|
| 55 |
+
input_video: ${output_dir}/0_input_video.mp4
|
| 56 |
+
video_30fps: ${output_dir}/0_input_video_30fps.mp4
|
| 57 |
+
bbx: ${preprocess_dir}/bbx.pt
|
| 58 |
+
vitpose: ${preprocess_dir}/vitpose.pt
|
| 59 |
+
vit_features: ${preprocess_dir}/vit_features.pt
|
| 60 |
+
hmr4d_results: ${output_dir}/hmr4d_results.pt
|
| 61 |
+
incam_video: ${output_dir}/1_incam.mp4
|
| 62 |
+
global_video: ${output_dir}/2_global.mp4
|
| 63 |
+
incam_global_horiz_video: ${output_dir}/3_incam_global_horiz.mp4
|
| 64 |
+
|
| 65 |
+
# Disable external logging by default for a local demo script.
|
| 66 |
+
###
|
| 67 |
+
# W&B logging removed from this repo; TensorBoard is used by `scripts/train.py` for training runs.
|
configs/model/genmo.yaml
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_target_: genmo.genmo.GENMO
|
| 2 |
+
|
| 3 |
+
pipeline: ${pipeline}
|
| 4 |
+
optimizer: ${optimizer}
|
| 5 |
+
scheduler: ${scheduler}
|
| 6 |
+
|
| 7 |
+
model_cfg:
|
| 8 |
+
train_modes: ["regression", "diffusion"]
|
| 9 |
+
noisy_2d_obs: true
|
| 10 |
+
kp2d_noise_scale: 0.5
|
| 11 |
+
perframe_condition_exists: true
|
| 12 |
+
train2d_mask_invis_obs: true
|
| 13 |
+
mask_occluded_imgfeats: true
|
| 14 |
+
cond_merge_strategy: "add"
|
| 15 |
+
use_cond_exists_as_input: true
|
| 16 |
+
normalize_cam_angvel: true
|
| 17 |
+
|
| 18 |
+
diffusion:
|
| 19 |
+
test_timestep_respacing: "50"
|
| 20 |
+
guidance_param: 2.5
|
| 21 |
+
|
| 22 |
+
text_encoder:
|
| 23 |
+
load_llm: false
|
| 24 |
+
llm_version: "t5-3b"
|
| 25 |
+
max_text_len: 50
|
| 26 |
+
|
| 27 |
+
condition_mask:
|
| 28 |
+
mask_img_prob: 0.5
|
| 29 |
+
mask_cam_prob: 1.0
|
| 30 |
+
reuse_regression_mask: false
|
| 31 |
+
regression_no_img_mask: true
|
| 32 |
+
|
| 33 |
+
mask_cfg:
|
| 34 |
+
drop_prob: 0.75
|
| 35 |
+
max_num_drops: 3
|
| 36 |
+
min_drop_nframes: 1
|
| 37 |
+
max_drop_nframes: 30
|
| 38 |
+
body_mask_cfg:
|
| 39 |
+
drop_prob: 0.75
|
| 40 |
+
joint_drop_prob: 0.25
|
| 41 |
+
max_num_drops: 3
|
| 42 |
+
min_drop_nframes: 1
|
| 43 |
+
max_drop_nframes: 30
|
| 44 |
+
music_mask_prob: 0.1
|
| 45 |
+
audio_mask_prob: 0.1
|
configs/network/diffusion.yaml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_target_: genmo.network.genmo_diffusion.GENMODiffusion
|
| 2 |
+
args: ${pipeline.args}
|
| 3 |
+
latent_dim: ${.model_cfg.denoiser.latent_dim}
|
| 4 |
+
cond_merge_strategy: "add"
|
| 5 |
+
music_mask_prob: ${.model_cfg.denoiser.music_mask_prob}
|
| 6 |
+
speech_mask_prob: ${.model_cfg.denoiser.speech_mask_prob}
|
| 7 |
+
encoded_music_dim: ${pipeline.args.encoded_music_dim}
|
| 8 |
+
model_cfg:
|
| 9 |
+
diffusion: ${model_cfg.diffusion}
|
| 10 |
+
denoiser:
|
| 11 |
+
_target_: genmo.network.genmo_denoiser.NetworkEncoderRoPE
|
| 12 |
+
output_dim: 151
|
| 13 |
+
xt_dim: ${.output_dim}
|
| 14 |
+
njoints: ${.xt_dim}
|
| 15 |
+
text_mask_prob: 0.1
|
| 16 |
+
music_mask_prob: 0.1
|
| 17 |
+
speech_mask_prob: 0.1
|
| 18 |
+
use_text_pos_enc: true
|
| 19 |
+
text_encoder_cfg:
|
| 20 |
+
mode: all
|
| 21 |
+
cross_attn_type: mha
|
| 22 |
+
latent_dim: 1024
|
| 23 |
+
num_layers: 16
|
| 24 |
+
num_heads: 8
|
| 25 |
+
mlp_ratio: 4
|
configs/optimizer/adamw_2e-4.yaml
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_target_: torch.optim.AdamW
|
| 2 |
+
lr: 2e-4
|
configs/pipeline/dual_mode.yaml
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_target_: genmo.pipeline.genmo_pipeline.Pipeline
|
| 2 |
+
args_denoiser3d: ${network}
|
| 3 |
+
args:
|
| 4 |
+
endecoder_opt: ${endecoder}
|
| 5 |
+
use_regression_outputs_prob: 0.
|
| 6 |
+
use_cfg_sampler_for_gen: true
|
| 7 |
+
inpaint_x_start_gt: false
|
| 8 |
+
regression_only: true
|
| 9 |
+
encoded_music_dim: 35
|
| 10 |
+
multicond_args: ${multicond_args}
|
| 11 |
+
infer_version: 2
|
| 12 |
+
weights:
|
| 13 |
+
cr_j3d: 500.
|
| 14 |
+
transl_c: 1.
|
| 15 |
+
cr_verts: 500.
|
| 16 |
+
j2d: 1000.
|
| 17 |
+
j2d_17: 1000.
|
| 18 |
+
verts2d: 1000.
|
| 19 |
+
|
| 20 |
+
proj_gt_j2d_to_bi01: true
|
| 21 |
+
|
| 22 |
+
transl_w: 1.
|
| 23 |
+
static_conf_bce: 1.
|
| 24 |
+
|
| 25 |
+
static_conf:
|
| 26 |
+
vel_thr: 0.15
|
| 27 |
+
|
| 28 |
+
in_attr:
|
| 29 |
+
- obs
|
| 30 |
+
- f_cliffcam
|
| 31 |
+
- f_imgseq
|
| 32 |
+
- f_cam_angvel
|
| 33 |
+
- encoded_music
|
| 34 |
+
- encoded_audio
|
| 35 |
+
mask_out_attr: [] # ${.in_attr}
|
| 36 |
+
out_attr:
|
| 37 |
+
pred_cam: 3
|
configs/scheduler/epoch_half_200_350.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
scheduler:
|
| 2 |
+
_target_: torch.optim.lr_scheduler.MultiStepLR
|
| 3 |
+
milestones: [200, 350]
|
| 4 |
+
gamma: 0.5
|
| 5 |
+
interval: epoch
|
| 6 |
+
frequency: 1
|
configs/test_datasets/3dpw_fliptest.yaml
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3dpw_fliptest:
|
| 2 |
+
_target_: genmo.datasets.threedpw.threedpw_motion_test.ThreedpwSmplFullSeqDataset
|
| 3 |
+
flip_test: true
|
configs/test_datasets/3dpw_occ_fliptest.yaml
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3dpw_occ_fliptest:
|
| 2 |
+
_target_: genmo.datasets.threedpw.threedpw_occ_motion_test.ThreedpwOccSmplFullSeqDataset
|
| 3 |
+
flip_test: true
|
configs/test_datasets/emdb1_fliptest.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
emdb1_fliptest:
|
| 2 |
+
_target_: genmo.datasets.emdb.emdb_motion_test.EmdbSmplFullSeqDataset
|
| 3 |
+
split: 1
|
| 4 |
+
flip_test: true
|
configs/test_datasets/emdb2_fliptest.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
emdb2_fliptest:
|
| 2 |
+
_target_: genmo.datasets.emdb.emdb_motion_test.EmdbSmplFullSeqDataset
|
| 3 |
+
split: 2
|
| 4 |
+
flip_test: true
|
configs/test_datasets/humanml3d_eval.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
humanml3d_eval:
|
| 2 |
+
_target_: genmo.datasets.pure_motion.humanml3d.Humanml3dDataset
|
| 3 |
+
eval_gen_only: true
|
| 4 |
+
cam_augmentation: v11
|
| 5 |
+
use_random_subset: true
|
| 6 |
+
random_subset_size: 2
|
| 7 |
+
random_subset_seed: 7
|
configs/test_datasets/rich_test.yaml
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
rich_test:
|
| 2 |
+
_target_: genmo.datasets.rich.rich_motion_test.RichSmplFullSeqDataset
|
configs/text_encoder/t5_3b.yaml
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
load_llm: true
|
| 2 |
+
llm_version: "t5-3b"
|
| 3 |
+
max_text_len: 50
|
configs/train.yaml
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
defaults:
|
| 2 |
+
- _self_
|
| 3 |
+
# pytorch-lightning
|
| 4 |
+
- data: ???
|
| 5 |
+
- model: ???
|
| 6 |
+
- callbacks: null
|
| 7 |
+
|
| 8 |
+
# system
|
| 9 |
+
- hydra: default
|
| 10 |
+
|
| 11 |
+
# utility groups that changes a lot
|
| 12 |
+
- pipeline: null
|
| 13 |
+
- network: null
|
| 14 |
+
- optimizer: null
|
| 15 |
+
- scheduler: null
|
| 16 |
+
- train_datasets: null
|
| 17 |
+
- test_datasets: null
|
| 18 |
+
- endecoder: null # normalize/unnormalize data
|
| 19 |
+
- refiner: null
|
| 20 |
+
|
| 21 |
+
# global-override
|
| 22 |
+
- exp: mixed # set "data, model and callbacks" in yaml
|
| 23 |
+
- global/task: null # dump/test
|
| 24 |
+
- global/hsearch: null # hyper-param search
|
| 25 |
+
- global/debug: null # debug mode
|
| 26 |
+
|
| 27 |
+
# ================================ #
|
| 28 |
+
# global setting #
|
| 29 |
+
# ================================ #
|
| 30 |
+
# expirement information
|
| 31 |
+
task: fit # [fit, predict]
|
| 32 |
+
exp_name_base: ???
|
| 33 |
+
exp_name_var: ""
|
| 34 |
+
exp_name: ${exp_name_base}_${exp_name_var}
|
| 35 |
+
data_name: ???
|
| 36 |
+
num_test_data: 32
|
| 37 |
+
|
| 38 |
+
# utilities in the entry file
|
| 39 |
+
output_dir: "outputs/${data_name}/${exp_name}"
|
| 40 |
+
ckpt_path: null
|
| 41 |
+
resume_mode: null
|
| 42 |
+
seed: 42
|
| 43 |
+
|
| 44 |
+
# lightning default settings
|
| 45 |
+
pl_trainer:
|
| 46 |
+
devices: 1
|
| 47 |
+
num_sanity_val_steps: 0 # disable sanity check
|
| 48 |
+
precision: 32
|
| 49 |
+
inference_mode: False
|
| 50 |
+
|
| 51 |
+
logger:
|
| 52 |
+
_target_: pytorch_lightning.loggers.tensorboard.TensorBoardLogger
|
| 53 |
+
save_dir: ${output_dir}
|
| 54 |
+
name: ""
|
| 55 |
+
version: ""
|
configs/train_datasets/3dpw_occ_v1.yaml
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3dpw_occ_v1:
|
| 2 |
+
_target_: genmo.datasets.threedpw.threedpw_occ_motion_train.ThreedpwOccSmplDataset
|
configs/train_datasets/3dpw_v1.yaml
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3dpw_v1:
|
| 2 |
+
_target_: genmo.datasets.threedpw.threedpw_motion_train.ThreedpwSmplDataset
|
configs/train_datasets/aistpp_train.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
aistpp_train:
|
| 2 |
+
_target_: genmo.datasets.aistplusplus.aistplusplus.AISTPlusPlusSmplDataset
|
| 3 |
+
split: train
|
| 4 |
+
motion_frames: 120
|
| 5 |
+
lazy_load: false
|
| 6 |
+
eval_gen_only: true
|
| 7 |
+
feat_version: v2
|
configs/train_datasets/amass_train_v11.yaml
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
amass_train_v11:
|
| 2 |
+
_target_: genmo.datasets.pure_motion.amass.AmassDataset
|
| 3 |
+
|
| 4 |
+
motion_frames: 120
|
| 5 |
+
l_factor: 1.5
|
| 6 |
+
skip_moyo: True
|
| 7 |
+
cam_augmentation: v11
|
| 8 |
+
random1024: False
|
| 9 |
+
limit_size: null
|
configs/train_datasets/beat2_static_train.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
beat2_static_train:
|
| 2 |
+
_target_: genmo.datasets.beat2.beat2.BEAT2SmplDataset
|
| 3 |
+
split: train
|
| 4 |
+
cam_augmentation: static
|
| 5 |
+
motion_frames: 120
|
| 6 |
+
lazy_load: false
|