zirobtc commited on
Commit
fbb20ff
·
verified ·
1 Parent(s): c7f6307

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +16 -0
  2. .gitignore +5 -0
  3. .gitmodules +6 -0
  4. LICENSE +36 -0
  5. README.md +75 -0
  6. SyntheticRecorder.cs +667 -0
  7. assets/teaser.png +3 -0
  8. configs/__init__.py +30 -0
  9. configs/callbacks/ckpt_saver/every10000s_top100.yaml +5 -0
  10. configs/callbacks/lr_monitor/pl.yaml +2 -0
  11. configs/callbacks/metric/metric_3dpw.yaml +2 -0
  12. configs/callbacks/metric/metric_3dpw_occ.yaml +2 -0
  13. configs/callbacks/metric/metric_aistpp.yaml +2 -0
  14. configs/callbacks/metric/metric_emdb1.yaml +4 -0
  15. configs/callbacks/metric/metric_emdb2.yaml +4 -0
  16. configs/callbacks/metric/metric_rich.yaml +3 -0
  17. configs/callbacks/metric/metric_unity.yaml +4 -0
  18. configs/callbacks/prog_bar/prog_reporter_ed1.yaml +5 -0
  19. configs/callbacks/train_speed_timer/base.yaml +3 -0
  20. configs/callbacks/vis/vis_music.yaml +2 -0
  21. configs/callbacks/vis/vis_speech.yaml +2 -0
  22. configs/callbacks/vis/vis_text.yaml +2 -0
  23. configs/callbacks/vis/vis_unity_val.yaml +14 -0
  24. configs/data/collate_cfg/default.yaml +23 -0
  25. configs/data/mocap/trainX_testY.yaml +21 -0
  26. configs/demo.yaml +85 -0
  27. configs/diffusion/ddim.yaml +8 -0
  28. configs/endecoder/v1_amass_local_bedlam_cam.yaml +2 -0
  29. configs/exp/genmo_lg.yaml +64 -0
  30. configs/finetune_unity.yaml +95 -0
  31. configs/hydra/default.yaml +19 -0
  32. configs/infer_video.yaml +67 -0
  33. configs/model/genmo.yaml +45 -0
  34. configs/network/diffusion.yaml +25 -0
  35. configs/optimizer/adamw_2e-4.yaml +2 -0
  36. configs/pipeline/dual_mode.yaml +37 -0
  37. configs/scheduler/epoch_half_200_350.yaml +6 -0
  38. configs/test_datasets/3dpw_fliptest.yaml +3 -0
  39. configs/test_datasets/3dpw_occ_fliptest.yaml +3 -0
  40. configs/test_datasets/emdb1_fliptest.yaml +4 -0
  41. configs/test_datasets/emdb2_fliptest.yaml +4 -0
  42. configs/test_datasets/humanml3d_eval.yaml +7 -0
  43. configs/test_datasets/rich_test.yaml +2 -0
  44. configs/text_encoder/t5_3b.yaml +3 -0
  45. configs/train.yaml +55 -0
  46. configs/train_datasets/3dpw_occ_v1.yaml +2 -0
  47. configs/train_datasets/3dpw_v1.yaml +2 -0
  48. configs/train_datasets/aistpp_train.yaml +7 -0
  49. configs/train_datasets/amass_train_v11.yaml +9 -0
  50. configs/train_datasets/beat2_static_train.yaml +6 -0
.gitattributes CHANGED
@@ -33,3 +33,19 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/teaser.png filter=lfs diff=lfs merge=lfs -text
37
+ test.mp4 filter=lfs diff=lfs merge=lfs -text
38
+ test_10.mp4 filter=lfs diff=lfs merge=lfs -text
39
+ test_11.mp4 filter=lfs diff=lfs merge=lfs -text
40
+ test_6.mp4 filter=lfs diff=lfs merge=lfs -text
41
+ third_party/DroidCalib/build/temp.linux-x86_64-3.10/.ninja_deps filter=lfs diff=lfs merge=lfs -text
42
+ third_party/DroidCalib/build/temp.linux-x86_64-3.10/src/droid.o filter=lfs diff=lfs merge=lfs -text
43
+ third_party/DroidCalib/misc/droidcalib.png filter=lfs diff=lfs merge=lfs -text
44
+ third_party/DroidCalib/thirdparty/lietorch/examples/registration/assets/image1.png filter=lfs diff=lfs merge=lfs -text
45
+ third_party/DroidCalib/thirdparty/lietorch/examples/registration/assets/image2.png filter=lfs diff=lfs merge=lfs -text
46
+ third_party/DroidCalib/thirdparty/lietorch/examples/registration/assets/image3.png filter=lfs diff=lfs merge=lfs -text
47
+ third_party/DroidCalib/thirdparty/lietorch/examples/registration/assets/image4.png filter=lfs diff=lfs merge=lfs -text
48
+ third_party/DroidCalib/thirdparty/lietorch/examples/registration/assets/registration.gif filter=lfs diff=lfs merge=lfs -text
49
+ third_party/DroidCalib/thirdparty/lietorch/examples/rgbdslam/assets/floor.png filter=lfs diff=lfs merge=lfs -text
50
+ third_party/DroidCalib/thirdparty/lietorch/examples/rgbdslam/assets/room.png filter=lfs diff=lfs merge=lfs -text
51
+ third_party/DroidCalib/thirdparty/lietorch/lietorch.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ outputs/*
2
+ dataset-generator/
3
+ out/
4
+ __pycache__/
5
+ *.pyc
.gitmodules ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ [submodule "third-party/DROID-SLAM/thirdparty/eigen"]
2
+ path = third-party/DROID-SLAM/thirdparty/eigen
3
+ url = https://gitlab.com/libeigen/eigen.git
4
+ [submodule "third_party/GVHMR"]
5
+ path = third_party/GVHMR
6
+ url = git@github.com:zju3dv/GVHMR.git
LICENSE ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NVIDIA License
2
+
3
+ 1. Definitions
4
+
5
+ “Licensor” means any person or entity that distributes its Work.
6
+ “Work” means (a) the original work of authorship made available under this license, which may include software, documentation, or other files, and (b) any additions to or derivative works thereof that are made available under this license.
7
+ The terms “reproduce,” “reproduction,” “derivative works,” and “distribution” have the meaning as provided under U.S. copyright law; provided, however, that for the purposes of this license, derivative works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work.
8
+ Works are “made available” under this license by including in or with the Work either (a) a copyright notice referencing the applicability of this license to the Work, or (b) a copy of this license.
9
+
10
+ 2. License Grant
11
+
12
+ 2.1 Copyright Grant. Subject to the terms and conditions of this license, each Licensor grants to you a perpetual, worldwide, non-exclusive, royalty-free, copyright license to use, reproduce, prepare derivative works of, publicly display, publicly perform, sublicense and distribute its Work and any resulting derivative works in any form.
13
+
14
+ 3. Limitations
15
+
16
+ 3.1 Redistribution. You may reproduce or distribute the Work only if (a) you do so under this license, (b) you include a complete copy of this license with your distribution, and (c) you retain without modification any copyright, patent, trademark, or attribution notices that are present in the Work.
17
+
18
+ 3.2 Derivative Works. You may specify that additional or different terms apply to the use, reproduction, and distribution of your derivative works of the Work (“Your Terms”) only if (a) Your Terms provide that the use limitation in Section 3.3 applies to your derivative works, and (b) you identify the specific derivative works that are subject to Your Terms. Notwithstanding Your Terms, this license (including the redistribution requirements in Section 3.1) will continue to apply to the Work itself.
19
+
20
+ 3.3 Use Limitation. The Work and any derivative works thereof only may be used or intended for use non-commercially. Notwithstanding the foregoing, NVIDIA Corporation and its affiliates may use the Work and any derivative works commercially. As used herein, “non-commercially” means for non-commercial academic purposes only.
21
+
22
+ 3.4 Patent Claims. If you bring or threaten to bring a patent claim against any Licensor (including any claim, cross-claim or counterclaim in a lawsuit) to enforce any patents that you allege are infringed by any Work, then your rights under this license from such Licensor (including the grant in Section 2.1) will terminate immediately.
23
+
24
+ 3.5 Trademarks. This license does not grant any rights to use any Licensor’s or its affiliates’ names, logos, or trademarks, except as necessary to reproduce the notices described in this license.
25
+
26
+ 3.6 Termination. If you violate any term of this license, then your rights under this license (including the grant in Section 2.1) will terminate immediately.
27
+
28
+ 4. Disclaimer of Warranty.
29
+
30
+ THE WORK IS PROVIDED “AS IS” WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
31
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE.
32
+
33
+ 5. Limitation of Liability.
34
+
35
+ EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
36
+
README.md ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <p align="center">
2
+ <h1 align="center"> GEM: A Generalist Model for Human Motion</h1>
3
+ <p align="center">
4
+ <a href="https://jeffli.site/"><strong>Jiefeng Li</strong></a>
5
+ ·
6
+ <a href="https://www.jinkuncao.com/"><strong>Jinkun Cao</strong></a>
7
+ ·
8
+ <a href="https://cs.stanford.edu/~haotianz/"><strong>Haotian Zhang</strong></a>
9
+ ·
10
+ <a href="https://davrempe.github.io/"><strong>Davis Rempe</strong></a>
11
+ ·
12
+ <a href="https://jankautz.com/"><strong>Jan Kautz</strong></a>
13
+ ·
14
+ <a href="https://www.umariqbal.info/"><strong>Umar Iqbal</strong></a>
15
+ ·
16
+ <a href="https://ye-yuan.com/"><strong>Ye Yuan</strong></a>
17
+ </p>
18
+ <h2 align="center">ICCV 2025 (Highlight)</h2>
19
+ <div align="center">
20
+ <img src="./assets/teaser.png" alt="Logo" width="100%">
21
+ </div>
22
+ </p>
23
+ <p align="center">
24
+ <a href="https://research.nvidia.com/labs/dair/gem/"><img src="https://img.shields.io/badge/Project-Page-0099cc"></a>
25
+ <a href="https://arxiv.org/abs/2505.01425"><img src="https://img.shields.io/badge/arXiv-2505.01425-b31b1b.svg"></a>
26
+
27
+ </p>
28
+
29
+ **GEM** is a generalist model for human motion that handles multiple tasks with a single model, supporting diverse conditioning signals including video, keypoints, text, audio, and 3D keyframes.
30
+
31
+ ---
32
+
33
+ ## 📰 News
34
+ - **[December 2025]** 📢 GENMO has been renamed to **GEM**.
35
+ - **[October 2025]** 📢 The **GEM** codebase is **released!**
36
+ Stay tuned for the pretrained models and evaluation scripts.
37
+ Follow the [project page](https://research.nvidia.com/labs/dair/gem/) for updates and announcements.
38
+
39
+
40
+ ---
41
+
42
+
43
+ ## 🚀 Highlights
44
+
45
+ GEM introduces a **unified generative framework** that connects motion estimation and generation through shared objectives.
46
+
47
+ - **Unified framework:** Reframes motion estimation as *constrained generation*, allowing a single model to perform both tasks.
48
+ - **Regression × Diffusion synergy:** Combines the accuracy of regression models with the diversity of diffusion-based generation.
49
+ - **Estimation-guided training:** Trains effectively on in-the-wild datasets using only 2D or textual supervision.
50
+ - **Multimodal conditioning:** Supports video, text, audio, 2D/3D keyframes, or even time-varying mixed inputs (e.g., video → text → video).
51
+ - **Arbitrary-length motion:** Generates continuous, coherent sequences of any duration in one diffusion pass.
52
+ - **State-of-the-art performance:** Achieves leading results on diverse motion estimation and generation benchmarks.
53
+
54
+ For more details, visit the **[GEM project page →](https://research.nvidia.com/labs/dair/gem/)**
55
+
56
+ ---
57
+
58
+ ### Pretrained Models
59
+ You can download pretrained models from [Google Drive](https://drive.google.com/file/d/1b1E84G7S0h2n5o0RmrcmKOhRKukOjgsJ/view?usp=sharing).
60
+
61
+ ## 📖 Paper & Citation
62
+
63
+ **Paper:**
64
+ [GENMO: A GENeralist Model for Human MOtion](https://arxiv.org/abs/2505.01425)
65
+ *Jiefeng Li, Jinkun Cao, Haotian Zhang, Davis Rempe, Jan Kautz, Umar Iqbal, Ye Yuan*
66
+ ICCV, 2025
67
+
68
+ **BibTeX:**
69
+ ```bibtex
70
+ @inproceedings{genmo2025,
71
+ title = {GENMO: A GENeralist Model for Human MOtion},
72
+ author = {Li, Jiefeng and Cao, Jinkun and Zhang, Haotian and Rempe, Davis and Kautz, Jan and Iqbal, Umar and Yuan, Ye},
73
+ booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
74
+ year = {2025}
75
+ }
SyntheticRecorder.cs ADDED
@@ -0,0 +1,667 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ using UnityEngine;
2
+ using System;
3
+ using System.IO;
4
+ using System.Collections;
5
+ using System.Collections.Generic;
6
+ using Newtonsoft.Json;
7
+ using UnityEngine.SceneManagement;
8
+ using System.Diagnostics; // Required for FFmpeg Process
9
+
10
+ public class SyntheticRecorder : MonoBehaviour
11
+ {
12
+ // --- CONFIGURATION CLASSES ---
13
+ [System.Serializable]
14
+ public class AvatarConfig
15
+ {
16
+ public string avatarName = "Avatar";
17
+ public GameObject avatarObject;
18
+ [Header("Animation")]
19
+ public Animator animator;
20
+ [Header("Retargeting Link")]
21
+ public HybridPoseCopier retargeter;
22
+ public List<GameObject> extraMeshes = new List<GameObject>();
23
+ public float specificPadding = 40f;
24
+ [Header("Keypoint Markers (COCO-17 order)")]
25
+ public List<Transform> customMarkers = new List<Transform>();
26
+ }
27
+
28
+ // --- JSON STRUCTURES ---
29
+ public class SequenceData { public List<FrameData> frames; }
30
+ public class FrameData { public int i; public float[] p, t, b; public int s; }
31
+
32
+ public class OutputMeta
33
+ {
34
+ public int frame_index;
35
+ public string image_path;
36
+ public string avatar_name;
37
+ public int face_id;
38
+ public int left_hand_id;
39
+ public int right_hand_id;
40
+ public float[] bbox;
41
+ public float[] kpts_2d;
42
+ public int[] kpts_vis;
43
+ public float[] bbox_clip;
44
+ public float[] cam_intrinsics;
45
+ public float[] cam_pos_world;
46
+ public float[] cam_rot_world;
47
+ public float[] pelvis_pos_world;
48
+ public float[] pelvis_rot_world;
49
+ public float[] smpl_incam_quat;
50
+ public float[] smpl_incam_transl;
51
+ public float[] smpl_root_incam_transl;
52
+ public float smpl_root_world_scale;
53
+ public float[] kpts_3d_world;
54
+ public float[] smplx_pose;
55
+ public float[] smplx_betas;
56
+ }
57
+
58
+ [Header("Settings")]
59
+ public string inputFolderPath = "Assets/StreamingAssets";
60
+ public string outputFolder = "C:/Temp/SyntheticDataset";
61
+ public bool startRecordingOnPlay = true;
62
+ public bool showDebugUI = true;
63
+ [Tooltip("If true, saves depth_xxxxx.png files to check what the occlusion camera sees.")]
64
+ public bool saveDebugDepthImages = true;
65
+
66
+ [Header("Video Settings")]
67
+ public string ffmpegPath = "ffmpeg";
68
+ public int frameRate = 30;
69
+
70
+ [Header("Compression (Twitch VOD Simulation)")]
71
+ [Tooltip("Target Bitrate in kbps. 6000 is High Quality 1080p. 2500 is messy 720p.")]
72
+ public int targetBitrateKbps = 2500;
73
+ [Tooltip("GOP (Group of Pictures) size in seconds. Twitch uses 2 seconds.")]
74
+ public float gopSizeSeconds = 2.0f;
75
+
76
+ [Header("Parallel Processing")]
77
+ public int workerId = 0;
78
+ public int totalWorkers = 1;
79
+
80
+ [Header("Sequence Naming")]
81
+ public string sequenceName = "";
82
+ private string _currentInputJsonPath = "";
83
+
84
+ [Header("Occlusion Settings")]
85
+ public float occlusionBias = 0.02f;
86
+
87
+ [Header("References")]
88
+ public GameObject characterRoot;
89
+ public Camera vtuberCamera;
90
+ public SyntheticCameraDriver cameraDriver;
91
+
92
+ [Header("Randomization")]
93
+ public List<AvatarConfig> avatarList = new List<AvatarConfig>();
94
+ public List<string> worldSceneNames = new List<string>();
95
+ public LoadSceneMode worldSceneLoadMode = LoadSceneMode.Additive;
96
+ public bool setLoadedWorldSceneActive = true;
97
+ public string worldMainCameraName = "Main Camera";
98
+ public string spawnPointToken = "SpawnPoint";
99
+
100
+ [Header("Animation Indices")]
101
+ public int faceMaxId = 5;
102
+ public int handsMaxId = 5;
103
+ public int minSwitchFrames = 30;
104
+ public int maxSwitchFrames = 120;
105
+ public string paramFaceIndex = "FaceIndex";
106
+ public string paramLeftHandIndex = "LeftHandIndex";
107
+ public string paramRightHandIndex = "RightHandIndex";
108
+
109
+ [Header("BBOX Accuracy")]
110
+ public bool useBakedSkinnedMeshForBbox = true;
111
+ public int bakedVertexStride = 8;
112
+
113
+ [Header("Calibration")]
114
+ public float movementScale = 1.0f;
115
+ public Vector3 translationOffset = new Vector3(0, 0.05f, 0);
116
+ public Vector3 globalCoordinateCorrection = new Vector3(-90, 180, 0);
117
+
118
+ // --- PRIVATE STATE ---
119
+ private float _activePadding = 40f;
120
+ private SequenceData _data;
121
+ private Transform[] _bones;
122
+ private Transform _pelvisBone;
123
+ private List<Transform> _activeMarkers = new List<Transform>();
124
+ private HybridPoseCopier _activeRetargeter;
125
+ private Transform _activeAvatarRoot = null;
126
+ private Animator _activeAnimator = null;
127
+ private string _activeAvatarName = "";
128
+ private readonly List<Renderer> _activeBboxRenderers = new List<Renderer>();
129
+ private Mesh _bakeMesh;
130
+ private readonly List<Vector3> _bakedVerts = new List<Vector3>(8192);
131
+ private Texture2D _greenTex, _redTex, _occTex;
132
+ private Rect _cachedBbox = new Rect(0, 0, 0, 0);
133
+ private bool _cachedHasBbox = false;
134
+ private int[] _cachedMarkerVis = null;
135
+ private string _currentlyLoadedWorldScene = "";
136
+
137
+ private Shader _autoDepthShader;
138
+ private const int JOINT_COUNT = 22;
139
+ private static readonly string[] BONE_NAMES = {
140
+ "pelvis", "left_hip", "right_hip", "spine1", "left_knee", "right_knee", "spine2",
141
+ "left_ankle", "right_ankle", "spine3", "left_foot", "right_foot", "neck", "left_collar",
142
+ "right_collar", "head", "left_shoulder", "right_shoulder", "left_elbow", "right_elbow",
143
+ "left_wrist", "right_wrist"
144
+ };
145
+
146
+ void Start()
147
+ {
148
+ Screen.SetResolution(1280, 720, FullScreenMode.Windowed);
149
+
150
+ EnsureDepthShaderExists();
151
+ _autoDepthShader = Shader.Find("Custom/AutoLinearDepth");
152
+ if (!_autoDepthShader) UnityEngine.Debug.LogError("Could not load the auto-generated depth shader!");
153
+
154
+ _greenTex = new Texture2D(1, 1); _greenTex.SetPixel(0, 0, Color.green); _greenTex.Apply();
155
+ _redTex = new Texture2D(1, 1); _redTex.SetPixel(0, 0, Color.red); _redTex.Apply();
156
+ _occTex = new Texture2D(1, 1); _occTex.SetPixel(0, 0, new Color(1, 0, 0, 0.5f)); _occTex.Apply();
157
+
158
+ _bakeMesh = new Mesh();
159
+ _bakeMesh.MarkDynamic();
160
+
161
+ if (startRecordingOnPlay)
162
+ StartCoroutine(ProcessBatch());
163
+ }
164
+
165
+ private void EnsureDepthShaderExists()
166
+ {
167
+ string path = "Assets/SyntheticDepth.shader";
168
+ if (File.Exists(path)) return;
169
+
170
+ string shaderCode = @"
171
+ Shader ""Custom/AutoLinearDepth""
172
+ {
173
+ SubShader
174
+ {
175
+ Tags { ""RenderType""="""" ""Queue""=""Geometry"" ""ForceNoShadowCasting""=""True"" }
176
+ Cull Off
177
+ ZWrite On
178
+ ZTest LEqual
179
+ Pass
180
+ {
181
+ CGPROGRAM
182
+ #pragma vertex vert
183
+ #pragma fragment frag
184
+ #include ""UnityCG.cginc""
185
+ struct appdata { float4 vertex : POSITION; };
186
+ struct v2f { float4 pos : SV_POSITION; float depth : TEXCOORD0; };
187
+ v2f vert (appdata v) { v2f o; o.pos = UnityObjectToClipPos(v.vertex); o.depth = -UnityObjectToViewPos(v.vertex).z; return o; }
188
+ float4 frag (v2f i) : SV_Target { return float4(i.depth, 0, 0, 1); }
189
+ ENDCG
190
+ }
191
+ }
192
+ }";
193
+ File.WriteAllText(path, shaderCode);
194
+ #if UNITY_EDITOR
195
+ UnityEditor.AssetDatabase.Refresh();
196
+ #endif
197
+ UnityEngine.Debug.Log("Created Aggressive AutoLinearDepth shader at " + path);
198
+ }
199
+
200
+ private IEnumerator ProcessBatch()
201
+ {
202
+ string fullInputPath = Path.IsPathRooted(inputFolderPath) ? inputFolderPath : Path.Combine(Application.dataPath, "..", inputFolderPath);
203
+ if (!Directory.Exists(fullInputPath)) { UnityEngine.Debug.LogError("Input folder missing"); yield break; }
204
+
205
+ string[] allFiles = Directory.GetFiles(fullInputPath, "*.json");
206
+ Array.Sort(allFiles);
207
+
208
+ List<string> myFiles = new List<string>();
209
+ int safeTotalWorkers = Mathf.Max(1, totalWorkers);
210
+
211
+ for (int i = 0; i < allFiles.Length; i++)
212
+ if (i % safeTotalWorkers == workerId) myFiles.Add(allFiles[i]);
213
+
214
+ foreach (string file in myFiles)
215
+ {
216
+ _currentInputJsonPath = file;
217
+ sequenceName = Path.GetFileNameWithoutExtension(file);
218
+
219
+ Resources.UnloadUnusedAssets();
220
+ System.GC.Collect();
221
+
222
+ yield return StartCoroutine(LoadRandomWorldRoutine());
223
+
224
+ RandomizeAvatarAndGatherRenderers();
225
+ FindAndCacheBones();
226
+ ApplyRandomSpawnPoint(SceneManager.GetActiveScene());
227
+
228
+ yield return StartCoroutine(RecordSingleSequence());
229
+ }
230
+
231
+ #if UNITY_EDITOR
232
+ UnityEditor.EditorApplication.isPlaying = false;
233
+ #else
234
+ Application.Quit();
235
+ #endif
236
+ }
237
+
238
+ private IEnumerator LoadRandomWorldRoutine()
239
+ {
240
+ if (worldSceneNames == null || worldSceneNames.Count == 0) yield break;
241
+
242
+ if (!string.IsNullOrEmpty(_currentlyLoadedWorldScene) && worldSceneLoadMode == LoadSceneMode.Additive)
243
+ {
244
+ AsyncOperation unloadOp = SceneManager.UnloadSceneAsync(_currentlyLoadedWorldScene);
245
+ while (unloadOp != null && !unloadOp.isDone) yield return null;
246
+ }
247
+
248
+ string chosen = worldSceneNames[UnityEngine.Random.Range(0, worldSceneNames.Count)].Trim();
249
+ _currentlyLoadedWorldScene = chosen;
250
+
251
+ AsyncOperation loadOp = SceneManager.LoadSceneAsync(chosen, worldSceneLoadMode);
252
+ while (!loadOp.isDone) yield return null;
253
+ yield return null;
254
+
255
+ Scene loaded = SceneManager.GetSceneByName(chosen);
256
+ if (!loaded.IsValid()) loaded = SceneManager.GetSceneByPath(chosen);
257
+
258
+ if (loaded.IsValid() && loaded.isLoaded)
259
+ {
260
+ if (setLoadedWorldSceneActive) SceneManager.SetActiveScene(loaded);
261
+ BindToWorldMainCameraOrLog(loaded);
262
+ }
263
+ }
264
+
265
+ IEnumerator RecordSingleSequence()
266
+ {
267
+ _data = JsonConvert.DeserializeObject<SequenceData>(File.ReadAllText(_currentInputJsonPath));
268
+
269
+ if (!Directory.Exists(outputFolder)) Directory.CreateDirectory(outputFolder);
270
+ string seqImageDir = Path.Combine(outputFolder, "images", sequenceName);
271
+ if (!Directory.Exists(seqImageDir)) Directory.CreateDirectory(seqImageDir);
272
+
273
+ // --- FFmpeg Setup for VOD SIMULATION ---
274
+ string videoPath = Path.Combine(outputFolder, $"video_{sequenceName}.mp4").Replace("\\", "/");
275
+
276
+ // VOD SIMULATION LOGIC:
277
+ // 1. -b:v {bitrate}k -> Forces the encoder to target a specific bandwidth
278
+ // 2. -maxrate {bitrate}k -> Prevents it from spiking quality during high motion (causes artifacts)
279
+ // 3. -bufsize {bitrate*2}k -> Standard buffer size for streaming
280
+ // 4. -g {gop} -> Sets Keyframe Interval. Twitch uses 2 seconds fixed.
281
+ // 5. -preset ultrafast -> Keeps Unity realtime, but relies on bitrate starvation to cause the artifacts
282
+
283
+ int gopFrames = Mathf.RoundToInt(frameRate * gopSizeSeconds);
284
+
285
+ string ffmpegArgs = $"-y -f rawvideo -vcodec rawvideo -pix_fmt rgb24 " +
286
+ $"-s {Screen.width}x{Screen.height} -r {frameRate} -i - " +
287
+ $"-vf vflip " +
288
+ $"-c:v libx264 " +
289
+ $"-pix_fmt yuv420p " +
290
+ $"-preset ultrafast " +
291
+ $"-b:v {targetBitrateKbps}k -maxrate {targetBitrateKbps}k -bufsize {targetBitrateKbps * 2}k " +
292
+ $"-g {gopFrames} " +
293
+ $"\"{videoPath}\"";
294
+
295
+ ProcessStartInfo psi = new ProcessStartInfo
296
+ {
297
+ FileName = ffmpegPath,
298
+ Arguments = ffmpegArgs,
299
+ UseShellExecute = false,
300
+ RedirectStandardInput = true,
301
+ CreateNoWindow = true
302
+ };
303
+
304
+ Process ffmpegProcess = null;
305
+ try
306
+ {
307
+ ffmpegProcess = Process.Start(psi);
308
+ }
309
+ catch(Exception e)
310
+ {
311
+ UnityEngine.Debug.LogError($"Failed to start FFmpeg. Is it in PATH? Error: {e.Message}");
312
+ yield break;
313
+ }
314
+
315
+ Texture2D screenTex = new Texture2D(Screen.width, Screen.height, TextureFormat.RGB24, false);
316
+ RenderTexture depthRT = new RenderTexture(Screen.width, Screen.height, 24, RenderTextureFormat.RFloat);
317
+ Texture2D depthReadTex = new Texture2D(Screen.width, Screen.height, TextureFormat.RFloat, false);
318
+
319
+ string jsonlPath = Path.Combine(outputFolder, $"sequence_{sequenceName}.jsonl");
320
+
321
+ int framesUntilSwitch = 0;
322
+ int currentFaceId = 0, currentLeftHandId = 0, currentRightHandId = 0;
323
+
324
+ using (var sw = new StreamWriter(jsonlPath, false))
325
+ {
326
+ for (int i = 0; i < _data.frames.Count; i++)
327
+ {
328
+ if (framesUntilSwitch <= 0)
329
+ {
330
+ framesUntilSwitch = UnityEngine.Random.Range(minSwitchFrames, maxSwitchFrames + 1);
331
+ currentFaceId = UnityEngine.Random.Range(0, faceMaxId + 1);
332
+ currentLeftHandId = UnityEngine.Random.Range(0, handsMaxId + 1);
333
+ currentRightHandId = UnityEngine.Random.Range(0, handsMaxId + 1);
334
+ if (_activeAnimator != null)
335
+ {
336
+ _activeAnimator.SetInteger(paramFaceIndex, currentFaceId);
337
+ _activeAnimator.SetInteger(paramLeftHandIndex, currentLeftHandId);
338
+ _activeAnimator.SetInteger(paramRightHandIndex, currentRightHandId);
339
+ }
340
+ }
341
+ framesUntilSwitch--;
342
+
343
+ ApplyFrame(_data.frames[i]);
344
+ if (cameraDriver != null) cameraDriver.OnFrame(i);
345
+ if (_activeRetargeter != null) _activeRetargeter.ManualUpdatePose();
346
+
347
+ Physics.SyncTransforms();
348
+
349
+ vtuberCamera.clearFlags = CameraClearFlags.SolidColor;
350
+ vtuberCamera.backgroundColor = Color.black;
351
+ vtuberCamera.cullingMask = ~0;
352
+
353
+ yield return new WaitForEndOfFrame();
354
+
355
+ if (screenTex.width != Screen.width || screenTex.height != Screen.height)
356
+ screenTex.Reinitialize(Screen.width, Screen.height);
357
+
358
+ screenTex.ReadPixels(new Rect(0, 0, Screen.width, Screen.height), 0, 0);
359
+ screenTex.Apply();
360
+
361
+ byte[] rawFrame = screenTex.GetRawTextureData();
362
+ if (ffmpegProcess != null && !ffmpegProcess.HasExited)
363
+ {
364
+ try {
365
+ ffmpegProcess.StandardInput.BaseStream.Write(rawFrame, 0, rawFrame.Length);
366
+ ffmpegProcess.StandardInput.BaseStream.Flush();
367
+ } catch (Exception ex) {
368
+ UnityEngine.Debug.LogError("FFmpeg write error: " + ex.Message);
369
+ }
370
+ }
371
+
372
+ RenderTexture origRT = vtuberCamera.targetTexture;
373
+ CameraClearFlags origFlags = vtuberCamera.clearFlags;
374
+ Color origBG = vtuberCamera.backgroundColor;
375
+
376
+ vtuberCamera.targetTexture = depthRT;
377
+ vtuberCamera.clearFlags = CameraClearFlags.SolidColor;
378
+ vtuberCamera.backgroundColor = new Color(1000f, 0, 0, 0);
379
+
380
+ if (_autoDepthShader != null) vtuberCamera.RenderWithShader(_autoDepthShader, "");
381
+ else vtuberCamera.Render();
382
+
383
+ RenderTexture.active = depthRT;
384
+ if (depthReadTex.width != Screen.width || depthReadTex.height != Screen.height)
385
+ depthReadTex.Reinitialize(Screen.width, Screen.height);
386
+ depthReadTex.ReadPixels(new Rect(0, 0, Screen.width, Screen.height), 0, 0);
387
+ depthReadTex.Apply();
388
+
389
+ if (saveDebugDepthImages)
390
+ {
391
+ Texture2D visualDepth = new Texture2D(Screen.width, Screen.height, TextureFormat.RGB24, false);
392
+ Color[] rawPixels = depthReadTex.GetPixels();
393
+ Color[] visPixels = new Color[rawPixels.Length];
394
+ float displayRange = 3.0f;
395
+ for (int k = 0; k < rawPixels.Length; k++)
396
+ {
397
+ float d = rawPixels[k].r;
398
+ if (d > 999f) visPixels[k] = Color.white;
399
+ else
400
+ {
401
+ float norm = Mathf.Clamp01(d / displayRange);
402
+ visPixels[k] = new Color(norm, norm, norm);
403
+ }
404
+ }
405
+ visualDepth.SetPixels(visPixels);
406
+ visualDepth.Apply();
407
+ string depthFile = $"depth_{i:D5}.png";
408
+ File.WriteAllBytes(Path.Combine(seqImageDir, depthFile), visualDepth.EncodeToPNG());
409
+ Destroy(visualDepth);
410
+ }
411
+
412
+ vtuberCamera.targetTexture = origRT;
413
+ vtuberCamera.clearFlags = origFlags;
414
+ vtuberCamera.backgroundColor = origBG;
415
+ RenderTexture.active = null;
416
+
417
+ ComputeBoundingBoxCached();
418
+
419
+ float H = Screen.height; float W = Screen.width;
420
+ Vector3 camP0_W = vtuberCamera.transform.TransformPoint(new Vector3(0f, 0f, 1f));
421
+ Vector3 camPx_W = vtuberCamera.transform.TransformPoint(new Vector3(1f, 0f, 1f));
422
+ Vector3 camPyDown_W = vtuberCamera.transform.TransformPoint(new Vector3(0f, -1f, 1f));
423
+ Vector3 s0 = vtuberCamera.WorldToScreenPoint(camP0_W);
424
+ Vector3 sx = vtuberCamera.WorldToScreenPoint(camPx_W);
425
+ Vector3 sy = vtuberCamera.WorldToScreenPoint(camPyDown_W);
426
+ float cx = s0.x; float cy = H - s0.y;
427
+ float fx = sx.x - s0.x; float fy = (H - sy.y) - cy;
428
+
429
+ Rect rFull = _cachedHasBbox ? _cachedBbox : new Rect(0, 0, 0, 0);
430
+ float bbox_x = rFull.x; float bbox_y = H - (rFull.y + rFull.height);
431
+ float bbox_w = rFull.width; float bbox_h = rFull.height;
432
+ float clip_x0 = Mathf.Clamp(bbox_x, 0, W);
433
+ float clip_y0 = Mathf.Clamp(bbox_y, 0, H);
434
+ float clip_w = Mathf.Max(0, Mathf.Clamp(bbox_x + bbox_w, 0, W) - clip_x0);
435
+ float clip_h = Mathf.Max(0, Mathf.Clamp(bbox_y + bbox_h, 0, H) - clip_y0);
436
+
437
+ var kpts2D = new List<float>();
438
+ var kptsVis = new List<int>();
439
+ var kpts3D = new List<float>();
440
+
441
+ if (_activeMarkers != null)
442
+ {
443
+ for (int mi = 0; mi < _activeMarkers.Count; mi++)
444
+ {
445
+ Transform t = _activeMarkers[mi];
446
+ if (t == null) { continue; }
447
+
448
+ Vector3 wPos = t.position;
449
+ kpts3D.Add(wPos.x); kpts3D.Add(wPos.y); kpts3D.Add(wPos.z);
450
+
451
+ Vector3 sPos = vtuberCamera.WorldToScreenPoint(wPos);
452
+ float x_px = sPos.x;
453
+ float y_px = H - sPos.y;
454
+ kpts2D.Add(x_px); kpts2D.Add(y_px);
455
+
456
+ int vis = 0;
457
+ if (sPos.z > 0 && x_px >= 0 && x_px < W && sPos.y >= 0 && sPos.y < H)
458
+ {
459
+ int checkRadius = 2;
460
+ float requiredVisibilityRatio = 0.5f;
461
+ int totalSamples = 0;
462
+ int visibleSamples = 0;
463
+ float markerDistance = sPos.z;
464
+
465
+ for (int ox = -checkRadius; ox <= checkRadius; ox++)
466
+ {
467
+ for (int oy = -checkRadius; oy <= checkRadius; oy++)
468
+ {
469
+ int px = (int)sPos.x + ox;
470
+ int py = (int)sPos.y + oy;
471
+ if (px >= 0 && px < W && py >= 0 && py < H)
472
+ {
473
+ totalSamples++;
474
+ float pixelDepth = depthReadTex.GetPixel(px, py).r;
475
+ if (pixelDepth >= (markerDistance - occlusionBias))
476
+ visibleSamples++;
477
+ }
478
+ }
479
+ }
480
+ if (totalSamples > 0)
481
+ {
482
+ float visibilityPct = (float)visibleSamples / totalSamples;
483
+ vis = (visibilityPct >= requiredVisibilityRatio) ? 2 : 1;
484
+ }
485
+ else vis = 1;
486
+ }
487
+ kptsVis.Add(vis);
488
+ }
489
+ }
490
+
491
+ _cachedMarkerVis = kptsVis.ToArray();
492
+ Transform pelvis = (_bones != null && _bones.Length > 0) ? _bones[0] : null;
493
+ Quaternion correction = Quaternion.Euler(globalCoordinateCorrection);
494
+ Quaternion pelvisWorld = (pelvis != null) ? pelvis.rotation : Quaternion.identity;
495
+ Vector3 pelvisPos = (pelvis != null) ? pelvis.position : Vector3.zero;
496
+ Quaternion incamQ = Quaternion.Inverse(vtuberCamera.transform.rotation) * (Quaternion.Inverse(correction) * pelvisWorld);
497
+ Vector3 incamPos = vtuberCamera.transform.InverseTransformPoint(pelvisPos);
498
+ Vector3 rootIncamPos = (characterRoot != null) ? vtuberCamera.transform.InverseTransformPoint(characterRoot.transform.position) : Vector3.zero;
499
+
500
+ var meta = new OutputMeta
501
+ {
502
+ frame_index = i,
503
+ image_path = videoPath,
504
+ avatar_name = _activeAvatarName,
505
+ face_id = currentFaceId,
506
+ left_hand_id = currentLeftHandId,
507
+ right_hand_id = currentRightHandId,
508
+ bbox = new float[] { bbox_x, bbox_y, bbox_w, bbox_h },
509
+ bbox_clip = new float[] { clip_x0, clip_y0, clip_w, clip_h },
510
+ kpts_2d = kpts2D.ToArray(), kpts_vis = kptsVis.ToArray(),
511
+ cam_intrinsics = new float[] { fx, fy, cx, cy },
512
+ cam_pos_world = new float[] { vtuberCamera.transform.position.x, vtuberCamera.transform.position.y, vtuberCamera.transform.position.z },
513
+ cam_rot_world = new float[] { vtuberCamera.transform.rotation.x, vtuberCamera.transform.rotation.y, vtuberCamera.transform.rotation.z, vtuberCamera.transform.rotation.w },
514
+ pelvis_pos_world = new float[] { pelvisPos.x, pelvisPos.y, pelvisPos.z },
515
+ pelvis_rot_world = new float[] { pelvisWorld.x, pelvisWorld.y, pelvisWorld.z, pelvisWorld.w },
516
+ smpl_incam_quat = new float[] { incamQ.x, incamQ.y, incamQ.z, incamQ.w },
517
+ smpl_incam_transl = new float[] { incamPos.x, incamPos.y, incamPos.z },
518
+ smpl_root_incam_transl = new float[] { rootIncamPos.x, rootIncamPos.y, rootIncamPos.z },
519
+ smpl_root_world_scale = (characterRoot != null) ? characterRoot.transform.lossyScale.x : 1f,
520
+ kpts_3d_world = kpts3D.ToArray(),
521
+ smplx_pose = _data.frames[i].p, smplx_betas = _data.frames[i].b
522
+ };
523
+ sw.WriteLine(JsonConvert.SerializeObject(meta));
524
+ }
525
+ }
526
+
527
+ if (ffmpegProcess != null && !ffmpegProcess.HasExited)
528
+ {
529
+ ffmpegProcess.StandardInput.Close();
530
+ ffmpegProcess.WaitForExit();
531
+ ffmpegProcess.Close();
532
+ }
533
+
534
+ if(screenTex) Destroy(screenTex);
535
+ if(depthRT) Destroy(depthRT);
536
+ if(depthReadTex) Destroy(depthReadTex);
537
+ }
538
+
539
+ void ApplyFrame(FrameData f)
540
+ {
541
+ if (f.p == null || characterRoot == null) return;
542
+ Quaternion correction = Quaternion.Euler(globalCoordinateCorrection);
543
+ characterRoot.transform.localPosition = (correction * (new Vector3(-f.t[0], f.t[1], f.t[2]) * movementScale)) + translationOffset;
544
+ int floatIdx = 0;
545
+ for (int i = 0; i < JOINT_COUNT; i++)
546
+ {
547
+ if (floatIdx + 2 >= f.p.Length) break;
548
+ float x = f.p[floatIdx++], y = f.p[floatIdx++], z = f.p[floatIdx++];
549
+ float angle = Mathf.Sqrt(x * x + y * y + z * z);
550
+ Quaternion q = Quaternion.identity;
551
+ if (angle > 1e-6f) { float c = Mathf.Cos(angle * 0.5f), s = Mathf.Sin(angle * 0.5f); q = new Quaternion(-(x / angle) * s, (y / angle) * s, (z / angle) * s, -c); }
552
+ if (_bones != null && i < _bones.Length && _bones[i] != null) _bones[i].localRotation = (i == 0) ? (correction * q) : q;
553
+ }
554
+ if (cameraDriver != null) cameraDriver.SetStyleFromFrameData(f.s);
555
+ }
556
+
557
+ private void FindAndCacheBones()
558
+ {
559
+ _bones = new Transform[BONE_NAMES.Length];
560
+ for (int i = 0; i < BONE_NAMES.Length; i++) _bones[i] = FindDeep(characterRoot.transform, BONE_NAMES[i]);
561
+ _pelvisBone = (_bones != null && _bones.Length > 0) ? _bones[0] : null;
562
+ }
563
+
564
+ private static Transform FindDeep(Transform root, string name)
565
+ {
566
+ if (root.name == name) return root;
567
+ foreach (Transform child in root) { var res = FindDeep(child, name); if (res) return res; }
568
+ return null;
569
+ }
570
+
571
+ private void RandomizeAvatarAndGatherRenderers()
572
+ {
573
+ if (avatarList == null || avatarList.Count == 0) return;
574
+ _activeBboxRenderers.Clear(); _activeMarkers.Clear();
575
+ int randomIndex = UnityEngine.Random.Range(0, avatarList.Count);
576
+ AvatarConfig selected = avatarList[randomIndex];
577
+ _activeAvatarName = selected.avatarName;
578
+ _activeAnimator = selected.animator;
579
+ if (_activeAnimator == null && selected.avatarObject != null) _activeAnimator = selected.avatarObject.GetComponent<Animator>();
580
+ for (int i = 0; i < avatarList.Count; i++) if (avatarList[i].avatarObject != null) avatarList[i].avatarObject.SetActive(i == randomIndex);
581
+ _activePadding = selected.specificPadding; _activeRetargeter = selected.retargeter;
582
+ if (selected.avatarObject != null) _activeAvatarRoot = selected.avatarObject.transform;
583
+ if (selected.customMarkers != null) _activeMarkers.AddRange(selected.customMarkers);
584
+ if (characterRoot != null) { foreach (var r in characterRoot.GetComponentsInChildren<Renderer>(true)) _activeBboxRenderers.Add(r); }
585
+ foreach (GameObject extra in selected.extraMeshes) if (extra) { foreach (var cr in extra.GetComponentsInChildren<Renderer>(true)) if (!_activeBboxRenderers.Contains(cr)) _activeBboxRenderers.Add(cr); }
586
+ }
587
+
588
+ private void ApplyRandomSpawnPoint(Scene worldScene)
589
+ {
590
+ if (!worldScene.IsValid() || !worldScene.isLoaded) return;
591
+ List<Transform> spawns = new List<Transform>();
592
+ foreach (GameObject root in worldScene.GetRootGameObjects()) { foreach (Transform child in root.GetComponentsInChildren<Transform>(true)) if (child.name.Contains(spawnPointToken)) spawns.Add(child); }
593
+ if (spawns.Count > 0)
594
+ {
595
+ Transform chosen = spawns[UnityEngine.Random.Range(0, spawns.Count)];
596
+ this.transform.position = chosen.position; this.transform.rotation = chosen.rotation;
597
+ if (characterRoot != null) { characterRoot.transform.localPosition = Vector3.zero; characterRoot.transform.localRotation = Quaternion.identity; }
598
+ }
599
+ }
600
+
601
+ private void BindToWorldMainCameraOrLog(Scene worldScene)
602
+ {
603
+ Camera found = null;
604
+ foreach (GameObject root in worldScene.GetRootGameObjects())
605
+ {
606
+ foreach (Transform t in root.GetComponentsInChildren<Transform>(true))
607
+ if (t.name == worldMainCameraName && t.GetComponent<Camera>()) { found = t.GetComponent<Camera>(); break; }
608
+ if (found) break;
609
+ }
610
+ if (!found) return;
611
+ Vector3 ls = found.transform.lossyScale;
612
+ if (Mathf.Abs(ls.x - 1f) > 1e-4f || Mathf.Abs(ls.y - 1f) > 1e-4f || Mathf.Abs(ls.z - 1f) > 1e-4f)
613
+ {
614
+ found.transform.SetParent(null, true);
615
+ found.transform.localScale = Vector3.one;
616
+ }
617
+ if (vtuberCamera && vtuberCamera != found) vtuberCamera.enabled = false;
618
+ vtuberCamera = found;
619
+ cameraDriver = found.GetComponent<SyntheticCameraDriver>() ?? found.gameObject.AddComponent<SyntheticCameraDriver>();
620
+ cameraDriver.BindAndInit(found, characterRoot.transform);
621
+ }
622
+
623
+ private bool ComputeBoundingBoxCached()
624
+ {
625
+ _cachedHasBbox = false; _cachedBbox = new Rect(0, 0, 0, 0);
626
+ if (vtuberCamera == null || _activeBboxRenderers.Count == 0) return false;
627
+ float minVX = float.MaxValue, maxVX = float.MinValue, minVY = float.MaxValue, maxVY = float.MinValue;
628
+ bool foundAny = false;
629
+ int stride = Mathf.Max(1, bakedVertexStride);
630
+ foreach (var rend in _activeBboxRenderers)
631
+ {
632
+ if (!rend) continue;
633
+ if (useBakedSkinnedMeshForBbox && rend is SkinnedMeshRenderer smr)
634
+ {
635
+ _bakeMesh.Clear(); smr.BakeMesh(_bakeMesh); _bakedVerts.Clear(); _bakeMesh.GetVertices(_bakedVerts);
636
+ Matrix4x4 localToWorldNoScale = Matrix4x4.TRS(smr.transform.position, smr.transform.rotation, Vector3.one);
637
+ for (int vi = 0; vi < _bakedVerts.Count; vi += stride)
638
+ {
639
+ Vector3 vp = vtuberCamera.WorldToViewportPoint(localToWorldNoScale.MultiplyPoint3x4(_bakedVerts[vi]));
640
+ if (vp.z <= 0f) continue;
641
+ foundAny = true; minVX = Math.Min(minVX, vp.x); maxVX = Math.Max(maxVX, vp.x); minVY = Math.Min(minVY, vp.y); maxVY = Math.Max(maxVY, vp.y);
642
+ }
643
+ }
644
+ else
645
+ {
646
+ Bounds b = rend.bounds; Vector3 c = b.center, e = b.extents;
647
+ Vector3[] corners = { c+new Vector3(-e.x,-e.y,-e.z), c+new Vector3(-e.x,-e.y,e.z), c+new Vector3(-e.x,e.y,-e.z), c+new Vector3(-e.x,e.y,e.z), c+new Vector3(e.x,-e.y,-e.z), c+new Vector3(e.x,-e.y,e.z), c+new Vector3(e.x,e.y,-e.z), c+new Vector3(e.x,e.y,e.z) };
648
+ foreach (var corner in corners)
649
+ {
650
+ Vector3 vp = vtuberCamera.WorldToViewportPoint(corner);
651
+ if (vp.z <= 0f) continue;
652
+ foundAny = true; minVX = Math.Min(minVX, vp.x); maxVX = Math.Max(maxVX, vp.x); minVY = Math.Min(minVY, vp.y); maxVY = Math.Max(maxVY, vp.y);
653
+ }
654
+ }
655
+ }
656
+ if (!foundAny) return false;
657
+ _cachedBbox = new Rect(minVX * Screen.width - _activePadding, minVY * Screen.height - _activePadding, (maxVX - minVX) * Screen.width + _activePadding * 2, (maxVY - minVY) * Screen.height + _activePadding * 2);
658
+ _cachedHasBbox = true; return true;
659
+ }
660
+
661
+ void OnGUI()
662
+ {
663
+ if (!showDebugUI || vtuberCamera == null) return;
664
+ if (_cachedHasBbox) { Rect r = _cachedBbox; float invY = Screen.height - (r.y + r.height); GUI.DrawTexture(new Rect(r.x, invY, r.width, 3), _greenTex); GUI.DrawTexture(new Rect(r.x, invY + r.height, r.width, 3), _greenTex); GUI.DrawTexture(new Rect(r.x, invY, 3, r.height), _greenTex); GUI.DrawTexture(new Rect(r.x + r.width, invY, 3, r.height), _greenTex); }
665
+ if (_activeMarkers != null) { for (int mi = 0; mi < _activeMarkers.Count; mi++) { if (!_activeMarkers[mi]) continue; Vector3 sc = vtuberCamera.WorldToScreenPoint(_activeMarkers[mi].position); if (sc.z > 0) { int vis = (_cachedMarkerVis != null && mi < _cachedMarkerVis.Length) ? _cachedMarkerVis[mi] : 2; GUI.DrawTexture(new Rect(sc.x - 2, Screen.height - sc.y - 2, 4, 4), vis == 1 ? _occTex : _greenTex); } } }
666
+ }
667
+ }
assets/teaser.png ADDED

Git LFS Details

  • SHA256: e79a5a30c0073bab36a6946337adda54824d0ecf7fb86af117d1b2d2c01d6775
  • Pointer size: 132 Bytes
  • Size of remote file: 2.73 MB
configs/__init__.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+
4
+ from hydra import compose, initialize_config_module
5
+ from hydra.core.config_store import ConfigStore
6
+
7
+ os.environ["HYDRA_FULL_ERROR"] = "1"
8
+
9
+ MainStore = ConfigStore.instance()
10
+
11
+
12
+ def parse_args_to_cfg():
13
+ """
14
+ Use minimal Hydra API to parse args and return cfg.
15
+ This function don't do _run_hydra which create log file hierarchy.
16
+ """
17
+ parser = argparse.ArgumentParser()
18
+ parser.add_argument("--config-name", "-cn", default="train")
19
+ parser.add_argument(
20
+ "overrides",
21
+ nargs="*",
22
+ help="Any key=value arguments to override config values (use dots for.nested=overrides)",
23
+ )
24
+ args = parser.parse_args()
25
+
26
+ # Cfg
27
+ with initialize_config_module(version_base="1.3", config_module="configs"):
28
+ cfg = compose(config_name=args.config_name, overrides=args.overrides)
29
+
30
+ return cfg
configs/callbacks/ckpt_saver/every10000s_top100.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ every10000s_top100:
2
+ _target_: genmo.callbacks.simple_ckpt_saver.SimpleCkptSaver
3
+ output_dir: ${output_dir}/checkpoints/
4
+ every_n_steps: 10000
5
+ save_top_k: 100
configs/callbacks/lr_monitor/pl.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ pl:
2
+ _target_: pytorch_lightning.callbacks.lr_monitor.LearningRateMonitor
configs/callbacks/metric/metric_3dpw.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ metric_3dpw:
2
+ _target_: genmo.callbacks.metric.metric_3dpw.MetricMocap
configs/callbacks/metric/metric_3dpw_occ.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ metric_3dpw_occ:
2
+ _target_: genmo.callbacks.metric.metric_3dpw_occ.MetricMocap
configs/callbacks/metric/metric_aistpp.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ metric_aistpp:
2
+ _target_: genmo.callbacks.metric.metric_aistpp.MetricMusic
configs/callbacks/metric/metric_emdb1.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ metric_emdb1:
2
+ _target_: genmo.callbacks.metric.metric_emdb.MetricMocap
3
+ emdb_split: 1
4
+ occ: false
configs/callbacks/metric/metric_emdb2.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ metric_emdb2:
2
+ _target_: genmo.callbacks.metric.metric_emdb.MetricMocap
3
+ emdb_split: 2
4
+ occ: false
configs/callbacks/metric/metric_rich.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ metric_rich:
2
+ _target_: genmo.callbacks.metric.metric_rich.MetricMocap
3
+ occ: false
configs/callbacks/metric/metric_unity.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ metric_unity:
2
+ _target_: genmo.callbacks.metric.metric_unity.MetricUnity
3
+ # Disable the old scenepic HTML viz by default (use `vis/vis_unity_val` instead).
4
+ vis_every_n_val: 1000000000
configs/callbacks/prog_bar/prog_reporter_ed1.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ prog_reporter_ed1:
2
+ _target_: genmo.callbacks.prog_bar.ProgressReporter
3
+ log_every_percent: 0.1
4
+ exp_name: ${exp_name}
5
+ data_name: ${data_name}
configs/callbacks/train_speed_timer/base.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ base:
2
+ _target_: genmo.callbacks.train_speed_timer.TrainSpeedTimer
3
+ N_avg: 5
configs/callbacks/vis/vis_music.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ vis_music:
2
+ _target_: genmo.callbacks.vis.vis_music.VisMusic
configs/callbacks/vis/vis_speech.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ vis_speech:
2
+ _target_: genmo.callbacks.vis.vis_speech.VisSpeech
configs/callbacks/vis/vis_text.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ vis_text:
2
+ _target_: genmo.callbacks.vis.vis_text.VisText
configs/callbacks/vis/vis_unity_val.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ vis_unity_val:
2
+ _target_: genmo.callbacks.vis.vis_unity_val.VisUnityVal
3
+ enabled: false
4
+ every_n_epochs: 1
5
+ num_batches: 1
6
+ num_frames: 30
7
+ render_incam: true
8
+ render_global: true
9
+ use_gt_betas_for_pred: true
10
+ global_root_relative: false
11
+ crf: 23
12
+ save_dir: ${output_dir}/vis
13
+ pred_color: [176, 100, 244]
14
+ gt_color: [0, 255, 0]
configs/data/collate_cfg/default.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ max_motion_frames: ${data.dataset_opts.max_motion_frames}
2
+ default_frame_feature_dim:
3
+ music_array: [1024]
4
+ music_embed: [35]
5
+ music_beats: []
6
+ audio_array: []
7
+ use_det_kp: []
8
+
9
+ default_seq_feature_dim:
10
+ text_embed: [50, 1024]
11
+
12
+ default_seq_feature_length_multiplier:
13
+ audio_array: 600
14
+
15
+ default_feature_val:
16
+ caption: ""
17
+ music_fps: 30
18
+ audio_fps: 30
19
+ has_text: False
20
+ # has_audio: False
21
+ # has_music: False
22
+
23
+ default_feature_type: {}
configs/data/mocap/trainX_testY.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - collate_cfg: default
3
+
4
+ # definition of lightning datamodule (dataset + dataloader)
5
+ _target_: genmo.datamodule.mocap_trainX_testY.DataModule
6
+
7
+ dataset_opts:
8
+ train: ${train_datasets}
9
+ val: ${test_datasets}
10
+ max_motion_frames: 120
11
+
12
+ loader_opts:
13
+ train:
14
+ batch_size: 128
15
+ num_workers: 8
16
+ val:
17
+ batch_size: 1
18
+ num_workers: 1
19
+ encoded_music_dim: ${pipeline.args.encoded_music_dim}
20
+
21
+ limit_each_trainset: null
configs/demo.yaml ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ # pytorch-lightning
3
+ - data: ???
4
+ - model: ???
5
+ - /text_encoder@model.model_cfg.text_encoder: t5_3b
6
+ - callbacks: null
7
+
8
+ # system
9
+ - hydra: default
10
+
11
+ # utility groups that changes a lot
12
+ - pipeline: null
13
+ - network: null
14
+ - optimizer: null
15
+ - scheduler: null
16
+ - train_datasets: null
17
+ - test_datasets: null
18
+ - endecoder: null # normalize/unnormalize data
19
+ - refiner: null
20
+
21
+ # global-override
22
+ - exp: mixed # set "data, model and callbacks" in yaml
23
+ - global/task: null # dump/test
24
+ - global/hsearch: null # hyper-param search
25
+ - global/debug: null # debug mode
26
+ - _self_
27
+
28
+ # ================================ #
29
+ # global setting #
30
+ # ================================ #
31
+
32
+
33
+ # expirement information
34
+ task: fit # [fit, predict]
35
+ exp_name_base: ???
36
+ exp_name_var: ""
37
+ exp_name: ${exp_name_base}_${exp_name_var}
38
+ data_name: ???
39
+
40
+ # utilities in the entry file
41
+ # output_dir: "outputs/${data_name}/${exp_name}"
42
+ resume_mode: null
43
+ seed: 42
44
+
45
+ version: null
46
+ ckpt_dir: outputs/${data_name}/${exp_name}/
47
+ remote_results_path: /lustre/fsw/portfolios/nvr/projects/nvr_torontoai_humanmotionfm/workspaces/motiondiff/motiondiff_results/jiefengl/gvhmr
48
+ ckpt_path: null
49
+
50
+ ###
51
+ # W&B logging removed from this repo; TensorBoard is used by `scripts/train.py`.
52
+ rsync_ckpt: true
53
+
54
+
55
+ # ================================ #
56
+ # global setting #
57
+ # ================================ #
58
+
59
+ video_name: ???
60
+ output_root: outputs/demo
61
+ output_dir: "${output_root}/${text1_video_name}"
62
+ preprocess_dir: ${output_dir}/preprocess
63
+ video_path: "${output_dir}/0_input_video.mp4"
64
+
65
+ # Options
66
+ text1: null
67
+ text1_file: null
68
+ text1_video_path: null
69
+ text1_video_name: null
70
+ text_length: 300
71
+ static_cam: False
72
+ verbose: False
73
+
74
+ paths:
75
+ bbx: ${preprocess_dir}/bbx.pt
76
+ bbx_xyxy_video_overlay: ${preprocess_dir}/bbx_xyxy_video_overlay.mp4
77
+ vit_features: ${preprocess_dir}/vit_features.pt
78
+ vimo_pred: ${preprocess_dir}/vimo_pred.pt
79
+ vitpose: ${preprocess_dir}/vitpose.pt
80
+ vitpose_video_overlay: ${preprocess_dir}/vitpose_video_overlay.mp4
81
+ hmr4d_results: ${output_dir}/hmr4d_results.pt
82
+ incam_video: ${output_dir}/1_incam.mp4
83
+ global_video: ${output_dir}/2_global.mp4
84
+ incam_global_horiz_video: ${output_dir}/3_incam_global_horiz.mp4
85
+ slam: ${preprocess_dir}/camera.npy
configs/diffusion/ddim.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ sampler: ddim
2
+ train_timestep_respacing: ""
3
+ test_timestep_respacing: "50"
4
+ schedule_sampler_type: uniform
5
+ noise_schedule: cosine
6
+ sigma_small: true
7
+ guidance_param: 1.0
8
+ ddim_eta: 0.0
configs/endecoder/v1_amass_local_bedlam_cam.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ _target_: genmo.network.endecoder.EnDecoder
2
+ stats_name: MM_V1_AMASS_LOCAL_BEDLAM_CAM
configs/exp/genmo_lg.yaml ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+ defaults:
3
+ - /diffusion@model_cfg.diffusion: ddim
4
+ - override /data: mocap/trainX_testY
5
+ - override /model: genmo
6
+ - override /network: diffusion
7
+ - override /pipeline: dual_mode
8
+ - override /endecoder: v1_amass_local_bedlam_cam
9
+ - override /optimizer: adamw_2e-4
10
+ - override /scheduler: epoch_half_200_350
11
+ - override /train_datasets:
12
+ - amass_train_v11
13
+ - humanml3d_static_train
14
+ - bedlam_v2
15
+ - h36m_v1
16
+ - 3dpw_v1
17
+ - 3dpw_occ_v1
18
+ - aistpp_train
19
+ - beat2_static_train
20
+ - override /test_datasets:
21
+ # - aistpp_test
22
+ - humanml3d_eval
23
+ - emdb1_fliptest
24
+ - emdb2_fliptest
25
+ - rich_test
26
+ - 3dpw_fliptest
27
+ - 3dpw_occ_fliptest
28
+ - override /callbacks:
29
+ - ckpt_saver/every10000s_top100
30
+ - prog_bar/prog_reporter_ed1
31
+ - train_speed_timer/base
32
+ - lr_monitor/pl
33
+ - vis/vis_text
34
+ - metric/metric_emdb1
35
+ - metric/metric_emdb2
36
+ - metric/metric_rich
37
+ - metric/metric_3dpw
38
+ - metric/metric_3dpw_occ
39
+ # - metric_aistpp
40
+ - _self_
41
+
42
+ exp_name_base: ${hydra:runtime.choices.exp}
43
+ exp_name_var: ""
44
+ exp_name: ${exp_name_base}_${exp_name_var}
45
+ data_name: genmo_mixed
46
+
47
+ multicond_args: null
48
+
49
+ pl_trainer:
50
+ precision: 16-mixed
51
+ log_every_n_steps: 10
52
+ gradient_clip_val: 0.5
53
+ max_epochs: null
54
+ check_val_every_n_epoch: null
55
+ val_check_interval: 3000
56
+ max_steps: 200000
57
+ devices: 1
58
+ strategy: ddp_find_unused_parameters_true
59
+
60
+ logger:
61
+ _target_: pytorch_lightning.loggers.tensorboard.TensorBoardLogger
62
+ save_dir: ${output_dir}
63
+ name: ""
64
+ version: ""
configs/finetune_unity.yaml ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # genmo/configs/finetune_unity.yaml
2
+ defaults:
3
+ - train
4
+ - override /exp: genmo_lg
5
+ # Keep only generic callbacks; drop dataset-specific metrics/visualizers.
6
+ - override /callbacks:
7
+ - ckpt_saver/every10000s_top100
8
+ - prog_bar/prog_reporter_ed1
9
+ - train_speed_timer/base
10
+ - lr_monitor/pl
11
+ - metric/metric_unity
12
+ - vis/vis_unity_val
13
+ - _self_
14
+
15
+ # Fix logging path mismatch by forcing filename to be local to the run dir
16
+ hydra:
17
+ job_logging:
18
+ handlers:
19
+ file:
20
+ filename: train.log
21
+
22
+ # Define mandatory variables and sync output_dir with Hydra run dir
23
+ data_name: "unity"
24
+ exp_name_base: "finetune"
25
+ # Keep `output_dir` from `configs/train.yaml` to avoid a Hydra/OmegaConf interpolation cycle:
26
+ # `hydra.run.dir` -> `${output_dir}` (configs/hydra/default.yaml) and `output_dir` -> `${hydra:run.dir}` would recurse.
27
+
28
+ # For tiny Unity sets, save a checkpoint at the end of every epoch.
29
+ callbacks:
30
+ ckpt_saver:
31
+ every10000s_top100:
32
+ every_n_steps: null
33
+ every_n_epochs: 1
34
+ save_top_k: 1
35
+ vis:
36
+ vis_unity_val:
37
+ enabled: true
38
+
39
+ train_datasets:
40
+ unity:
41
+ _target_: genmo.datasets.unity_dataset.UnityDataset
42
+ root: "./third_party/GVHMR/processed_dataset"
43
+ split: "train"
44
+ motion_frames: 120
45
+ # Explicitly disable datasets inherited from `exp=genmo_lg`.
46
+ amass_train_v11: null
47
+ humanml3d_static_train: null
48
+ bedlam_v2: null
49
+ h36m_v1: null
50
+ 3dpw_v1: null
51
+ 3dpw_occ_v1: null
52
+ aistpp_train: null
53
+ beat2_static_train: null
54
+
55
+ test_datasets:
56
+ unity_val:
57
+ _target_: genmo.datasets.unity_dataset.UnityDataset
58
+ root: "./third_party/GVHMR/processed_dataset"
59
+ split: "train"
60
+ motion_frames: 120
61
+ # Explicitly disable test datasets inherited from `exp=genmo_lg`.
62
+ humanml3d_eval: null
63
+ emdb1_fliptest: null
64
+ emdb2_fliptest: null
65
+ rich_test: null
66
+ 3dpw_fliptest: null
67
+ 3dpw_occ_fliptest: null
68
+
69
+ # Fine-tuning Hyperparameters
70
+ solver:
71
+ optimizer:
72
+ lr: 5e-6 # VERY IMPORTANT: Low LR to preserve pretrained knowledge
73
+
74
+ scheduler:
75
+ type: "constant" # Keep it simple for fine-tuning
76
+
77
+ # Lightning Trainer settings
78
+ pl_trainer:
79
+ max_epochs: 5
80
+ check_val_every_n_epoch: 1
81
+ log_every_n_steps: 1
82
+ precision: 16-mixed # Saves VRAM, faster
83
+ gradient_clip_val: 1.0
84
+ val_check_interval: 1.0
85
+
86
+ # Override the default dataloader settings from `exp=genmo_lg` (it uses batch_size=128
87
+ # and the DataModule uses `drop_last=True`, which yields 0 batches for small Unity sets).
88
+ data:
89
+ loader_opts:
90
+ train:
91
+ batch_size: 1
92
+ num_workers: 1
93
+ val:
94
+ batch_size: 1
95
+ num_workers: 1
configs/hydra/default.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # enable color logging
2
+ defaults:
3
+ - override hydra_logging: colorlog
4
+ - override job_logging: colorlog
5
+
6
+ job_logging:
7
+ formatters:
8
+ simple:
9
+ datefmt: "%m/%d %H:%M:%S"
10
+ format: "[%(asctime)s][%(levelname)s] %(message)s"
11
+ colorlog:
12
+ datefmt: "%m/%d %H:%M:%S"
13
+ format: "[%(cyan)s%(asctime)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] %(message)s"
14
+ handlers:
15
+ file:
16
+ filename: ${output_dir}/${hydra.job.name}.log
17
+
18
+ run:
19
+ dir: ${output_dir}
configs/infer_video.yaml ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ # pytorch-lightning / hydra wiring (kept for compatibility with `exp=...` configs)
3
+ - data: ???
4
+ - model: ???
5
+ - callbacks: null
6
+ - hydra: default
7
+ - pipeline: null
8
+ - network: null
9
+ - optimizer: null
10
+ - scheduler: null
11
+ - train_datasets: null
12
+ - test_datasets: null
13
+ - endecoder: null
14
+ - refiner: null
15
+
16
+ # pick an experiment preset (sets data/model/network/pipeline/etc)
17
+ - exp: genmo_lg
18
+ - _self_
19
+
20
+ # Video -> SMPL-X inference (GENMO/GEM)
21
+ video_path: null
22
+ video_name: null
23
+
24
+ output_root: outputs/infer_video
25
+ output_dir: ${output_root}/${video_name}
26
+ preprocess_dir: ${output_dir}/preprocess
27
+
28
+ # Checkpoint
29
+ ckpt_path: null
30
+
31
+ # Inference options
32
+ static_cam: true
33
+ use_kp2d: true
34
+ postproc: true
35
+ resample_to_30fps: true
36
+ verbose: false
37
+
38
+ # Rendering
39
+ render_incam: true
40
+ render_global: true
41
+ render_side_by_side: true
42
+ render_crf: 23
43
+
44
+ # Optional visualization: draw estimated camera axes in the global render.
45
+ draw_camera_axes: false
46
+ # Camera pose convention for `paths.slam` (affects camera-axis visualization only):
47
+ # - auto: choose the one closest to the person root each frame
48
+ # - w2c: interpret trajectory as world->camera
49
+ # - c2w: interpret trajectory as camera->world
50
+ camera_pose_convention: auto
51
+ camera_axis_length: 0.5
52
+ camera_axis_width: 3
53
+
54
+ paths:
55
+ input_video: ${output_dir}/0_input_video.mp4
56
+ video_30fps: ${output_dir}/0_input_video_30fps.mp4
57
+ bbx: ${preprocess_dir}/bbx.pt
58
+ vitpose: ${preprocess_dir}/vitpose.pt
59
+ vit_features: ${preprocess_dir}/vit_features.pt
60
+ hmr4d_results: ${output_dir}/hmr4d_results.pt
61
+ incam_video: ${output_dir}/1_incam.mp4
62
+ global_video: ${output_dir}/2_global.mp4
63
+ incam_global_horiz_video: ${output_dir}/3_incam_global_horiz.mp4
64
+
65
+ # Disable external logging by default for a local demo script.
66
+ ###
67
+ # W&B logging removed from this repo; TensorBoard is used by `scripts/train.py` for training runs.
configs/model/genmo.yaml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: genmo.genmo.GENMO
2
+
3
+ pipeline: ${pipeline}
4
+ optimizer: ${optimizer}
5
+ scheduler: ${scheduler}
6
+
7
+ model_cfg:
8
+ train_modes: ["regression", "diffusion"]
9
+ noisy_2d_obs: true
10
+ kp2d_noise_scale: 0.5
11
+ perframe_condition_exists: true
12
+ train2d_mask_invis_obs: true
13
+ mask_occluded_imgfeats: true
14
+ cond_merge_strategy: "add"
15
+ use_cond_exists_as_input: true
16
+ normalize_cam_angvel: true
17
+
18
+ diffusion:
19
+ test_timestep_respacing: "50"
20
+ guidance_param: 2.5
21
+
22
+ text_encoder:
23
+ load_llm: false
24
+ llm_version: "t5-3b"
25
+ max_text_len: 50
26
+
27
+ condition_mask:
28
+ mask_img_prob: 0.5
29
+ mask_cam_prob: 1.0
30
+ reuse_regression_mask: false
31
+ regression_no_img_mask: true
32
+
33
+ mask_cfg:
34
+ drop_prob: 0.75
35
+ max_num_drops: 3
36
+ min_drop_nframes: 1
37
+ max_drop_nframes: 30
38
+ body_mask_cfg:
39
+ drop_prob: 0.75
40
+ joint_drop_prob: 0.25
41
+ max_num_drops: 3
42
+ min_drop_nframes: 1
43
+ max_drop_nframes: 30
44
+ music_mask_prob: 0.1
45
+ audio_mask_prob: 0.1
configs/network/diffusion.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: genmo.network.genmo_diffusion.GENMODiffusion
2
+ args: ${pipeline.args}
3
+ latent_dim: ${.model_cfg.denoiser.latent_dim}
4
+ cond_merge_strategy: "add"
5
+ music_mask_prob: ${.model_cfg.denoiser.music_mask_prob}
6
+ speech_mask_prob: ${.model_cfg.denoiser.speech_mask_prob}
7
+ encoded_music_dim: ${pipeline.args.encoded_music_dim}
8
+ model_cfg:
9
+ diffusion: ${model_cfg.diffusion}
10
+ denoiser:
11
+ _target_: genmo.network.genmo_denoiser.NetworkEncoderRoPE
12
+ output_dim: 151
13
+ xt_dim: ${.output_dim}
14
+ njoints: ${.xt_dim}
15
+ text_mask_prob: 0.1
16
+ music_mask_prob: 0.1
17
+ speech_mask_prob: 0.1
18
+ use_text_pos_enc: true
19
+ text_encoder_cfg:
20
+ mode: all
21
+ cross_attn_type: mha
22
+ latent_dim: 1024
23
+ num_layers: 16
24
+ num_heads: 8
25
+ mlp_ratio: 4
configs/optimizer/adamw_2e-4.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ _target_: torch.optim.AdamW
2
+ lr: 2e-4
configs/pipeline/dual_mode.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: genmo.pipeline.genmo_pipeline.Pipeline
2
+ args_denoiser3d: ${network}
3
+ args:
4
+ endecoder_opt: ${endecoder}
5
+ use_regression_outputs_prob: 0.
6
+ use_cfg_sampler_for_gen: true
7
+ inpaint_x_start_gt: false
8
+ regression_only: true
9
+ encoded_music_dim: 35
10
+ multicond_args: ${multicond_args}
11
+ infer_version: 2
12
+ weights:
13
+ cr_j3d: 500.
14
+ transl_c: 1.
15
+ cr_verts: 500.
16
+ j2d: 1000.
17
+ j2d_17: 1000.
18
+ verts2d: 1000.
19
+
20
+ proj_gt_j2d_to_bi01: true
21
+
22
+ transl_w: 1.
23
+ static_conf_bce: 1.
24
+
25
+ static_conf:
26
+ vel_thr: 0.15
27
+
28
+ in_attr:
29
+ - obs
30
+ - f_cliffcam
31
+ - f_imgseq
32
+ - f_cam_angvel
33
+ - encoded_music
34
+ - encoded_audio
35
+ mask_out_attr: [] # ${.in_attr}
36
+ out_attr:
37
+ pred_cam: 3
configs/scheduler/epoch_half_200_350.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ scheduler:
2
+ _target_: torch.optim.lr_scheduler.MultiStepLR
3
+ milestones: [200, 350]
4
+ gamma: 0.5
5
+ interval: epoch
6
+ frequency: 1
configs/test_datasets/3dpw_fliptest.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 3dpw_fliptest:
2
+ _target_: genmo.datasets.threedpw.threedpw_motion_test.ThreedpwSmplFullSeqDataset
3
+ flip_test: true
configs/test_datasets/3dpw_occ_fliptest.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 3dpw_occ_fliptest:
2
+ _target_: genmo.datasets.threedpw.threedpw_occ_motion_test.ThreedpwOccSmplFullSeqDataset
3
+ flip_test: true
configs/test_datasets/emdb1_fliptest.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ emdb1_fliptest:
2
+ _target_: genmo.datasets.emdb.emdb_motion_test.EmdbSmplFullSeqDataset
3
+ split: 1
4
+ flip_test: true
configs/test_datasets/emdb2_fliptest.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ emdb2_fliptest:
2
+ _target_: genmo.datasets.emdb.emdb_motion_test.EmdbSmplFullSeqDataset
3
+ split: 2
4
+ flip_test: true
configs/test_datasets/humanml3d_eval.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ humanml3d_eval:
2
+ _target_: genmo.datasets.pure_motion.humanml3d.Humanml3dDataset
3
+ eval_gen_only: true
4
+ cam_augmentation: v11
5
+ use_random_subset: true
6
+ random_subset_size: 2
7
+ random_subset_seed: 7
configs/test_datasets/rich_test.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ rich_test:
2
+ _target_: genmo.datasets.rich.rich_motion_test.RichSmplFullSeqDataset
configs/text_encoder/t5_3b.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ load_llm: true
2
+ llm_version: "t5-3b"
3
+ max_text_len: 50
configs/train.yaml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - _self_
3
+ # pytorch-lightning
4
+ - data: ???
5
+ - model: ???
6
+ - callbacks: null
7
+
8
+ # system
9
+ - hydra: default
10
+
11
+ # utility groups that changes a lot
12
+ - pipeline: null
13
+ - network: null
14
+ - optimizer: null
15
+ - scheduler: null
16
+ - train_datasets: null
17
+ - test_datasets: null
18
+ - endecoder: null # normalize/unnormalize data
19
+ - refiner: null
20
+
21
+ # global-override
22
+ - exp: mixed # set "data, model and callbacks" in yaml
23
+ - global/task: null # dump/test
24
+ - global/hsearch: null # hyper-param search
25
+ - global/debug: null # debug mode
26
+
27
+ # ================================ #
28
+ # global setting #
29
+ # ================================ #
30
+ # expirement information
31
+ task: fit # [fit, predict]
32
+ exp_name_base: ???
33
+ exp_name_var: ""
34
+ exp_name: ${exp_name_base}_${exp_name_var}
35
+ data_name: ???
36
+ num_test_data: 32
37
+
38
+ # utilities in the entry file
39
+ output_dir: "outputs/${data_name}/${exp_name}"
40
+ ckpt_path: null
41
+ resume_mode: null
42
+ seed: 42
43
+
44
+ # lightning default settings
45
+ pl_trainer:
46
+ devices: 1
47
+ num_sanity_val_steps: 0 # disable sanity check
48
+ precision: 32
49
+ inference_mode: False
50
+
51
+ logger:
52
+ _target_: pytorch_lightning.loggers.tensorboard.TensorBoardLogger
53
+ save_dir: ${output_dir}
54
+ name: ""
55
+ version: ""
configs/train_datasets/3dpw_occ_v1.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 3dpw_occ_v1:
2
+ _target_: genmo.datasets.threedpw.threedpw_occ_motion_train.ThreedpwOccSmplDataset
configs/train_datasets/3dpw_v1.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 3dpw_v1:
2
+ _target_: genmo.datasets.threedpw.threedpw_motion_train.ThreedpwSmplDataset
configs/train_datasets/aistpp_train.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ aistpp_train:
2
+ _target_: genmo.datasets.aistplusplus.aistplusplus.AISTPlusPlusSmplDataset
3
+ split: train
4
+ motion_frames: 120
5
+ lazy_load: false
6
+ eval_gen_only: true
7
+ feat_version: v2
configs/train_datasets/amass_train_v11.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ amass_train_v11:
2
+ _target_: genmo.datasets.pure_motion.amass.AmassDataset
3
+
4
+ motion_frames: 120
5
+ l_factor: 1.5
6
+ skip_moyo: True
7
+ cam_augmentation: v11
8
+ random1024: False
9
+ limit_size: null
configs/train_datasets/beat2_static_train.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ beat2_static_train:
2
+ _target_: genmo.datasets.beat2.beat2.BEAT2SmplDataset
3
+ split: train
4
+ cam_augmentation: static
5
+ motion_frames: 120
6
+ lazy_load: false