NikhilMarisetty commited on Jan 31

Commit

eb71a72

verified ·

1 Parent(s): 8b783f1

Upload folder using huggingface_hub

Browse files

Files changed (29) hide show

.gitattributes +1 -0
.gitignore +187 -0
LICENSE +52 -0
README.md +164 -3
app.py +84 -0
args.py +119 -0
data/code/pre_motion.py +72 -0
data/code/pre_music.py +90 -0
data/code/slice_music_motion.py +41 -0
dataset/FineDance_dataset.py +180 -0
dataset/__init__.py +0 -0
dataset/preprocess.py +93 -0
dataset/quaternion.py +71 -0
dataset/scaler.py +83 -0
environment.yaml +343 -0
environment_macos.yaml +64 -0
generate_all.py +57 -0
generate_dance.py +240 -0
model/adan.py +123 -0
model/diffusion.py +741 -0
model/model.py +444 -0
model/rotary_embedding_torch.py +132 -0
model/utils.py +99 -0
render.py +395 -0
smplx_neu_J_1.npy +3 -0
teaser/teaser.png +3 -0
test.py +187 -0
train_seq.py +318 -0
vis.py +687 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+teaser/teaser.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,187 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# Project-specific
+experiments
+data/finedance
+generated
+eval
+wandb
+assets/checkpoints/
+assets/smpl_model/
+assets/ffmpeg-6.0-amd64-static/
+assets/NORMAL_new.obj
+# User-generated output and uploaded music
+output/
+custom_music/
+*.mp4
+*.wav
+*.mp3
+*.flac
+*.ogg
+*.m4a
+# macOS
+.DS_Store
+# VSCode
+.vscode/
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

LICENSE ADDED Viewed

	@@ -0,0 +1,52 @@

+License
+Software Copyright License for non-commercial scientific research purposes
+Please read carefully the following terms and conditions and any accompanying documentation before you download and/or use FineDance data, model and software, (the "Data & Software"), including 3D meshes, images, videos, textures, software, scripts, and animations. By downloading and/or using the Data & Software (including downloading, cloning, installing, and any other use of the corresponding github repository), you acknowledge that you have read these terms and conditions, understand them, and agree to be bound by them. If you do not agree with these terms and conditions, you must not download and/or use the Data & Software. Any infringement of the terms of this agreement will automatically terminate your rights under this License
+Ownership / Licensees
+The Software and the associated materials has been developed at the
+Professor Xiu Li, Tsinghua University.
+Any copyright or patent right is owned by and proprietary material of the
+Professor Xiu Li, Tsinghua University.
+hereinafter the “Licensor”.
+License Grant
+Licensor grants you (Licensee) personally a single-user, non-exclusive, non-transferable, free of charge right:
+To install the Data & Software on computers owned, leased or otherwise controlled by you and/or your organization;
+To use the Data & Software for the sole purpose of performing non-commercial scientific research, non-commercial education, or non-commercial artistic projects;
+Any other use, in particular any use for commercial, pornographic, military, or surveillance, purposes is prohibited. This includes, without limitation, incorporation in a commercial product, use in a commercial service, or production of other artifacts for commercial purposes. The Data & Software may not be used to create fake, libelous, misleading, or defamatory content of any kind excluding analyses in peer-reviewed scientific research. The Data & Software may not be reproduced, modified and/or made available in any form to any third party without Xiu Li’s prior written permission.
+The Data & Software may not be used for pornographic purposes or to generate pornographic material whether commercial or not. This license also prohibits the use of the Software to train methods/algorithms/neural networks/etc. for commercial, pornographic, military, surveillance, or defamatory use of any kind. By downloading the Data & Software, you agree not to reverse engineer it.
+No Distribution
+The Data & Software and the license herein granted shall not be copied, shared, distributed, re-sold, offered for re-sale, transferred or sub-licensed in whole or in part except that you may make one copy for archive purposes only.
+Disclaimer of Representations and Warranties
+You expressly acknowledge and agree that the Data & Software results from basic research, is provided “AS IS”, may contain errors, and that any use of the Data & Software is at your sole risk. LICENSOR MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE DATA & SOFTWARE, NEITHER EXPRESS NOR IMPLIED, AND THE ABSENCE OF ANY LEGAL OR ACTUAL DEFECTS, WHETHER DISCOVERABLE OR NOT. Specifically, and not to limit the foregoing, licensor makes no representations or warranties (i) regarding the merchantability or fitness for a particular purpose of the Data & Software, (ii) that the use of the Data & Software will not infringe any patents, copyrights or other intellectual property rights of a third party, and (iii) that the use of the Data & Software will not cause any damage of any kind to you or a third party.
+Limitation of Liability
+The Data & Software is provided in the state of development the licensor defines. If modified or extended by Licensee, the Licensor makes no claims about the fitness of the Data & Software and is not responsible for any problems such modifications cause.
+No Maintenance Services
+You understand and agree that Licensor is under no obligation to provide either maintenance services, update services, notices of latent defects, or corrections of defects with regard to the Data & Software. Licensor nevertheless reserves the right to update, modify, or discontinue the Data & Software at any time.
+Defects of the Data & Software must be notified in writing to the Licensor with a comprehensible description of the error symptoms. The notification of the defect should enable the reproduction of the error. The Licensee is encouraged to communicate any use, results, modification or publication.
+Publications using the Data & Software
+You acknowledge that the Data & Software is a valuable scientific resource and agree to appropriately reference the following paper in any publication making use of the Data & Software.
+Citation:
+@InProceedings{Li_2023_ICCV,
+    author    = {Li, Ronghui and Zhao, Junfan and Zhang, Yachao and Su, Mingyang and Ren, Zeping and Zhang, Han and Tang, Yansong and Li, Xiu},
+    title     = {FineDance: A Fine-grained Choreography Dataset for 3D Full Body Dance Generation},
+    booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
+    month     = {October},
+    year      = {2023},
+    pages     = {10234-10243}
+}

README.md CHANGED Viewed

@@ -1,3 +1,164 @@
----
-license: mit
----

+# [FineDance: A Fine-grained Choreography Dataset for 3D Full Body Dance Generation (ICCV 2023)](https://github.com/li-ronghui/FineDance)
+[[Project Page](https://li-ronghui.github.io/finedance)] | [[Preprint](https://arxiv.org/abs/2212.03741)] | [[pdf](https://arxiv.org/pdf/2212.03741.pdf)] | [[video](https://li-ronghui.github.io/finedance)]
+<img src="teaser/teaser.png">
+## Quick Start
+### Prerequisites
+Install the conda environment and activate it:
+```bash
+conda env create -f environment.yaml
+conda activate FineNet
+```
+Download the pretrained checkpoints and asset files from [Google Drive](https://drive.google.com/file/d/1ENoeUn-X-3Vw2Gon-voVLlndy3hZXdWD/view?usp=drive_link).
+### Web UI (Recommended)
+Launch the Gradio web interface:
+```bash
+python app.py
+```
+Open `http://127.0.0.1:7861` in your browser. Upload a music file and click "Generate Dance" to produce a video.
+### Command Line
+```bash
+python generate_dance.py /path/to/music.mp3
+```
+Output will be saved to `output/<songname>_dance.mp4`.
+To specify a custom output path:
+```bash
+python generate_dance.py /path/to/music.mp3 --output my_dance.mp4
+```
+Supported audio formats: any format ffmpeg can read (`.mp3`, `.wav`, `.m4a`, `.flac`, `.ogg`, etc.).
+### Output Details
+- Resolution: 1200x1200
+- Frame rate: 30 fps
+- Duration: ~30 seconds
+- Background: black
+- Body model: SMPLX (full body with hands)
+## How It Works
+1. **Audio conversion** - Converts input to WAV if needed
+2. **Feature extraction** - Slices audio into 4-second windows with 2-second stride, then extracts 35-dim features per slice (onset envelope, 20 MFCC, 12 chroma, peak onehot, beat onehot) using librosa
+3. **Dance generation** - Feeds audio features into a pretrained diffusion model (`assets/checkpoints/train-2000.pt`) which generates SMPLX body motion (319-dim: 4 contact + 3 translation + 52 joints x 6 rotation)
+4. **Rendering** - Converts generated motion to SMPLX meshes and renders 900 frames at 30fps using pyrender
+5. **Final output** - Merges rendered video with original audio via ffmpeg
+## FineDance Dataset
+The dataset (7.7 hours) can be downloaded from [Google Drive](https://drive.google.com/file/d/1zQvWG9I0H4U3Zrm8d_QD_ehenZvqfQfS/view?usp=sharing) or [Baidu Cloud](https://pan.baidu.com/s/1gynUC7pMdpsE31wAwq177w?pwd=o9pw).
+Put the downloaded data into `./data`. The data directory contains:
+- **label_json** - Song name, coarse style, and fine-grained genre
+- **motion** - [SMPLH](https://smpl-x.is.tue.mpg.de/) format motion data
+- **music_wav** - Music data in WAV format
+- **music_npy** - Music features extracted by [librosa](https://github.com/librosa/librosa) following [AIST++](https://github.com/google/aistplusplus_api/tree/main)
+Reading a motion file:
+```python
+import numpy as np
+data = np.load("motion/001.npy")
+T, C = data.shape           # T is the number of frames
+smpl_poses = data[:, 3:]
+smpl_trans = data[:, :3]
+```
+### Dataset Split
+The dataset is split into train, val, and test sets in two ways:
+1. **FineDance@Genre** - Test set includes a broader range of dance genres; the same dancer may appear across splits but with different motions. Recommended for dance generation.
+2. **FineDance@Dancer** - Splits are divided by dancer; the same dancer won't appear in different sets, but the test set contains fewer genres.
+## Training
+Only needed if you want to train from scratch. The pretrained checkpoint is already provided.
+```bash
+# Data preprocessing
+python data/code/pre_motion.py
+# Train
+accelerate launch train_seq.py --batch_size 32 --epochs 200
+```
+Key flags:
+- `--batch_size` - Default is 400, reduce to 32 or lower for Mac MPS (limited to ~30GB)
+- `--epochs` - Default is 2000
+- `--checkpoint` - Resume from a saved checkpoint
+## Advanced Usage
+### Generate on the test set
+```bash
+python data/code/slice_music_motion.py
+python generate_all.py --motion_save_dir generated/finedance_seq_120_dancer --save_motions
+```
+### Render a pre-generated motion file
+```bash
+python render.py --modir eval/motions --mode smplx
+```
+## Project Structure
+```
+FineDance/
+├── app.py                   # Gradio web UI
+├── generate_dance.py        # One-command dance generation (CLI)
+├── train_seq.py             # Training script
+├── test.py                  # Original test/inference script
+├── render.py                # Video rendering (SMPLX mesh to MP4)
+├── args.py                  # CLI argument definitions
+├── vis.py                   # Skeleton/FK utilities
+├── assets/
+│   ├── checkpoints/
+│   │   └── train-2000.pt    # Pretrained model (2000 epochs)
+│   └── smpl_model/
+│       └── smplx/
+│           └── SMPLX_NEUTRAL.npz  # SMPLX body model
+├── model/
+│   ├── model.py             # SeqModel (transformer decoder)
+│   └── diffusion.py         # Gaussian diffusion (training + sampling)
+├── dataset/
+│   └── FineDance_dataset.py # Dataset loader
+└── data/
+    └── finedance/           # Training data (music + motion pairs)
+```
+## Acknowledgments
+We would like to express our sincere gratitude to Dr [Yan Zhang](https://yz-cnsdqz.github.io/) and [Yulun Zhang](https://yulunzhang.com/) for their invaluable guidance and insights during the course of our research.
+This code is based on: [EDGE](https://github.com/Stanford-TML/EDGE/tree/main), [MDM](https://github.com/Stanford-TML/EDGE/tree/main), [Adan](https://github.com/lucidrains/Adan-pytorch), [Diffusion](https://github.com/lucidrains/denoising-diffusion-pytorch), [SMPLX](https://smpl-x.is.tue.mpg.de/).
+## Citation
+```
+@inproceedings{li2023finedance,
+  title={FineDance: A Fine-grained Choreography Dataset for 3D Full Body Dance Generation},
+  author={Li, Ronghui and Zhao, Junfan and Zhang, Yachao and Su, Mingyang and Ren, Zeping and Zhang, Han and Tang, Yansong and Li, Xiu},
+  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+  pages={10234--10243},
+  year={2023}
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,84 @@

+"""
+Gradio UI for FineDance — generate dance videos from music.
+Usage:
+    conda activate FineNet
+    python app.py
+"""
+import os
+import tempfile
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+import gradio as gr
+# Monkey-patch gradio_client bug: additionalProperties can be a bool,
+# but the code assumes it's always a dict.
+import gradio_client.utils as _gc_utils
+_orig_json_schema_to_python_type = _gc_utils._json_schema_to_python_type
+def _patched_json_schema_to_python_type(schema, defs=None):
+    if isinstance(schema, bool):
+        return "Any"
+    return _orig_json_schema_to_python_type(schema, defs)
+_gc_utils._json_schema_to_python_type = _patched_json_schema_to_python_type
+from generate_dance import load_model, generate, _setup_render_args
+from render import MovieMaker
+# Preload model once at startup
+print("Loading model...")
+MODEL = load_model()
+print("Model loaded. Initializing renderer...")
+# Create MovieMaker on the main thread so pyglet's signal handler works.
+_setup_render_args()
+VISUALIZER = MovieMaker(save_path=".")
+print("Starting UI...")
+def run(audio_path):
+    if audio_path is None:
+        raise gr.Error("Please upload a music file.")
+    logs = []
+    def log_fn(msg):
+        logs.append(msg)
+        print(msg)
+    songname = os.path.splitext(os.path.basename(audio_path))[0]
+    output_path = os.path.join(tempfile.gettempdir(), f"{songname}_dance.mp4")
+    generate(audio_path, output_path, model=MODEL, visualizer=VISUALIZER, log_fn=log_fn)
+    return output_path, "\n".join(logs)
+with gr.Blocks(title="FineDance") as demo:
+    gr.Markdown("# FineDance\nUpload a music file to generate a 3D dance video.")
+    with gr.Row():
+        with gr.Column():
+            audio_input = gr.Audio(
+                label="Upload Music",
+                type="filepath",
+            )
+            generate_btn = gr.Button("Generate Dance", variant="primary")
+        with gr.Column():
+            video_output = gr.Video(label="Generated Dance")
+            status_output = gr.Textbox(label="Status", lines=6, interactive=False)
+    generate_btn.click(
+        fn=run,
+        inputs=[audio_input],
+        outputs=[video_output, status_output],
+    )
+if __name__ == "__main__":
+    demo.launch(server_name="127.0.0.1")

args.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import argparse
+import yaml
+def FineDance_parse_train_opt():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--project", default="experiments/finedance_seq_120_genre/train", help="project/name")
+    parser.add_argument("--exp_name", default="finedance_seq_120_genre", help="save to project/name")
+    parser.add_argument("--feature_type", type=str, default="baseline")
+    parser.add_argument("--datasplit", type=str, default="cross_genre", choices=["cross_genre", "cross_dancer"])
+    parser.add_argument(
+        "--render_dir", type=str, default="experiments/finedance_seq_120_genre/renders", help="Sample render path"
+    )
+    parser.add_argument(
+        "--full_seq_len", type=int, default=120, help="full_seq_len"
+    )
+    parser.add_argument(
+        "--windows", type=int, default=10, help="windows"
+    )
+    parser.add_argument(
+        "--mix", action="store_true", help="Saves the motions for evaluation"
+    )
+    # parser.add_argument("--feature_type", type=str, default="jukebox")
+    parser.add_argument(
+        "--wandb_pj_name", type=str, default="finedance_seq", help="project name"
+    )
+    parser.add_argument("--batch_size", type=int, default=400, help="batch size")        # default=64
+    parser.add_argument("--epochs", type=int, default=2000)
+    parser.add_argument(
+        "--save_interval",
+        type=int,
+        default=10,            # default=100,
+        help='Log model after every "save_period" epoch',
+    )
+    parser.add_argument("--ema_interval", type=int, default=1, help="ema every x steps")
+    parser.add_argument(
+        "--checkpoint", type=str, default="", help="trained checkpoint path (optional)"
+    )
+    parser.add_argument(
+        "--do_normalize",
+        action="store_true",
+        help="normalize",
+    )
+    parser.add_argument(
+        "--nfeats", type=int, default=319, help="nfeats"
+    )
+    opt = parser.parse_args()
+    return opt
+def FineDance_parse_test_opt():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--feature_type", type=str, default="baseline")
+    parser.add_argument(
+        "--full_seq_len", type=int, default=120, help="full_seq_len"
+    )
+    parser.add_argument("--datasplit", type=str, default="cross_genre", choices=["cross_genre", "cross_dancer"])
+    parser.add_argument(
+        "--windows", type=int, default=10, help="windows"
+    )
+    parser.add_argument("--out_length", type=float, default=30, help="max. length of output, in seconds")
+    parser.add_argument(
+        "--render_dir", type=str, default="FineDance_test_renders/", help="Sample render path"
+    )
+    parser.add_argument(
+        "--checkpoint", type=str, default="assets/checkpoints/train-2000.pt", help="checkpoint"
+    )
+    parser.add_argument(
+        "--nfeats", type=int, default=319, help="nfeats"
+    )
+    parser.add_argument(
+        "--music_dir",
+        type=str,
+        default="data/finedance/music_wav",
+        help="folder containing input music",
+    )
+    parser.add_argument(
+        "--save_motions", action="store_true", help="Saves the motions for evaluation"
+    )
+    parser.add_argument(
+        "--motion_save_dir",
+        type=str,
+        default="eval/motions",
+        help="Where to save the motions",
+    )
+    parser.add_argument(
+        "--cache_features",
+        action="store_true",
+        help="Save the jukebox features for later reuse",
+    )
+    parser.add_argument(
+        "--do_normalize",
+        action="store_true",
+        help="normalize",
+    )
+    parser.add_argument(
+        "--no_render",
+        action="store_true",
+        help="Don't render the video",
+    )
+    parser.add_argument(
+        "--use_cached_features",
+        action="store_true",
+        help="Use precomputed features instead of music folder",
+    )
+    parser.add_argument(
+        "--feature_cache_dir",
+        type=str,
+        default="cached_features/",
+        help="Where to save/load the features",
+    )
+    opt = parser.parse_args()
+    return opt
+def save_arguments_to_yaml(args, file_path):
+    arg_dict = vars(args)  # 将Namespace对象转换为字典
+    yaml_str = yaml.dump(arg_dict, default_flow_style=False)
+    with open(file_path, 'w') as file:
+        file.write(yaml_str)

data/code/pre_motion.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import argparse
+import os
+from pathlib import Path
+import smplx, pickle
+import torch
+import sys
+from tqdm import tqdm
+import glob
+import numpy as np
+sys.path.append(os.getcwd())
+from dataset.quaternion import ax_to_6v, ax_from_6v
+from dataset.preprocess import Normalizer, vectorize_many
+def motion_feats_extract(inputs_dir, outputs_dir):
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    print("extracting")
+    raw_fps = 30
+    data_fps = 30
+    data_fps <= raw_fps
+    if not os.path.exists(outputs_dir):
+        os.makedirs(outputs_dir)
+    # All motion is retargeted to this standard model.
+    smplx_model = smplx.SMPLX(model_path='assets/smpl_model/smplx', ext='npz', gender='neutral',
+                             num_betas=10, flat_hand_mean=True, num_expression_coeffs=10, use_pca=False).eval().to(device)
+    motions = sorted(glob.glob(os.path.join(inputs_dir, "*.npy")))
+    for motion in tqdm(motions):
+        name = os.path.splitext(os.path.basename(motion))[0].split(".")[0]
+        print("name is", name)
+        data = np.load(motion, allow_pickle=True)
+        print(data.shape)
+        pos = data[:,:3]   # length, c
+        q = data[:,3:]
+        root_pos = torch.Tensor(pos).to(device) # T, 3
+        length = root_pos.shape[0]
+        local_q_rot6d = torch.Tensor(q).to(device)    # T, 312
+        print("local_q_rot6d", local_q_rot6d.shape)
+        local_q = local_q_rot6d.reshape(length, 52, 6).clone()
+        local_q = ax_from_6v(local_q).view(length, 156)           # T, 156
+        smplx_output = smplx_model(
+                betas = torch.zeros([root_pos.shape[0], 10], device=device, dtype=torch.float32),
+                transl = root_pos,        # global translation
+                global_orient = local_q[:, :3],
+                body_pose = local_q[:, 3:66],           # 21
+                jaw_pose = torch.zeros([root_pos.shape[0], 3], device=device, dtype=torch.float32),         # 1
+                leye_pose = torch.zeros([root_pos.shape[0],  3], device=device, dtype=torch.float32),        # 1
+                reye_pose= torch.zeros([root_pos.shape[0],  3], device=device, dtype=torch.float32),          # 1
+                left_hand_pose = local_q[:, 66:66+45],   # 15
+                right_hand_pose = local_q[:, 66+45:], # 15
+                expression = torch.zeros([root_pos.shape[0], 10], device=device, dtype=torch.float32),
+                return_verts = False
+        )
+        positions = smplx_output.joints.view(length, -1, 3)   # bxt, j, 3
+        feet = positions[:, (7, 8, 10, 11)]  # # 150, 4, 3
+        feetv = torch.zeros(feet.shape[:2], device=device)     # 150, 4
+        feetv[:-1] = (feet[1:] - feet[:-1]).norm(dim=-1)
+        contacts = (feetv < 0.01).to(local_q)  # cast to right dtype        # b, 150, 4
+        mofea319 = torch.cat([contacts, root_pos, local_q_rot6d], dim=1)
+        assert mofea319.shape[1] == 319
+        mofea319 = mofea319.detach().cpu().numpy()
+        np.save(os.path.join(outputs_dir, name+'.npy'), mofea319)
+    return
+if __name__ == "__main__":
+    motion_feats_extract("data/finedance/motion", "data/finedance/motion_fea319")

data/code/pre_music.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import librosa
+import numpy as np
+import os
+import wave
+from tqdm import  tqdm
+import librosa as lr
+FPS = 30 #* 5
+HOP_LENGTH = 512
+SR = FPS * HOP_LENGTH
+EPS = 1e-6
+# HOP_LENGTH = 160
+# SR = 16000
+audio_dir = 'data/finedance/music_wav'
+# audio_dir = '/home/human/datasets/aist_plusplus_final/music'
+# audio_dir = "/home/human/datasets/data/Clip/music_clip_rhythm"
+target_dir_ori = "data/finedance/music_wav_test"
+os.makedirs(target_dir_ori, exist_ok=True)
+# AIST++
+def _get_tempo(audio_name):
+    """Get tempo (BPM) for a music by parsing music name."""
+    # a lot of stuff, only take the 5th element
+    audio_name = audio_name.split("_")[4]
+    assert len(audio_name) == 4
+    if audio_name[0:3] in [
+        "mBR",
+        "mPO",
+        "mLO",
+        "mMH",
+        "mLH",
+        "mWA",
+        "mKR",
+        "mJS",
+        "mJB",
+    ]:
+        return int(audio_name[3]) * 10 + 80
+    elif audio_name[0:3] == "mHO":
+        return int(audio_name[3]) * 5 + 110
+    else:
+        assert False, audio_name
+for file in tqdm(os.listdir(audio_dir)):
+    audio_name = file[:-4]
+    save_path = os.path.join(target_dir_ori, f"{audio_name}.npy") ##存特征路径
+    music_file = os.path.join(audio_dir, file)
+    data, _ = librosa.load(music_file, sr=SR)
+    envelope = librosa.onset.onset_strength(y=data, sr=SR)  # (seq_len,)
+    mfcc = librosa.feature.mfcc(y=data, sr=SR, n_mfcc=20).T  # (seq_len, 20)
+    chroma = librosa.feature.chroma_cens(
+        y=data, sr=SR, hop_length=HOP_LENGTH, n_chroma=12
+    ).T  # (seq_len, 12)
+    peak_idxs = librosa.onset.onset_detect(
+        onset_envelope=envelope.flatten(), sr=SR, hop_length=HOP_LENGTH
+    )
+    peak_onehot = np.zeros_like(envelope, dtype=np.float32)
+    peak_onehot[peak_idxs] = 1.0  # (seq_len,)
+    try:
+        start_bpm = _get_tempo(audio_name)
+    except:
+        # determine manually
+        start_bpm = lr.beat.tempo(y=lr.load(music_file)[0])[0]
+    tempo, beat_idxs = librosa.beat.beat_track(
+        onset_envelope=envelope,
+        sr=SR,
+        hop_length=HOP_LENGTH,
+        start_bpm=start_bpm,
+        tightness=100,
+    )
+    beat_onehot = np.zeros_like(envelope, dtype=np.float32)
+    beat_onehot[beat_idxs] = 1.0  # (seq_len,)
+    audio_feature = np.concatenate(
+        [envelope[:, None], mfcc, chroma, peak_onehot[:, None], beat_onehot[:, None]],
+        axis=-1,
+    )
+    np.save(save_path, audio_feature)

data/code/slice_music_motion.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import numpy as np
+import os
+import sys
+music_dir = "data/finedance/music_npy"
+motion_dir = "data/finedance/motion_fea319"
+music_out = "data/finedance/div_by_time/music_npy_"
+motion_out = "data/finedance/div_by_time/motion_fea319_"
+timelen = 120
+music_out = music_out + str(timelen)
+motion_out = motion_out + str(timelen)
+if not os.path.exists(music_out):
+    os.makedirs(music_out)
+if not os.path.exists(motion_out):
+    os.makedirs(motion_out)
+for file in os.listdir(motion_dir):
+    if file[-3:] != 'npy':
+        print(file[-3:])
+        continue
+    name = file.split(".")[0]
+    music_fea = np.load(os.path.join(music_dir, file))
+    motion_fea = np.load(os.path.join(motion_dir, file))
+    max_length = min(music_fea.shape[0], motion_fea.shape[0])
+    iters = (max_length//timelen)
+    max_length = iters*timelen
+    music_fea = music_fea[:max_length, :]
+    motion_fea = motion_fea[:max_length, :]
+    for i in range(iters):
+        music_clip = music_fea[i*timelen: (i+1)*timelen, :]
+        motion_clip = motion_fea[i*timelen: (i+1)*timelen, :]
+        np.save(os.path.join(music_out, name + "z@" + str(i).zfill(3) + ".npy"), music_clip)
+        np.save(os.path.join(motion_out, name + "z@" + str(i).zfill(3) + ".npy"), motion_clip)

dataset/FineDance_dataset.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import torch
+from torch.utils import data
+import numpy as np
+import os
+from tqdm import tqdm
+import json
+import sys
+sys.path.insert(0,'.')
+SMPL_JOINTS_FLIP_PERM = [0, 2, 1, 3, 5, 4, 6, 8, 7, 9, 11, 10, 12, 14, 13, 15, 17, 16, 19, 18, 21, 20, 23, 22]
+SMPLX_JOINTS_FLIP_PERM = [0, 2, 1, 3, 5, 4, 6, 8, 7, 9, 11, 10, 12, 14, 13,
+                        15, 17, 16, 19, 18, 21, 20, 22, 24, 23,
+                        40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+                        25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]
+SMPLX_POSE_FLIP_PERM = []
+for i in SMPLX_JOINTS_FLIP_PERM:
+    SMPLX_POSE_FLIP_PERM.append(3*i)
+    SMPLX_POSE_FLIP_PERM.append(3*i+1)
+    SMPLX_POSE_FLIP_PERM.append(3*i+2)
+def flip_pose(pose):
+    #Flip pose.The flipping is based on SMPLX parameters.
+    pose = pose[:,SMPLX_POSE_FLIP_PERM]
+    # we also negate the second and the third dimension of the axis-angle
+    pose[:,1::3] = -pose[:,1::3]
+    pose[:,2::3] = -pose[:,2::3]
+    return pose
+def get_train_test_list(datasplit):
+        all_list = []
+        train_list = []
+        for i in range(1,212):
+            all_list.append(str(i).zfill(3))
+        if datasplit == "cross_genre":
+            test_list = ["063", "132", "143", "036", "098", "198", "130", "012", "211", "193", "179", "065", "137", "161", "092", "120", "037", "109", "204", "144"]
+            ignor_list = ["116", "117", "118", "119", "120", "121", "122", "123", "202"]+["130"]
+        elif datasplit == "cross_dancer":
+            test_list = ['001','002','003','004','005','006','007','008','009','010','011','012','013','124','126','128','130','132']
+            ignor_list = ['115','117','119','121','122','135','137','139','141','143','145','147'] + ["116", "118", "120", "123", "202", "159"]+["130"]       # 前一个列表为val set，后一个列表为ignore set
+        else:
+            raise("error of data split!")
+        for one in all_list:
+            if one not in test_list:
+                if one not in ignor_list:
+                    train_list.append(one)
+        return train_list, test_list, ignor_list
+class FineDance_Smpl(data.Dataset):
+    def __init__(self, args, istrain):
+        self.motion_dir = './data/finedance/motion_fea319'
+        self.music_dir = './data/finedance/music_npy'
+        self.istrain = istrain
+        self.seq_len = args.full_seq_len
+        slide = args.full_seq_len // args.windows
+        self.motion_index = []
+        self.music_index = []
+        self.name = []
+        motion_all = []
+        music_all = []
+        train_list, test_list, ignor_list = get_train_test_list(args.datasplit)
+        if self.istrain:
+            self.datalist= train_list
+        else:
+            self.datalist = test_list
+        total_length = 0            # 将数据集中的所有motion用同一个index索引
+        for name in tqdm(self.datalist):
+            save_name = name
+            name = name + ".npy"
+            if name[:-4] in ignor_list:
+                continue
+            motion = np.load(os.path.join(self.motion_dir, name))
+            music = np.load(os.path.join(self.music_dir, name))
+            min_all_len = min(motion.shape[0], music.shape[0])
+            motion = motion[:min_all_len]
+            if motion.shape[-1] == 168:
+                motion = np.concatenate([motion[:,:69], motion[:,78:]], axis=1)     # 22,  25
+            elif motion.shape[-1] == 319:
+                pass
+            elif motion.shape[-1] == 315:
+                pass
+                # motion = np.concatenate([motion[:,:135], motion[:,153:]], axis=1)    #
+            else:
+                raise("input motion shape error! not 168 or 319!")
+            music = music[:min_all_len]         # motion = motion[:min_all_len]
+            nums = (min_all_len-self.seq_len) // slide + 1          # 舍弃了最后一段不满seq_len的motion
+            if self.istrain:
+                clip_index = []
+                for i in range(nums):
+                    motion_clip = motion[i * slide: i * slide + self.seq_len]
+                    if motion_clip.std(axis=0).mean() > 0.07:           # 判断是否为有效motion，如果耗费时间，可以考虑删掉
+                        clip_index.append(i)
+                index = np.array(clip_index) * slide + total_length     # clip_index为local index
+                index_ = np.array(clip_index) * slide
+            else:
+                index = np.arange(nums) * slide + total_length
+                index_ = np.arange(nums) * slide
+            motion_all.append(motion)
+            music_all.append(music)
+            if args.mix:
+                motion_index = []
+                music_index = []
+                num = (len(index) - 1) // 8 + 1
+                for i in range(num):
+                    motion_index_tmp, music_index_tmp = np.meshgrid(index[i*8:(i+1)*8], index[i*8:(i+1)*8])         # 这里i有问题？似乎没有
+                    motion_index += motion_index_tmp.reshape((-1)).tolist()
+                    music_index += music_index_tmp.reshape((-1)).tolist()
+                    index_tmp = np.meshgrid(index_[i*8:(i+1)*8])
+                    index_ += index_tmp.reshape((-1)).tolist()
+            else:
+                motion_index = index.tolist()
+                music_index = index.tolist()
+                index_ = index_.tolist()
+            index_ = [save_name + "_" + str(element).zfill(5) for element in index_]
+            self.motion_index += motion_index
+            self.music_index += music_index
+            total_length += min_all_len
+            self.name += index_
+        self.motion = np.concatenate(motion_all, axis=0).astype(np.float32)
+        self.music = np.concatenate(music_all, axis=0).astype(np.float32)
+        self.len = len(self.motion_index)
+        print(f'FineDance has {self.len} samples..')
+    def __len__(self):
+        return self.len
+    def __getitem__(self, index):
+        motion_index = self.motion_index[index]
+        music_index = self.music_index[index]
+        motion = self.motion[motion_index:motion_index+self.seq_len]
+        if motion.shape[-1] == 319 or motion.shape[-1] == 139:
+            motion[:, 4:7]  = motion[:, 4:7] - motion[:1, 4:7]           # The first 4 dimension are foot contact
+        else:
+            motion[:, :3] = motion[:, :3] - motion[:1, :3]
+        music = self.music[music_index:music_index+self.seq_len]
+        filename = self.name[index]
+        # if np.random.rand(1) > 0.5:
+        #     motion = motion[:,self.mirror_idx]
+        return motion, music, filename
+if __name__ == '__main__':
+    data_split = {}
+    all_list = []
+    train_list = []
+    for i in range(1,212):
+        all_list.append(str(i).zfill(3))
+    test_list = ["001","002","003","004","005","006","007","008","009","010","011","012","013","124","126","128","130","132"]
+    val_list = ["115","117","119","121","122","135","137","139","141","143","145","147"]
+    for one in all_list:
+        if one not in test_list:
+            if one not in val_list:
+                train_list.append(one)
+    data_split["train"] = train_list
+    data_split["test"] = test_list
+    data_split["val"] = val_list
+    data_split["ignore"] =  ["116", "117", "118", "119", "120", "121", "122", "123", "202"]
+    with open("data_crossdancer.json", "w") as f:
+        json.dump(data_split,f)
+    print(train_list)

dataset/__init__.py ADDED Viewed

File without changes

dataset/preprocess.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import glob
+import os
+import re
+from pathlib import Path
+import torch
+from .scaler import MinMaxScaler
+import pickle
+def increment_path(path, exist_ok=False, sep="", mkdir=False):
+    # Increment file or directory path, i.e. runs/exp --> runs/exp{sep}2, runs/exp{sep}3, ... etc.
+    path = Path(path)  # os-agnostic
+    if path.exists() and not exist_ok:
+        suffix = path.suffix
+        path = path.with_suffix("")
+        dirs = glob.glob(f"{path}{sep}*")  # similar paths
+        matches = [re.search(rf"%s{sep}(\d+)" % path.stem, d) for d in dirs]
+        i = [int(m.groups()[0]) for m in matches if m]  # indices
+        n = max(i) + 1 if i else 2  # increment number
+        path = Path(f"{path}{sep}{n}{suffix}")  # update path
+    dir = path if path.suffix == "" else path.parent  # directory
+    if not dir.exists() and mkdir:
+        dir.mkdir(parents=True, exist_ok=True)  # make directory
+    return path
+class Normalizer:
+    def __init__(self, data):
+        flat = data.reshape(-1, data.shape[-1])     # bxt , 151
+        self.scaler = MinMaxScaler((-1, 1), clip=True)
+        self.scaler.fit(flat)
+    def normalize(self, x):
+        batch, seq, ch = x.shape
+        x = x.reshape(-1, ch)
+        return self.scaler.transform(x).reshape((batch, seq, ch))
+    def unnormalize(self, x):
+        batch, seq, ch = x.shape
+        x = x.reshape(-1, ch)
+        x = torch.clip(x, -1, 1)  # clip to force compatibility
+        return self.scaler.inverse_transform(x).reshape((batch, seq, ch))
+class My_Normalizer:
+    def __init__(self, data):
+        if isinstance(data, str):
+            self.scaler = MinMaxScaler((-1, 1), clip=True)
+            with open(data, 'rb') as f:
+                normalizer_state_dict = pickle.load(f)
+            # normalizer_state_dict = torch.load(data)
+            self.scaler.scale_ = normalizer_state_dict["scale"]
+            self.scaler.min_ = normalizer_state_dict["min"]
+        else:
+            flat = data.reshape(-1, data.shape[-1])     # bxt , 151
+            self.scaler = MinMaxScaler((-1, 1), clip=True)
+            self.scaler.fit(flat)
+    def normalize(self, x):
+        if len(x.shape) == 3:
+            batch, seq, ch = x.shape
+            x = x.reshape(-1, ch)
+            return self.scaler.transform(x).reshape((batch, seq, ch))
+        elif len(x.shape) == 2:
+            batch, ch = x.shape
+            return self.scaler.transform(x)
+        else:
+            raise("input error!")
+    def unnormalize(self, x):
+        if len(x.shape) == 3:
+            batch, seq, ch = x.shape
+            x = x.reshape(-1, ch)
+            x = torch.clip(x, -1, 1)  # clip to force compatibility
+            return self.scaler.inverse_transform(x).reshape((batch, seq, ch))
+        elif len(x.shape) == 2:
+             x = torch.clip(x, -1, 1)
+             return self.scaler.inverse_transform(x)
+        else:
+            raise("input error!")
+def vectorize_many(data):
+    # given a list of batch x seqlen x joints? x channels, flatten all to batch x seqlen x -1, concatenate
+    batch_size = data[0].shape[0]
+    seq_len = data[0].shape[1]
+    out = [x.reshape(batch_size, seq_len, -1).contiguous() for x in data]
+    global_pose_vec_gt = torch.cat(out, dim=2)
+    return global_pose_vec_gt

dataset/quaternion.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import torch
+from pytorch3d.transforms import (axis_angle_to_matrix, matrix_to_axis_angle,
+                                  matrix_to_quaternion, matrix_to_rotation_6d,
+                                  quaternion_to_matrix, rotation_6d_to_matrix)
+def quat_to_6v(q):
+    assert q.shape[-1] == 4
+    mat = quaternion_to_matrix(q)
+    mat = matrix_to_rotation_6d(mat)
+    return mat
+def quat_from_6v(q):
+    assert q.shape[-1] == 6
+    mat = rotation_6d_to_matrix(q)
+    quat = matrix_to_quaternion(mat)
+    return quat
+def ax_to_6v(q):
+    assert q.shape[-1] == 3
+    mat = axis_angle_to_matrix(q)
+    mat = matrix_to_rotation_6d(mat)
+    return mat
+def ax_from_6v(q):
+    assert q.shape[-1] == 6
+    mat = rotation_6d_to_matrix(q)
+    ax = matrix_to_axis_angle(mat)
+    return ax
+def quat_slerp(x, y, a):
+    """
+    Performs spherical linear interpolation (SLERP) between x and y, with proportion a
+    :param x: quaternion tensor (N, S, J, 4)
+    :param y: quaternion tensor (N, S, J, 4)
+    :param a: interpolation weight (S, )
+    :return: tensor of interpolation results
+    """
+    len = torch.sum(x * y, axis=-1)
+    neg = len < 0.0
+    len[neg] = -len[neg]
+    y[neg] = -y[neg]
+    a = torch.zeros_like(x[..., 0]) + a
+    amount0 = torch.zeros_like(a)
+    amount1 = torch.zeros_like(a)
+    linear = (1.0 - len) < 0.01
+    omegas = torch.arccos(len[~linear])
+    sinoms = torch.sin(omegas)
+    amount0[linear] = 1.0 - a[linear]
+    amount0[~linear] = torch.sin((1.0 - a[~linear]) * omegas) / sinoms
+    amount1[linear] = a[linear]
+    amount1[~linear] = torch.sin(a[~linear] * omegas) / sinoms
+    # reshape
+    amount0 = amount0[..., None]
+    amount1 = amount1[..., None]
+    res = amount0 * x + amount1 * y
+    return res

dataset/scaler.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import torch
+def _handle_zeros_in_scale(scale, copy=True, constant_mask=None):
+    # if we are fitting on 1D arrays, scale might be a scalar
+    if constant_mask is None:
+        # Detect near constant values to avoid dividing by a very small
+        # value that could lead to surprising results and numerical
+        # stability issues.
+        constant_mask = scale < 10 * torch.finfo(scale.dtype).eps
+    if copy:
+        # New array to avoid side-effects
+        scale = scale.clone()
+    scale[constant_mask] = 1.0
+    return scale
+class MinMaxScaler:
+    _parameter_constraints: dict = {
+        "feature_range": [tuple],
+        "copy": ["boolean"],
+        "clip": ["boolean"],
+    }
+    def __init__(self, feature_range=(0, 1), *, copy=True, clip=False):
+        self.feature_range = feature_range
+        self.copy = copy
+        self.clip = clip
+    def _reset(self):
+        """Reset internal data-dependent state of the scaler, if necessary.
+        __init__ parameters are not touched.
+        """
+        # Checking one attribute is enough, because they are all set together
+        # in partial_fit
+        if hasattr(self, "scale_"):
+            del self.scale_
+            del self.min_
+            del self.n_samples_seen_
+            del self.data_min_
+            del self.data_max_
+            del self.data_range_
+    def fit(self, X):
+        # Reset internal state before fitting
+        self._reset()
+        return self.partial_fit(X)
+    def partial_fit(self, X):
+        feature_range = self.feature_range
+        if feature_range[0] >= feature_range[1]:
+            raise ValueError(
+                "Minimum of desired feature range must be smaller than maximum. Got %s."
+                % str(feature_range)
+            )
+        data_min = torch.min(X, axis=0)[0]
+        data_max = torch.max(X, axis=0)[0]
+        self.n_samples_seen_ = X.shape[0]
+        data_range = data_max - data_min
+        self.scale_ = (feature_range[1] - feature_range[0]) / _handle_zeros_in_scale(
+            data_range, copy=True
+        )
+        self.min_ = feature_range[0] - data_min * self.scale_
+        self.data_min_ = data_min
+        self.data_max_ = data_max
+        self.data_range_ = data_range
+        return self
+    def transform(self, X):
+        X *= self.scale_.to(X.device)
+        X += self.min_.to(X.device)
+        if self.clip:
+            torch.clip(X, self.feature_range[0], self.feature_range[1], out=X)
+        return X
+    def inverse_transform(self, X):
+        X -= self.min_[-X.shape[1] :].to(X.device)
+        X /= self.scale_[-X.shape[1] :].to(X.device)
+        return X

environment.yaml ADDED Viewed

	@@ -0,0 +1,343 @@

+name: FineNet
+channels:
+  - anaconda
+  - pytorch
+  - conda-forge
+  - https://repo.anaconda.com/pkgs/main
+  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/peterjc123/
+  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch/
+  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - aiohttp=3.8.1=py38h0a891b7_1
+  - aiosignal=1.3.1=pyhd8ed1ab_0
+  - asttokens=2.2.1=pyhd8ed1ab_0
+  - async-timeout=4.0.3=pyhd8ed1ab_0
+  - attrs=23.1.0=pyh71513ae_1
+  - backcall=0.2.0=pyh9f0ad1d_0
+  - backports=1.0=pyhd8ed1ab_3
+  - backports.functools_lru_cache=1.6.5=pyhd8ed1ab_0
+  - blas=1.0=mkl
+  - blinker=1.7.0=pyhd8ed1ab_0
+  - brotlipy=0.7.0=py38h0a891b7_1004
+  - bzip2=1.0.8=h7f98852_4
+  - c-ares=1.18.1=h7f98852_0
+  - ca-certificates=2023.11.17=hbcca054_0
+  - certifi=2023.11.17=pyhd8ed1ab_0
+  - cffi=1.15.0=py38h3931269_0
+  - charset-normalizer=2.1.1=pyhd8ed1ab_0
+  - colorama=0.4.6=pyhd8ed1ab_0
+  - cpuonly=1.0=0
+  - cryptography=37.0.2=py38h2b5fc30_0
+  - cudatoolkit=11.3.1=h9edb442_10
+  - cycler=0.11.0=pyhd8ed1ab_0
+  - dataclasses=0.8=pyhc8e2a94_3
+  - debugpy=1.5.1=py38h295c915_0
+  - entrypoints=0.4=pyhd8ed1ab_0
+  - executing=1.2.0=pyhd8ed1ab_0
+  - ffmpeg=4.3=hf484d3e_0
+  - freetype=2.10.4=h0708190_1
+  - frozenlist=1.3.0=py38h0a891b7_1
+  - fsspec=2023.5.0=pyh1a96a4e_0
+  - future=0.18.3=pyhd8ed1ab_0
+  - geos=3.10.2=h9c3ff4c_0
+  - giflib=5.2.1=h5eee18b_1
+  - gmp=6.2.1=h58526e2_0
+  - gnutls=3.6.13=h85f3911_1
+  - icu=67.1=he1b5a44_0
+  - idna=3.4=pyhd8ed1ab_0
+  - importlib-metadata=6.8.0=pyha770c72_0
+  - intel-openmp=2021.4.0=h06a4308_3561
+  - ipykernel=6.14.0=py38h7f3c49e_0
+  - ipython=8.4.0=py38h578d9bd_0
+  - jedi=0.18.2=pyhd8ed1ab_0
+  - jpeg=9e=h166bdaf_1
+  - jupyter_client=7.0.6=pyhd8ed1ab_0
+  - jupyter_core=4.12.0=py38h578d9bd_0
+  - kiwisolver=1.4.4=py38h6a678d5_0
+  - lame=3.100=h7f98852_1001
+  - lcms2=2.12=hddcbb42_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - libffi=3.4.2=h6a678d5_6
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgfortran-ng=7.5.0=ha8ba4b0_17
+  - libgfortran4=7.5.0=ha8ba4b0_17
+  - libgomp=11.2.0=h1234567_1
+  - libiconv=1.17=h166bdaf_0
+  - libpng=1.6.37=h21135ba_2
+  - libprotobuf=3.18.0=h780b84a_1
+  - libsodium=1.0.18=h36c2ea0_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libtiff=4.2.0=hf544144_3
+  - libuv=1.43.0=h7f98852_0
+  - libwebp=1.2.2=h55f646e_0
+  - libwebp-base=1.2.2=h7f98852_1
+  - lightning-utilities=0.8.0=pyhd8ed1ab_0
+  - lz4-c=1.9.3=h9c3ff4c_1
+  - mapbox_earcut=1.0.0=py38h43d8883_3
+  - matplotlib-base=3.2.2=py38h5d868c9_1
+  - matplotlib-inline=0.1.6=pyhd8ed1ab_0
+  - mkl=2021.4.0=h06a4308_640
+  - mkl-service=2.4.0=py38h95df7f1_0
+  - mkl_fft=1.3.1=py38h8666266_1
+  - mkl_random=1.2.2=py38h1abd341_0
+  - mpi=1.0=mpich
+  - mpi4py=3.1.4=py38hfc96bbd_0
+  - mpich=3.3.2=hc856adb_0
+  - multidict=6.0.2=py38h0a891b7_1
+  - ncurses=6.3=h5eee18b_3
+  - nest-asyncio=1.5.6=pyhd8ed1ab_0
+  - nettle=3.6=he412f7d_0
+  - networkx=3.1=pyhd8ed1ab_0
+  - ninja=1.11.0=h924138e_0
+  - oauthlib=3.2.2=pyhd8ed1ab_0
+  - olefile=0.46=pyh9f0ad1d_1
+  - openh264=2.1.1=h780b84a_0
+  - openjpeg=2.4.0=hb52868f_1
+  - openssl=1.1.1w=h7f8727e_0
+  - parso=0.8.3=pyhd8ed1ab_0
+  - pexpect=4.8.0=pyh1a96a4e_2
+  - pickleshare=0.7.5=py_1003
+  - prompt-toolkit=3.0.39=pyha770c72_0
+  - ptyprocess=0.7.0=pyhd3deb0d_0
+  - pure_eval=0.2.2=pyhd8ed1ab_0
+  - pyasn1=0.5.0=pyhd8ed1ab_0
+  - pyasn1-modules=0.3.0=pyhd8ed1ab_0
+  - pycparser=2.21=pyhd8ed1ab_0
+  - pyjwt=2.8.0=pyhd8ed1ab_0
+  - pyopenssl=22.0.0=pyhd8ed1ab_1
+  - pyrender=0.1.45=pyh8a188c0_3
+  - pysocks=1.7.1=pyha2e5f31_6
+  - python=3.8.15=h7a1cb2a_2
+  - python-dateutil=2.8.2=pyhd8ed1ab_0
+  - python_abi=3.8=2_cp38
+  - pytorch-lightning=1.5.8=pyhd8ed1ab_0
+  - pytorch-mutex=1.0=cuda
+  - pyu2f=0.1.5=pyhd8ed1ab_0
+  - pyyaml=6.0=py38h0a891b7_4
+  - pyzmq=19.0.2=py38ha71036d_2
+  - readline=8.2=h5eee18b_0
+  - requests=2.28.2=pyhd8ed1ab_0
+  - requests-oauthlib=1.3.1=pyhd8ed1ab_0
+  - rsa=4.9=pyhd8ed1ab_0
+  - six=1.16.0=pyh6c4a22f_0
+  - sqlite=3.40.0=h5082296_0
+  - stack_data=0.6.2=pyhd8ed1ab_0
+  - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0
+  - tk=8.6.12=h1ccaba5_0
+  - tornado=6.1=py38h0a891b7_3
+  - tqdm=4.65.0=pyhd8ed1ab_1
+  - traitlets=5.9.0=pyhd8ed1ab_0
+  - typing-extensions=4.4.0=hd8ed1ab_0
+  - typing_extensions=4.4.0=pyha770c72_0
+  - urllib3=1.26.14=pyhd8ed1ab_0
+  - wcwidth=0.2.6=pyhd8ed1ab_0
+  - xz=5.2.8=h5eee18b_0
+  - yaml=0.2.5=h7f98852_2
+  - yarl=1.7.2=py38h0a891b7_2
+  - zeromq=4.3.4=h9c3ff4c_1
+  - zlib=1.2.13=h5eee18b_0
+  - zstd=1.5.0=ha95c52a_0
+  - pip:
+    - absl-py==1.4.0
+    - accelerate==0.19.0
+    - alembic==1.12.0
+    - aniposelib==0.4.3
+    - antlr4-python3-runtime==4.8
+    - appdirs==1.4.4
+    - audioread==3.0.0
+    - autopage==0.5.1
+    - backoff==2.2.1
+    - beautifulsoup4==4.12.2
+    - bertopic==0.15.0
+    - blobfile==2.0.2
+    - boilerpy3==1.0.6
+    - cachetools==5.3.1
+    - canals==0.2.2
+    - cattrs==23.1.2
+    - chumpy==0.69
+    - click==8.1.3
+    - cliff==4.3.0
+    - clip==1.0
+    - cmaes==0.10.0
+    - cmd2==2.4.3
+    - colorlog==6.7.0
+    - commonmark==0.9.1
+    - configer==1.3.1
+    - configparser==5.3.0
+    - contourpy==1.0.7
+    - coremltools==6.1
+    - cython==0.29.35
+    - decorator==4.4.2
+    - diffusers==0.16.1
+    - dill==0.3.6
+    - docker-pycreds==0.4.0
+    - docopt==0.6.2
+    - easydict==1.7
+    - einops==0.6.1
+    - etils==0.9.0
+    - events==0.4
+    - exceptiongroup==1.1.2
+    - farm-haystack==1.18.1
+    - filelock==3.12.0
+    - fire==0.1.3
+    - fonttools==4.39.4
+    - freetype-py==2.3.0
+    - ftfy==6.1.1
+    - fvcore==0.1.5.post20221221
+    - gdown==4.7.1
+    - gitdb==4.0.10
+    - gitpython==3.1.31
+    - google-auth==2.22.0
+    - google-auth-oauthlib==1.0.0
+    - googleapis-common-protos==1.57.0
+    - greenlet==2.0.2
+    - grpcio==1.56.0
+    - h5py==3.9.0
+    - hdbscan==0.8.33
+    - huggingface-hub==0.14.1
+    - hydra==2.5
+    - hydra-colorlog==1.1.0.dev1
+    - hydra-core==1.1.0rc1
+    - hydra-optuna-sweeper==1.1.0.dev2
+    - imageio==2.27.0
+    - imageio-ffmpeg==0.4.9
+    - importlib-resources==5.10.1
+    - inflect==7.0.0
+    - iopath==0.1.10
+    - joblib==1.2.0
+    - json-tricks==3.17.1
+    - jsonschema==4.18.4
+    - jsonschema-specifications==2023.7.1
+    - jukebox==1.0
+    - lazy-imports==0.3.1
+    - lazy-loader==0.2
+    - librosa==0.7.2
+    - llvmlite==0.31.0
+    - lxml==4.9.2
+    - mako==1.2.4
+    - markdown==3.4.3
+    - markupsafe==2.1.3
+    - matplotlib==3.7.3
+    - monotonic==1.6
+    - more-itertools==10.0.0
+    - moviepy==1.0.3
+    - mpmath==1.2.1
+    - msgpack==1.0.5
+    - multiprocess==0.70.14
+    - netifaces==0.11.0
+    - nltk==3.8.1
+    - num2words==0.5.12
+    - numba==0.48.0
+    - numpy==1.24.4
+    - omegaconf==2.1.0rc1
+    - onnx==1.12.0
+    - onnxoptimizer==0.3.2
+    - onnxsim==0.4.10
+    - opencv-contrib-python==4.8.0.74
+    - opencv-python==4.7.0.72
+    - optuna==2.4.0
+    - p-tqdm==1.4.0
+    - packaging==22.0
+    - pandas==1.2.4
+    - pathos==0.3.0
+    - pathtools==0.1.2
+    - pbr==5.11.1
+    - pickle5==0.0.11
+    - pillow==9.5.0
+    - pip==23.3.1
+    - pkgutil-resolve-name==1.3.10
+    - platformdirs==3.9.1
+    - plotly==5.17.0
+    - pooch==1.6.0
+    - portalocker==2.7.0
+    - posthog==3.0.1
+    - pox==0.3.2
+    - ppft==1.7.6.6
+    - prettytable==3.9.0
+    - proglog==0.1.10
+    - promise==2.3
+    - prompthub-py==4.0.0
+    - protobuf==3.20.1
+    - psutil==5.9.5
+    - publicip==1.0.1
+    - pycocotools==2.0.6
+    - pycryptodomex==3.17
+    - pydantic==1.10.11
+    - pydeprecate==0.3.2
+    - pydub==0.25.1
+    - pyglet==1.4.0b1
+    - pygments==2.13.0
+    - pynndescent==0.5.10
+    - pyopengl==3.1.0
+    - pyopengl-accelerate==3.1.7
+    - pyparsing==3.0.9
+    - pyperclip==1.8.2
+    - python-dotenv==0.17.1
+    - pytorch3d==0.3.0
+    - pytz==2022.6
+    - pywavelets==1.4.1
+    - quantulum3==0.9.0
+    - rank-bm25==0.2.2
+    - referencing==0.30.0
+    - regex==2023.5.5
+    - requests-cache==0.9.8
+    - resampy==0.3.1
+    - rich==12.6.0
+    - rpds-py==0.9.2
+    - safetensors==0.3.1
+    - scikit-image==0.18.0
+    - scikit-learn==1.2.2
+    - scipy==1.10.1
+    - sentence-transformers==2.2.2
+    - sentencepiece==0.1.99
+    - sentry-sdk==1.25.0
+    - setproctitle==1.3.2
+    - setuptools==68.2.2
+    - shapely==2.0.1
+    - smmap==5.0.0
+    - smplx==0.1.28
+    - soundfile==0.10.3.post1
+    - soupsieve==2.4.1
+    - soxr==0.3.5
+    - sqlalchemy==2.0.20
+    - sseclient-py==1.7.2
+    - stevedore==5.1.0
+    - sympy==1.11.1
+    - tabulate==0.9.0
+    - tbb==2021.10.0
+    - tenacity==8.2.2
+    - tensorboard==2.13.0
+    - tensorboard-data-server==0.7.1
+    - tensorboardx==1.6
+    - tensorflow-datasets==4.7.0
+    - tensorflow-metadata==1.12.0
+    - tf2onnx==1.13.0
+    - threadpoolctl==3.1.0
+    - tifffile==2023.7.4
+    - tiktoken==0.4.0
+    - timm==0.4.5
+    - tokenizers==0.13.3
+    - toml==0.10.2
+    - torch==1.13.1+cu116
+    - torch-tb-profiler==0.4.1
+    - torchaudio==0.13.1+cu116
+    - torchgeometry==0.1.2
+    - torchmetrics==0.7.0
+    - torchvision==0.14.1+cu116
+    - transformers==4.30.1
+    - transforms3d==0.4.1
+    - trimesh==3.9.24
+    - tzdata==2023.3
+    - umap-learn==0.5.4
+    - unidecode==1.1.1
+    - url-normalize==1.4.3
+    - wandb==0.15.2
+    - werkzeug==2.3.6
+    - wget==3.2
+    - wheel==0.41.2
+    - yacs==0.1.8
+    - zipp==3.16.2
+prefix: /home/lrh/.conda/envs/py38

environment_macos.yaml ADDED Viewed

	@@ -0,0 +1,64 @@

+name: FineNet
+channels:
+  - pytorch
+  - conda-forge
+  - defaults
+dependencies:
+  - python=3.9
+  - numpy
+  - scipy
+  - matplotlib
+  - pandas
+  - pyyaml
+  - h5py
+  - tqdm
+  - ipython
+  - jupyter_client
+  - cython
+  - ffmpeg
+  - pip
+  - pip:
+    - torch
+    - torchaudio
+    - torchvision
+    - pytorch-lightning==1.9.5
+    - torchmetrics==0.11.4
+    - accelerate
+    - einops
+    - smplx
+    - trimesh
+    - pyrender
+    - opencv-python
+    - opencv-contrib-python
+    - scikit-learn
+    - scikit-image
+    - transformers
+    - diffusers
+    - librosa
+    - soundfile
+    - moviepy
+    - imageio
+    - imageio-ffmpeg
+    - hydra-core==1.3.2
+    - omegaconf==2.3.0
+    - wandb
+    - tensorboard
+    - tensorboardx
+    - easydict
+    - fire
+    - ftfy
+    - regex
+    - pillow
+    - plotly
+    - gdown
+    - huggingface-hub
+    - safetensors
+    - sentence-transformers
+    - pydub
+    - json-tricks
+    - yacs
+    - fvcore
+    - iopath
+    - tabulate
+    - rich
+    - click

generate_all.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import glob
+import os,sys
+from functools import cmp_to_key
+from pathlib import Path
+# import jukemirlib
+import numpy as np
+import torch
+from tqdm import tqdm
+from args import FineDance_parse_test_opt
+from train_seq import EDGE
+from dataset.FineDance_dataset import get_train_test_list
+# test_list = ["063", "132", "143", "036", "098", "198", "130", "012", "211", "193", "179", "065", "137", "161", "092", "120", "037", "109", "204", "144"]
+test_list = ["063", "144"]
+music_dir = "data/finedance/div_by_time/music_npy_120"
+count = 10
+def test(opt):
+    # split = get_train_test_dict(opt.datasplit)
+    train_list, test_list, ignore_list = get_train_test_list(opt.datasplit)
+    for file in os.listdir(music_dir):
+        if file[:3] in ignore_list:
+            continue
+        if not file[:3] in test_list:
+            continue
+        file_name = file[:-4]
+        music_fea = np.load(os.path.join(music_dir, file))
+        device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
+        music_fea = torch.from_numpy(music_fea).float().to(device).unsqueeze(0)
+        music_fea = music_fea.repeat(count, 1, 1)
+        all_filenames = [file_name]*count
+        # directory for optionally saving the dances for eval
+        fk_out = None
+        if opt.save_motions:
+            fk_out = opt.motion_save_dir
+        model = EDGE(opt, opt.feature_type, opt.checkpoint)
+        model.eval()
+        data_tuple = None, music_fea, all_filenames
+        model.render_sample(
+                data_tuple, "test", opt.render_dir, render_count=10, mode='normal', fk_out=fk_out, render=not opt.no_render
+            )
+        print("Done")
+if __name__ == "__main__":
+    opt = FineDance_parse_test_opt()
+    test(opt)
+# python test.py --save_motions

generate_dance.py ADDED Viewed

	@@ -0,0 +1,240 @@

+"""
+End-to-end dance generation from a music file.
+Usage:
+    python generate_dance.py /path/to/music.mp3
+    python generate_dance.py /path/to/music.mp3 --output my_dance.mp4
+"""
+import argparse
+import glob
+import os
+import subprocess
+import sys
+from functools import cmp_to_key
+from pathlib import Path
+from tempfile import TemporaryDirectory
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+import librosa
+import librosa as lr
+import numpy as np
+import soundfile as sf
+import torch
+from tqdm import tqdm
+from train_seq import EDGE
+from render import MovieMaker, motion_data_load_process
+# --- Audio utilities (from test.py) ---
+def slice_audio(audio_file, stride, length, out_dir):
+    audio, sr = lr.load(audio_file, sr=None)
+    file_name = os.path.splitext(os.path.basename(audio_file))[0]
+    start_idx = 0
+    idx = 0
+    window = int(length * sr)
+    stride_step = int(stride * sr)
+    while start_idx <= len(audio) - window:
+        audio_slice = audio[start_idx : start_idx + window]
+        sf.write(f"{out_dir}/{file_name}_slice{idx}.wav", audio_slice, sr)
+        start_idx += stride_step
+        idx += 1
+    return idx
+def extract_features(fpath, full_seq_len=120):
+    FPS = 30
+    HOP_LENGTH = 512
+    SR = FPS * HOP_LENGTH
+    data, _ = librosa.load(fpath, sr=SR)
+    envelope = librosa.onset.onset_strength(y=data, sr=SR)
+    mfcc = librosa.feature.mfcc(y=data, sr=SR, n_mfcc=20).T
+    chroma = librosa.feature.chroma_cens(y=data, sr=SR, hop_length=HOP_LENGTH, n_chroma=12).T
+    peak_idxs = librosa.onset.onset_detect(
+        onset_envelope=envelope.flatten(), sr=SR, hop_length=HOP_LENGTH
+    )
+    peak_onehot = np.zeros_like(envelope, dtype=np.float32)
+    peak_onehot[peak_idxs] = 1.0
+    start_bpm = lr.beat.tempo(y=lr.load(fpath)[0])[0]
+    tempo, beat_idxs = librosa.beat.beat_track(
+        onset_envelope=envelope, sr=SR, hop_length=HOP_LENGTH,
+        start_bpm=start_bpm, tightness=100,
+    )
+    beat_onehot = np.zeros_like(envelope, dtype=np.float32)
+    beat_onehot[beat_idxs] = 1.0
+    audio_feature = np.concatenate(
+        [envelope[:, None], mfcc, chroma, peak_onehot[:, None], beat_onehot[:, None]],
+        axis=-1,
+    )
+    audio_feature = audio_feature[:4 * FPS]
+    return audio_feature
+key_func = lambda x: int(os.path.splitext(x)[0].split("_")[-1].split("slice")[-1])
+def stringintcmp_(a, b):
+    aa, bb = "".join(a.split("_")[:-1]), "".join(b.split("_")[:-1])
+    ka, kb = key_func(a), key_func(b)
+    if aa < bb:
+        return -1
+    if aa > bb:
+        return 1
+    if ka < kb:
+        return -1
+    if ka > kb:
+        return 1
+    return 0
+stringintkey = cmp_to_key(stringintcmp_)
+# --- Model loading ---
+class _Opt:
+    """Minimal config namespace for EDGE model."""
+    feature_type = "baseline"
+    full_seq_len = 120
+    windows = 10
+    nfeats = 319
+    do_normalize = False
+    datasplit = "cross_genre"
+    project = "experiments/finedance_seq_120_genre/train"
+    exp_name = "finedance_seq_120_genre"
+    render_dir = "tmp_renders"
+    batch_size = 64
+    epochs = 1
+    save_interval = 10
+    ema_interval = 1
+    checkpoint = ""
+    wandb_pj_name = "finedance_seq"
+def load_model(checkpoint_path="assets/checkpoints/train-2000.pt"):
+    """Load the EDGE model once. Returns (model, opt)."""
+    opt = _Opt()
+    model = EDGE(opt, opt.feature_type, checkpoint_path)
+    model.eval()
+    return model, opt
+def _setup_render_args():
+    """Inject render.py global args for MovieMaker."""
+    import render as render_module
+    render_module.args = argparse.Namespace(
+        mode="smplx", fps=30, gpu="0", modir="", save_path=None
+    )
+# --- Main pipeline ---
+def generate(music_path, output_path, model=None, visualizer=None, log_fn=print):
+    """
+    Generate a dance video from a music file.
+    Args:
+        music_path: Path to input audio (mp3, wav, etc.)
+        output_path: Where to save the output mp4
+        model: Pre-loaded (model, opt) tuple. If None, loads fresh.
+        visualizer: Pre-built MovieMaker instance. If None, creates one.
+        log_fn: Callable for status messages (default: print)
+    """
+    import shutil
+    music_path = os.path.abspath(music_path)
+    songname = os.path.splitext(os.path.basename(music_path))[0]
+    # Step 1: Convert to WAV if needed
+    log_fn(f"[1/5] Preparing audio: {os.path.basename(music_path)}")
+    temp_root = TemporaryDirectory()
+    wav_dir = os.path.join(temp_root.name, "wav")
+    os.makedirs(wav_dir)
+    wav_path = os.path.join(wav_dir, songname + ".wav")
+    if music_path.lower().endswith(".wav"):
+        shutil.copy2(music_path, wav_path)
+    else:
+        subprocess.run(
+            ["ffmpeg", "-i", music_path, wav_path, "-y"],
+            capture_output=True, check=True,
+        )
+    # Step 2: Slice and extract features
+    log_fn("[2/5] Extracting audio features...")
+    slice_dir = os.path.join(temp_root.name, "slices")
+    os.makedirs(slice_dir)
+    stride = 60 / 30  # 2 seconds
+    full_seq_len = 120
+    slice_audio(wav_path, stride, full_seq_len / 30, slice_dir)
+    file_list = sorted(glob.glob(f"{slice_dir}/*.wav"), key=stringintkey)
+    out_length = 30  # seconds
+    sample_size = int(out_length / stride) - 1
+    cond_list = []
+    for file in tqdm(file_list[:sample_size]):
+        reps = extract_features(file)[:full_seq_len]
+        cond_list.append(reps)
+    cond = torch.from_numpy(np.array(cond_list))
+    filenames = file_list[:sample_size]
+    # Step 3: Generate motion
+    log_fn("[3/5] Generating dance motion...")
+    if model is None:
+        edge_model, opt = load_model()
+    else:
+        edge_model, opt = model
+    motion_dir = os.path.join(temp_root.name, "motions")
+    os.makedirs(motion_dir)
+    data_tuple = (None, cond, filenames)
+    edge_model.render_sample(
+        data_tuple, "gen", temp_root.name, render_count=-1,
+        fk_out=motion_dir, mode="long", render=False,
+    )
+    # Step 4: Render video
+    log_fn("[4/5] Rendering video...")
+    motion_file = glob.glob(os.path.join(motion_dir, "*.pkl"))[0]
+    modata = motion_data_load_process(motion_file)
+    video_dir = os.path.join(temp_root.name, "video")
+    os.makedirs(video_dir)
+    _setup_render_args()
+    if visualizer is None:
+        visualizer = MovieMaker(save_path=video_dir)
+    else:
+        visualizer.save_path = video_dir
+    visualizer.run(modata, tab=songname, music_file=wav_path)
+    # Step 5: Copy final output
+    log_fn("[5/5] Saving output...")
+    rendered_file = os.path.join(video_dir, songname + "z.mp4")
+    output_path = os.path.abspath(output_path)
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    shutil.move(rendered_file, output_path)
+    temp_root.cleanup()
+    log_fn(f"Done! Output saved to: {output_path}")
+    return output_path
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Generate a dance video from a music file.")
+    parser.add_argument("music", type=str, help="Path to the input music file (mp3, wav, etc.)")
+    parser.add_argument("--output", type=str, default=None, help="Output video path (default: output/<songname>_dance.mp4)")
+    args = parser.parse_args()
+    if args.output is None:
+        songname = os.path.splitext(os.path.basename(args.music))[0]
+        args.output = os.path.join("output", f"{songname}_dance.mp4")
+    generate(args.music, args.output)

model/adan.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import math
+import torch
+from torch.optim import Optimizer
+def exists(val):
+    return val is not None
+class Adan(Optimizer):
+    def __init__(
+        self,
+        params,
+        lr=1e-3,
+        betas=(0.02, 0.08, 0.01),
+        eps=1e-8,
+        weight_decay=0,
+        restart_cond: callable = None,
+    ):
+        assert len(betas) == 3
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            restart_cond=restart_cond,
+        )
+        super().__init__(params, defaults)
+    def step(self, closure=None):
+        loss = None
+        if exists(closure):
+            loss = closure()
+        for group in self.param_groups:
+            lr = group["lr"]
+            beta1, beta2, beta3 = group["betas"]
+            weight_decay = group["weight_decay"]
+            eps = group["eps"]
+            restart_cond = group["restart_cond"]
+            for p in group["params"]:
+                if not exists(p.grad):
+                    continue
+                data, grad = p.data, p.grad.data
+                assert not grad.is_sparse
+                state = self.state[p]
+                if len(state) == 0:
+                    state["step"] = 0
+                    state["prev_grad"] = torch.zeros_like(grad)
+                    state["m"] = torch.zeros_like(grad)
+                    state["v"] = torch.zeros_like(grad)
+                    state["n"] = torch.zeros_like(grad)
+                step, m, v, n, prev_grad = (
+                    state["step"],
+                    state["m"],
+                    state["v"],
+                    state["n"],
+                    state["prev_grad"],
+                )
+                if step > 0:
+                    prev_grad = state["prev_grad"]
+                    # main algorithm
+                    m.mul_(1 - beta1).add_(grad, alpha=beta1)
+                    grad_diff = grad - prev_grad
+                    v.mul_(1 - beta2).add_(grad_diff, alpha=beta2)
+                    next_n = (grad + (1 - beta2) * grad_diff) ** 2
+                    n.mul_(1 - beta3).add_(next_n, alpha=beta3)
+                # bias correction terms
+                step += 1
+                correct_m, correct_v, correct_n = map(
+                    lambda n: 1 / (1 - (1 - n) ** step), (beta1, beta2, beta3)
+                )
+                # gradient step
+                def grad_step_(data, m, v, n):
+                    weighted_step_size = lr / (n * correct_n).sqrt().add_(eps)
+                    denom = 1 + weight_decay * lr
+                    data.addcmul_(
+                        weighted_step_size,
+                        (m * correct_m + (1 - beta2) * v * correct_v),
+                        value=-1.0,
+                    ).div_(denom)
+                grad_step_(data, m, v, n)
+                # restart condition
+                if exists(restart_cond) and restart_cond(state):
+                    m.data.copy_(grad)
+                    v.zero_()
+                    n.data.copy_(grad ** 2)
+                    grad_step_(data, m, v, n)
+                # set new incremented step
+                prev_grad.copy_(grad)
+                state["step"] = step
+        return loss

model/diffusion.py ADDED Viewed

	@@ -0,0 +1,741 @@

+import copy
+import os
+import pickle
+from pathlib import Path
+from functools import partial
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import reduce
+from p_tqdm import p_map
+from pytorch3d.transforms import (axis_angle_to_quaternion,
+                                  quaternion_to_axis_angle)
+from tqdm import tqdm
+from dataset.quaternion import ax_from_6v, quat_slerp
+from vis import skeleton_render
+from vis import SMPLX_Skeleton
+from dataset.preprocess import My_Normalizer as Normalizer
+from .utils import extract, make_beta_schedule
+def identity(t, *args, **kwargs):
+    return t
+class EMA:
+    def __init__(self, beta):
+        super().__init__()
+        self.beta = beta
+    def update_model_average(self, ma_model, current_model):
+        for current_params, ma_params in zip(
+            current_model.parameters(), ma_model.parameters()
+        ):
+            old_weight, up_weight = ma_params.data, current_params.data
+            ma_params.data = self.update_average(old_weight, up_weight)
+    def update_average(self, old, new):
+        if old is None:
+            return new
+        return old * self.beta + (1 - self.beta) * new
+class GaussianDiffusion(nn.Module):
+    def __init__(
+        self,
+        model,
+        opt,
+        horizon,
+        repr_dim,
+        smplx_model,
+        n_timestep=1000,
+        schedule="linear",
+        loss_type="l1",
+        clip_denoised=True,
+        predict_epsilon=True,
+        guidance_weight=3,
+        use_p2=False,
+        cond_drop_prob=0.2,
+        do_normalize=False,
+    ):
+        super().__init__()
+        self.horizon = horizon
+        self.transition_dim = repr_dim
+        self.model = model
+        self.ema = EMA(0.9999)
+        self.master_model = copy.deepcopy(self.model)
+        self.normalizer = None
+        self.do_normalize = do_normalize
+        self.opt = opt
+        self.cond_drop_prob = cond_drop_prob
+        # make a SMPL instance for FK module
+        self.smplx_fk = smplx_model
+        betas = torch.Tensor(
+            make_beta_schedule(schedule=schedule, n_timestep=n_timestep)
+        )
+        alphas = 1.0 - betas
+        alphas_cumprod = torch.cumprod(alphas, axis=0)
+        alphas_cumprod_prev = torch.cat([torch.ones(1), alphas_cumprod[:-1]])
+        self.n_timestep = int(n_timestep)
+        self.clip_denoised = clip_denoised
+        self.predict_epsilon = predict_epsilon
+        self.register_buffer("betas", betas)
+        self.register_buffer("alphas_cumprod", alphas_cumprod)
+        self.register_buffer("alphas_cumprod_prev", alphas_cumprod_prev)
+        self.guidance_weight = guidance_weight
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer("sqrt_alphas_cumprod", torch.sqrt(alphas_cumprod))
+        self.register_buffer(
+            "sqrt_one_minus_alphas_cumprod", torch.sqrt(1.0 - alphas_cumprod)
+        )
+        self.register_buffer(
+            "log_one_minus_alphas_cumprod", torch.log(1.0 - alphas_cumprod)
+        )
+        self.register_buffer(
+            "sqrt_recip_alphas_cumprod", torch.sqrt(1.0 / alphas_cumprod)
+        )
+        self.register_buffer(
+            "sqrt_recipm1_alphas_cumprod", torch.sqrt(1.0 / alphas_cumprod - 1)
+        )
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        posterior_variance = (
+            betas * (1.0 - alphas_cumprod_prev) / (1.0 - alphas_cumprod)
+        )
+        self.register_buffer("posterior_variance", posterior_variance)
+        ## log calculation clipped because the posterior variance
+        ## is 0 at the beginning of the diffusion chain
+        self.register_buffer(
+            "posterior_log_variance_clipped",
+            torch.log(torch.clamp(posterior_variance, min=1e-20)),
+        )
+        self.register_buffer(
+            "posterior_mean_coef1",
+            betas * np.sqrt(alphas_cumprod_prev) / (1.0 - alphas_cumprod),
+        )
+        self.register_buffer(
+            "posterior_mean_coef2",
+            (1.0 - alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - alphas_cumprod),
+        )
+        # p2 weighting
+        self.p2_loss_weight_k = 1
+        self.p2_loss_weight_gamma = 0.5 if use_p2 else 0
+        self.register_buffer(
+            "p2_loss_weight",
+            (self.p2_loss_weight_k + alphas_cumprod / (1 - alphas_cumprod))
+            ** -self.p2_loss_weight_gamma,
+        )
+        ## get loss coefficients and initialize objective
+        self.loss_fn = F.mse_loss if loss_type == "l2" else F.l1_loss
+    # ------------------------------------------ sampling ------------------------------------------#
+    def predict_start_from_noise(self, x_t, t, noise):
+        """
+            if self.predict_epsilon, model output is (scaled) noise;
+            otherwise, model predicts x0 directly
+        """
+        if self.predict_epsilon:
+            return (
+                extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
+                - extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
+            )
+        else:
+            return noise
+    def predict_noise_from_start(self, x_t, t, x0):
+        return (
+            (extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - x0) / \
+            extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+        )
+    def model_predictions(self, x, cond, t, weight=None, clip_x_start = False):
+        weight = weight if weight is not None else self.guidance_weight
+        model_output = self.model.guided_forward(x, cond, t, weight)
+        maybe_clip = partial(torch.clamp, min = -1., max = 1.) if clip_x_start else identity
+        x_start = model_output
+        x_start = maybe_clip(x_start)
+        pred_noise = self.predict_noise_from_start(x, t, x_start)
+        return pred_noise, x_start
+    def q_posterior(self, x_start, x_t, t):
+        posterior_mean = (
+            extract(self.posterior_mean_coef1, t, x_t.shape) * x_start
+            + extract(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = extract(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = extract(
+            self.posterior_log_variance_clipped, t, x_t.shape
+        )
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def p_mean_variance(self, x, cond, t):
+        # guidance clipping
+        if t[0] > 1.0 * self.n_timestep:
+            weight = min(self.guidance_weight, 0)
+        elif t[0] < 0.1 * self.n_timestep:
+            weight = min(self.guidance_weight, 1)
+        else:
+            weight = self.guidance_weight
+        x_recon = self.predict_start_from_noise(
+            x, t=t, noise=self.model.guided_forward(x, cond, t, weight)
+        )
+        if self.clip_denoised:
+            x_recon.clamp_(-1.0, 1.0)
+        else:
+            assert RuntimeError()
+        model_mean, posterior_variance, posterior_log_variance = self.q_posterior(
+            x_start=x_recon, x_t=x, t=t
+        )
+        return model_mean, posterior_variance, posterior_log_variance, x_recon
+    @torch.no_grad()
+    def p_sample(self, x, cond, t):
+        b, *_, device = *x.shape, x.device
+        model_mean, _, model_log_variance, x_start = self.p_mean_variance(
+            x=x, cond=cond, t=t
+        )
+        noise = torch.randn_like(model_mean)
+        # no noise when t == 0
+        nonzero_mask = (1 - (t == 0).float()).reshape(
+            b, *((1,) * (len(noise.shape) - 1))
+        )
+        x_out = model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
+        return x_out, x_start
+    @torch.no_grad()
+    def p_sample_loop(
+        self,
+        shape,
+        cond,
+        noise=None,
+        constraint=None,
+        return_diffusion=False,
+        start_point=None,
+    ):
+        device = self.betas.device
+        # default to diffusion over whole timescale
+        start_point = self.n_timestep if start_point is None else start_point
+        batch_size = shape[0]
+        x = torch.randn(shape, device=device) if noise is None else noise.to(device)
+        cond = cond.to(device)
+        if return_diffusion:
+            diffusion = [x]
+        for i in tqdm(reversed(range(0, start_point))):
+            # fill with i
+            timesteps = torch.full((batch_size,), i, device=device, dtype=torch.long)
+            x, _ = self.p_sample(x, cond, timesteps)
+            if return_diffusion:
+                diffusion.append(x)
+        if return_diffusion:
+            return x, diffusion
+        else:
+            return x
+    @torch.no_grad()
+    def ddim_sample(self, shape, cond, **kwargs):
+        batch, device, total_timesteps, sampling_timesteps, eta = shape[0], self.betas.device, self.n_timestep, 50, 1
+        times = torch.linspace(-1, total_timesteps - 1, steps=sampling_timesteps + 1)   # [-1, 0, 1, 2, ..., T-1] when sampling_timesteps == total_timesteps
+        times = list(reversed(times.int().tolist()))
+        time_pairs = list(zip(times[:-1], times[1:])) # [(T-1, T-2), (T-2, T-3), ..., (1, 0), (0, -1)]
+        x = torch.randn(shape, device = device)
+        cond = cond.to(device)
+        x_start = None
+        for time, time_next in tqdm(time_pairs, desc = 'sampling loop time step'):
+            time_cond = torch.full((batch,), time, device=device, dtype=torch.long)
+            pred_noise, x_start, *_ = self.model_predictions(x, cond, time_cond, clip_x_start = self.clip_denoised)
+            if time_next < 0:
+                x = x_start
+                continue
+            alpha = self.alphas_cumprod[time]
+            alpha_next = self.alphas_cumprod[time_next]
+            sigma = eta * ((1 - alpha / alpha_next) * (1 - alpha_next) / (1 - alpha)).sqrt()
+            c = (1 - alpha_next - sigma ** 2).sqrt()
+            noise = torch.randn_like(x)
+            x = x_start * alpha_next.sqrt() + \
+                  c * pred_noise + \
+                  sigma * noise
+        return x
+    @torch.no_grad()
+    def long_ddim_sample(self, shape, cond, **kwargs):
+        batch, device, total_timesteps, sampling_timesteps, eta = shape[0], self.betas.device, self.n_timestep, 50, 1
+        if batch == 1:
+            return self.ddim_sample(shape, cond)
+        times = torch.linspace(-1, total_timesteps - 1, steps=sampling_timesteps + 1)   # [-1, 0, 1, 2, ..., T-1] when sampling_timesteps == total_timesteps
+        times = list(reversed(times.int().tolist()))
+        weights = np.clip(np.linspace(0, self.guidance_weight * 2, sampling_timesteps), None, self.guidance_weight)
+        time_pairs = list(zip(times[:-1], times[1:], weights)) # [(T-1, T-2), (T-2, T-3), ..., (1, 0), (0, -1)]
+        x = torch.randn(shape, device = device)
+        cond = cond.to(device)
+        assert batch > 1
+        assert x.shape[1] % 2 == 0
+        half = x.shape[1] // 2
+        x_start = None
+        for time, time_next, weight in tqdm(time_pairs, desc = 'sampling loop time step'):
+            time_cond = torch.full((batch,), time, device=device, dtype=torch.long)
+            pred_noise, x_start, *_ = self.model_predictions(x, cond, time_cond, weight=weight, clip_x_start = self.clip_denoised)
+            if time_next < 0:
+                x = x_start
+                continue
+            alpha = self.alphas_cumprod[time]
+            alpha_next = self.alphas_cumprod[time_next]
+            sigma = eta * ((1 - alpha / alpha_next) * (1 - alpha_next) / (1 - alpha)).sqrt()
+            c = (1 - alpha_next - sigma ** 2).sqrt()
+            noise = torch.randn_like(x)
+            x = x_start * alpha_next.sqrt() + \
+                  c * pred_noise + \
+                  sigma * noise
+            if time > 0:
+                # the first half of each sequence is the second half of the previous one
+                x[1:, :half] = x[:-1, half:]
+        return x
+    @torch.no_grad()
+    def inpaint_loop(
+        self,
+        shape,
+        cond,
+        noise=None,
+        constraint=None,
+        return_diffusion=False,
+        start_point=None,
+    ):
+        device = self.betas.device
+        batch_size = shape[0]
+        x = torch.randn(shape, device=device) if noise is None else noise.to(device)
+        cond = cond.to(device)
+        if return_diffusion:
+            diffusion = [x]
+        mask = constraint["mask"].to(device)  # batch x horizon x channels
+        value = constraint["value"].to(device)  # batch x horizon x channels
+        start_point = self.n_timestep if start_point is None else start_point
+        for i in tqdm(reversed(range(0, start_point))):
+            # fill with i
+            timesteps = torch.full((batch_size,), i, device=device, dtype=torch.long)
+            # sample x from step i to step i-1
+            x, _ = self.p_sample(x, cond, timesteps)
+            # enforce constraint between each denoising step
+            value_ = self.q_sample(value, timesteps - 1) if (i > 0) else x
+            x = value_ * mask + (1.0 - mask) * x
+            if return_diffusion:
+                diffusion.append(x)
+        if return_diffusion:
+            return x, diffusion
+        else:
+            return x
+    @torch.no_grad()
+    def long_inpaint_loop(
+        self,
+        shape,
+        cond,
+        noise=None,
+        constraint=None,
+        return_diffusion=False,
+        start_point=None,
+    ):
+        device = self.betas.device
+        batch_size = shape[0]
+        x = torch.randn(shape, device=device) if noise is None else noise.to(device)
+        cond = cond.to(device)
+        if return_diffusion:
+            diffusion = [x]
+        assert x.shape[1] % 2 == 0
+        if batch_size == 1:
+            # there's no continuation to do, just do normal
+            return self.p_sample_loop(
+                shape,
+                cond,
+                noise=noise,
+                constraint=constraint,
+                return_diffusion=return_diffusion,
+                start_point=start_point,
+            )
+        assert batch_size > 1
+        half = x.shape[1] // 2
+        start_point = self.n_timestep if start_point is None else start_point
+        for i in tqdm(reversed(range(0, start_point))):
+            # fill with i
+            timesteps = torch.full((batch_size,), i, device=device, dtype=torch.long)
+            # sample x from step i to step i-1
+            x, _ = self.p_sample(x, cond, timesteps)
+            # enforce constraint between each denoising step
+            if i > 0:
+                # the first half of each sequence is the second half of the previous one
+                x[1:, :half] = x[:-1, half:]
+            if return_diffusion:
+                diffusion.append(x)
+        if return_diffusion:
+            return x, diffusion
+        else:
+            return x
+    @torch.no_grad()
+    def conditional_sample(
+        self, shape, cond, constraint=None, *args, horizon=None, **kwargs
+    ):
+        """
+            conditions : [ (time, state), ... ]
+        """
+        device = self.betas.device
+        horizon = horizon or self.horizon
+        return self.p_sample_loop(shape, cond, *args, **kwargs)
+    # ------------------------------------------ training ------------------------------------------#
+    def q_sample(self, x_start, t, noise=None):
+        if noise is None:
+            noise = torch.randn_like(x_start)
+        sample = (
+            extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+            + extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
+        )
+        return sample
+    def p_losses(self, x_start, cond, t):
+        noise = torch.randn_like(x_start)
+        x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)      # 将x0加噪到xt
+        # reconstruct
+        x_recon = self.model(x_noisy, cond, t, cond_drop_prob=self.cond_drop_prob)
+        assert noise.shape == x_recon.shape
+        model_out = x_recon
+        if self.predict_epsilon:
+            target = noise
+        else:
+            target = x_start
+        # full reconstruction loss
+        loss = self.loss_fn(model_out, target, reduction="none")            # mse loss
+        loss = reduce(loss, "b ... -> b (...)", "mean")
+        loss = loss * extract(self.p2_loss_weight, t, loss.shape)
+        # split off contact from the rest
+        _, model_out_ = torch.split(
+            model_out, (4, model_out.shape[2] - 4), dim=2               # 前4维是foot contact
+        )
+        _, target_ = torch.split(target, (4, target.shape[2] - 4), dim=2)       # b, length, jxc
+        # velocity loss
+        target_v = target_[:, 1:] - target_[:, :-1]
+        model_out_v = model_out_[:, 1:] - model_out_[:, :-1]
+        v_loss = self.loss_fn(model_out_v, target_v, reduction="none")
+        v_loss = reduce(v_loss, "b ... -> b (...)", "mean")
+        v_loss = v_loss * extract(self.p2_loss_weight, t, v_loss.shape)
+        # FK loss
+        b, s, c = model_out.shape
+        model_contact, model_out = torch.split(model_out, (4, model_out.shape[2] - 4), dim=2)
+        target_contact, target = torch.split(target, (4, target.shape[2] - 4), dim=2)       # b, length, jxc
+        model_x = model_out[:, :, :3]   # root position
+        model_q = ax_from_6v(model_out[:, :, 3:].reshape(b, s, -1, 6))
+        target_x = target[:, :, :3]
+        target_q = ax_from_6v(target[:, :, 3:].reshape(b, s, -1, 6))
+        b, s, nums, c_ = model_q.shape
+        if self.opt.nfeats == 139 or self.opt.nfeats==135:
+            model_xp = self.smplx_fk.forward(model_q, model_x)
+            target_xp = self.smplx_fk.forward(target_q, target_x)
+        else:
+            model_q = model_q.view(b*s, -1)
+            target_q = target_q.view(b*s, -1)
+            model_x = model_x.view(-1, 3)
+            target_x = target_x.view(-1, 3)
+            model_xp = self.smplx_fk.forward(model_q, model_x)
+            target_xp = self.smplx_fk.forward(target_q, target_x)
+            model_xp = model_xp.view(b, s, -1, 3)
+            target_xp = target_xp.view(b, s, -1, 3)
+        fk_loss = self.loss_fn(model_xp, target_xp, reduction="none")
+        fk_loss = reduce(fk_loss, "b ... -> b (...)", "mean")
+        fk_loss = fk_loss * extract(self.p2_loss_weight, t, fk_loss.shape)
+        # foot skate loss
+        foot_idx = [7, 8, 10, 11]
+        # find static indices consistent with model's own predictions
+        static_idx = model_contact > 0.95  # N x S x 4
+        model_feet = model_xp[:, :, foot_idx]  # foot positions (N, S, 4, 3)
+        model_foot_v = torch.zeros_like(model_feet)
+        model_foot_v[:, :-1] = (
+            model_feet[:, 1:, :, :] - model_feet[:, :-1, :, :]
+        )  # (N, S-1, 4, 3)
+        model_foot_v[~static_idx] = 0
+        foot_loss = self.loss_fn(
+            model_foot_v, torch.zeros_like(model_foot_v), reduction="none"
+        )
+        foot_loss = reduce(foot_loss, "b ... -> b (...)", "mean")
+        losses = (
+            0.636 * loss.mean(),
+            2.964 * v_loss.mean(),
+            0.646 * fk_loss.mean(),
+            10.942 * foot_loss.mean(),
+        )
+        return sum(losses), losses
+    def loss(self, x, cond, t_override=None):
+        batch_size = len(x)
+        if t_override is None:
+            t = torch.randint(0, self.n_timestep, (batch_size,), device=x.device).long()
+        else:
+            t = torch.full((batch_size,), t_override, device=x.device).long()
+        return self.p_losses(x, cond, t)
+    def forward(self, x, cond, t_override=None):
+        return self.loss(x, cond, t_override)
+    def partial_denoise(self, x, cond, t):
+        x_noisy = self.noise_to_t(x, t)
+        return self.p_sample_loop(x.shape, cond, noise=x_noisy, start_point=t)
+    def noise_to_t(self, x, timestep):
+        batch_size = len(x)
+        t = torch.full((batch_size,), timestep, device=x.device).long()
+        return self.q_sample(x, t) if timestep > 0 else x
+    def smplxmodel_fk(self, local_q, root_pos):      # input
+        b, s, nums, c = local_q.shape
+        local_q = local_q.view(b*s, -1)
+        full_pose = self.smplx_model(
+                    betas = torch.zeros([b*s, 10], device=local_q.device, dtype=torch.float32),
+                    transl = root_pos.view(b*s, -1),        # global translation
+                    global_orient = local_q[:, :3],
+                    body_pose = local_q[:, 3:66],           # 21
+                    jaw_pose = torch.zeros([b*s, 3], device=local_q.device, dtype=torch.float32),         # 1
+                    leye_pose = torch.zeros([b*s,  3], device=local_q.device, dtype=torch.float32),        # 1
+                    reye_pose= torch.zeros([b*s,  3], device=local_q.device, dtype=torch.float32),          # 1
+                    left_hand_pose = local_q[:, 66:111],   # 15
+                    right_hand_pose = local_q[:, 111:], # 15
+                    expression = torch.zeros([b*s, 10], device=local_q.device, dtype=torch.float32),
+                    return_verts = False
+            )
+        full_pose = full_pose.joints.view(b, s, -1, 3)   # b, s, 55, 3
+        return full_pose
+    def render_sample(
+        self,
+        shape,
+        cond,
+        normalizer,
+        epoch,
+        render_out,
+        fk_out=None,
+        name=None,
+        sound=True,
+        mode="normal",
+        noise=None,
+        constraint=None,
+        sound_folder="ood_sliced",
+        start_point=None,
+        render=True,
+        # do_normalize=True,
+    ):
+        if isinstance(shape, tuple):
+            if mode == "inpaint":
+                func_class = self.inpaint_loop
+            elif mode == "normal":
+                func_class = self.ddim_sample
+            elif mode == "long":
+                func_class = self.long_ddim_sample
+            else:
+                assert False, "Unrecognized inference mode"
+            samples = (
+                func_class(
+                    shape,
+                    cond,
+                    noise=noise,
+                    constraint=constraint,
+                    start_point=start_point,
+                )
+                .detach()
+                .cpu()
+            )
+        else:
+            samples = shape
+        if self.do_normalize:
+            with torch.no_grad():
+                samples = normalizer.unnormalize(samples)
+        if samples.shape[2] == 319 or samples.shape[2] == 151 or samples.shape[2] == 139:                 # debug if samples.shape[2] == 151:
+            sample_contact, samples = torch.split(
+                samples, (4, samples.shape[2] - 4), dim=2
+            )
+        else:
+            sample_contact = None
+        # do the FK all at once
+        b, s, c = samples.shape
+        pos = samples[:, :, :3].to(cond.device)  # np.zeros((sample.shape[0], 3))
+        q = samples[:, :, 3:].reshape(b, s, -1, 6)      # debug 24
+        # go 6d to ax
+        q = ax_from_6v(q).to(cond.device)
+        if self.opt.nfeats == 139 or self.opt.nfeats==135:
+            reshape_size = 66
+        else:
+            reshape_size = 156
+        if mode == "long":
+            b, s, c1, c2 = q.shape
+            assert s % 2 == 0
+            half = s // 2
+            if b > 1:
+                # if long mode, stitch position using linear interp
+                fade_out = torch.ones((1, s, 1)).to(pos.device)
+                fade_in = torch.ones((1, s, 1)).to(pos.device)
+                fade_out[:, half:, :] = torch.linspace(1, 0, half)[None, :, None].to(
+                    pos.device
+                )
+                fade_in[:, :half, :] = torch.linspace(0, 1, half)[None, :, None].to(
+                    pos.device
+                )
+                pos[:-1] *= fade_out
+                pos[1:] *= fade_in
+                full_pos = torch.zeros((s + half * (b - 1), 3)).to(pos.device)
+                idx = 0
+                for pos_slice in pos:
+                    full_pos[idx : idx + s] += pos_slice
+                    idx += half
+                # stitch joint angles with slerp
+                slerp_weight = torch.linspace(0, 1, half)[None, :, None].to(pos.device)
+                left, right = q[:-1, half:], q[1:, :half]
+                # convert to quat
+                left, right = (
+                    axis_angle_to_quaternion(left),
+                    axis_angle_to_quaternion(right),
+                )
+                merged = quat_slerp(left, right, slerp_weight)  # (b-1) x half x ...
+                # convert back
+                merged = quaternion_to_axis_angle(merged)
+                full_q = torch.zeros((s + half * (b - 1), c1, c2)).to(pos.device)
+                full_q[:half] += q[0, :half]
+                idx = half
+                for q_slice in merged:
+                    full_q[idx : idx + half] += q_slice
+                    idx += half
+                full_q[idx : idx + half] += q[-1, half:]
+                # unsqueeze for fk
+                full_pos = full_pos.unsqueeze(0)
+                full_q = full_q.unsqueeze(0)
+            else:
+                full_pos = pos
+                full_q = q
+            if fk_out is not None:
+                outname = f'{epoch}_{"_".join(os.path.splitext(os.path.basename(name[0]))[0].split("_")[:-1])}.pkl'  # f'{epoch}_{"_".join(name)}.pkl' #
+                Path(fk_out).mkdir(parents=True, exist_ok=True)
+                pickle.dump(
+                    {
+                        "smpl_poses": full_q.squeeze(0).reshape((-1, reshape_size)).cpu().numpy(),    # local rotations
+                        "smpl_trans": full_pos.squeeze(0).cpu().numpy(),                    # root translation
+                        # "full_pose": full_pose[0],                                          # 3d positions
+                    },
+                    open(os.path.join(fk_out, outname), "wb"),
+                )
+            return
+        sample_contact = (
+            sample_contact.detach().cpu().numpy()
+            if sample_contact is not None
+            else None
+        )
+        def inner(xx):
+            num, pose = xx
+            filename = name[num] if name is not None else None
+            contact = sample_contact[num] if sample_contact is not None else None
+            skeleton_render(
+                pose,
+                epoch=f"e{epoch}_b{num}",
+                out=render_out,
+                name=filename,
+                sound=sound,
+                contact=contact,
+            )
+        # p_map(inner, enumerate(poses))      # poses: 2, 150, 52, 3
+        # print("4")
+        if fk_out is not None and mode != "long":
+            Path(fk_out).mkdir(parents=True, exist_ok=True)
+            # for num, (qq, pos_, filename, pose) in enumerate(zip(q, pos, name, poses)):
+            for num, (qq, pos_, filename) in enumerate(zip(q, pos, name)):
+                filename = os.path.basename(filename).split(".")[0]
+                outname = f"{epoch}_{num}_{filename}.pkl"
+                pickle.dump(
+                    {
+                        "smpl_poses": qq.reshape((-1, reshape_size)).cpu().numpy(),
+                        "smpl_trans": pos_.cpu().numpy(),
+                        # "full_pose": pose,
+                    },
+                    open(f"{fk_out}/{outname}", "wb"),
+                )

model/model.py ADDED Viewed

	@@ -0,0 +1,444 @@

+from typing import Any, Callable, List, Optional, Union
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange, reduce, repeat
+from einops.layers.torch import Rearrange, Reduce
+from torch import Tensor
+from torch.nn import functional as F
+from model.rotary_embedding_torch import RotaryEmbedding
+from model.utils import PositionalEncoding, SinusoidalPosEmb, prob_mask_like
+class DenseFiLM(nn.Module):
+    """Feature-wise linear modulation (FiLM) generator."""
+    def __init__(self, embed_channels):
+        super().__init__()
+        self.embed_channels = embed_channels
+        self.block = nn.Sequential(
+            nn.Mish(), nn.Linear(embed_channels, embed_channels * 2)
+        )
+    def forward(self, position):
+        pos_encoding = self.block(position)
+        pos_encoding = rearrange(pos_encoding, "b c -> b 1 c")
+        scale_shift = pos_encoding.chunk(2, dim=-1)
+        return scale_shift
+def featurewise_affine(x, scale_shift):
+    scale, shift = scale_shift
+    return (scale + 1) * x + shift
+class TransformerEncoderLayer(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        nhead: int,
+        dim_feedforward: int = 2048,
+        dropout: float = 0.1,
+        activation: Union[str, Callable[[Tensor], Tensor]] = F.relu,
+        layer_norm_eps: float = 1e-5,
+        batch_first: bool = False,
+        norm_first: bool = True,
+        device=None,
+        dtype=None,
+        rotary=None,
+    ) -> None:
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(
+            d_model, nhead, dropout=dropout, batch_first=batch_first
+        )
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm_first = norm_first
+        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.activation = activation
+        self.rotary = rotary
+        self.use_rotary = rotary is not None
+    def forward(
+        self,
+        src: Tensor,
+        src_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
+        x = src
+        if self.norm_first:
+            x = x + self._sa_block(self.norm1(x), src_mask, src_key_padding_mask)
+            x = x + self._ff_block(self.norm2(x))
+        else:
+            x = self.norm1(x + self._sa_block(x, src_mask, src_key_padding_mask))
+            x = self.norm2(x + self._ff_block(x))
+        return x
+    # self-attention block
+    def _sa_block(
+        self, x: Tensor, attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor]
+    ) -> Tensor:
+        qk = self.rotary.rotate_queries_or_keys(x) if self.use_rotary else x
+        x = self.self_attn(
+            qk,
+            qk,
+            x,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask,
+            need_weights=False,
+        )[0]
+        return self.dropout1(x)
+    # feed forward block
+    def _ff_block(self, x: Tensor) -> Tensor:
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        return self.dropout2(x)
+class FiLMTransformerDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        nhead: int,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation=F.relu,
+        layer_norm_eps=1e-5,
+        batch_first=False,
+        norm_first=True,
+        device=None,
+        dtype=None,
+        rotary=None,
+    ):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(
+            d_model, nhead, dropout=dropout, batch_first=batch_first
+        )
+        self.multihead_attn = nn.MultiheadAttention(
+            d_model, nhead, dropout=dropout, batch_first=batch_first
+        )
+        # Feedforward
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm_first = norm_first
+        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+        self.activation = activation
+        self.film1 = DenseFiLM(d_model)
+        self.film2 = DenseFiLM(d_model)
+        self.film3 = DenseFiLM(d_model)
+        self.rotary = rotary
+        self.use_rotary = rotary is not None
+    # x, cond, t
+    def forward(
+        self,
+        tgt,
+        memory,
+        t,
+        tgt_mask=None,
+        memory_mask=None,
+        tgt_key_padding_mask=None,
+        memory_key_padding_mask=None,
+    ):
+        x = tgt
+        if self.norm_first:
+            # self-attention -> film -> residual
+            x_1 = self._sa_block(self.norm1(x), tgt_mask, tgt_key_padding_mask)
+            x = x + featurewise_affine(x_1, self.film1(t))
+            # cross-attention -> film -> residual
+            x_2 = self._mha_block(
+                self.norm2(x), memory, memory_mask, memory_key_padding_mask
+            )
+            x = x + featurewise_affine(x_2, self.film2(t))
+            # feedforward -> film -> residual
+            x_3 = self._ff_block(self.norm3(x))
+            x = x + featurewise_affine(x_3, self.film3(t))
+        else:
+            x = self.norm1(
+                x
+                + featurewise_affine(
+                    self._sa_block(x, tgt_mask, tgt_key_padding_mask), self.film1(t)
+                )
+            )
+            x = self.norm2(
+                x
+                + featurewise_affine(
+                    self._mha_block(x, memory, memory_mask, memory_key_padding_mask),
+                    self.film2(t),
+                )
+            )
+            x = self.norm3(x + featurewise_affine(self._ff_block(x), self.film3(t)))
+        return x
+    # self-attention block
+    # qkv
+    def _sa_block(self, x, attn_mask, key_padding_mask):
+        qk = self.rotary.rotate_queries_or_keys(x) if self.use_rotary else x
+        x = self.self_attn(
+            qk,
+            qk,
+            x,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask,
+            need_weights=False,
+        )[0]
+        return self.dropout1(x)
+    # multihead attention block
+    # qkv
+    def _mha_block(self, x, mem, attn_mask, key_padding_mask):
+        q = self.rotary.rotate_queries_or_keys(x) if self.use_rotary else x
+        k = self.rotary.rotate_queries_or_keys(mem) if self.use_rotary else mem
+        x = self.multihead_attn(
+            q,
+            k,
+            mem,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask,
+            need_weights=False,
+        )[0]
+        return self.dropout2(x)
+    # feed forward block
+    def _ff_block(self, x):
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        return self.dropout3(x)
+class DecoderLayerStack(nn.Module):
+    def __init__(self, stack):
+        super().__init__()
+        self.stack = stack
+    def forward(self, x, cond, t):
+        for layer in self.stack:
+            x = layer(x, cond, t)
+        return x
+class SeqModel(nn.Module):
+    def __init__(self,
+        nfeats: int,
+        seq_len: int = 150,  # 5 seconds, 30 fps
+        latent_dim: int = 256,
+        ff_size: int = 1024,
+        num_layers: int = 4,
+        num_heads: int = 4,
+        dropout: float = 0.1,
+        cond_feature_dim: int = 35,
+        activation: Callable[[Tensor], Tensor] = F.gelu,
+        use_rotary=True,
+        **kwargs
+    ) -> None:
+        super().__init__()
+        self.network = nn.ModuleDict()
+        self.network['body_net'] = DanceDecoder(
+            nfeats=4+3+22*6,
+            seq_len=seq_len,
+            latent_dim=latent_dim,
+            ff_size=ff_size,
+            num_layers=num_layers,
+            num_heads=num_heads,
+            dropout=dropout,
+            cond_feature_dim=cond_feature_dim,
+            activation=activation
+        )
+        self.network['hand_net'] = DanceDecoder(
+            nfeats=30*6,
+            seq_len=seq_len,
+            latent_dim=latent_dim,
+            ff_size=ff_size,
+            num_layers=num_layers,
+            num_heads=num_heads,
+            dropout=dropout,
+            cond_feature_dim=35+139,      # debug !
+            activation=activation
+        )
+    def forward(self, x: Tensor, cond_embed: Tensor, times: Tensor, cond_drop_prob: float = 0.0):
+        x_body_start = x[:,:,:4+135]
+        x_hand_start = x[:,:,4+135:]
+        body_output = self.network['body_net'](x_body_start, cond_embed, times, cond_drop_prob)
+        cond_embed = torch.cat([body_output, cond_embed], dim = -1)
+        hand_output = self.network['hand_net'](x_hand_start, cond_embed, times, cond_drop_prob)
+        output = torch.cat([body_output, hand_output], dim=-1)
+        return output
+    def guided_forward(self, x, cond_embed, times, guidance_weight):
+        unc = self.forward(x, cond_embed, times, cond_drop_prob=1)
+        conditioned = self.forward(x, cond_embed, times, cond_drop_prob=0)
+        return unc + (conditioned - unc) * guidance_weight
+class DanceDecoder(nn.Module):
+    def __init__(
+        self,
+        nfeats: int,
+        seq_len: int = 150,  # 5 seconds, 30 fps
+        latent_dim: int = 256,
+        ff_size: int = 1024,
+        num_layers: int = 4,
+        num_heads: int = 4,
+        dropout: float = 0.1,
+        cond_feature_dim: int = 35,
+        activation: Callable[[Tensor], Tensor] = F.gelu,
+        use_rotary=True,
+        **kwargs
+    ) -> None:
+        super().__init__()
+        output_feats = nfeats
+        # positional embeddings
+        self.rotary = None
+        self.abs_pos_encoding = nn.Identity()
+        # if rotary, replace absolute embedding with a rotary embedding instance (absolute becomes an identity)
+        if use_rotary:
+            self.rotary = RotaryEmbedding(dim=latent_dim)
+        else:
+            self.abs_pos_encoding = PositionalEncoding(
+                latent_dim, dropout, batch_first=True
+            )
+        # time embedding processing
+        self.time_mlp = nn.Sequential(
+            SinusoidalPosEmb(latent_dim),  # learned?
+            nn.Linear(latent_dim, latent_dim * 4),
+            nn.Mish(),
+        )
+        self.to_time_cond = nn.Sequential(nn.Linear(latent_dim * 4, latent_dim),)
+        self.to_time_tokens = nn.Sequential(
+            nn.Linear(latent_dim * 4, latent_dim * 2),  # 2 time tokens
+            Rearrange("b (r d) -> b r d", r=2),
+        )
+        # null embeddings for guidance dropout
+        self.null_cond_embed = nn.Parameter(torch.randn(1, seq_len, latent_dim))
+        self.null_cond_hidden = nn.Parameter(torch.randn(1, latent_dim))
+        self.norm_cond = nn.LayerNorm(latent_dim)
+        # input projection
+        self.input_projection = nn.Linear(nfeats, latent_dim)
+        self.cond_encoder = nn.Sequential()
+        for _ in range(2):
+            self.cond_encoder.append(
+                TransformerEncoderLayer(
+                    d_model=latent_dim,
+                    nhead=num_heads,
+                    dim_feedforward=ff_size,
+                    dropout=dropout,
+                    activation=activation,
+                    batch_first=True,
+                    rotary=self.rotary,
+                )
+            )
+        # conditional projection
+        self.cond_projection = nn.Linear(cond_feature_dim, latent_dim)      # debug cond_feature_dim
+        self.non_attn_cond_projection = nn.Sequential(
+            nn.LayerNorm(latent_dim),
+            nn.Linear(latent_dim, latent_dim),
+            nn.SiLU(),
+            nn.Linear(latent_dim, latent_dim),
+        )
+        # decoder
+        decoderstack = nn.ModuleList([])
+        for _ in range(num_layers):
+            decoderstack.append(
+                FiLMTransformerDecoderLayer(
+                    latent_dim,
+                    num_heads,
+                    dim_feedforward=ff_size,
+                    dropout=dropout,
+                    activation=activation,
+                    batch_first=True,
+                    rotary=self.rotary,
+                )
+            )
+        self.seqTransDecoder = DecoderLayerStack(decoderstack)
+        self.final_layer = nn.Linear(latent_dim, output_feats)
+    def guided_forward(self, x, cond_embed, times, guidance_weight):
+        unc = self.forward(x, cond_embed, times, cond_drop_prob=1)
+        conditioned = self.forward(x, cond_embed, times, cond_drop_prob=0)
+        return unc + (conditioned - unc) * guidance_weight
+    def forward(
+        self, x: Tensor, cond_embed: Tensor, times: Tensor, cond_drop_prob: float = 0.0
+    ):
+        batch_size, device = x.shape[0], x.device
+        # project to latent space
+        x = self.input_projection(x)
+        # add the positional embeddings of the input sequence to provide temporal information
+        x = self.abs_pos_encoding(x)
+        # create music conditional embedding with conditional dropout
+        keep_mask = prob_mask_like((batch_size,), 1 - cond_drop_prob, device=device)
+        keep_mask_embed = rearrange(keep_mask, "b -> b 1 1")
+        keep_mask_hidden = rearrange(keep_mask, "b -> b 1")
+        cond_tokens = self.cond_projection(cond_embed)
+        # encode tokens
+        cond_tokens = self.abs_pos_encoding(cond_tokens)
+        cond_tokens = self.cond_encoder(cond_tokens)
+        null_cond_embed = self.null_cond_embed.to(cond_tokens.dtype)
+        cond_tokens = torch.where(keep_mask_embed, cond_tokens, null_cond_embed)
+        mean_pooled_cond_tokens = cond_tokens.mean(dim=-2)
+        cond_hidden = self.non_attn_cond_projection(mean_pooled_cond_tokens)
+        # create the diffusion timestep embedding, add the extra music projection
+        t_hidden = self.time_mlp(times)
+        # project to attention and FiLM conditioning
+        t = self.to_time_cond(t_hidden)
+        t_tokens = self.to_time_tokens(t_hidden)
+        # FiLM conditioning
+        null_cond_hidden = self.null_cond_hidden.to(t.dtype)
+        cond_hidden = torch.where(keep_mask_hidden, cond_hidden, null_cond_hidden)
+        t += cond_hidden
+        # cross-attention conditioning
+        c = torch.cat((cond_tokens, t_tokens), dim=-2)
+        cond_tokens = self.norm_cond(c)
+        # Pass through the transformer decoder
+        # attending to the conditional embedding
+        output = self.seqTransDecoder(x, cond_tokens, t)
+        output = self.final_layer(output)
+        return output

model/rotary_embedding_torch.py ADDED Viewed

	@@ -0,0 +1,132 @@

+from inspect import isfunction
+from math import log, pi
+import torch
+from einops import rearrange, repeat
+from torch import einsum, nn
+# helper functions
+def exists(val):
+    return val is not None
+def broadcat(tensors, dim=-1):
+    num_tensors = len(tensors)
+    shape_lens = set(list(map(lambda t: len(t.shape), tensors)))
+    assert len(shape_lens) == 1, "tensors must all have the same number of dimensions"
+    shape_len = list(shape_lens)[0]
+    dim = (dim + shape_len) if dim < 0 else dim
+    dims = list(zip(*map(lambda t: list(t.shape), tensors)))
+    expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
+    assert all(
+        [*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]
+    ), "invalid dimensions for broadcastable concatentation"
+    max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims))
+    expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims))
+    expanded_dims.insert(dim, (dim, dims[dim]))
+    expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims)))
+    tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes)))
+    return torch.cat(tensors, dim=dim)
+# rotary embedding helper functions
+def rotate_half(x):
+    x = rearrange(x, "... (d r) -> ... d r", r=2)
+    x1, x2 = x.unbind(dim=-1)
+    x = torch.stack((-x2, x1), dim=-1)
+    return rearrange(x, "... d r -> ... (d r)")
+def apply_rotary_emb(freqs, t, start_index=0):
+    freqs = freqs.to(t)
+    rot_dim = freqs.shape[-1]
+    end_index = start_index + rot_dim
+    assert (
+        rot_dim <= t.shape[-1]
+    ), f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}"
+    t_left, t, t_right = (
+        t[..., :start_index],
+        t[..., start_index:end_index],
+        t[..., end_index:],
+    )
+    t = (t * freqs.cos()) + (rotate_half(t) * freqs.sin())
+    return torch.cat((t_left, t, t_right), dim=-1)
+# learned rotation helpers
+def apply_learned_rotations(rotations, t, start_index=0, freq_ranges=None):
+    if exists(freq_ranges):
+        rotations = einsum("..., f -> ... f", rotations, freq_ranges)
+        rotations = rearrange(rotations, "... r f -> ... (r f)")
+    rotations = repeat(rotations, "... n -> ... (n r)", r=2)
+    return apply_rotary_emb(rotations, t, start_index=start_index)
+# classes
+class RotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim,
+        custom_freqs=None,
+        freqs_for="lang",
+        theta=10000,
+        max_freq=10,
+        num_freqs=1,
+        learned_freq=False,
+    ):
+        super().__init__()
+        if exists(custom_freqs):
+            freqs = custom_freqs
+        elif freqs_for == "lang":
+            freqs = 1.0 / (
+                theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
+            )
+        elif freqs_for == "pixel":
+            freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi
+        elif freqs_for == "constant":
+            freqs = torch.ones(num_freqs).float()
+        else:
+            raise ValueError(f"unknown modality {freqs_for}")
+        self.cache = dict()
+        if learned_freq:
+            self.freqs = nn.Parameter(freqs)
+        else:
+            self.register_buffer("freqs", freqs)
+    def rotate_queries_or_keys(self, t, seq_dim=-2):
+        device = t.device
+        seq_len = t.shape[seq_dim]
+        freqs = self.forward(
+            lambda: torch.arange(seq_len, device=device), cache_key=seq_len
+        )
+        return apply_rotary_emb(freqs, t)
+    def forward(self, t, cache_key=None):
+        if exists(cache_key) and cache_key in self.cache:
+            return self.cache[cache_key]
+        if isfunction(t):
+            t = t()
+        freqs = self.freqs
+        freqs = torch.einsum("..., f -> ... f", t.type(freqs.dtype), freqs)
+        freqs = repeat(freqs, "... n -> ... (n r)", r=2)
+        if exists(cache_key):
+            self.cache[cache_key] = freqs
+        return freqs

model/utils.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import math
+import numpy as np
+import torch
+from einops import rearrange, reduce, repeat
+from einops.layers.torch import Rearrange
+from torch import nn
+# absolute positional embedding used for vanilla transformer sequential data
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, dropout=0.1, max_len=500, batch_first=False):
+        super().__init__()
+        self.batch_first = batch_first
+        self.dropout = nn.Dropout(p=dropout)
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer("pe", pe)
+    def forward(self, x):
+        if self.batch_first:
+            x = x + self.pe.permute(1, 0, 2)[:, : x.shape[1], :]
+        else:
+            x = x + self.pe[: x.shape[0], :]
+        return self.dropout(x)
+# very similar positional embedding used for diffusion timesteps
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = x[:, None] * emb[None, :]
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+# dropout mask
+def prob_mask_like(shape, prob, device):
+    if prob == 1:
+        return torch.ones(shape, device=device, dtype=torch.bool)
+    elif prob == 0:
+        return torch.zeros(shape, device=device, dtype=torch.bool)
+    else:
+        return torch.zeros(shape, device=device).float().uniform_(0, 1) < prob
+def extract(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def make_beta_schedule(
+    schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3
+):
+    if schedule == "linear":
+        betas = (
+            torch.linspace(
+                linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64
+            )
+            ** 2
+        )
+    elif schedule == "cosine":
+        timesteps = (
+            torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
+        )
+        alphas = timesteps / (1 + cosine_s) * np.pi / 2
+        alphas = torch.cos(alphas).pow(2)
+        alphas = alphas / alphas[0]
+        betas = 1 - alphas[1:] / alphas[:-1]
+        betas = np.clip(betas, a_min=0, a_max=0.999)
+    elif schedule == "sqrt_linear":
+        betas = torch.linspace(
+            linear_start, linear_end, n_timestep, dtype=torch.float64
+        )
+    elif schedule == "sqrt":
+        betas = (
+            torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
+            ** 0.5
+        )
+    else:
+        raise ValueError(f"schedule '{schedule}' unknown.")
+    return betas.numpy()

render.py ADDED Viewed

	@@ -0,0 +1,395 @@

+import pickle
+import numpy as np
+import torch
+import cv2
+import os
+# os.environ["PYOPENGL_PLATFORM"] = "osmesa"  # Not available on macOS
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+from tqdm import tqdm
+from smplx import SMPL, SMPLX, SMPLH
+import pyrender
+import trimesh
+import subprocess
+import pickle
+from pytorch3d.transforms import (axis_angle_to_matrix, matrix_to_axis_angle,
+                                  matrix_to_quaternion, matrix_to_rotation_6d,
+                                  quaternion_to_matrix, rotation_6d_to_matrix)
+import sys
+sys.path.append('.')
+import argparse
+def quat_to_6v(q):
+    assert q.shape[-1] == 4
+    mat = quaternion_to_matrix(q)
+    mat = matrix_to_rotation_6d(mat)
+    return mat
+def quat_from_6v(q):
+    assert q.shape[-1] == 6
+    mat = rotation_6d_to_matrix(q)
+    quat = matrix_to_quaternion(mat)
+    return quat
+def ax_to_6v(q):
+    assert q.shape[-1] == 3
+    mat = axis_angle_to_matrix(q)
+    mat = matrix_to_rotation_6d(mat)
+    return mat
+def ax_from_6v(q):
+    assert q.shape[-1] == 6
+    mat = rotation_6d_to_matrix(q)
+    ax = matrix_to_axis_angle(mat)
+    return ax
+class MovieMaker():
+    def __init__(self, save_path) -> None:
+        self.mag = 2
+        self.eyes = np.array([[3,-3,2], [0,0,-2], [0,0,4], [-8,-8,1], [0,-2,4], [0,2,4]])
+        self.centers = np.array([[0,0,0],[0,0,0],[0,0.5,0],[0,0,-1], [0,0.5,0], [0,0.5,0]])
+        self.ups = np.array([[0,0,1],[0,1,0],[0,1,0],[0,0,-1], [0,1,0], [0,1,0]])
+        self.save_path = save_path
+        self.fps = args.fps
+        self.img_size = (1200,1200)
+        # SMPLH_path = "assets/smpl_model/smplh/SMPLH_MALE.pkl"
+        # SMPL_path = "assets/smpl_model/smpl/SMPL_MALE.pkl"
+        SMPLX_path = "assets/smpl_model/smplx/SMPLX_NEUTRAL.npz"
+        trimesh_path = 'assets/NORMAL_new.obj'
+        # self.smplh = SMPLH(SMPLH_path, use_pca=False, flat_hand_mean=True)
+        # self.smplh.to(f'cuda:{args.gpu}').eval()
+        # self.smpl = SMPL(SMPL_path)
+        # self.smpl.to(f'cuda:{args.gpu}').eval()
+        self.smplx = SMPLX(SMPLX_path, use_pca=False, flat_hand_mean=True).eval()
+        _device = "mps" if torch.backends.mps.is_available() else "cpu"
+        self.smplx.to(_device).eval()
+        self.scene = pyrender.Scene(bg_color=[0.0, 0.0, 0.0, 1.0])
+        camera = pyrender.PerspectiveCamera(yfov=np.pi / 3.0)
+        camera_pose = look_at(self.eyes[5], self.centers[5], self.ups[5])       # 2
+        self.scene.add(camera, pose=camera_pose)
+        light = pyrender.DirectionalLight(color=np.ones(3), intensity=3.0)
+        self.scene.add(light, pose=camera_pose)
+        self.r = pyrender.OffscreenRenderer(self.img_size[0], self.img_size[1])
+        # self.mesh = trimesh.load(trimesh_path)
+        # floor_mesh  = pyrender.Mesh.from_trimesh(self.mesh)
+        # floor_node = self.scene.add(floor_mesh)
+    def save_video(self, save_path, color_list):
+        # save_path = os.path.join(save_path,'move.mp4')
+        f = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
+        videowriter = cv2.VideoWriter(save_path,f,self.fps,self.img_size)
+        for i in range(len(color_list)):
+            videowriter.write(color_list[i][:,:,::-1])
+        videowriter.release()
+    def get_imgs(self, motion):
+        meshes = self.motion2mesh(motion)
+        imgs = self.render_imgs(meshes)
+        return np.concatenate(imgs, axis=1)
+    def motion2mesh(self, motion):
+        if args.mode == "smpl":
+            output = self.smpl.forward(
+                betas = torch.zeros([motion.shape[0], 10]).to(motion.device),
+                transl = motion[:,:3],
+                global_orient = motion[:,3:6],
+                body_pose = torch.cat([motion[:,6:69], motion[:,69:72], motion[:,114:117]], dim=1)
+                )
+        elif args.mode == "smplh":
+            output = self.smplh.forward(
+                betas = torch.zeros([motion.shape[0], 10]).to(motion.device),
+                # transl = motion[:,:3],
+                transl = torch.tensor([[0,0,-1]]).expand(motion.shape[0],-1).to(motion.device) ,
+                global_orient = motion[:,3:6],
+                body_pose = motion[:,6:69],
+                left_hand_pose = motion[:,69:114],
+                right_hand_pose = motion[:,114:159],
+                )
+        elif args.mode == "smplx":
+            output = self.smplx.forward(
+                betas = torch.zeros([motion.shape[0], 10]).to(motion.device),
+                # transl = motion[:,:3],
+                transl = motion[:,:3],
+                global_orient = motion[:,3:6],
+                body_pose = motion[:,6:69],
+                jaw_pose = torch.zeros([motion.shape[0], 3]).to(motion),
+                leye_pose = torch.zeros([motion.shape[0], 3]).to(motion),
+                reye_pose = torch.zeros([motion.shape[0], 3]).to(motion),
+                left_hand_pose = motion[:,69:69+45],
+                right_hand_pose = motion[:,69+45:],
+                expression= torch.zeros([motion.shape[0], 10]).to(motion),
+                )
+        meshes = []
+        for i in range(output.vertices.shape[0]):
+            if args.mode == 'smplh':
+                mesh = trimesh.Trimesh(output.vertices[i].cpu(), self.smplh.faces)
+            elif args.mode == 'smplx':
+                mesh = trimesh.Trimesh(output.vertices[i].cpu(), self.smplx.faces)
+            elif args.mode == 'smpl':
+                mesh = trimesh.Trimesh(output.vertices[i].cpu(), self.smpl.faces)
+            # mesh.export(os.path.join(self.save_path, f'{i}.obj'))
+            meshes.append(mesh)
+        return meshes
+    def render_multi_view(self, meshes, music_file, tab='', eyes=None, centers=None, ups=None, views=1):
+        if eyes and centers and ups:
+            assert eyes.shape == centers.shape == ups.shape
+        else:
+            eyes = self.eyes
+            centers = self.centers
+            ups = self.ups
+        for i in range(views):
+            color_list = self.render_single_view(meshes, eyes[1], centers[1], ups[1])
+            movie_file = os.path.join(self.save_path, tab + '-' + str(i) + '.mp4')
+            output_file = os.path.join(self.save_path, tab + '-' + str(i) + '-music.mp4')
+            self.save_video(movie_file, color_list)
+            if music_file is not None:
+                subprocess.run(['ffmpeg','-i',movie_file,'-i',music_file,'-shortest',output_file])
+            else:
+                subprocess.run(['ffmpeg','-i',movie_file,output_file])
+            # if music_file is not None:
+            #     subprocess.run(['ffmpeg','-i',movie_file,'-i',music_file,'-shortest',output_file])
+            # else:
+            #     subprocess.run(['ffmpeg','-i',movie_file,output_file])
+            os.remove(movie_file)
+    def render_single_view(self, meshes):
+        num = len(meshes)
+        color_list = []
+        for i in tqdm(range(num)):
+            mesh_nodes = []
+            for mesh in meshes[i]:
+                render_mesh = pyrender.Mesh.from_trimesh(mesh)
+                mesh_node = self.scene.add(render_mesh)
+                mesh_nodes.append(mesh_node)
+            color, _ = self.r.render(self.scene, flags=pyrender.RenderFlags.SHADOWS_DIRECTIONAL)
+            color = color.copy()
+            color_list.append(color)
+            for mesh_node in mesh_nodes:
+                self.scene.remove_node(mesh_node)
+        return color_list
+    def render_imgs(self, meshes):
+        colors = []
+        for mesh in meshes:
+            render_mesh = pyrender.Mesh.from_trimesh(mesh)
+            mesh_node = self.scene.add(render_mesh)
+            color, _ = self.r.render(self.scene, flags=pyrender.RenderFlags.SHADOWS_DIRECTIONAL)
+            colors.append(color)
+            self.scene.remove_node(mesh_node)
+        return colors
+        # cv2.imwrite(os.path.join(self.save_path, 'test.jpg'), color[:,:,::-1])
+    def run(self, seq_rot, music_file=None, tab='', save_pt=False):
+        if isinstance(seq_rot, np.ndarray):
+            seq_rot = torch.tensor(seq_rot, dtype=torch.float32, device="mps" if torch.backends.mps.is_available() else "cpu")
+        if save_pt:
+            torch.save(seq_rot.detach().cpu(), os.path.join(self.save_path, tab +'_pose.pt'))
+        B, D = seq_rot.shape
+        if args.mode == "smpl":
+            print("using smpl!!!")
+            output = self.smpl.forward(
+                betas = torch.zeros([seq_rot.shape[0], 10]).to(seq_rot.device),
+                transl = seq_rot[:,:3],
+                global_orient = seq_rot[:,3:6],
+                body_pose = torch.cat([seq_rot[:,6:69], seq_rot[:,69:72], seq_rot[:,114:117]], dim=1)
+                )
+        elif args.mode == "smplh":
+            print("using smplh!!!")
+            output = self.smplh.forward(
+                betas = torch.zeros([seq_rot.shape[0], 10]).to(seq_rot.device),
+                transl = seq_rot[:,:3],
+                global_orient = seq_rot[:,3:6],
+                body_pose = seq_rot[:,6:69],
+                left_hand_pose =  seq_rot[:,69:114],  # torch.zeros([seq_rot.shape[0], 45]).to(seq_rot.device),      # seq_rot[:,69:114],
+                right_hand_pose = seq_rot[:,114:],    # torch.zeros([seq_rot.shape[0], 45]).to(seq_rot.device),      #
+                expression = torch.zeros([seq_rot.shape[0], 10]).to(seq_rot.device),
+                )
+        elif args.mode == "smplx":
+            output = self.smplx.forward(
+                betas = torch.zeros([seq_rot.shape[0], 10]).to(seq_rot.device),
+                # transl = motion[:,:3],
+                transl = seq_rot[:,:3],
+                global_orient = seq_rot[:,3:6],
+                body_pose = seq_rot[:,6:69],
+                jaw_pose = torch.zeros([seq_rot.shape[0], 3]).to(seq_rot),
+                leye_pose = torch.zeros([seq_rot.shape[0], 3]).to(seq_rot),
+                reye_pose = torch.zeros([seq_rot.shape[0], 3]).to(seq_rot),
+                left_hand_pose = seq_rot[:,69:69+45],
+                right_hand_pose = seq_rot[:,69+45:],
+                expression= torch.zeros([seq_rot.shape[0], 10]).to(seq_rot),
+                )
+        N, V, DD = output.vertices.shape                # 150, 6890, 3
+        vertices = output.vertices.reshape((B, -1, V, DD))  #  # 150, 1, 6890, 3
+        meshes = []
+        for i in range(B):
+            # if int(i) > 20:
+            #     break
+            view = []
+            for v in vertices[i]:
+                # vertices[:,2] *= -1
+                if args.mode == 'smplh':
+                    mesh = trimesh.Trimesh(output.vertices[i].cpu(), self.smplh.faces)
+                elif args.mode == 'smplx':
+                    mesh = trimesh.Trimesh(output.vertices[i].cpu(), self.smplx.faces)
+                elif args.mode == 'smpl':
+                    mesh = trimesh.Trimesh(output.vertices[i].cpu(), self.smpl.faces)
+                # mesh.export(os.path.join(self.save_path, 'test.obj'))
+                view.append(mesh)
+            meshes.append(view)
+        color_list = self.render_single_view(meshes)
+        movie_file = os.path.join(self.save_path, tab + 'tmp.mp4')
+        output_file = os.path.join(self.save_path, tab + 'z.mp4')
+        self.save_video(movie_file, color_list)
+        if music_file is not None:
+            subprocess.run(['ffmpeg','-i',movie_file,'-i',music_file,'-shortest',output_file])
+        else:
+            subprocess.run(['ffmpeg','-i',movie_file,output_file])
+        # if music_file is not None:
+        #     subprocess.run(['ffmpeg','-i',movie_file,'-i',music_file,'-shortest',output_file])
+        # else:
+        #     subprocess.run(['ffmpeg','-i',movie_file,output_file])
+        os.remove(movie_file)
+def look_at(eye, center, up):
+    front = eye - center
+    front = front / np.linalg.norm(front)
+    right = np.cross(up, front)
+    right = right/ np.linalg.norm(right)
+    up_new = np.cross(front, right)
+    camera_pose = np.eye(4)
+    camera_pose[:3,:3] = np.stack([right, up_new, front]).transpose()
+    camera_pose[:3,3] = eye
+    return camera_pose
+def motion_data_load_process(motionfile):
+    if motionfile.split(".")[-1] == "pkl":
+        pkl_data = pickle.load(open(motionfile, "rb"))
+        smpl_poses = pkl_data["smpl_poses"]
+        modata = np.concatenate((pkl_data["smpl_trans"], smpl_poses), axis=1)
+        if modata.shape[1] == 69:
+            hand_zeros = np.zeros([modata.shape[0], 90], dtype=np.float32)
+            modata = np.concatenate((modata, hand_zeros), axis=1)
+        assert modata.shape[1] == 159
+        modata[:, 1] = modata[:, 1] + 1.3
+        return modata
+    elif motionfile.split(".")[-1] == "npy":
+        modata = np.load(motionfile)
+        print("modata.shape", modata.shape)
+        if modata.shape[-1] == 315:             # first 3-dim is root translation
+            print("modata.shape is:", modata.shape)
+            rot6d = torch.from_numpy(modata[:,3:])
+            T,C = rot6d.shape
+            rot6d = rot6d.reshape(-1,6)
+            axis = ax_from_6v(rot6d).view(T,-1).detach().cpu().numpy()
+            modata = np.concatenate((modata[:,:3], axis), axis=1)
+            print("modata.shape is:", modata.shape)
+        elif modata.shape[-1] == 319:
+            print("modata.shape is:", modata.shape)
+            modata = modata[:,4:]
+            rot6d = torch.from_numpy(modata[:,3:])
+            T,C = rot6d.shape
+            rot6d = rot6d.reshape(-1,6)
+            axis = ax_from_6v(rot6d).view(T,-1).detach().cpu().numpy()
+            modata = np.concatenate((modata[:,:3], axis), axis=1)
+            print("modata.shape is:", modata.shape)
+        elif modata.shape[-1] == 168:
+            modata = np.concatenate( [modata[:,:21*3+1], modata[:,25*3:]] , axis=1)
+        elif modata.shape[-1] == 159:
+            print("modata.shape is:", modata.shape)
+            print("modata.shape is:", modata.shape)
+        elif modata.shape[-1] == 135:
+            print("modata.shape is:", modata.shape)
+            if len(modata.shape) == 3 and modata.shape[0] ==1:
+                modata = modata.squeeze(0)
+            rot6d = torch.from_numpy(modata[:,3:])
+            T,C = rot6d.shape
+            rot6d = rot6d.reshape(-1,6)
+            axis = ax_from_6v(rot6d).view(T,-1).detach().cpu().numpy()
+            hand_zeros = torch.zeros([T, 90]).to(rot6d).detach().cpu().numpy()
+            modata = np.concatenate((modata[:,:3], axis, hand_zeros), axis=1)
+            print("modata.shape is:", modata.shape)
+        else:
+            raise("shape error!")
+        modata[:, 1] = modata[:, 1] + 1.3
+        return modata
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--gpu", type=str, default="2")
+    parser.add_argument("--modir", type=str, default="")
+    parser.add_argument("--mode", type=str, default="smplx", choices=['smpl','smplh','smplx'])
+    parser.add_argument("--fps", type=int, default=30)
+    parser.add_argument("--save_path", type=str, default=None)
+    args = parser.parse_args()
+    print(args.gpu)
+    motion_dir = args.modir
+    if args.save_path is not None:
+        save_path = args.save_path
+        if not os.path.exists(save_path):
+            os.makedirs(save_path)
+    else:
+        save_path = os.path.join(motion_dir, 'video')
+        os.makedirs(save_path, exist_ok=True)
+    music_dir = "experiments/DanceDiffuse_module/debug--0517_Norm_512len_315_transloss/val1640/samples_2023-05-17-20-54-05"
+    for file in os.listdir(motion_dir):
+        if file[-3:] in ["npy", "pkl"]:
+            # if there have exist rendered video, continue
+            flag = False
+            for exists_file in os.listdir(save_path):
+                if file[:-4] in exists_file:
+                    flag = True
+                    break
+                else:
+                    flag = False
+            if flag:
+                print("exist", file)
+                continue
+            print(file)
+            motion_file = os.path.join(motion_dir, file)
+            visualizer = MovieMaker(save_path=save_path)
+            modata = motion_data_load_process(motion_file)
+            visualizer.run(modata, tab=os.path.basename(motion_file).split(".")[0], music_file=None)
+    print('done')

smplx_neu_J_1.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa80c6e1b28e9d43470f250a2d168bacf2cb0cc5e8688ff2e48e71f4db13ba6c
+size 788

teaser/teaser.png ADDED Viewed

Git LFS Details

SHA256: 797034e986aad1b6cd47f78b12b2406ec779315eb205f4211d07c0623f198e5f
Pointer size: 132 Bytes
Size of remote file: 1.48 MB

test.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import glob
+import os
+from functools import cmp_to_key
+from pathlib import Path
+import sys
+from tempfile import TemporaryDirectory
+import random
+# import jukemirlib
+import numpy as np
+import torch
+from tqdm import tqdm
+import librosa
+import librosa as lr
+import soundfile as sf
+from args import FineDance_parse_test_opt
+from train_seq import EDGE
+# from data.audio_extraction.jukebox_features import extract as juke_extract
+def slice_audio(audio_file, stride, length, out_dir):
+    # stride, length in seconds
+    audio, sr = lr.load(audio_file, sr=None)
+    file_name = os.path.splitext(os.path.basename(audio_file))[0]
+    start_idx = 0
+    idx = 0
+    window = int(length * sr)
+    stride_step = int(stride * sr)
+    while start_idx <= len(audio) - window:
+        audio_slice = audio[start_idx : start_idx + window]
+        sf.write(f"{out_dir}/{file_name}_slice{idx}.wav", audio_slice, sr)
+        start_idx += stride_step
+        idx += 1
+    return idx
+def extract(fpath):
+    FPS = 30
+    HOP_LENGTH = 512
+    SR = FPS * HOP_LENGTH
+    EPS = 1e-6
+    data, _ = librosa.load(fpath, sr=SR)
+    envelope = librosa.onset.onset_strength(y=data, sr=SR)  # (seq_len,)
+    mfcc = librosa.feature.mfcc(y=data, sr=SR, n_mfcc=20).T  # (seq_len, 20)
+    chroma = librosa.feature.chroma_cens(
+        y=data, sr=SR, hop_length=HOP_LENGTH, n_chroma=12
+    ).T  # (seq_len, 12)
+    peak_idxs = librosa.onset.onset_detect(
+        onset_envelope=envelope.flatten(), sr=SR, hop_length=HOP_LENGTH
+    )
+    peak_onehot = np.zeros_like(envelope, dtype=np.float32)
+    peak_onehot[peak_idxs] = 1.0  # (seq_len,)
+    start_bpm = lr.beat.tempo(y=lr.load(fpath)[0])[0]
+    tempo, beat_idxs = librosa.beat.beat_track(
+        onset_envelope=envelope,
+        sr=SR,
+        hop_length=HOP_LENGTH,
+        start_bpm=start_bpm,
+        tightness=100,
+    )
+    beat_onehot = np.zeros_like(envelope, dtype=np.float32)
+    beat_onehot[beat_idxs] = 1.0  # (seq_len,)
+    audio_feature = np.concatenate(
+        [envelope[:, None], mfcc, chroma, peak_onehot[:, None], beat_onehot[:, None]],
+        axis=-1,
+    )
+    # chop to ensure exact shape
+    audio_feature = audio_feature[:4 * FPS]
+    return audio_feature
+# sort filenames that look like songname_slice{number}.ext
+key_func = lambda x: int(os.path.splitext(x)[0].split("_")[-1].split("slice")[-1])
+# test_list = ["063", "132", "143", "036", "098", "198", "130", "012", "211", "193", "179", "065", "137", "161", "092", "120", "037", "109", "204", "144"]
+test_list = ["063", "144"]
+def stringintcmp_(a, b):
+    aa, bb = "".join(a.split("_")[:-1]), "".join(b.split("_")[:-1])
+    ka, kb = key_func(a), key_func(b)
+    if aa < bb:
+        return -1
+    if aa > bb:
+        return 1
+    if ka < kb:
+        return -1
+    if ka > kb:
+        return 1
+    return 0
+stringintkey = cmp_to_key(stringintcmp_)
+stride_ = 60/30
+def test(opt):
+    feature_func = extract
+    sample_length = opt.out_length
+    sample_size = int(sample_length / stride_) - 1
+    temp_dir_list = []
+    all_cond = []
+    all_filenames = []
+    if opt.use_cached_features:             # default is false
+        print("Using precomputed features")
+        # all subdirectories
+        dir_list = glob.glob(os.path.join(opt.feature_cache_dir, "*/"))
+        for dir in dir_list:
+            file_list = sorted(glob.glob(f"{dir}/*.wav"), key=stringintkey)
+            juke_file_list = sorted(glob.glob(f"{dir}/*.npy"), key=stringintkey)
+            assert len(file_list) == len(juke_file_list)
+            # random chunk after sanity check
+            rand_idx = random.randint(0, len(file_list) - sample_size)
+            file_list = file_list[rand_idx : rand_idx + sample_size]
+            juke_file_list = juke_file_list[rand_idx : rand_idx + sample_size]
+            cond_list = [np.load(x) for x in juke_file_list]
+            all_filenames.append(file_list)
+            all_cond.append(torch.from_numpy(np.array(cond_list)))
+    else:
+        print("Computing features for input music")
+        for wav_file in glob.glob(os.path.join(opt.music_dir, "*.wav")):
+            songname = os.path.splitext(os.path.basename(wav_file))[0]
+            # create temp folder (or use the cache folder if specified)
+            if True:  # songname in test_list:
+                if opt.cache_features:
+                    save_dir = os.path.join(opt.feature_cache_dir, songname)
+                    Path(save_dir).mkdir(parents=True, exist_ok=True)
+                    dirname = save_dir
+                else:
+                    temp_dir = TemporaryDirectory()
+                    print("temp_dir is", temp_dir)
+                    temp_dir_list.append(temp_dir)
+                    dirname = temp_dir.name
+                # slice the audio file
+                print(f"Slicing {wav_file}")
+                slice_audio(wav_file, 60/30, 120/30, dirname)
+                file_list = sorted(glob.glob(f"{dirname}/*.wav"), key=stringintkey)
+                # randomly sample a chunk of length at most sample_size
+                rand_idx = random.randint(0, len(file_list) - sample_size)
+                cond_list = []
+                # generate juke representations
+                print(f"Computing features for {wav_file}")
+                for idx, file in enumerate(tqdm(file_list)):
+                    # if not caching then only calculate for the interested range
+                    if (not opt.cache_features) and (not (rand_idx <= idx < rand_idx + sample_size)):
+                        continue
+                    # audio = jukemirlib.load_audio(file)
+                    # reps = jukemirlib.extract(
+                    #     audio, layers=[66], downsample_target_rate=30
+                    # )[66]
+                    reps = feature_func(file)[:opt.full_seq_len]
+                    # save reps
+                    if opt.cache_features:
+                        featurename = os.path.splitext(file)[0] + ".npy"
+                        np.save(featurename, reps)
+                    # if in the random range, put it into the list of reps we want
+                    # to actually use for generation
+                    if rand_idx <= idx < rand_idx + sample_size:
+                        cond_list.append(reps)
+                cond_list = torch.from_numpy(np.array(cond_list))
+                all_cond.append(cond_list)
+                all_filenames.append(file_list[rand_idx : rand_idx + sample_size])
+    model = EDGE(opt, opt.feature_type, opt.checkpoint)
+    model.eval()
+    # directory for optionally saving the dances for eval
+    fk_out = None
+    if opt.save_motions:
+        fk_out = opt.motion_save_dir
+    print("Generating dances")
+    for i in range(len(all_cond)):
+        data_tuple = None, all_cond[i], all_filenames[i]
+        model.render_sample(
+            data_tuple, "test", opt.render_dir, render_count=-1, fk_out=fk_out, mode="long",  render=not opt.no_render
+        )
+    print("Done")
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    for temp_dir in temp_dir_list:
+        temp_dir.cleanup()
+if __name__ == "__main__":
+    opt = FineDance_parse_test_opt()
+    test(opt)

train_seq.py ADDED Viewed

	@@ -0,0 +1,318 @@

+import multiprocessing
+import os
+from zlib import Z_FULL_FLUSH
+# os.environ["WANDB_API_KEY"] = "your WANDB_API_KEY" #
+# os.environ["WANDB_MODE"] = "online"
+# os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7"
+import pickle
+from functools import partial
+from pathlib import Path
+from args import FineDance_parse_train_opt, save_arguments_to_yaml
+import sys
+import torch
+import torch.nn.functional as F
+import wandb
+from accelerate import Accelerator, DistributedDataParallelKwargs
+from accelerate.state import AcceleratorState
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from dataset.FineDance_dataset import FineDance_Smpl
+from dataset.preprocess import increment_path
+from dataset.preprocess import My_Normalizer as Normalizer        # do not use Normalizer
+from model.adan import Adan
+from model.diffusion import GaussianDiffusion
+from model.model import DanceDecoder, SeqModel
+from vis import SMPLX_Skeleton, SMPLSkeleton
+def wrap(x):
+    return {f"module.{key}": value for key, value in x.items()}
+def maybe_wrap(x, num):
+    return x if num == 1 else wrap(x)
+class EDGE:
+    def __init__(
+        self,
+        opt,
+        feature_type,
+        checkpoint_path="",
+        normalizer=None,
+        EMA=True,
+        learning_rate=4e-4,
+        weight_decay=0.02,
+    ):
+        self.opt = opt
+        ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+        self.accelerator = Accelerator(kwargs_handlers=[ddp_kwargs])
+        state = AcceleratorState()
+        num_processes = state.num_processes
+        self.repr_dim = repr_dim = opt.nfeats
+        feature_dim = 35
+        self.horizon = horizon = opt.full_seq_len
+        self.accelerator.wait_for_everyone()
+        self.resume_num = 0
+        checkpoint = None
+        self.normalizer = None
+        if checkpoint_path != "":
+            checkpoint = torch.load(
+                checkpoint_path, map_location=self.accelerator.device
+            )
+            self.resume_num = int(os.path.basename(checkpoint_path).split("-")[1].split(".")[0])      # int(os.path.basenam
+        model = SeqModel(
+            nfeats=repr_dim,
+            seq_len=horizon,
+            latent_dim=512,
+            ff_size=1024,
+            num_layers=8,
+            num_heads=8,
+            dropout=0.1,
+            cond_feature_dim=feature_dim,
+            activation=F.gelu,
+        )
+        if opt.nfeats == 139 or opt.nfeats == 135:
+            smplx_fk = SMPLSkeleton(device=self.accelerator.device)
+        else:
+            smplx_fk = SMPLX_Skeleton(device=self.accelerator.device, batch=512000)
+        diffusion = GaussianDiffusion(
+            model,
+            opt,
+            horizon,
+            repr_dim,
+            smplx_model = smplx_fk,
+            schedule="cosine",
+            n_timestep=1000,
+            predict_epsilon=False,
+            loss_type="l2",
+            use_p2=False,
+            cond_drop_prob=0.25,
+            guidance_weight=2,
+            do_normalize = opt.do_normalize
+        )
+        print(
+            "Model has {} parameters".format(sum(y.numel() for y in model.parameters()))
+        )
+        self.model = self.accelerator.prepare(model)
+        self.diffusion = diffusion.to(self.accelerator.device)                              # 为什么这里不需要prepare
+        self.smplx_fk = smplx_fk     # to(self.accelerator.device)
+        optim = Adan(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
+        self.optim = self.accelerator.prepare(optim)
+        if checkpoint_path != "":
+            self.model.load_state_dict(
+                maybe_wrap(
+                    checkpoint["ema_state_dict" if EMA else "model_state_dict"],
+                    num_processes,
+                )
+            )
+    def eval(self):
+        self.diffusion.eval()
+    def train(self):
+        self.diffusion.train()
+    def prepare(self, objects):
+        return self.accelerator.prepare(*objects)
+    def train_loop(self, opt):
+        print("train_dataset = FineDance_Dataset ")
+        train_dataset = FineDance_Smpl(
+            args=opt,            # data/
+            istrain=True,
+        )
+        test_dataset = FineDance_Smpl(
+            args=opt,
+            istrain=False,
+        )
+        num_cpus = multiprocessing.cpu_count()
+        print("batchsize=:", opt.batch_size)
+        train_data_loader = DataLoader(
+            train_dataset,
+            batch_size=opt.batch_size,
+            shuffle=True,
+            num_workers=min(int(num_cpus * 0.5), 40),      # num_workers=min(int(num_cpus * 0.75), 32),
+            pin_memory=True,
+            drop_last=True,
+        )
+        test_data_loader = DataLoader(
+            test_dataset,
+            batch_size=opt.batch_size,
+            shuffle=True,
+            num_workers=2,
+            pin_memory=True,
+            drop_last=True,
+        )
+        train_data_loader = self.accelerator.prepare(train_data_loader)
+        # boot up multi-gpu training. test dataloader is only on main process
+        load_loop = (
+            partial(tqdm, position=1, desc="Batch")
+            if self.accelerator.is_main_process
+            else lambda x: x
+        )
+        if self.accelerator.is_main_process:
+            save_dir = str(increment_path(Path(opt.project) / opt.exp_name))
+            opt.exp_name = save_dir.split("/")[-1]
+            wandb.init(project=opt.wandb_pj_name, name=opt.exp_name)
+            save_dir = Path(save_dir)
+            wdir = save_dir / "weights"
+            wdir.mkdir(parents=True, exist_ok=True)
+            wandb.save("params.yaml")  # 保存wandb配置到文件
+            yaml_path = os.path.join(wdir, 'parameters.yaml')
+            save_arguments_to_yaml(opt, yaml_path)
+        self.accelerator.wait_for_everyone()
+        for epoch in range(1, opt.epochs + 1):
+            print("epoch:", epoch+self.resume_num)
+            avg_loss = 0
+            avg_vloss = 0
+            avg_fkloss = 0
+            avg_footloss = 0
+            # train
+            self.train()
+            for step, (x, cond, filename) in enumerate(
+                load_loop(train_data_loader)
+            ):
+                if opt.nfeats == 139 or opt.nfeats==135:
+                    x = x[:, :, :139]
+                total_loss, (loss, v_loss, fk_loss, foot_loss) = self.diffusion(
+                    x, cond, t_override=None
+                )
+                # print("3")
+                self.optim.zero_grad()
+                self.accelerator.backward(total_loss)
+                self.optim.step()
+                # ema update and train loss update only on main
+                if self.accelerator.is_main_process:
+                    avg_loss += loss.detach().cpu().numpy()
+                    avg_vloss += v_loss.detach().cpu().numpy()
+                    avg_fkloss += fk_loss.detach().cpu().numpy()
+                    avg_footloss += foot_loss.detach().cpu().numpy()
+                    if step % opt.ema_interval == 0:
+                        self.diffusion.ema.update_model_average(
+                            self.diffusion.master_model, self.diffusion.model
+                        )
+            #-----------------------------------------------------------------------------------------------------------
+            # test
+            # Save model
+            if ((epoch+self.resume_num) % opt.save_interval) == 0  or epoch<=1:
+                # everyone waits here for the val loop to finish ( don't start next train epoch early)
+                self.accelerator.wait_for_everyone()
+                self.eval()     # debug!
+                # save only if on main thread
+                if self.accelerator.is_main_process:
+                    # self.eval()
+                    # log
+                    avg_loss /= len(train_data_loader)
+                    avg_vloss /= len(train_data_loader)
+                    avg_fkloss /= len(train_data_loader)
+                    avg_footloss /= len(train_data_loader)
+                    log_dict = {
+                        "Train Loss": avg_loss,
+                        "V Loss": avg_vloss,
+                        "FK Loss": avg_fkloss,
+                        "Foot Loss": avg_footloss,
+                    }
+                    wandb.log(log_dict)
+                    ckpt = {
+                        "ema_state_dict": self.diffusion.master_model.state_dict(),     # 经过accelerate prepare的模型，在保存时需要unwrap，反之不需要
+                        "model_state_dict": self.accelerator.unwrap_model(
+                            self.model
+                        ).state_dict(),
+                        "optimizer_state_dict": self.optim.state_dict(),
+                        "normalizer": self.normalizer,
+                    }
+                    torch.save(ckpt, os.path.join(wdir, f"train-{epoch+self.resume_num}.pt"))
+                    print(f"[MODEL SAVED at Epoch {epoch+self.resume_num}]")
+                    # generate a sample
+                    render_count = 2
+                    shape = (render_count, self.horizon, self.opt.nfeats)
+                    print("Generating Sample")
+                    # draw a music from the test dataset
+                    (x, cond, filename) = next(iter(test_data_loader))
+                    # if opt.do_normalize:
+                    #     x = self.normalizer.normalize(x)
+                    if opt.nfeats == 139 or opt.nfeats==135:
+                        x = x[:, :, :139]
+                    cond = cond.to(self.accelerator.device)
+                    # name_iter = name_iter+1
+                    self.diffusion.render_sample(
+                        shape,
+                        cond[:render_count],
+                        self.normalizer,
+                        epoch+self.resume_num,
+                        render_out = os.path.join(opt.render_dir, "train_" + opt.exp_name),      # render out
+                        fk_out = os.path.join(opt.render_dir, "train_" + opt.exp_name),
+                        name=filename[:render_count],
+                        # name = str(epoch) + str(name_iter).zfill(3)
+                        sound=True,
+                    )
+            #-----------------------------------------------------------------------------------------------------------
+        if self.accelerator.is_main_process:
+            wandb.run.finish()
+    def render_sample(
+        self, data_tuple, label, render_dir, render_count=-1, mode='normal', fk_out=None, render=True,
+    ):
+        _, cond, wavname = data_tuple
+        assert len(cond.shape) == 3
+        if render_count < 0:
+            render_count = len(cond)
+        shape = (render_count, self.horizon, self.repr_dim)
+        cond = cond.to(self.accelerator.device).float()
+        self.diffusion.render_sample(
+            shape,
+            cond[:render_count],
+            self.normalizer,
+            label,
+            render_dir,
+            name=wavname[:render_count],
+            sound=True,
+            mode=mode,
+            fk_out=fk_out,
+            render=render
+        )
+def train(opt):
+    model = EDGE(opt, opt.feature_type)
+    model.train_loop(opt)
+if __name__ == "__main__":
+    opt = FineDance_parse_train_opt()
+    command = ' '.join(sys.argv)
+    if not os.path.exists(os.path.join(opt.project, opt.exp_name)):
+        os.makedirs(os.path.join(opt.project, opt.exp_name), exist_ok=False)
+    with open(os.path.join(opt.project, opt.exp_name, 'command.txt'), 'w') as f:
+        f.write(command)
+    yaml_path = os.path.join(opt.project, opt.exp_name, 'parameters.yaml')
+    save_arguments_to_yaml(opt, yaml_path)
+    train(opt)

vis.py ADDED Viewed

	@@ -0,0 +1,687 @@

+import os
+from pathlib import Path
+import sys
+from tempfile import TemporaryDirectory
+import librosa as lr
+import matplotlib.animation as animation
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import axes3d
+import numpy as np
+import soundfile as sf
+import torch
+from matplotlib import cm
+from matplotlib.colors import ListedColormap
+from pytorch3d.transforms import (axis_angle_to_quaternion, quaternion_apply,
+                                  quaternion_multiply)
+from tqdm import tqdm
+from typing import NewType
+Tensor = NewType('Tensor', torch.Tensor)
+import torch.nn.functional as F
+try:
+    import pickle5 as pickle
+except ImportError:
+    import pickle
+smpl_joints = [
+    "root",  # 0
+    "lhip",  # 1
+    "rhip",  # 2
+    "belly", # 3
+    "lknee", # 4
+    "rknee", # 5
+    "spine", # 6
+    "lankle",# 7
+    "rankle",# 8
+    "chest", # 9
+    "ltoes", # 10
+    "rtoes", # 11
+    "neck",  # 12
+    "linshoulder", # 13
+    "rinshoulder", # 14
+    "head", # 15
+    "lshoulder", # 16
+    "rshoulder",  # 17
+    "lelbow", # 18
+    "relbow",  # 19
+    "lwrist", # 20
+    "rwrist", # 21
+    "lhand", # 22
+    "rhand", # 23
+]
+smplh_joints = [
+    'pelvis',
+    'left_hip',
+    'right_hip',
+    'spine1',
+    'left_knee',
+    'right_knee',
+    'spine2',
+    'left_ankle',
+    'right_ankle',
+    'spine3',
+    'left_foot',
+    'right_foot',
+    'neck',
+    'left_collar',
+    'right_collar',
+    'head',
+    'left_shoulder',
+    'right_shoulder',
+    'left_elbow',
+    'right_elbow',
+    'left_wrist',
+    'right_wrist',
+    'left_index1',
+    'left_index2',
+    'left_index3',
+    'left_middle1',
+    'left_middle2',
+    'left_middle3',
+    'left_pinky1',
+    'left_pinky2',
+    'left_pinky3',
+    'left_ring1',
+    'left_ring2',
+    'left_ring3',
+    'left_thumb1',
+    'left_thumb2',
+    'left_thumb3',
+    'right_index1',
+    'right_index2',
+    'right_index3',
+    'right_middle1',
+    'right_middle2',
+    'right_middle3',
+    'right_pinky1',
+    'right_pinky2',
+    'right_pinky3',
+    'right_ring1',
+    'right_ring2',
+    'right_ring3',
+    'right_thumb1',
+    'right_thumb2',
+    'right_thumb3'
+]
+smplx_joints = [
+    'pelvis',
+    'left_hip',
+    'right_hip',
+    'spine1',
+    'left_knee',
+    'right_knee',
+    'spine2',
+    'left_ankle',
+    'right_ankle',
+    'spine3',
+    'left_foot',
+    'right_foot',
+    'neck',
+    'left_collar',
+    'right_collar',
+    'head',
+    'left_shoulder',
+    'right_shoulder',
+    'left_elbow',
+    'right_elbow',
+    'left_wrist',
+    'right_wrist',
+    'jaw',
+    'left_eye_smplhf',
+    'right_eye_smplhf',
+    'left_index1',
+    'left_index2',
+    'left_index3',
+    'left_middle1',
+    'left_middle2',
+    'left_middle3',
+    'left_pinky1',
+    'left_pinky2',
+    'left_pinky3',
+    'left_ring1',
+    'left_ring2',
+    'left_ring3',
+    'left_thumb1',
+    'left_thumb2',
+    'left_thumb3',
+    'right_index1',
+    'right_index2',
+    'right_index3',
+    'right_middle1',
+    'right_middle2',
+    'right_middle3',
+    'right_pinky1',
+    'right_pinky2',
+    'right_pinky3',
+    'right_ring1',
+    'right_ring2',
+    'right_ring3',
+    'right_thumb1',
+    'right_thumb2',
+    'right_thumb3'
+]
+smpl_parents = [
+    -1,
+    0,
+    0,
+    0,
+    1,
+    2,
+    3,
+    4,
+    5,
+    6,
+    7,
+    8,
+    9,
+    9,
+    9,
+    12,
+    13,
+    14,
+    16,
+    17,
+    18,
+    19,
+    20,
+    21,
+]
+smplh_parents = [-1,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  9,  9, 12, 13, 14,
+        16, 17, 18, 19, 20, 22, 23, 20, 25, 26, 20, 28, 29, 20, 31, 32, 20, 34,
+        35, 21, 37, 38, 21, 40, 41, 21, 43, 44, 21, 46, 47, 21, 49, 50]
+smplx_parents = [-1, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 12, 13, 14, 16, 17, 18, 19, 15, 15, 15, 20, 25, 26, 20, 28, 29, 20, 31, 32, 20, 34, 35, 20, 37, 38, 21, 40, 41, 21, 43, 44, 21, 46, 47, 21, 49, 50, 21, 52, 53]
+smpl_offsets = [
+    [0.0, 0.0, 0.0],
+    [0.05858135, -0.08228004, -0.01766408],
+    [-0.06030973, -0.09051332, -0.01354254],
+    [0.00443945, 0.12440352, -0.03838522],
+    [0.04345142, -0.38646945, 0.008037],
+    [-0.04325663, -0.38368791, -0.00484304],
+    [0.00448844, 0.1379564, 0.02682033],
+    [-0.01479032, -0.42687458, -0.037428],
+    [0.01905555, -0.4200455, -0.03456167],
+    [-0.00226458, 0.05603239, 0.00285505],
+    [0.04105436, -0.06028581, 0.12204243],
+    [-0.03483987, -0.06210566, 0.13032329],
+    [-0.0133902, 0.21163553, -0.03346758],
+    [0.07170245, 0.11399969, -0.01889817],
+    [-0.08295366, 0.11247234, -0.02370739],
+    [0.01011321, 0.08893734, 0.05040987],
+    [0.12292141, 0.04520509, -0.019046],
+    [-0.11322832, 0.04685326, -0.00847207],
+    [0.2553319, -0.01564902, -0.02294649],
+    [-0.26012748, -0.01436928, -0.03126873],
+    [0.26570925, 0.01269811, -0.00737473],
+    [-0.26910836, 0.00679372, -0.00602676],
+    [0.08669055, -0.01063603, -0.01559429],
+    [-0.0887537, -0.00865157, -0.01010708],
+]
+def set_line_data_3d(line, x):
+    line.set_data(x[:, :2].T)
+    line.set_3d_properties(x[:, 2])
+def set_scatter_data_3d(scat, x, c):
+    scat.set_offsets(x[:, :2])
+    scat.set_3d_properties(x[:, 2], "z")
+    scat.set_facecolors([c])
+def get_axrange(poses):
+    pose = poses[0]
+    x_min = pose[:, 0].min()
+    x_max = pose[:, 0].max()
+    y_min = pose[:, 1].min()
+    y_max = pose[:, 1].max()
+    z_min = pose[:, 2].min()
+    z_max = pose[:, 2].max()
+    xdiff = x_max - x_min
+    ydiff = y_max - y_min
+    zdiff = z_max - z_min
+    biggestdiff = max([xdiff, ydiff, zdiff])
+    return biggestdiff
+def plot_single_pose(num, poses, lines, ax, axrange, scat, contact, ske_parents):
+    pose = poses[num]
+    static = contact[num]
+    indices = [7, 8, 10, 11]
+    for i, (point, idx) in enumerate(zip(scat, indices)):
+        position = pose[idx : idx + 1]
+        color = "r" if static[i] else "g"
+        set_scatter_data_3d(point, position, color)
+    for i, (p, line) in enumerate(zip(ske_parents, lines)):
+        # don't plot root
+        if i == 0:
+            continue
+        # stack to create a line
+        data = np.stack((pose[i], pose[p]), axis=0)
+        set_line_data_3d(line, data)
+    if num == 0:
+        if isinstance(axrange, int):
+            axrange = (axrange, axrange, axrange)
+        xcenter, ycenter, zcenter = 0, 0, 2.5
+        stepx, stepy, stepz = axrange[0] / 2, axrange[1] / 2, axrange[2] / 2
+        x_min, x_max = xcenter - stepx, xcenter + stepx
+        y_min, y_max = ycenter - stepy, ycenter + stepy
+        z_min, z_max = zcenter - stepz, zcenter + stepz
+        ax.set_xlim(x_min, x_max)
+        ax.set_ylim(y_min, y_max)
+        ax.set_zlim(z_min, z_max)
+def skeleton_render(
+    poses,
+    epoch=0,
+    out="renders",
+    name="",
+    sound=True,
+    stitch=False,
+    sound_folder="ood_sliced",
+    contact=None,
+    render=True,
+    smpl_mode="smpl",       # 是否渲染双手
+):
+    if render:
+        if smpl_mode=="smpl":
+            poses = np.concatenate((poses[:, :23, :], np.expand_dims(poses[:, 37, :], axis=1)), axis=1)
+            ske_parents = smpl_parents
+        elif smpl_mode == "smplx":
+            ske_parents = smplx_parents
+        # generate the pose with FK
+        Path(out).mkdir(parents=True, exist_ok=True)
+        num_steps = poses.shape[0]      #
+        fig = plt.figure()
+        ax = fig.add_subplot(projection="3d")
+        point = np.array([0, 0, 1])
+        normal = np.array([0, 0, 1])
+        d = -point.dot(normal)
+        xx, yy = np.meshgrid(np.linspace(-1.5, 1.5, 2), np.linspace(-1.5, 1.5, 2))
+        z = (-normal[0] * xx - normal[1] * yy - d) * 1.0 / normal[2]
+        # plot the plane
+        ax.plot_surface(xx, yy, z, zorder=-11, cmap=cm.twilight)
+        # Create lines initially without data
+        lines = [
+            ax.plot([], [], [], zorder=10, linewidth=1.5)[0]
+            for _ in ske_parents
+        ]
+        scat = [
+            ax.scatter([], [], [], zorder=10, s=0, cmap=ListedColormap(["r", "g", "b"]))
+            for _ in range(4)
+        ]
+        axrange = 3
+        # create contact labels
+        feet = poses[:, (7, 8, 10, 11)]
+        feetv = np.zeros(feet.shape[:2])
+        feetv[:-1] = np.linalg.norm(feet[1:] - feet[:-1], axis=-1)
+        if contact is None:
+            contact = feetv < 0.01
+        else:
+            contact = contact > 0.95
+        # Creating the Animation object
+        anim = animation.FuncAnimation(
+            fig,
+            plot_single_pose,
+            num_steps,
+            fargs=(poses, lines, ax, axrange, scat, contact, ske_parents),
+            interval=1000 // 30,
+        )
+    if sound:
+        # make a temporary directory to save the intermediate gif in
+        if render:
+            temp_dir = TemporaryDirectory()
+            gifname = os.path.join(temp_dir.name, f"{epoch}.gif")
+            anim.save(gifname)
+        # stitch wavs
+        if stitch:
+            assert type(name) == list  # must be a list of names to do stitching
+            name_ = [os.path.splitext(x)[0] + ".wav" for x in name]
+            audio, sr = lr.load(name_[0], sr=None)
+            ll, half = len(audio), len(audio) // 2
+            total_wav = np.zeros(ll + half * (len(name_) - 1))
+            total_wav[:ll] = audio
+            idx = ll
+            for n_ in name_[1:]:
+                audio, sr = lr.load(n_, sr=None)
+                total_wav[idx : idx + half] = audio[half:]
+                idx += half
+            # save a dummy spliced audio
+            audioname = f"{temp_dir.name}/tempsound.wav" if render else os.path.join(out, f'{epoch}_{"_".join(os.path.splitext(os.path.basename(name[0]))[0].split("_")[:-1])}.wav')
+            sf.write(audioname, total_wav, sr)
+            outname = os.path.join(
+                out,
+                f'{epoch}_{"_".join(os.path.splitext(os.path.basename(name[0]))[0].split("_")[:-1])}.mp4',
+            )
+        else:
+            assert type(name) == str
+            assert name != "", "Must provide an audio filename"
+            audioname = name
+            outname = os.path.join(
+                out, f"{epoch}_{os.path.splitext(os.path.basename(name))[0]}.mp4"
+            )
+        if render:
+            print(f"ffmpeg -loglevel error -stream_loop 0 -y -i {gifname} -i {audioname} -shortest -c:v libx264 -crf 26 -c:a aac -q:a 4 {outname}")
+            out = os.system(
+                f"/home/lrh/Documents/ffmpeg-6.0-amd64-static/ffmpeg -loglevel error -stream_loop 0 -y -i {gifname} -i {audioname} -shortest -c:v libx264 -crf 26 -c:a aac -q:a 4 {outname}"
+            )
+    else:
+        if render:
+            # actually save the gif
+            path = os.path.normpath(name)
+            pathparts = path.split(os.sep)
+            gifname = os.path.join(out, f"{pathparts[-1][:-4]}.gif")
+            anim.save(gifname, savefig_kwargs={"transparent": True, "facecolor": "none"},)
+    plt.close()
+class SMPLSkeleton:
+    def __init__(
+        self, device=None,
+    ):
+        offsets = smpl_offsets
+        parents = smpl_parents
+        assert len(offsets) == len(parents)
+        self._offsets = torch.Tensor(offsets)   #.to(device)
+        self._parents = np.array(parents)
+        self._compute_metadata()
+    def _compute_metadata(self):
+        self._has_children = np.zeros(len(self._parents)).astype(bool)
+        for i, parent in enumerate(self._parents):
+            if parent != -1:
+                self._has_children[parent] = True
+        self._children = []
+        for i, parent in enumerate(self._parents):
+            self._children.append([])
+        for i, parent in enumerate(self._parents):
+            if parent != -1:
+                self._children[parent].append(i)
+    def forward(self, rotations, root_positions):
+        """
+        Perform forward kinematics using the given trajectory and local rotations.
+        Arguments (where N = batch size, L = sequence length, J = number of joints):
+         -- rotations: (N, L, J, 3) tensor of axis-angle rotations describing the local rotations of each joint.
+         -- root_positions: (N, L, 3) tensor describing the root joint positions.
+        """
+        assert len(rotations.shape) == 4
+        assert len(root_positions.shape) == 3
+        # transform from axis angle to quaternion
+        fk_device = rotations.device
+        self._offsets.to(fk_device)
+        rotations = axis_angle_to_quaternion(rotations)
+        positions_world = []
+        rotations_world = []
+        expanded_offsets = self._offsets.expand(
+            rotations.shape[0],
+            rotations.shape[1],
+            self._offsets.shape[0],
+            self._offsets.shape[1],
+        ).to(fk_device)
+        # Parallelize along the batch and time dimensions
+        for i in range(self._offsets.shape[0]):
+            if self._parents[i] == -1:
+                positions_world.append(root_positions)
+                rotations_world.append(rotations[:, :, 0])
+            else:
+                positions_world.append(
+                    quaternion_apply(
+                        rotations_world[self._parents[i]], expanded_offsets[:, :, i]
+                    )
+                    + positions_world[self._parents[i]]
+                )
+                if self._has_children[i]:
+                    rotations_world.append(
+                        quaternion_multiply(
+                            rotations_world[self._parents[i]], rotations[:, :, i]
+                        )
+                    )
+                else:
+                    # This joint is a terminal node -> it would be useless to compute the transformation
+                    rotations_world.append(None)
+        return torch.stack(positions_world, dim=3).permute(0, 1, 3, 2)
+@torch.no_grad()
+class SMPLX_Skeleton:
+    def __init__(
+        self, device=None, batch=64,
+    ):
+        # offsets = smpl_offsets
+        self.device = device
+        self.parents = smplx_parents
+        self.J = np.load(os.path.join(os.path.dirname(__file__), "smplx_neu_J_1.npy"))
+        self.J = torch.from_numpy(self.J).to(device).unsqueeze(dim=0).repeat(batch, 1, 1)
+    def batch_rodrigues(self, rot_vecs: Tensor, epsilon: float = 1e-8,) -> Tensor:
+        ''' Calculates the rotation matrices for a batch of rotation vectors
+            Parameters
+            ----------
+            rot_vecs: torch.tensor Nx3
+                array of N axis-angle vectors
+            Returns
+            -------
+            R: torch.tensor Nx3x3
+                The rotation matrices for the given axis-angle parameters
+        '''
+        batch_size = rot_vecs.shape[0]
+        device, dtype = rot_vecs.device, rot_vecs.dtype
+        angle = torch.norm(rot_vecs + 1e-8, dim=1, keepdim=True)
+        rot_dir = rot_vecs / angle
+        cos = torch.unsqueeze(torch.cos(angle), dim=1)
+        sin = torch.unsqueeze(torch.sin(angle), dim=1)
+        # Bx1 arrays
+        rx, ry, rz = torch.split(rot_dir, 1, dim=1)
+        K = torch.zeros((batch_size, 3, 3), dtype=dtype, device=device)
+        zeros = torch.zeros((batch_size, 1), dtype=dtype, device=device)
+        K = torch.cat([zeros, -rz, ry, rz, zeros, -rx, -ry, rx, zeros], dim=1) \
+            .view((batch_size, 3, 3))
+        ident = torch.eye(3, dtype=dtype, device=device).unsqueeze(dim=0)
+        rot_mat = ident + sin * K + (1 - cos) * torch.bmm(K, K)
+        return rot_mat
+    def batch_rigid_transform(self,
+        rot_mats: Tensor,
+        joints: Tensor,
+        parents: Tensor,
+        dtype=torch.float32
+    ) -> Tensor:
+        """
+        Applies a batch of rigid transformations to the joints
+        Parameters
+        ----------
+        rot_mats : torch.tensor BxNx3x3
+            Tensor of rotation matrices
+        joints : torch.tensor BxNx3
+            Locations of joints
+        parents : torch.tensor BxN
+            The kinematic tree of each object
+        dtype : torch.dtype, optional:
+            The data type of the created tensors, the default is torch.float32
+        Returns
+        -------
+        posed_joints : torch.tensor BxNx3
+            The locations of the joints after applying the pose rotations
+        rel_transforms : torch.tensor BxNx4x4
+            The relative (with respect to the root joint) rigid transformations
+            for all the joints
+        """
+        joints = torch.unsqueeze(joints, dim=-1)
+        # joints_check = joints.detach().cpu().numpy()
+        rel_joints = joints.clone()
+        rel_joints[:, 1:] -= joints[:, parents[1:]]
+        transforms_mat = self.transform_mat(
+            rot_mats.reshape(-1, 3, 3),
+            rel_joints.reshape(-1, 3, 1)).reshape(-1, joints.shape[1], 4, 4)
+        transform_chain = [transforms_mat[:, 0]]
+        for i in range(1, parents.shape[0]):
+            # Subtract the joint location at the rest pose
+            # No need for rotation, since it's identity when at rest
+            curr_res = torch.matmul(transform_chain[parents[i]],
+                                    transforms_mat[:, i])
+            transform_chain.append(curr_res)
+        transforms = torch.stack(transform_chain, dim=1)
+        # The last column of the transformations contains the posed joints
+        posed_joints = transforms[:, :, :3, 3]
+        # joints_homogen = F.pad(joints, [0, 0, 0, 1])
+        # rel_transforms = transforms - F.pad(
+        #     torch.matmul(transforms, joints_homogen), [3, 0, 0, 0, 0, 0, 0, 0])
+        return posed_joints #, rel_transforms
+    def transform_mat(self, R: Tensor, t: Tensor) -> Tensor:
+        ''' Creates a batch of transformation matrices
+            Args:
+                - R: Bx3x3 array of a batch of rotation matrices
+                - t: Bx3x1 array of a batch of translation vectors
+            Returns:
+                - T: Bx4x4 Transformation matrix
+        '''
+        # No padding left or right, only add an extra row
+        return torch.cat([F.pad(R, [0, 0, 0, 1]),
+                        F.pad(t, [0, 0, 0, 1], value=1)], dim=2)
+    def motion_data_load_process(self, motionfile):
+        if motionfile.split(".")[-1] == "pkl":
+            pkl_data = pickle.load(open(motionfile, "rb"))
+            if "pos" in pkl_data.keys():
+                local_q_165 = torch.from_numpy(pkl_data["q"]).to(self.device).float()
+                root_pos = torch.from_numpy(pkl_data["pos"]).to(self.device).float()
+                root_pos = root_pos[:, :]  - root_pos[0, :]
+                return local_q_165, root_pos
+            else:
+                smpl_poses = pkl_data["smpl_poses"]
+                if smpl_poses.shape[0] != 150 and smpl_poses.shape[0] != 300:
+                    smpl_poses = smpl_poses.reshape(150, -1)
+                # modata = np.concatenate((pkl_data["smpl_trans"], smpl_poses), axis=1)
+                # assert modata.shape[1] == 159
+                # modata = torch.from_numpy(modata).to(f'cuda:{args.gpu}')
+                root_pos = pkl_data["smpl_trans"]
+                local_q = torch.from_numpy(smpl_poses).to(self.device).float()
+                root_pos = torch.from_numpy(root_pos).to(self.device).float()
+                local_q_165 = torch.cat([local_q[:, :66], torch.zeros([local_q.shape[0], 9], device=local_q.device, dtype=torch.float32), local_q[:, 66:]], dim=1).to(self.device).float()
+                root_pos = root_pos[:, :]  - root_pos[0, :]
+                return local_q_165, root_pos
+    def forward(self, rotations, root_positions):
+        """
+        Perform forward kinematics using the given trajectory and local rotations.
+        Arguments (where N = batch size, L = sequence length, J = number of joints):
+         -- rotations: (N, 156)  或 (N, 165)
+         -- root_positions: (N, 3)
+         输出: N, 55, 3 关节点全局坐标
+        """
+        # assert len(rotations.shape) == 4
+        # assert len(root_positions.shape) == 3
+        # print(fk_device)
+        fk_device = rotations.device
+        if rotations.shape[1] == 156:
+            local_q_165 = torch.cat([rotations[:, :66], torch.zeros([rotations.shape[0], 9], device=fk_device, dtype=torch.float32), rotations[:, 66:]], dim=1).to(fk_device).float()
+        elif rotations.shape[1] == 165:
+            local_q_165 = rotations.to(fk_device).float()
+        else:
+            print("rotations shape error", rotations.shape)
+            sys.exit(0)
+        root_pos = root_positions.to(fk_device).float()
+        assert local_q_165.shape[1] == 165
+        B, C = local_q_165.shape
+        # print("local_q shape is:", local_q_165.shape)
+        rot_mats = self.batch_rodrigues(local_q_165.view(-1, 3)).view(
+                [B, -1, 3, 3])
+        # J = np.load("/data/lrh/project/Dance/mdm_v2/model/smplx_neu_J_1.npy")
+        if self.J.shape[0] >= B:
+            J_temp  = self.J[:B,:,:]        #self.J = self.J[:B,:,:]
+        else:
+            J_temp = self.J[:1,:,:].repeat(B, 1, 1)
+            print("warning: self.J size 0 is lower than batchsize x seq_len")
+        parents = torch.Tensor(self.parents).long() # if self.parents is None else self.parents
+        J_transformed = self.batch_rigid_transform(rot_mats, J_temp, parents, dtype=torch.float32)
+        J_transformed += root_pos.unsqueeze(dim=1)
+        # J_transformed = J_transformed.detach().cpu().numpy()
+        return J_transformed
+if __name__ == "__main__":
+    print("1")
+    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
+    smplx_fk = SMPLX_Skeleton(device = device, batch=150)
+    motion_file = "/home/data/lrh/datasets/fine_dance/magicsmpl/sliced/test/dances/012_slice0.pkl"
+    # music_file = "/home/data/lrh/datasets/fine_dance/magicsmpl/sliced/test/wavs/012_slice0.wav"
+    local_q_165, root_pos = smplx_fk.motion_data_load_process(motion_file)
+    print("local_q_165.shape", local_q_165.shape)
+    print("root_pos.shape", root_pos.shape)
+    joints = smplx_fk.forward(local_q_165, root_pos).detach().cpu().numpy()            # 150, 165     150, 3
+    print("joints.shape", joints.shape)
+    # skeleton_render(
+    #             joints,
+    #             epoch=f"e{1}_b{1}",
+    #             out="./output/temp",
+    #             name=music_file,
+    #             render=True,
+    #             stitch=False,
+    #             sound=True,
+    #             smpl_mode="smplx"
+    #         )