Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running

App Files Files Community

Jack Wu commited on Mar 25

Commit

c60109f

1 Parent(s): 6cf4573

.

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.idea/.gitignore +10 -0
.idea/Generate_Audio_for_Video.iml +14 -0
.idea/inspectionProfiles/Project_Default.xml +7 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/modules.xml +8 -0
.idea/vcs.xml +6 -0
HunyuanVideo-Foley/.gitattributes +3 -0
HunyuanVideo-Foley/.gitignore +159 -0
HunyuanVideo-Foley/.pre-commit-config.yaml +38 -0
HunyuanVideo-Foley/DEVELOPMENT.md +187 -0
HunyuanVideo-Foley/INSTALL.md +203 -0
HunyuanVideo-Foley/LICENSE +77 -0
HunyuanVideo-Foley/MANIFEST.in +38 -0
HunyuanVideo-Foley/NOTICE +27 -0
HunyuanVideo-Foley/README.md +519 -0
HunyuanVideo-Foley/build_package.sh +58 -0
HunyuanVideo-Foley/configs/hunyuanvideo-foley-xl.yaml +48 -0
HunyuanVideo-Foley/configs/hunyuanvideo-foley-xxl.yaml +48 -0
HunyuanVideo-Foley/download_test_videos.sh +11 -0
HunyuanVideo-Foley/gradio_app.py +834 -0
HunyuanVideo-Foley/hunyuanvideo_foley/__init__.py +30 -0
HunyuanVideo-Foley/hunyuanvideo_foley/cli.py +141 -0
HunyuanVideo-Foley/hunyuanvideo_foley/constants.py +57 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/__init__.py +0 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/__init__.py +16 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/__main__.py +36 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/model/__init__.py +4 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/model/base.py +301 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/model/dac.py +410 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/model/discriminator.py +228 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/nn/__init__.py +3 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/nn/layers.py +33 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/nn/loss.py +368 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/nn/quantize.py +262 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/nn/vae_utils.py +91 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/utils/__init__.py +121 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/utils/decode.py +95 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/utils/encode.py +94 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/hifi_foley.py +794 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/nn/__init__.py +0 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/nn/activation_layers.py +44 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/nn/attn_layers.py +546 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/nn/embed_layers.py +136 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/nn/mlp_layers.py +149 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/nn/modulate_layers.py +49 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/nn/norm_layers.py +70 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/nn/posemb_layers.py +159 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/synchformer/__init__.py +1 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/synchformer/ast_model.py +289 -0
HunyuanVideo-Foley/hunyuanvideo_foley/models/synchformer/compute_desync_score.py +214 -0

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Ignored default folder with query files
+/queries/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/

.idea/Generate_Audio_for_Video.iml ADDED Viewed

	@@ -0,0 +1,14 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/.venv" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>

.idea/inspectionProfiles/Project_Default.xml ADDED Viewed

	@@ -0,0 +1,7 @@

+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="SqlNoDataSourceInspection" enabled="false" level="WARNING" enabled_by_default="false" />
+    <inspection_tool class="TodoComment" enabled="false" level="INFORMATION" enabled_by_default="false" />
+  </profile>
+</component>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/Generate_Audio_for_Video.iml" filepath="$PROJECT_DIR$/.idea/Generate_Audio_for_Video.iml" />
+    </modules>
+  </component>
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>

HunyuanVideo-Foley/.gitattributes ADDED Viewed

	@@ -0,0 +1,3 @@

+assets/data_pipeline.png filter=lfs diff=lfs merge=lfs -text
+assets/model_arch.png filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

HunyuanVideo-Foley/.gitignore ADDED Viewed

	@@ -0,0 +1,159 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# ==========================================
+# Custom settings
+# ==========================================
+# For MacOS
+.DS_Store
+# For IDEs
+.idea/
+.vscode/
+pyrightconfig.json
+.cursorignore
+assets/
+examples/
+# For global settings
+__*/
+**/my_*
+tmp*.*
+.my*
+# Model checkpoints
+*.pt
+*.ckpt
+*.pth
+*.safetensors
+CLAUDE.md

HunyuanVideo-Foley/.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,38 @@

+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+      - id: check-added-large-files
+      - id: check-merge-conflict
+      - id: debug-statements
+      - id: check-docstring-first
+  - repo: https://github.com/psf/black
+    rev: 23.3.0
+    hooks:
+      - id: black
+        language_version: python3
+        args: [--line-length=120]
+  - repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        args: [--profile, black, --line-length=120]
+  - repo: https://github.com/pycqa/flake8
+    rev: 6.0.0
+    hooks:
+      - id: flake8
+        args: [--max-line-length=120]
+        additional_dependencies: [flake8-docstrings]
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.3.0
+    hooks:
+      - id: mypy
+        additional_dependencies: [types-all]
+        args: [--ignore-missing-imports]

HunyuanVideo-Foley/DEVELOPMENT.md ADDED Viewed

	@@ -0,0 +1,187 @@

+# Development Guide
+This document provides guidelines for developing and contributing to the HunyuanVideo-Foley project.
+## Code Style and Quality
+### Code Formatting
+We use the following tools to maintain consistent code style:
+- **Black**: Code formatter with 120 character line length
+- **isort**: Import sorter compatible with Black
+- **flake8**: Linting and style checking
+- **mypy**: Static type checking
+### Pre-commit Hooks
+Install pre-commit hooks to automatically format code before commits:
+```bash
+pip install pre-commit
+pre-commit install
+```
+### Manual Code Formatting
+Format code manually:
+```bash
+# Format all Python files
+black --line-length 120 .
+# Sort imports
+isort --profile black --line-length 120 .
+# Check code style
+flake8 --max-line-length 120
+# Type checking
+mypy --ignore-missing-imports .
+```
+## Project Structure
+```
+hunyuanvideo_foley/
+├── models/                 # Model implementations
+│   ├── hifi_foley.py      # Main model
+│   ├── nn/                # Neural network layers
+│   ├── dac_vae/           # Audio VAE
+│   └── synchformer/       # Synchronization model
+├── utils/                 # Utilities
+│   ├── config_utils.py    # Configuration handling
+│   ├── feature_utils.py   # Feature extraction
+│   ├── model_utils.py     # Model loading/saving
+│   └── media_utils.py     # Audio/video processing
+└── constants.py           # Project constants
+```
+## Coding Standards
+### Error Handling
+- Use custom exceptions for domain-specific errors
+- Always validate inputs at function boundaries
+- Log errors with appropriate levels (ERROR, WARNING, INFO)
+- Provide helpful error messages to users
+### Type Hints
+- Add type hints to all function parameters and return values
+- Use `Optional[Type]` for nullable parameters
+- Import types from `typing` module
+### Documentation
+- Add docstrings to all public functions and classes
+- Use Google-style docstrings
+- Document parameters, return values, and exceptions
+### Example Function
+```python
+def process_video(
+    video_path: str,
+    max_duration: Optional[float] = None
+) -> Tuple[np.ndarray, float]:
+    """
+    Process video file and extract frames.
+    Args:
+        video_path: Path to input video file
+        max_duration: Maximum duration in seconds (optional)
+    Returns:
+        Tuple of (frames array, duration in seconds)
+    Raises:
+        FileNotFoundError: If video file doesn't exist
+        VideoProcessingError: If video processing fails
+    """
+    if not os.path.exists(video_path):
+        raise FileNotFoundError(f"Video file not found: {video_path}")
+    # Implementation here...
+```
+## Testing
+### Running Tests
+```bash
+# Run all tests
+python -m pytest
+# Run specific test file
+python -m pytest tests/test_feature_utils.py
+# Run with coverage
+python -m pytest --cov=hunyuanvideo_foley
+```
+### Writing Tests
+- Place tests in `tests/` directory
+- Name test files as `test_*.py`
+- Use descriptive test function names
+- Test edge cases and error conditions
+## Development Workflow
+1. **Setup Environment**
+   ```bash
+   python -m venv venv
+   source venv/bin/activate  # Linux/Mac
+   # or
+   venv\Scripts\activate     # Windows
+   pip install -r requirements.txt
+   pip install -e .
+   ```
+2. **Install Development Tools**
+   ```bash
+   pre-commit install
+   ```
+3. **Make Changes**
+   - Follow the coding standards above
+   - Add tests for new functionality
+   - Update documentation as needed
+4. **Run Quality Checks**
+   ```bash
+   black --check --line-length 120 .
+   isort --check-only --profile black .
+   flake8 --max-line-length 120
+   mypy --ignore-missing-imports .
+   pytest
+   ```
+5. **Commit Changes**
+   ```bash
+   git add .
+   git commit -m "feat: add new feature"
+   ```
+## Performance Considerations
+- Use `torch.no_grad()` for inference-only code
+- Leverage GPU when available
+- Implement batch processing where possible
+- Profile code to identify bottlenecks
+## Dependencies
+- Keep dependencies minimal and well-maintained
+- Pin versions for reproducibility
+- Separate development dependencies from runtime dependencies
+- Document any special installation requirements
+## Configuration
+- Use centralized configuration in `constants.py`
+- Support environment variable overrides
+- Provide sensible defaults for all parameters
+- Validate configuration at startup

HunyuanVideo-Foley/INSTALL.md ADDED Viewed

	@@ -0,0 +1,203 @@

+# 安装指南 - HunyuanVideo-Foley
+本文档提供了将 HunyuanVideo-Foley 作为 Python 包安装和使用的详细指南。
+## 安装方式
+### 方式1：从源码安装（推荐）
+```bash
+# 克隆仓库
+git clone https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley
+cd HunyuanVideo-Foley
+# 安装包（开发模式）
+pip install -e .
+# 或安装包含所有可选依赖
+pip install -e .[all]
+```
+### 方式2：直接从GitHub安装
+```bash
+pip install git+https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley.git
+```
+### 方式3：构建wheel包安装
+```bash
+# 在项目根目录下
+python setup.py bdist_wheel
+pip install dist/hunyuanvideo_foley-1.0.0-py3-none-any.whl
+```
+## 特殊依赖安装
+由于某些依赖不在PyPI上，需要单独安装：
+```bash
+# 安装audiotools（必需）
+pip install git+https://github.com/descriptinc/audiotools
+# 安装特定版本的transformers（支持SigLIP2）
+pip install git+https://github.com/huggingface/transformers@v4.49.0-SigLIP-2
+```
+## 可选依赖安装
+```bash
+# 安装开发依赖
+pip install hunyuanvideo-foley[dev]
+# 安装测试依赖
+pip install hunyuanvideo-foley[test]
+# 安装Gradio界面依赖
+pip install hunyuanvideo-foley[gradio]
+# 安装所有可选依赖
+pip install hunyuanvideo-foley[all]
+```
+## 验证安装
+```bash
+# 检查包是否正确安装
+python -c "import hunyuanvideo_foley; print(hunyuanvideo_foley.__version__)"
+# 检查命令行工具
+hunyuanvideo-foley --help
+```
+## 使用方法
+### 1. 作为Python包使用
+```python
+import hunyuanvideo_foley as hvf
+# 加载模型
+model_dict, cfg = hvf.load_model(
+    model_path="path/to/model",
+    config_path="configs/hunyuanvideo-foley-xxl.yaml"
+)
+# 处理特征
+visual_feats, text_feats, audio_len = hvf.feature_process(
+    video_path="video.mp4",
+    prompt="footsteps on gravel",
+    model_dict=model_dict,
+    cfg=cfg
+)
+# 生成音频
+audio, sample_rate = hvf.denoise_process(
+    visual_feats, text_feats, audio_len,
+    model_dict, cfg
+)
+```
+### 2. 使用命令行工具
+```bash
+# 单个视频处理
+hunyuanvideo-foley \
+    --model_path ./pretrained_models \
+    --single_video video.mp4 \
+    --single_prompt "footsteps on gravel" \
+    --output_dir ./outputs
+# 批量处理
+hunyuanvideo-foley \
+    --model_path ./pretrained_models \
+    --csv_path batch_videos.csv \
+    --output_dir ./outputs
+# 启动Gradio界面
+hunyuanvideo-foley --gradio --model_path ./pretrained_models
+```
+### 3. 使用原始脚本（向后兼容）
+```bash
+# 使用原始infer.py脚本
+python infer.py --model_path ./pretrained_models --single_video video.mp4 --single_prompt "audio description"
+# 启动Gradio应用
+export HIFI_FOLEY_MODEL_PATH=./pretrained_models
+python gradio_app.py
+```
+## 开发环境设置
+如果你想参与开发：
+```bash
+# 克隆项目
+git clone https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley
+cd HunyuanVideo-Foley
+# 安装开发版本
+pip install -e .[dev]
+# 安装pre-commit钩子
+pre-commit install
+# 运行测试
+python -m pytest
+# 代码格式化
+black --line-length 120 .
+isort --profile black .
+# 类型检查
+mypy --ignore-missing-imports .
+```
+## 系统要求
+- **Python**: 3.8+
+- **操作系统**: Linux（主要支持），macOS，Windows
+- **GPU内存**: 推荐 ≥24GB VRAM（如RTX 3090/4090）
+- **CUDA版本**: 12.4 或 11.8（推荐）
+## 故障排除
+### 常见问题
+1. **ImportError: No module named 'audiotools'**
+   ```bash
+   pip install git+https://github.com/descriptinc/audiotools
+   ```
+2. **CUDA内存不足**
+   - 使用较小的批次大小
+   - 确保GPU有足够的VRAM（推荐24GB+）
+3. **transformers版本问题**
+   ```bash
+   pip install git+https://github.com/huggingface/transformers@v4.49.0-SigLIP-2
+   ```
+### 获取帮助
+- 查看项目README: [GitHub](https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley)
+- 报告问题: [GitHub Issues](https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley/issues)
+- 论文: [arXiv:2508.16930](https://arxiv.org/abs/2508.16930)
+## 模型下载
+```bash
+# 使用HuggingFace Hub
+git clone https://huggingface.co/tencent/HunyuanVideo-Foley
+# 或使用huggingface-cli
+huggingface-cli download tencent/HunyuanVideo-Foley
+```
+## 配置文件
+包安装后，配置文件位于：
+- `hunyuanvideo_foley/configs/` 目录
+- 默认配置：`configs/hunyuanvideo-foley-xxl.yaml`

HunyuanVideo-Foley/LICENSE ADDED Viewed

	@@ -0,0 +1,77 @@

+TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT
+Tencent HunyuanVideo-Foley Release Date: August 28, 2025
+THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
+By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying any portion or element of the Tencent Hunyuan Works, including via any Hosted Service, You will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
+1.	DEFINITIONS.
+a.	“Acceptable Use Policy” shall mean the policy made available by Tencent as set forth in the Exhibit A.
+b.	“Agreement” shall mean the terms and conditions for use, reproduction, distribution, modification, performance and displaying of Tencent Hunyuan Works or any portion or element thereof set forth herein.
+c.	“Documentation” shall mean the specifications, manuals and documentation for Tencent Hunyuan made publicly available by Tencent.
+d.	“Hosted Service” shall mean a hosted service offered via an application programming interface (API), web access, or any other electronic or remote means.
+e.	“Licensee,” “You” or “Your” shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Tencent Hunyuan Works for any purpose and in any field of use.
+f.	“Materials” shall mean, collectively, Tencent’s proprietary Tencent Hunyuan and Documentation (and any portion thereof) as made available by Tencent under this Agreement.
+g.	“Model Derivatives” shall mean all: (i) modifications to Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; (ii) works based on Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; or (iii) any other machine learning model which is created by transfer of patterns of the weights, parameters, operations, or Output of Tencent Hunyuan or any Model Derivative of Tencent Hunyuan, to that model in order to cause that model to perform similarly to Tencent Hunyuan or a Model Derivative of Tencent Hunyuan, including distillation methods, methods that use intermediate data representations, or methods based on the generation of synthetic data Outputs by Tencent Hunyuan or a Model Derivative of Tencent Hunyuan for training that model. For clarity, Outputs by themselves are not deemed Model Derivatives.
+h.	“Output” shall mean the information and/or content output of Tencent Hunyuan or a Model Derivative that results from operating or otherwise using Tencent Hunyuan or a Model Derivative, including via a Hosted Service.
+i.	“Tencent,” “We” or “Us” shall mean the applicable entity or entities in the Tencent corporate family that own(s) intellectual property or other rights embodied in or utilized by the Materials.
+j.	“Tencent Hunyuan” shall mean the large language models, text/image/video/audio/3D generation models, and multimodal large language models and their software and algorithms, including trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing made publicly available by Us, including, without limitation to, Tencent HunyuanVideo-Foley released at [https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley].
+k.	“Tencent Hunyuan Works” shall mean: (i) the Materials; (ii) Model Derivatives; and (iii) all derivative works thereof.
+l.	“Territory” shall mean the worldwide territory, excluding the territory of the European Union, United Kingdom and South Korea.
+m.	“Third Party” or “Third Parties” shall mean individuals or legal entities that are not under common control with Us or You.
+n.	“including” shall mean including but not limited to.
+2.	GRANT OF RIGHTS.
+We grant You, for the Territory only, a non-exclusive, non-transferable and royalty-free limited license under Tencent’s intellectual property or other rights owned by Us embodied in or utilized by the Materials to use, reproduce, distribute, create derivative works of (including Model Derivatives), and make modifications to the Materials, only in accordance with the terms of this Agreement and the Acceptable Use Policy, and You must not violate (or encourage or permit anyone else to violate) any term of this Agreement or the Acceptable Use Policy.
+3.	DISTRIBUTION.
+You may, subject to Your compliance with this Agreement, distribute or make available to Third Parties the Tencent Hunyuan Works, exclusively in the Territory, provided that You meet all of the following conditions:
+a.	You must provide all such Third Party recipients of the Tencent Hunyuan Works or products or services using them a copy of this Agreement;
+b.	You must cause any modified files to carry prominent notices stating that You changed the files;
+c.	You are encouraged to: (i) publish at least one technology introduction blogpost or one public statement expressing Your experience of using the Tencent Hunyuan Works; and (ii) mark the products or services developed by using the Tencent Hunyuan Works to indicate that the product/service is “Powered by Tencent Hunyuan”; and
+d.	All distributions to Third Parties (other than through a Hosted Service) must be accompanied by a “Notice” text file that contains the following notice: “Tencent Hunyuan is licensed under the Tencent Hunyuan Community License Agreement, Copyright © 2025 Tencent. All Rights Reserved. The trademark rights of “Tencent Hunyuan” are owned by Tencent or its affiliate.”
+You may add Your own copyright statement to Your modifications and, except as set forth in this Section and in Section 5, may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Model Derivatives as a whole, provided Your use, reproduction, modification, distribution, performance and display of the work otherwise complies with the terms and conditions of this Agreement (including as regards the Territory). If You receive Tencent Hunyuan Works from a Licensee as part of an integrated end user product, then this Section 3 of this Agreement will not apply to You.
+4.	ADDITIONAL COMMERCIAL TERMS.
+If, on the Tencent Hunyuan version release date, the monthly active users of all products or services made available by or for Licensee is greater than 100 million monthly active users in the preceding calendar month, You must request a license from Tencent, which Tencent may grant to You in its sole discretion, and You are not authorized to exercise any of the rights under this Agreement unless or until Tencent otherwise expressly grants You such rights.
+5.	RULES OF USE.
+a.	Your use of the Tencent Hunyuan Works must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Tencent Hunyuan Works, which is hereby incorporated by reference into this Agreement. You must include the use restrictions referenced in these Sections 5(a) and 5(b) as an enforceable provision in any agreement (e.g., license agreement, terms of use, etc.) governing the use and/or distribution of Tencent Hunyuan Works and You must provide notice to subsequent users to whom You distribute that Tencent Hunyuan Works are subject to the use restrictions in these Sections 5(a) and 5(b).
+b.	You must not use the Tencent Hunyuan Works or any Output or results of the Tencent Hunyuan Works to improve any other AI model (other than Tencent Hunyuan or Model Derivatives thereof).
+c.	You must not use, reproduce, modify, distribute, or display the Tencent Hunyuan Works, Output or results of the Tencent Hunyuan Works outside the Territory. Any such use outside the Territory is unlicensed and unauthorized under this Agreement.
+6.	INTELLECTUAL PROPERTY.
+a.	Subject to Tencent’s ownership of Tencent Hunyuan Works made by or for Tencent and intellectual property rights therein, conditioned upon Your compliance with the terms and conditions of this Agreement, as between You and Tencent, You will be the owner of any derivative works and modifications of the Materials and any Model Derivatives that are made by or for You.
+b.	No trademark licenses are granted under this Agreement, and in connection with the Tencent Hunyuan Works, Licensee may not use any name or mark owned by or associated with Tencent or any of its affiliates, except as required for reasonable and customary use in describing and distributing the Tencent Hunyuan Works. Tencent hereby grants You a license to use “Tencent Hunyuan” (the “Mark”) in the Territory solely as required to comply with the provisions of Section 3(c), provided that You comply with any applicable laws related to trademark protection. All goodwill arising out of Your use of the Mark will inure to the benefit of Tencent.
+c.	If You commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any person or entity alleging that the Materials or any Output, or any portion of any of the foregoing, infringe any intellectual property or other right owned or licensable by You, then all licenses granted to You under this Agreement shall terminate as of the date such lawsuit or other proceeding is filed. You will defend, indemnify and hold harmless Us from and against any claim by any Third Party arising out of or related to Your or the Third Party’s use or distribution of the Tencent Hunyuan Works.
+d.	Tencent claims no rights in Outputs You generate. You and Your users are solely responsible for Outputs and their subsequent uses.
+7.	DISCLAIMERS OF WARRANTY AND LIMITATIONS OF LIABILITY.
+a.	We are not obligated to support, update, provide training for, or develop any further version of the Tencent Hunyuan Works or to grant any license thereto.
+b.	UNLESS AND ONLY TO THE EXTENT REQUIRED BY APPLICABLE LAW, THE TENCENT HUNYUAN WORKS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED “AS IS” WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES OF ANY KIND INCLUDING ANY WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, COURSE OF DEALING, USAGE OF TRADE, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING, REPRODUCING, MODIFYING, PERFORMING, DISPLAYING OR DISTRIBUTING ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND ASSUME ANY AND ALL RISKS ASSOCIATED WITH YOUR OR A THIRD PARTY’S USE OR DISTRIBUTION OF ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND YOUR EXERCISE OF RIGHTS AND PERMISSIONS UNDER THIS AGREEMENT.
+c.	TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL TENCENT OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, FOR ANY DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, CONSEQUENTIAL OR PUNITIVE DAMAGES, OR LOST PROFITS OF ANY KIND ARISING FROM THIS AGREEMENT OR RELATED TO ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS, EVEN IF TENCENT OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+8.	SURVIVAL AND TERMINATION.
+a.	The term of this Agreement shall commence upon Your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
+b.	We may terminate this Agreement if You breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, You must promptly delete and cease use of the Tencent Hunyuan Works. Sections 6(a), 6(c), 7 and 9 shall survive the termination of this Agreement.
+9.	GOVERNING LAW AND JURISDICTION.
+a.	This Agreement and any dispute arising out of or relating to it will be governed by the laws of the Hong Kong Special Administrative Region of the People’s Republic of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
+b.	Exclusive jurisdiction and venue for any dispute arising out of or relating to this Agreement will be a court of competent jurisdiction in the Hong Kong Special Administrative Region of the People’s Republic of China, and Tencent and Licensee consent to the exclusive jurisdiction of such court with respect to any such dispute.
+EXHIBIT A
+ACCEPTABLE USE POLICY
+Tencent reserves the right to update this Acceptable Use Policy from time to time.
+Last modified: November 5, 2024
+Tencent endeavors to promote safe and fair use of its tools and features, including Tencent Hunyuan. You agree not to use Tencent Hunyuan or Model Derivatives:
+1.	Outside the Territory;
+2.	In any way that violates any applicable national, federal, state, local, international or any other law or regulation;
+3.	To harm Yourself or others;
+4.	To repurpose or distribute output from Tencent Hunyuan or any Model Derivatives to harm Yourself or others;
+5.	To override or circumvent the safety guardrails and safeguards We have put in place;
+6.	For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
+7.	To generate or disseminate verifiably false information and/or content with the purpose of harming others or influencing elections;
+8.	To generate or facilitate false online engagement, including fake reviews and other means of fake online engagement;
+9.	To intentionally defame, disparage or otherwise harass others;
+10.	To generate and/or disseminate malware (including ransomware) or any other content to be used for the purpose of harming electronic systems;
+11.	To generate or disseminate personal identifiable information with the purpose of harming others;
+12.	To generate or disseminate information (including images, code, posts, articles), and place the information in any public context (including –through the use of bot generated tweets), without expressly and conspicuously identifying that the information and/or content is machine generated;
+13.	To impersonate another individual without consent, authorization, or legal right;
+14.	To make high-stakes automated decisions in domains that affect an individual’s safety, rights or wellbeing (e.g., law enforcement, migration, medicine/health, management of critical infrastructure, safety components of products, essential services, credit, employment, housing, education, social scoring, or insurance);
+15.	In a manner that violates or disrespects the social ethics and moral standards of other countries or regions;
+16.	To perform, facilitate, threaten, incite, plan, promote or encourage violent extremism or terrorism;
+17.	For any use intended to discriminate against or harm individuals or groups based on protected characteristics or categories, online or offline social behavior or known or predicted personal or personality characteristics;
+18.	To intentionally exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
+19.	For military purposes;
+20.	To engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or other professional practices.

HunyuanVideo-Foley/MANIFEST.in ADDED Viewed

	@@ -0,0 +1,38 @@

+# Include package metadata and documentation
+include README.md
+include LICENSE
+include NOTICE
+include DEVELOPMENT.md
+include CLAUDE.md
+include requirements.txt
+include pyproject.toml
+include pytest.ini
+# Include configuration files
+include configs/*.yaml
+include configs/*.yml
+recursive-include hunyuanvideo_foley/configs *.yaml *.yml
+# Include test assets if any
+include assets/*.csv
+include assets/*.txt
+recursive-include assets/test_videos *
+# Include example scripts
+include *.py
+include *.sh
+# Include test files
+recursive-include tests *.py
+# Exclude unnecessary files
+global-exclude *.pyc
+global-exclude *.pyo
+global-exclude *~
+global-exclude .DS_Store
+global-exclude __pycache__
+prune .git
+prune .github
+prune examples/*/outputs
+prune **/__pycache__
+prune **/*.pyc

HunyuanVideo-Foley/NOTICE ADDED Viewed

	@@ -0,0 +1,27 @@

+Usage and Legal Notices:
+Tencent is pleased to support the open source community by making Tencent HunyuanVideo-Foley available.
+Copyright (C) 2025 Tencent. All rights reserved.
+Tencent HunyuanVideo-Foley is licensed under TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT, which can be found in this repository called "LICENSE", except for the third-party components listed below. Tencent HunyuanVideo-Foley does not impose any additional limitations beyond what is outlined in the respective licenses of these third-party components. Users must comply with all terms and conditions of original licenses of these third-party components and must ensure that the usage of the third party components adheres to all relevant laws and regulations.
+For avoidance of doubts, Tencent HunyuanVideo-Foley means the large language models and their software and algorithms, including trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing made publicly available by Tencent in accordance with the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+Other dependencies and licenses:
+Open Source Software Licensed under the MIT License:
+--------------------------------------------------------------------
+1. syncformer
+Copyright (c) 2024 Vladimir Iashin
+Terms of the MIT License:
+--------------------------------------------------------------------
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

HunyuanVideo-Foley/README.md ADDED Viewed

	@@ -0,0 +1,519 @@

+<div align="center">
+https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley
+<img src="assets/logo.png" alt="HunyuanVideo-Foley Logo" width="400">
+<h4>Multimodal Diffusion with Representation Alignment for High-Fidelity Foley Audio Generation</h4>
+<p align="center">
+  <strong>Professional-grade AI sound effect generation for video content creators</strong>
+</p>
+<div align="center">
+  <a href=https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley target="_blank"><img src=https://img.shields.io/badge/Code-black.svg?logo=github height=22px></a>
+  <a href=https://szczesnys.github.io/hunyuanvideo-foley target="_blank"><img src=https://img.shields.io/badge/Page-bb8a2e.svg?logo=github height=22px></a>
+  <a href=https://huggingface.co/tencent/HunyuanVideo-Foley target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Models-d96902.svg height=22px></a>
+  <a href=https://huggingface.co/spaces/tencent/HunyuanVideo-Foley  target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Demo-276cb4.svg height=22px></a>
+  <a href=https://arxiv.org/abs/2508.16930 target="_blank"><img src=https://img.shields.io/badge/Report-b5212f.svg?logo=arxiv height=22px></a>
+  <a href=https://x.com/TencentHunyuan target="_blank"><img src=https://img.shields.io/badge/Hunyuan-black.svg?logo=x height=22px></a>
+  <a href=https://discord.gg/YEyGGn6Bte target="_blank"><img src=https://img.shields.io/badge/Hunyuan-141984.svg?logo=discord height=22px></a>
+</div>
+</div>
+---
+<div align="center">
+### 👥 **Authors**
+<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 15px; margin: 20px 0;">
+**Sizhe Shan**<sup>1,2*</sup> • **Qiulin Li**<sup>1,3*</sup> • **Yutao Cui**<sup>1</sup> • **Miles Yang**<sup>1</sup>  • **Yuehai Wang**<sup>2</sup> • **Qun Yang**<sup>3</sup> • **Jin Zhou**<sup>1†</sup> • **Zhao Zhong**<sup>1</sup>
+</div>
+<div style="margin-top: 15px; font-size: 14px; color: #666;">
+🏢 <sup>1</sup>**Tencent Hunyuan** • 🎓 <sup>2</sup>**Zhejiang University** • ✈️ <sup>3</sup>**Nanjing University of Aeronautics and Astronautics**
+*Equal contribution • †Project lead
+</div>
+</div>
+---
+## 🔥🔥🔥 **News**
+<div style="background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%); padding: 20px; border-radius: 15px; margin: 20px 0; border-left: 5px solid #2196f3;">
+- **[2025.9.29]** 🚀 **HunyuanVideo-Foley-XL Model Release** - Release XL-sized model with offload inference support, significantly reducing VRAM requirements.
+- **[2025.8.28]** 🌟 **HunyuanVideo-Foley Open Source Release** - Inference code and model weights publicly available.
+</div>
+---
+## 🎥 **Demo & Showcase**
+<div align="center">
+> **Experience the magic of AI-generated Foley audio in perfect sync with video content!**
+<div style="border: 3px solid #4A90E2; border-radius: 15px; padding: 10px; margin: 20px 0; background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);">
+  <video src="https://github.com/user-attachments/assets/d6e1b6fd-6980-4a68-8717-74298d064195" width="80%" controls style="border-radius: 10px; box-shadow: 0 8px 32px rgba(0,0,0,0.1);"> </video>
+  <p><em>🎬 Watch how HunyuanVideo-Foley generates immersive sound effects synchronized with video content</em></p>
+</div>
+---
+## 🤝 **Community Contributions**
+<div style="background: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 4px solid #28a745; margin: 20px 0; color: #333;">
+**ComfyUI Integration** - Thanks to the amazing community for creating ComfyUI nodes:
+- **[if-ai/ComfyUI_HunyuanVideoFoley](https://github.com/if-ai/ComfyUI_HunyuanVideoFoley)** - ComfyUI workflow integration which supports cpu offloading and FP8 quantization
+- **[phazei/ComfyUI-HunyuanVideo-Foley](https://github.com/phazei/ComfyUI-HunyuanVideo-Foley)** - Alternative ComfyUI node implementation which supports different precision modes
+</div>
+<div align="center" style="margin: 20px 0;">
+**🌟 We encourage and appreciate community contributions that make HunyuanVideo-Foley more accessible!**
+</div>
+---
+### ✨ **Key Highlights**
+<table align="center" style="border: none; margin: 20px 0;">
+<tr>
+<td align="center" width="33%">
+🎭 **Multi-scenario Sync**
+High-quality audio synchronized with complex video scenes
+</td>
+<td align="center" width="33%">
+🧠 **Multi-modal Balance**
+Perfect harmony between visual and textual information
+</td>
+<td align="center" width="33%">
+🎵 **48kHz Hi-Fi Output**
+Professional-grade audio generation with crystal clarity
+</td>
+</tr>
+</table>
+</div>
+---
+## 📄 **Abstract**
+<div align="center" style="background: linear-gradient(135deg, #ffeef8 0%, #f0f8ff 100%); padding: 30px; border-radius: 20px; margin: 20px 0; border-left: 5px solid #ff6b9d; color: #333;">
+**🚀 Tencent Hunyuan** open-sources **HunyuanVideo-Foley** an end-to-end video sound effect generation model!
+*A professional-grade AI tool specifically designed for video content creators, widely applicable to diverse scenarios including short video creation, film production, advertising creativity, and game development.*
+</div>
+### 🎯 **Core Highlights**
+<div style="display: grid; grid-template-columns: 1fr; gap: 15px; margin: 20px 0;">
+<div style="border-left: 4px solid #4CAF50; padding: 15px; background: #f8f9fa; border-radius: 8px; color: #333;">
+**🎬 Multi-scenario Audio-Visual Synchronization**
+Supports generating high-quality audio that is synchronized and semantically aligned with complex video scenes, enhancing realism and immersive experience for film/TV and gaming applications.
+</div>
+<div style="border-left: 4px solid #2196F3; padding: 15px; background: #f8f9fa; border-radius: 8px; color: #333;">
+**⚖️ Multi-modal Semantic Balance**
+Intelligently balances visual and textual information analysis, comprehensively orchestrates sound effect elements, avoids one-sided generation, and meets personalized dubbing requirements.
+</div>
+<div style="border-left: 4px solid #FF9800; padding: 15px; background: #f8f9fa; border-radius: 8px; color: #333;">
+**🎵 High-fidelity Audio Output**
+Self-developed 48kHz audio VAE perfectly reconstructs sound effects, music, and vocals, achieving professional-grade audio generation quality.
+</div>
+</div>
+<div align="center" style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 15px; margin: 20px 0; color: #333;">
+**🏆 SOTA Performance Achieved**
+*HunyuanVideo-Foley comprehensively leads the field across multiple evaluation benchmarks, achieving new state-of-the-art levels in audio fidelity, visual-semantic alignment, temporal alignment, and distribution matching - surpassing all open-source solutions!*
+</div>
+<div align="center">
+![Performance Overview](assets/pan_chart.png)
+*📊 Performance comparison across different evaluation metrics - HunyuanVideo-Foley leads in all categories*
+</div>
+---
+## 🔧 **Technical Architecture**
+### 📊 **Data Pipeline Design**
+<div align="center" style="margin: 20px 0; color: #333;">
+![Data Pipeline](assets/data_pipeline.png)
+*🔄 Comprehensive data processing pipeline for high-quality text-video-audio datasets*
+</div>
+<div style="background: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 4px solid #17a2b8; margin: 20px 0;">
+The **TV2A (Text-Video-to-Audio)** task presents a complex multimodal generation challenge requiring large-scale, high-quality datasets. Our comprehensive data pipeline systematically identifies and excludes unsuitable content to produce robust and generalizable audio generation capabilities.
+</div>
+### 🏗️ **Model Architecture**
+<div align="center" style="margin: 20px 0; color: #333;">
+![Model Architecture](assets/model_arch.png)
+*🧠 HunyuanVideo-Foley hybrid architecture with multimodal and unimodal transformer blocks*
+</div>
+<div style="background: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 4px solid #28a745; margin: 20px 0;">
+**HunyuanVideo-Foley** employs a sophisticated hybrid architecture:
+- **🔄 Multimodal Transformer Blocks**: Process visual-audio streams simultaneously
+- **🎵 Unimodal Transformer Blocks**: Focus on audio stream refinement
+- **👁️ Visual Encoding**: Pre-trained encoder extracts visual features from video frames
+- **📝 Text Processing**: Semantic features extracted via pre-trained text encoder
+- **🎧 Audio Encoding**: Latent representations with Gaussian noise perturbation
+- **⏰ Temporal Alignment**: Synchformer-based frame-level synchronization with gated modulation
+</div>
+---
+## 📈 **Performance Benchmarks**
+### 🎬 **MovieGen-Audio-Bench Results**
+<div align="center">
+> *Objective and Subjective evaluation results demonstrating superior performance across all metrics*
+</div>
+<div style="overflow-x: auto; margin: 20px 0;">
+| 🏆 **Method** | **PQ** ↑ | **PC** ↓ | **CE** ↑ | **CU** ↑ | **IB** ↑ | **DeSync** ↓ | **CLAP** ↑ | **MOS-Q** ↑ | **MOS-S** ↑ | **MOS-T** ↑ |
+|:-------------:|:--------:|:--------:|:--------:|:--------:|:--------:|:-------------:|:-----------:|:------------:|:------------:|:------------:|
+| FoleyGrafter | 6.27 | 2.72 | 3.34 | 5.68 | 0.17 | 1.29 | 0.14 | 3.36±0.78 | 3.54±0.88 | 3.46±0.95 |
+| V-AURA | 5.82 | 4.30 | 3.63 | 5.11 | 0.23 | 1.38 | 0.14 | 2.55±0.97 | 2.60±1.20 | 2.70±1.37 |
+| Frieren | 5.71 | 2.81 | 3.47 | 5.31 | 0.18 | 1.39 | 0.16 | 2.92±0.95 | 2.76±1.20 | 2.94±1.26 |
+| MMAudio | 6.17 | 2.84 | 3.59 | 5.62 | 0.27 | 0.80 | 0.35 | 3.58±0.84 | 3.63±1.00 | 3.47±1.03 |
+| ThinkSound | 6.04 | 3.73 | 3.81 | 5.59 | 0.18 | 0.91 | 0.20 | 3.20±0.97 | 3.01±1.04 | 3.02±1.08 |
+| **HunyuanVideo-Foley (ours)** | **6.59** | **2.74** | **3.88** | **6.13** | **0.35** | **0.74** | **0.33** | **4.14±0.68** | **4.12±0.77** | **4.15±0.75** |
+</div>
+### 🎯 **Kling-Audio-Eval Results**
+<div align="center">
+> *Comprehensive objective evaluation showcasing state-of-the-art performance*
+</div>
+<div style="overflow-x: auto; margin: 20px 0;">
+| 🏆 **Method** | **FD_PANNs** ↓ | **FD_PASST** ↓ | **KL** ↓ | **IS** ↑ | **PQ** ↑ | **PC** ↓ | **CE** ↑ | **CU** ↑ | **IB** ↑ | **DeSync** ↓ | **CLAP** ↑ |
+|:-------------:|:--------------:|:--------------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:-------------:|:-----------:|
+| FoleyGrafter | 22.30 | 322.63 | 2.47 | 7.08 | 6.05 | 2.91 | 3.28 | 5.44 | 0.22 | 1.23 | 0.22 |
+| V-AURA | 33.15 | 474.56 | 3.24 | 5.80 | 5.69 | 3.98 | 3.13 | 4.83 | 0.25 | 0.86 | 0.13 |
+| Frieren | 16.86 | 293.57 | 2.95 | 7.32 | 5.72 | 2.55 | 2.88 | 5.10 | 0.21 | 0.86 | 0.16 |
+| MMAudio | 9.01 | 205.85 | 2.17 | 9.59 | 5.94 | 2.91 | 3.30 | 5.39 | 0.30 | 0.56 | 0.27 |
+| ThinkSound | 9.92 | 228.68 | 2.39 | 6.86 | 5.78 | 3.23 | 3.12 | 5.11 | 0.22 | 0.67 | 0.22 |
+| **HunyuanVideo-Foley (ours)** | **6.07** | **202.12** | **1.89** | **8.30** | **6.12** | **2.76** | **3.22** | **5.53** | **0.38** | **0.54** | **0.24** |
+</div>
+<div align="center" style="background: linear-gradient(135deg, #4CAF50 0%, #45a049 100%); color: white; padding: 15px; border-radius: 10px; margin: 20px 0; color: #333;">
+**🎉 Outstanding Results!** HunyuanVideo-Foley achieves the best scores across **ALL** evaluation metrics, demonstrating significant improvements in audio quality, synchronization, and semantic alignment.
+</div>
+---
+## 🚀 **Quick Start**
+### 📦 **Installation**
+<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 15px; margin: 20px 0; color: #333;">
+**🔧 System Requirements**
+- **CUDA**: 12.4 or 11.8 recommended
+- **Python**: 3.8+
+- **OS**: Linux (primary support)
+- **VRAM**: 20GB for XXL model (or 12GB with `--enable_offload`), 16GB for XL model (or 8GB with `--enable_offload`)
+</div>
+#### **Step 1: Clone Repository**
+```bash
+# 📥 Clone the repository
+git clone https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley
+cd HunyuanVideo-Foley
+```
+#### **Step 2: Environment Setup**
+<div style="background: #fff3cd; padding: 15px; border-radius: 8px; border-left: 4px solid #ffc107; margin: 10px 0; color: #333;">
+💡 **Tip**: We recommend using [Conda](https://docs.anaconda.com/free/miniconda/index.html) for Python environment management.
+</div>
+```bash
+# 🔧 Install dependencies
+pip install -r requirements.txt
+```
+#### **Step 3: Download Pretrained Models**
+<div style="background: #d1ecf1; padding: 15px; border-radius: 8px; border-left: 4px solid #17a2b8; margin: 10px 0;color: #333;">
+🔗 **Download Model weights from Huggingface**
+```bash
+# using git-lfs
+git clone https://huggingface.co/tencent/HunyuanVideo-Foley
+# using huggingface-cli
+huggingface-cli download tencent/HunyuanVideo-Foley
+```
+<!-- 🔗 **Download Model weights from ModelScope**   -->
+<!-- ```bash -->
+<!-- # using git-lfs -->
+<!-- git clone https://huggingface.co/tencent/HunyuanVideo-Foley -->
+<!--  -->
+<!-- # using huggingface-cli -->
+<!-- huggingface-cli download tencent/HunyuanVideo-Foley -->
+<!-- ``` -->
+</div>
+---
+## 💻 **Usage**
+### 📊 **Model Specifications**
+| Model | Checkpoint | VRAM (Normal) | VRAM (Offload) |
+|-------|------------|---------------|----------------|
+| **XXL** *(Default)* | `hunyuanvideo_foley.pth` | 20GB | 12GB |
+| **XL** | `hunyuanvideo_foley_xl.pth` | 16GB | 8GB |
+### 🎬 **Single Video Generation**
+<div style="background: #e8f5e8; padding: 15px; border-radius: 8px; border-left: 4px solid #28a745; margin: 10px 0;color: #333;">
+Generate Foley audio for a single video file with text description:
+</div>
+```bash
+# Use XXL model (default, best quality)
+python3 infer.py \
+    --model_path PRETRAINED_MODEL_PATH_DIR \
+    --single_video video_path \
+    --single_prompt "audio description" \
+    --output_dir OUTPUT_DIR \
+    # --enable_offload
+# Use XL model (memory-friendly)
+python3 infer.py \
+    --model_path PRETRAINED_MODEL_PATH_DIR \
+    --model_size xl \
+    --single_video video_path \
+    --single_prompt "audio description" \
+    --output_dir OUTPUT_DIR \
+    # --enable_offload
+```
+### 📂 **Batch Processing**
+<div style="background: #fff3e0; padding: 15px; border-radius: 8px; border-left: 4px solid #ff9800; margin: 10px 0;color: #333;">
+Process multiple videos using a CSV file with video paths and descriptions:
+</div>
+```bash
+# Download sample test videos
+bash ./download_test_videos.sh
+# Batch processing
+python3 infer.py \
+    --model_path PRETRAINED_MODEL_PATH_DIR \
+    --csv_path assets/test.csv \
+    --output_dir OUTPUT_DIR \
+    # --enable_offload
+```
+### 🌐 **Interactive Web Interface**
+<div style="background: #f3e5f5; padding: 15px; border-radius: 8px; border-left: 4px solid #9c27b0; margin: 10px 0;color: #333;">
+Launch a user-friendly Gradio web interface for easy interaction:
+</div>
+```bash
+# Launch with XXL model (default)
+export HIFI_FOLEY_MODEL_PATH=PRETRAINED_MODEL_PATH_DIR
+python3 gradio_app.py
+# Launch with XL model (memory-friendly)
+export HIFI_FOLEY_MODEL_PATH=PRETRAINED_MODEL_PATH_DIR
+MODEL_SIZE=xl python3 gradio_app.py
+# Optional: Enable offload to reduce memory usage
+ENABLE_OFFLOAD=true python3 gradio_app.py
+```
+<div align="center" style="margin: 20px 0; color: #333;">
+*🚀 Then open your browser and navigate to the provided local URL to start generating Foley audio!*
+</div>
+---
+## 📚 **Citation**
+<div style="background: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 4px solid #6c757d; margin: 20px 0; color: #333;">
+If you find **HunyuanVideo-Foley** useful for your research, please consider citing our paper:
+</div>
+```bibtex
+@misc{shan2025hunyuanvideofoleymultimodaldiffusionrepresentation,
+      title={HunyuanVideo-Foley: Multimodal Diffusion with Representation Alignment for High-Fidelity Foley Audio Generation},
+      author={Sizhe Shan and Qiulin Li and Yutao Cui and Miles Yang and Yuehai Wang and Qun Yang and Jin Zhou and Zhao Zhong},
+      year={2025},
+      eprint={2508.16930},
+      archivePrefix={arXiv},
+      primaryClass={eess.AS},
+      url={https://arxiv.org/abs/2508.16930},
+}
+```
+## Star History
+[![Star History Chart](https://api.star-history.com/svg?repos=Tencent-Hunyuan/HunyuanVideo-Foley&type=Date)](https://www.star-history.com/#Tencent-Hunyuan/HunyuanVideo-Foley&Date)
+---
+## 🙏 **Acknowledgements**
+<div align="center">
+**We extend our heartfelt gratitude to the open-source community!**
+</div>
+<table align="center" style="width: 100%; border: none; margin: 20px 0;">
+<tr>
+<td align="center" style="width: 33%; padding: 10px; vertical-align: top;">
+🎨 **[Stable Diffusion 3](https://huggingface.co/stabilityai/stable-diffusion-3-medium)**
+*Foundation diffusion models*
+</td>
+<td align="center" style="width: 33%; padding: 10px; vertical-align: top;">
+⚡ **[FLUX](https://github.com/black-forest-labs/flux)**
+*Advanced generation techniques*
+</td>
+<td align="center" style="width: 33%; padding: 10px; vertical-align: top;">
+🎵 **[MMAudio](https://github.com/hkchengrex/MMAudio)**
+*Multimodal audio generation*
+</td>
+</tr>
+<tr>
+<td align="center" style="width: 33%; padding: 10px; vertical-align: top;">
+🤗 **[HuggingFace](https://huggingface.co)**
+*Platform & diffusers library*
+</td>
+<td align="center" style="width: 33%; padding: 10px; vertical-align: top;">
+🗜️ **[DAC](https://github.com/descriptinc/descript-audio-codec)**
+*High-Fidelity Audio Compression*
+</td>
+<td align="center" style="width: 33%; padding: 10px; vertical-align: top;">
+🔗 **[Synchformer](https://github.com/v-iashin/Synchformer)**
+*Audio-Visual Synchronization*
+</td>
+</tr>
+</table>
+<div align="center" style="background: linear-gradient(135deg, #74b9ff 0%, #0984e3 100%); color: white; padding: 20px; border-radius: 15px; margin: 20px 0;, color: #333;">
+**🌟 Special thanks to all researchers and developers who contribute to the advancement of AI-generated audio and multimodal learning!**
+</div>
+---
+<div align="center" style="margin: 30px 0;">
+### 🔗 **Connect with Us**
+[![GitHub](https://img.shields.io/badge/GitHub-Follow-black?style=for-the-badge&logo=github)](https://github.com/Tencent-Hunyuan)
+[![Twitter](https://img.shields.io/badge/Twitter-Follow-blue?style=for-the-badge&logo=twitter)](https://twitter.com/Tencent)
+[![Hunyuan](https://img.shields.io/badge/Website-HunyuanAI-green?style=for-the-badge&logo=hunyuan)](https://hunyuan.tencent.com/)
+<p style="color: #666; margin-top: 15px; font-size: 14px;">
+© 2025 Tencent Hunyuan. All rights reserved. | Made with ❤️ for the AI community
+</p>
+</div>

HunyuanVideo-Foley/build_package.sh ADDED Viewed

	@@ -0,0 +1,58 @@

+#!/bin/bash
+# 构建 HunyuanVideo-Foley Python 包的脚本
+set -e  # 出现错误时退出
+echo "🚀 开始构建 HunyuanVideo-Foley Python 包..."
+# 清理之前的构建文件
+echo "🧹 清理之前的构建文件..."
+rm -rf build/ dist/ *.egg-info/
+# 检查必要的工具
+echo "🔍 检查构建工具..."
+python -c "import setuptools, wheel; print('✅ setuptools和wheel已安装')" || {
+    echo "❌ 请安装构建工具: pip install setuptools wheel"
+    exit 1
+}
+# 检查setup.py
+echo "🔍 验证setup.py配置..."
+python setup.py check --restructuredtext --strict || {
+    echo "⚠️  setup.py验证有警告，但继续构建..."
+}
+# 构建源码分发包
+echo "📦 构建源码分发包..."
+python setup.py sdist
+# 构建wheel包
+echo "🎡 构建wheel包..."
+python setup.py bdist_wheel
+# 显示构建结果
+echo "✅ 构建完成！生成的包："
+ls -la dist/
+# 验证包
+echo "🔍 验证生成的包..."
+python -m pip check dist/*.whl || echo "⚠️  包验证有警告"
+echo ""
+echo "📝 安装说明："
+echo "# 从wheel文件安装:"
+echo "pip install dist/hunyuanvideo_foley-1.0.0-py3-none-any.whl"
+echo ""
+echo "# 开发模式安装:"
+echo "pip install -e ."
+echo ""
+echo "# 安装所有可选依赖:"
+echo "pip install -e .[all]"
+echo ""
+echo "⚠️  注意：某些依赖需要单独安装："
+echo "pip install git+https://github.com/descriptinc/audiotools"
+echo "pip install git+https://github.com/huggingface/transformers@v4.49.0-SigLIP-2"
+echo ""
+echo "🎉 构建完成！查看 INSTALL.md 获取详细安装指南。"

HunyuanVideo-Foley/configs/hunyuanvideo-foley-xl.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+model_config:
+  model_name: HunyuanVideo-Foley-XL
+  model_type: 1d
+  model_precision: bf16
+  model_kwargs:
+    depth_triple_blocks: 12
+    depth_single_blocks: 24
+    hidden_size: 1408
+    num_heads: 11
+    mlp_ratio: 4
+    mlp_act_type: "gelu_tanh"
+    qkv_bias: True
+    qk_norm: True
+    qk_norm_type: "rms"
+    attn_mode: "torch"
+    embedder_type: "default"
+    interleaved_audio_visual_rope: True
+    enable_learnable_empty_visual_feat: True
+    sync_modulation: False
+    add_sync_feat_to_audio: True
+    cross_attention: True
+    use_attention_mask: False
+    condition_projection: "linear"
+    sync_feat_dim: 768 # syncformer 768 dim
+    condition_dim: 768  # clap 768 text condition dim (clip-text)
+    clip_dim: 768  # siglip2 visual dim
+    audio_vae_latent_dim: 128
+    audio_frame_rate: 50
+    patch_size: 1
+    rope_dim_list: null
+    rope_theta: 10000
+    text_length: 77
+    clip_length: 64
+    sync_length: 192
+    depth_triple_ssl_encoder: null
+    depth_single_ssl_encoder: 8
+    use_repa_with_audiossl: True
+diffusion_config:
+  denoise_type: "flow"
+  flow_path_type: "linear"
+  flow_predict_type: "velocity"
+  flow_reverse: True
+  flow_solver: "euler"
+  sample_flow_shift: 1.0
+  sample_use_flux_shift: False
+  flux_base_shift: 0.5
+  flux_max_shift: 1.15

HunyuanVideo-Foley/configs/hunyuanvideo-foley-xxl.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+model_config:
+  model_name: HunyuanVideo-Foley-XXL
+  model_type: 1d
+  model_precision: bf16
+  model_kwargs:
+    depth_triple_blocks: 18
+    depth_single_blocks: 36
+    hidden_size: 1536
+    num_heads: 12
+    mlp_ratio: 4
+    mlp_act_type: "gelu_tanh"
+    qkv_bias: True
+    qk_norm: True
+    qk_norm_type: "rms"
+    attn_mode: "torch"
+    embedder_type: "default"
+    interleaved_audio_visual_rope: True
+    enable_learnable_empty_visual_feat: True
+    sync_modulation: False
+    add_sync_feat_to_audio: True
+    cross_attention: True
+    use_attention_mask: False
+    condition_projection: "linear"
+    sync_feat_dim: 768 # syncformer 768 dim
+    condition_dim: 768  # clap 768 text condition dim (clip-text)
+    clip_dim: 768  # siglip2 visual dim
+    audio_vae_latent_dim: 128
+    audio_frame_rate: 50
+    patch_size: 1
+    rope_dim_list: null
+    rope_theta: 10000
+    text_length: 77
+    clip_length: 64
+    sync_length: 192
+    depth_triple_ssl_encoder: null
+    depth_single_ssl_encoder: 8
+    use_repa_with_audiossl: True
+diffusion_config:
+  denoise_type: "flow"
+  flow_path_type: "linear"
+  flow_predict_type: "velocity"
+  flow_reverse: True
+  flow_solver: "euler"
+  sample_flow_shift: 1.0
+  sample_use_flux_shift: False
+  flux_base_shift: 0.5
+  flux_max_shift: 1.15

HunyuanVideo-Foley/download_test_videos.sh ADDED Viewed

	@@ -0,0 +1,11 @@

+#!/bin/bash
+# Download MoviegenAudioBenchSfx 10 videos
+curl -O https://texttoaudio-train-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuanvideo-foley_demo/MovieGenAudioBenchSfx.tar.gz
+tar -xzvf MovieGenAudioBenchSfx.tar.gz -C ./assets
+rm MovieGenAudioBenchSfx.tar.gz
+# Download gradio example video
+curl -O https://texttoaudio-train-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuanvideo-foley_demo/examples.tar.gz
+tar -xvzf examples.tar.gz
+rm examples.tar.gz

HunyuanVideo-Foley/gradio_app.py ADDED Viewed

	@@ -0,0 +1,834 @@

+import os
+import tempfile
+import gradio as gr
+import torch
+import torchaudio
+from loguru import logger
+from typing import Optional, Tuple
+import random
+import numpy as np
+from hunyuanvideo_foley.utils.model_utils import load_model
+from hunyuanvideo_foley.utils.feature_utils import feature_process
+from hunyuanvideo_foley.utils.model_utils import denoise_process
+from hunyuanvideo_foley.utils.media_utils import merge_audio_video
+# Global variables for model storage
+model_dict = None
+cfg = None
+device = None
+# need to modify the model path
+MODEL_PATH = os.environ.get("HIFI_FOLEY_MODEL_PATH", "./pretrained_models/")
+ENABLE_OFFLOAD = os.environ.get("ENABLE_OFFLOAD", "false").lower() in ("true", "1", "yes")
+MODEL_SIZE = os.environ.get("MODEL_SIZE", "xxl")  # default to xxl model
+CONFIG_PATH = os.environ.get("CONFIG_PATH", "")
+def setup_device(device_str: str = "auto", gpu_id: int = 0) -> torch.device:
+    """Setup computing device"""
+    if device_str == "auto":
+        if torch.cuda.is_available():
+            device = torch.device(f"cuda:{gpu_id}")
+            logger.info(f"Using CUDA device: {device}")
+        elif torch.backends.mps.is_available():
+            device = torch.device("mps")
+            logger.info("Using MPS device")
+        else:
+            device = torch.device("cpu")
+            logger.info("Using CPU device")
+    else:
+        if device_str == "cuda":
+            device = torch.device(f"cuda:{gpu_id}")
+        else:
+            device = torch.device(device_str)
+        logger.info(f"Using specified device: {device}")
+    return device
+def auto_load_models() -> str:
+    """Automatically load preset models"""
+    global model_dict, cfg, device
+    try:
+        if not os.path.exists(MODEL_PATH):
+            return f"❌ Model directory not found: {MODEL_PATH}"
+        # Use GPU by default
+        device = setup_device("auto", 0)
+        # Auto-select config if not specified
+        config_path = CONFIG_PATH
+        if not config_path:
+            config_mapping = {
+                "xl": "configs/hunyuanvideo-foley-xl.yaml",
+                "xxl": "configs/hunyuanvideo-foley-xxl.yaml"
+            }
+            config_path = config_mapping.get(MODEL_SIZE, "configs/hunyuanvideo-foley-xxl.yaml")
+        # Load model
+        logger.info("Auto-loading model...")
+        logger.info(f"Model path: {MODEL_PATH}")
+        logger.info(f"Model size: {MODEL_SIZE}")
+        logger.info(f"Config path: {config_path}")
+        logger.info(f"Offload mode: {'enabled' if ENABLE_OFFLOAD else 'disabled'}")
+        model_dict, cfg = load_model(MODEL_PATH, config_path, device, enable_offload=ENABLE_OFFLOAD, model_size=MODEL_SIZE)
+        logger.info("✅ Model loaded successfully!")
+        return "✅ Model loaded successfully!"
+    except Exception as e:
+        logger.error(f"Model loading failed: {str(e)}")
+        return f"❌ Model loading failed: {str(e)}"
+def infer_single_video(
+    video_file,
+    text_prompt: str,
+    neg_prompt: str = None,
+    guidance_scale: float = 4.5,
+    num_inference_steps: int = 50,
+    sample_nums: int = 1
+) -> Tuple[list, str]:
+    """Single video inference"""
+    global model_dict, cfg, device
+    if model_dict is None or cfg is None:
+        return [], "❌ Please load the model first!"
+    if video_file is None:
+        return [], "❌ Please upload a video file!"
+    # Allow empty text prompt, use empty string if no prompt provided
+    if text_prompt is None:
+        text_prompt = ""
+    text_prompt = text_prompt.strip()
+    try:
+        logger.info(f"Processing video: {video_file}")
+        logger.info(f"Text prompt: {text_prompt}")
+        # Feature processing
+        visual_feats, text_feats, audio_len_in_s = feature_process(
+            video_file,
+            text_prompt,
+            model_dict,
+            cfg,
+            neg_prompt=neg_prompt
+        )
+        # Denoising process to generate multiple audio samples
+        # Note: The model now generates sample_nums audio samples per inference
+        # The denoise_process function returns audio with shape [batch_size, channels, samples]
+        logger.info(f"Generating {sample_nums} audio samples...")
+        audio, sample_rate = denoise_process(
+            visual_feats,
+            text_feats,
+            audio_len_in_s,
+            model_dict,
+            cfg,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            batch_size=sample_nums
+        )
+        # Create temporary files to save results
+        temp_dir = tempfile.mkdtemp()
+        video_outputs = []
+        # Process each generated audio sample
+        for i in range(sample_nums):
+            # Save audio file
+            audio_output = os.path.join(temp_dir, f"generated_audio_{i+1}.wav")
+            torchaudio.save(audio_output, audio[i], sample_rate)
+            # Merge video and audio
+            video_output = os.path.join(temp_dir, f"video_with_audio_{i+1}.mp4")
+            merge_audio_video(audio_output, video_file, video_output)
+            video_outputs.append(video_output)
+        logger.info(f"Inference completed! Generated {sample_nums} samples.")
+        return video_outputs, f"✅ Generated {sample_nums} audio sample(s) successfully!"
+    except Exception as e:
+        logger.error(f"Inference failed: {str(e)}")
+        return [], f"❌ Inference failed: {str(e)}"
+def update_video_outputs(video_list, status_msg):
+    """Update video outputs based on the number of generated samples"""
+    # Initialize all outputs as None
+    outputs = [None] * 6
+    # Set values based on generated videos
+    for i, video_path in enumerate(video_list[:6]):  # Max 6 samples
+        outputs[i] = video_path
+    # Return all outputs plus status message
+    return tuple(outputs + [status_msg])
+def create_gradio_interface():
+    """Create Gradio interface"""
+    # Custom CSS for beautiful interface with better contrast
+    css = """
+    .gradio-container {
+        font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
+        background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
+        min-height: 100vh;
+    }
+    .main-header {
+        text-align: center;
+        padding: 2rem 0;
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        border-radius: 20px;
+        margin-bottom: 2rem;
+        box-shadow: 0 8px 32px rgba(0,0,0,0.15);
+    }
+    .main-header h1 {
+        color: white;
+        font-size: 3rem;
+        font-weight: 700;
+        margin-bottom: 0.5rem;
+        text-shadow: 0 2px 10px rgba(0,0,0,0.3);
+    }
+    .main-header p {
+        color: rgba(255, 255, 255, 0.95);
+        font-size: 1.2rem;
+        font-weight: 300;
+    }
+    .status-card {
+        background: white;
+        border-radius: 15px;
+        padding: 1rem;
+        margin-bottom: 1.5rem;
+        border: 1px solid #e1e5e9;
+        box-shadow: 0 4px 20px rgba(0,0,0,0.08);
+    }
+    .status-card label {
+        color: #2d3748 !important;
+        font-weight: 600 !important;
+    }
+    .usage-guide h3 {
+        color: #2d3748 !important;
+        font-weight: 600 !important;
+        margin-bottom: 0.5rem !important;
+    }
+    .usage-guide p {
+        color: #4a5568 !important;
+        font-size: 1rem !important;
+        line-height: 1.6 !important;
+        margin: 0.5rem 0 !important;
+    }
+    .usage-guide strong {
+        color: #1a202c !important;
+        font-weight: 700 !important;
+    }
+    .usage-guide em {
+        color: #1a202c !important;
+        font-weight: 700 !important;
+        font-style: normal !important;
+    }
+    .main-interface {
+        margin-bottom: 2rem;
+    }
+    .input-section {
+        background: white;
+        border-radius: 20px;
+        padding: 2rem;
+        margin-right: 1rem;
+        box-shadow: 0 8px 32px rgba(0,0,0,0.1);
+        border: 1px solid #e1e5e9;
+    }
+    .input-section h3 {
+        color: #2d3748 !important;
+        font-weight: 600 !important;
+        margin-bottom: 1rem !important;
+    }
+    .input-section label {
+        color: #4a5568 !important;
+        font-weight: 500 !important;
+    }
+    .output-section {
+        background: white;
+        border-radius: 20px;
+        padding: 2rem;
+        margin-left: 1rem;
+        box-shadow: 0 8px 32px rgba(0,0,0,0.1);
+        border: 1px solid #e1e5e9;
+    }
+    .output-section h3 {
+        color: #2d3748 !important;
+        font-weight: 600 !important;
+        margin-bottom: 1rem !important;
+    }
+    .output-section label {
+        color: #4a5568 !important;
+        font-weight: 500 !important;
+    }
+    .examples-section h3 {
+        color: #2d3748 !important;
+        font-weight: 600 !important;
+        margin-bottom: 1.5rem !important;
+    }
+    .generate-btn {
+        background: linear-gradient(45deg, #667eea, #764ba2) !important;
+        border: none !important;
+        color: white !important;
+        font-weight: 600 !important;
+        font-size: 1.1rem !important;
+        padding: 12px 30px !important;
+        border-radius: 25px !important;
+        box-shadow: 0 4px 15px rgba(102, 126, 234, 0.4) !important;
+        transition: all 0.3s ease !important;
+    }
+    .generate-btn:hover {
+        transform: translateY(-2px) !important;
+        box-shadow: 0 8px 25px rgba(102, 126, 234, 0.6) !important;
+    }
+    .examples-section {
+        background: white;
+        border-radius: 20px;
+        padding: 2rem;
+        margin-top: 2rem;
+        box-shadow: 0 8px 32px rgba(0,0,0,0.1);
+        border: 1px solid #e1e5e9;
+    }
+    .examples-section p {
+        color: #4a5568 !important;
+        margin-bottom: 1rem !important;
+    }
+    .example-row {
+        background: #f8fafc;
+        border: 1px solid #e2e8f0;
+        border-radius: 15px;
+        padding: 1.5rem;
+        margin: 1rem 0;
+        transition: all 0.3s ease;
+        align-items: center;
+    }
+    .example-row:hover {
+        border-color: #667eea;
+        transform: translateY(-2px);
+        box-shadow: 0 4px 20px rgba(102, 126, 234, 0.15);
+    }
+    .example-row .markdown {
+        color: #2d3748 !important;
+    }
+    .example-row .markdown p {
+        color: #2d3748 !important;
+        margin: 0.5rem 0 !important;
+        line-height: 1.5 !important;
+    }
+    .example-row .markdown strong {
+        color: #1a202c !important;
+        font-weight: 600 !important;
+    }
+    /* Example grid layout styles */
+    .example-grid-row {
+        margin: 1rem 0;
+        gap: 1rem;
+    }
+    .example-item {
+        background: #f8fafc;
+        border: 1px solid #e2e8f0;
+        border-radius: 15px;
+        padding: 1rem;
+        transition: all 0.3s ease;
+        margin: 0.25rem;
+        max-width: 250px;
+        margin-left: auto;
+        margin-right: auto;
+    }
+    .example-item:hover {
+        border-color: #667eea;
+        transform: translateY(-2px);
+        box-shadow: 0 4px 20px rgba(102, 126, 234, 0.15);
+    }
+    .example-caption {
+        margin: 0.5rem 0 !important;
+        min-height: 2.8rem !important;
+        display: flex !important;
+        align-items: flex-start !important;
+    }
+    .example-caption p {
+        color: #2d3748 !important;
+        font-size: 0.9rem !important;
+        line-height: 1.4 !important;
+        margin: 0.5rem 0 !important;
+    }
+    /* Multi-video gallery styles */
+    .additional-samples {
+        margin-top: 1rem;
+        gap: 0.5rem;
+    }
+    .additional-samples .gradio-video {
+        border-radius: 10px;
+        overflow: hidden;
+    }
+    /* Video gallery responsive layout */
+    .video-gallery {
+        display: grid;
+        gap: 1rem;
+        margin-top: 1rem;
+    }
+    .video-gallery.single {
+        grid-template-columns: 1fr;
+    }
+    .video-gallery.dual {
+        grid-template-columns: 1fr 1fr;
+    }
+    .video-gallery.multi {
+        grid-template-columns: repeat(2, 1fr);
+        grid-template-rows: auto auto auto;
+    }
+    .footer-text {
+        color: #718096 !important;
+        text-align: center;
+        padding: 2rem;
+        font-size: 0.9rem;
+    }
+    /* Video component styling for consistent size */
+    .input-section video,
+    .output-section video,
+    .example-row video {
+        width: 100% !important;
+        height: 300px !important;
+        object-fit: contain !important;
+        border-radius: 10px !important;
+        background-color: #000 !important;
+    }
+    .example-row video {
+        height: 150px !important;
+    }
+    /* Fix for additional samples video display */
+    .additional-samples video {
+        height: 150px !important;
+        object-fit: contain !important;
+        border-radius: 10px !important;
+        background-color: #000 !important;
+    }
+    .additional-samples .gradio-video {
+        border-radius: 10px !important;
+        overflow: hidden !important;
+        background-color: #000 !important;
+    }
+    .additional-samples .gradio-video > div {
+        background-color: #000 !important;
+        border-radius: 10px !important;
+    }
+    /* Video container styling */
+    .input-section .video-container,
+    .output-section .video-container,
+    .example-row .video-container {
+        background-color: #000 !important;
+        border-radius: 10px !important;
+        display: flex !important;
+        align-items: center !important;
+        justify-content: center !important;
+        overflow: hidden !important;
+    }
+    /* Ensure proper alignment */
+    .example-row {
+        display: flex !important;
+        align-items: stretch !important;
+    }
+    .example-row > div {
+        display: flex !important;
+        flex-direction: column !important;
+        justify-content: center !important;
+    }
+    /* Video wrapper for better control */
+    .video-wrapper {
+        position: relative !important;
+        width: 100% !important;
+        background: #000 !important;
+        border-radius: 10px !important;
+        overflow: hidden !important;
+        display: flex !important;
+        align-items: center !important;
+        justify-content: center !important;
+    }
+    """
+    with gr.Blocks(css=css, title="HunyuanVideo-Foley") as app:
+        # Main header
+        with gr.Column(elem_classes=["main-header"]):
+            gr.HTML("""
+            <h1>🎵 HunyuanVideo-Foley</h1>
+            <p>Text-Video-to-Audio Synthesis: Generate realistic audio from video and text descriptions</p>
+            """)
+        # Usage Guide
+        with gr.Column(elem_classes=["status-card"]):
+            gr.Markdown("""
+            ### 📋 Quick Start Guide
+            **1.** Upload your video file\t**2.** Add optional text description\t**3.** Adjust sample numbers (1-6)\t**4.** Click Generate Audio
+            💡 For quick start, you can load the prepared examples by clicking the button.
+            """, elem_classes=["usage-guide"])
+        # Main inference interface - Input and Results side by side
+        with gr.Row(elem_classes=["main-interface"]):
+            # Input section
+            with gr.Column(scale=1, elem_classes=["input-section"]):
+                gr.Markdown("### 📹 Video Input")
+                video_input = gr.Video(
+                    label="Upload Video",
+                    info="Supported formats: MP4, AVI, MOV, etc.",
+                    height=300
+                )
+                text_input = gr.Textbox(
+                    label="🎯 Audio Description (English)",
+                    placeholder="A person walks on frozen ice",
+                    lines=3,
+                    info="Describe the audio you want to generate (optional)"
+                )
+                neg_prompt_input = gr.Textbox(
+                    label="🚫 Negative Prompt",
+                    placeholder="noisy, harsh",
+                    lines=2,
+                    info="Describe what you want to avoid in the generated audio (optional, default: 'noisy, harsh')"
+                )
+                with gr.Row():
+                    guidance_scale = gr.Slider(
+                        minimum=1.0,
+                        maximum=10.0,
+                        value=4.5,
+                        step=0.1,
+                        label="🎚️ CFG Scale",
+                    )
+                    inference_steps = gr.Slider(
+                        minimum=10,
+                        maximum=100,
+                        value=50,
+                        step=5,
+                        label="⚡ Steps",
+                    )
+                    sample_nums = gr.Slider(
+                        minimum=1,
+                        maximum=6,
+                        value=1,
+                        step=1,
+                        label="🎲 Sample Nums",
+                    )
+                generate_btn = gr.Button(
+                    "🎵 Generate Audio",
+                    variant="primary",
+                    elem_classes=["generate-btn"]
+                )
+            # Results section
+            with gr.Column(scale=1, elem_classes=["output-section"]):
+                gr.Markdown("### 🎥 Generated Results")
+                # Multi-video gallery for displaying multiple generated samples
+                with gr.Column():
+                    # Primary video (Sample 1)
+                    video_output_1 = gr.Video(
+                        label="Sample 1",
+                        height=250,
+                        visible=True
+                    )
+                    # Additional videos (Samples 2-6) - initially hidden
+                    with gr.Row(elem_classes=["additional-samples"]):
+                        with gr.Column(scale=1):
+                            video_output_2 = gr.Video(
+                                label="Sample 2",
+                                height=150,
+                                visible=False
+                            )
+                            video_output_3 = gr.Video(
+                                label="Sample 3",
+                                height=150,
+                                visible=False
+                            )
+                        with gr.Column(scale=1):
+                            video_output_4 = gr.Video(
+                                label="Sample 4",
+                                height=150,
+                                visible=False
+                            )
+                            video_output_5 = gr.Video(
+                                label="Sample 5",
+                                height=150,
+                                visible=False
+                            )
+                    # Sample 6 - full width
+                    video_output_6 = gr.Video(
+                        label="Sample 6",
+                        height=150,
+                        visible=False
+                    )
+                result_text = gr.Textbox(
+                    label="Status",
+                    interactive=False,
+                    lines=2
+                )
+        # Examples section at the bottom
+        with gr.Column(elem_classes=["examples-section"]):
+            gr.Markdown("### 🌟 Examples")
+            gr.Markdown("Click on any example to load it into the interface above")
+            # Define your custom examples here - 8 examples total
+            examples_data = [
+                # Example 1
+                {
+                    "caption": "A person walks on frozen ice",
+                    "video_path": "examples/1_video.mp4",
+                    "result_path": "examples/1_result.mp4"
+                },
+                # Example 2
+                {
+                    "caption": "With a faint sound as their hands parted, the two embraced, a soft 'mm' escaping between them.",
+                    "video_path": "examples/2_video.mp4",
+                    "result_path": "examples/2_result.mp4"
+                },
+                # Example 3
+                {
+                    "caption": "The sound of the number 3's bouncing footsteps is as light and clear as glass marbles hitting the ground. Each step carries a magical sound.",
+                    "video_path": "examples/3_video.mp4",
+                    "result_path": "examples/3_result.mp4"
+                },
+                # Example 4
+                {
+                    "caption": "gentle gurgling of the stream's current, and music plays in the background which is a beautiful and serene piano solo with a hint of classical charm, evoking a sense of peace and serenity in people's hearts.",
+                    "video_path": "examples/4_video.mp4",
+                    "result_path": "examples/4_result.mp4"
+                },
+                # Example 5 - Add your new examples here
+                {
+                    "caption": "snow crunching under the snowboard's edge.",
+                    "video_path": "examples/5_video.mp4",
+                    "result_path": "examples/5_result.mp4"
+                },
+                # Example 6
+                {
+                    "caption": "The crackling of the fire, the whooshing of the flames, and the occasional crisp popping of charred leaves filled the forest.",
+                    "video_path": "examples/6_video.mp4",
+                    "result_path": "examples/6_result.mp4"
+                },
+                # Example 7
+                {
+                    "caption": "humming of the scooter engine accelerates slowly.",
+                    "video_path": "examples/7_video.mp4",
+                    "result_path": "examples/7_result.mp4"
+                },
+                # Example 8
+                {
+                    "caption": "splash of water and loud thud as person hits the surface.",
+                    "video_path": "examples/8_video.mp4",
+                    "result_path": "examples/8_result.mp4"
+                }
+            ]
+            # Create example grid - 4 examples per row, 2 rows total
+            example_buttons = []
+            for row in range(2):  # 2 rows
+                with gr.Row(elem_classes=["example-grid-row"]):
+                    for col in range(4):  # 4 columns
+                        idx = row * 4 + col
+                        if idx < len(examples_data):
+                            example = examples_data[idx]
+                            with gr.Column(scale=1, elem_classes=["example-item"]):
+                                # Video thumbnail
+                                if os.path.exists(example['video_path']):
+                                    example_video = gr.Video(
+                                        value=example['video_path'],
+                                        label=f"Example {idx+1}",
+                                        interactive=False,
+                                        show_label=True,
+                                        height=180
+                                    )
+                                else:
+                                    example_video = gr.HTML(f"""
+                                    <div style="background: #f0f0f0; padding: 15px; text-align: center; border-radius: 8px; height: 180px; display: flex; align-items: center; justify-content: center;">
+                                        <div>
+                                            <p style="color: #666; margin: 0; font-size: 12px;">📹 Video not found</p>
+                                            <small style="color: #999; font-size: 10px;">{example['video_path']}</small>
+                                        </div>
+                                    </div>
+                                    """)
+                                # Caption (truncated for grid layout)
+                                caption_preview = example['caption'][:60] + "..." if len(example['caption']) > 60 else example['caption']
+                                gr.Markdown(f"{caption_preview}", elem_classes=["example-caption"])
+                                # Load button
+                                example_btn = gr.Button(
+                                    f"Load Example {idx+1}",
+                                    variant="secondary",
+                                    size="sm"
+                                )
+                                example_buttons.append((example_btn, example))
+        # Event handlers
+        def process_inference(video_file, text_prompt, neg_prompt, guidance_scale, inference_steps, sample_nums):
+            # Generate videos
+            video_list, status_msg = infer_single_video(
+                video_file, text_prompt, neg_prompt, guidance_scale, inference_steps, int(sample_nums)
+            )
+            # Update outputs with proper visibility
+            return update_video_outputs(video_list, status_msg)
+        # Add dynamic visibility control based on sample_nums
+        def update_visibility(sample_nums):
+            sample_nums = int(sample_nums)
+            return [
+                gr.update(visible=True),  # Sample 1 always visible
+                gr.update(visible=sample_nums >= 2),  # Sample 2
+                gr.update(visible=sample_nums >= 3),  # Sample 3
+                gr.update(visible=sample_nums >= 4),  # Sample 4
+                gr.update(visible=sample_nums >= 5),  # Sample 5
+                gr.update(visible=sample_nums >= 6),  # Sample 6
+            ]
+        # Update visibility when sample_nums changes
+        sample_nums.change(
+            fn=update_visibility,
+            inputs=[sample_nums],
+            outputs=[video_output_1, video_output_2, video_output_3, video_output_4, video_output_5, video_output_6]
+        )
+        generate_btn.click(
+            fn=process_inference,
+            inputs=[video_input, text_input, neg_prompt_input, guidance_scale, inference_steps, sample_nums],
+            outputs=[
+                video_output_1,  # Sample 1 value
+                video_output_2,  # Sample 2 value
+                video_output_3,  # Sample 3 value
+                video_output_4,  # Sample 4 value
+                video_output_5,  # Sample 5 value
+                video_output_6,  # Sample 6 value
+                result_text
+            ]
+        )
+        # Add click handlers for example buttons
+        for btn, example in example_buttons:
+            def create_example_handler(ex):
+                def handler():
+                    # Check if files exist, if not, return placeholder message
+                    if os.path.exists(ex['video_path']):
+                        video_file = ex['video_path']
+                    else:
+                        video_file = None
+                    if os.path.exists(ex['result_path']):
+                        result_video = ex['result_path']
+                    else:
+                        result_video = None
+                    status_msg = f"✅ Loaded example with caption: {ex['caption'][:50]}..."
+                    if not video_file:
+                        status_msg += f"\n⚠️ Video file not found: {ex['video_path']}"
+                    if not result_video:
+                        status_msg += f"\n⚠️ Result video not found: {ex['result_path']}"
+                    return video_file, ex['caption'], "noisy, harsh", result_video, status_msg
+                return handler
+            btn.click(
+                fn=create_example_handler(example),
+                outputs=[video_input, text_input, neg_prompt_input, video_output_1, result_text]
+            )
+        # Footer
+        gr.HTML("""
+        <div class="footer-text">
+            <p>🚀 Powered by HunyuanVideo-Foley | Generate high-quality audio from video and text descriptions</p>
+        </div>
+        """)
+    return app
+def set_manual_seed(global_seed):
+    random.seed(global_seed)
+    np.random.seed(global_seed)
+    torch.manual_seed(global_seed)
+if __name__ == "__main__":
+    set_manual_seed(1)
+    # Setup logging
+    logger.remove()
+    logger.add(lambda msg: print(msg, end=''), level="INFO")
+    # Auto-load model
+    logger.info("Starting application and loading model...")
+    model_load_result = auto_load_models()
+    logger.info(model_load_result)
+    # Create and launch Gradio app
+    app = create_gradio_interface()
+    # Log completion status
+    if "successfully" in model_load_result:
+        logger.info("Application ready, model loaded")
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=8080,
+        share=False,
+        debug=False,
+        show_error=True
+    )

HunyuanVideo-Foley/hunyuanvideo_foley/__init__.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""
+HunyuanVideo-Foley: Multimodal Diffusion with Representation Alignment
+for High-Fidelity Foley Audio Generation
+This package provides tools for generating high-quality Foley audio effects
+from video content using multimodal diffusion models.
+"""
+__version__ = "1.0.0"
+__author__ = "Tencent Hunyuan Team"
+__email__ = "hunyuan@tencent.com"
+# Import main components for easy access
+try:
+    from .utils.model_utils import load_model, denoise_process
+    from .utils.feature_utils import feature_process
+    from .utils.media_utils import merge_audio_video
+    from .utils.config_utils import AttributeDict
+    __all__ = [
+        "__version__",
+        "load_model",
+        "denoise_process",
+        "feature_process",
+        "merge_audio_video",
+        "AttributeDict"
+    ]
+except ImportError:
+    # Handle missing dependencies gracefully during installation
+    __all__ = ["__version__"]

HunyuanVideo-Foley/hunyuanvideo_foley/cli.py ADDED Viewed

	@@ -0,0 +1,141 @@

+#!/usr/bin/env python3
+"""
+Command Line Interface for HunyuanVideo-Foley
+Provides command-line access to the main inference functionality.
+"""
+import sys
+import argparse
+from pathlib import Path
+def main():
+    """Main CLI entry point."""
+    parser = argparse.ArgumentParser(
+        description="HunyuanVideo-Foley: Generate Foley audio from video and text",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Single video generation
+  hunyuanvideo-foley --model_path ./models --single_video video.mp4 --single_prompt "footsteps on gravel"
+  # Batch processing
+  hunyuanvideo-foley --model_path ./models --csv_path batch.csv --output_dir ./outputs
+  # Start Gradio interface
+  hunyuanvideo-foley --gradio --model_path ./models
+        """
+    )
+    parser.add_argument("--model_path", type=str, required=True,
+                       help="Path to the pretrained model directory")
+    parser.add_argument("--config_path", type=str,
+                       default="configs/hunyuanvideo-foley-xxl.yaml",
+                       help="Path to the model configuration file")
+    # Input options
+    group_input = parser.add_mutually_exclusive_group(required=True)
+    group_input.add_argument("--single_video", type=str,
+                           help="Path to single video file for processing")
+    group_input.add_argument("--csv_path", type=str,
+                           help="Path to CSV file with video paths and prompts")
+    group_input.add_argument("--gradio", action="store_true",
+                           help="Launch Gradio web interface")
+    # Generation options
+    parser.add_argument("--single_prompt", type=str,
+                       help="Text prompt for single video (required with --single_video)")
+    parser.add_argument("--output_dir", type=str, default="./outputs",
+                       help="Output directory for generated audio files")
+    parser.add_argument("--guidance_scale", type=float, default=4.5,
+                       help="Guidance scale for generation (default: 4.5)")
+    parser.add_argument("--num_inference_steps", type=int, default=50,
+                       help="Number of inference steps (default: 50)")
+    parser.add_argument("--neg_prompt", type=str,
+                       help="Negative prompt to avoid certain audio characteristics")
+    # System options
+    parser.add_argument("--device", type=str, default="auto",
+                       choices=["auto", "cpu", "cuda"],
+                       help="Device to use for inference")
+    parser.add_argument("--gpu_id", type=int, default=0,
+                       help="GPU ID to use (default: 0)")
+    parser.add_argument("--seed", type=int, default=42,
+                       help="Random seed for reproducible generation")
+    args = parser.parse_args()
+    # Validate arguments
+    if args.single_video and not args.single_prompt:
+        parser.error("--single_prompt is required when using --single_video")
+    # Import here to avoid import errors if dependencies are missing
+    try:
+        if args.gradio:
+            _launch_gradio(args)
+        elif args.single_video:
+            _process_single_video(args)
+        elif args.csv_path:
+            _process_batch(args)
+    except ImportError as e:
+        print(f"Error: Missing required dependencies. Please install with: pip install hunyuanvideo-foley[all]")
+        print(f"Import error: {e}")
+        sys.exit(1)
+    except Exception as e:
+        print(f"Error: {e}")
+        sys.exit(1)
+def _launch_gradio(args):
+    """Launch Gradio web interface."""
+    import os
+    os.environ["HIFI_FOLEY_MODEL_PATH"] = args.model_path
+    # Import and launch gradio app
+    import subprocess
+    gradio_script = Path(__file__).parent.parent / "gradio_app.py"
+    subprocess.run([sys.executable, str(gradio_script)])
+def _process_single_video(args):
+    """Process a single video file."""
+    from . import infer
+    print(f"Processing video: {args.single_video}")
+    print(f"Prompt: {args.single_prompt}")
+    # This would need to be implemented to match the actual infer.py interface
+    # For now, redirect to the original script
+    import subprocess
+    cmd = [
+        sys.executable, "infer.py",
+        "--model_path", args.model_path,
+        "--config_path", args.config_path,
+        "--single_video", args.single_video,
+        "--single_prompt", args.single_prompt,
+        "--output_dir", args.output_dir,
+        "--guidance_scale", str(args.guidance_scale),
+        "--num_inference_steps", str(args.num_inference_steps)
+    ]
+    if args.neg_prompt:
+        cmd.extend(["--neg_prompt", args.neg_prompt])
+    subprocess.run(cmd)
+def _process_batch(args):
+    """Process a batch of videos from CSV."""
+    import subprocess
+    cmd = [
+        sys.executable, "infer.py",
+        "--model_path", args.model_path,
+        "--config_path", args.config_path,
+        "--csv_path", args.csv_path,
+        "--output_dir", args.output_dir,
+        "--guidance_scale", str(args.guidance_scale),
+        "--num_inference_steps", str(args.num_inference_steps)
+    ]
+    if args.neg_prompt:
+        cmd.extend(["--neg_prompt", args.neg_prompt])
+    subprocess.run(cmd)
+if __name__ == "__main__":
+    main()

HunyuanVideo-Foley/hunyuanvideo_foley/constants.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""Constants used throughout the HunyuanVideo-Foley project."""
+from typing import Dict, List
+# Model configuration
+DEFAULT_AUDIO_SAMPLE_RATE = 48000
+DEFAULT_VIDEO_FPS = 25
+DEFAULT_AUDIO_CHANNELS = 2
+# Video processing
+MAX_VIDEO_DURATION_SECONDS = 15.0
+MIN_VIDEO_DURATION_SECONDS = 1.0
+# Audio processing
+AUDIO_VAE_LATENT_DIM = 128
+AUDIO_FRAME_RATE = 75  # frames per second in latent space
+# Visual features
+FPS_VISUAL: Dict[str, int] = {
+    "siglip2": 8,
+    "synchformer": 25
+}
+# Model paths (can be overridden by environment variables)
+DEFAULT_MODEL_PATH = "./pretrained_models/"
+DEFAULT_CONFIG_PATH = "configs/hunyuanvideo-foley-xxl.yaml"
+# Inference parameters
+DEFAULT_GUIDANCE_SCALE = 4.5
+DEFAULT_NUM_INFERENCE_STEPS = 50
+MIN_GUIDANCE_SCALE = 1.0
+MAX_GUIDANCE_SCALE = 10.0
+MIN_INFERENCE_STEPS = 10
+MAX_INFERENCE_STEPS = 100
+# Text processing
+MAX_TEXT_LENGTH = 100
+DEFAULT_NEGATIVE_PROMPT = "noisy, harsh"
+# File extensions
+SUPPORTED_VIDEO_EXTENSIONS: List[str] = [".mp4", ".avi", ".mov", ".mkv", ".webm"]
+SUPPORTED_AUDIO_EXTENSIONS: List[str] = [".wav", ".mp3", ".flac", ".aac"]
+# Quality settings
+AUDIO_QUALITY_SETTINGS: Dict[str, List[str]] = {
+    "high": ["-b:a", "192k"],
+    "medium": ["-b:a", "128k"],
+    "low": ["-b:a", "96k"]
+}
+# Error messages
+ERROR_MESSAGES: Dict[str, str] = {
+    "model_not_loaded": "Model is not loaded. Please load the model first.",
+    "invalid_video_format": "Unsupported video format. Supported formats: {formats}",
+    "video_too_long": f"Video duration exceeds maximum of {MAX_VIDEO_DURATION_SECONDS} seconds",
+    "ffmpeg_not_found": "ffmpeg not found. Please install ffmpeg: https://ffmpeg.org/download.html"
+}

HunyuanVideo-Foley/hunyuanvideo_foley/models/__init__.py ADDED Viewed

File without changes

HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+__version__ = "1.0.0"
+# preserved here for legacy reasons
+__model_version__ = "latest"
+import audiotools
+audiotools.ml.BaseModel.INTERN += ["dac.**"]
+audiotools.ml.BaseModel.EXTERN += ["einops"]
+from . import nn
+from . import model
+from . import utils
+from .model import DAC
+from .model import DACFile

HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/__main__.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import sys
+import argbind
+from .utils import download
+from .utils.decode import decode
+from .utils.encode import encode
+STAGES = ["encode", "decode", "download"]
+def run(stage: str):
+    """Run stages.
+    Parameters
+    ----------
+    stage : str
+        Stage to run
+    """
+    if stage not in STAGES:
+        raise ValueError(f"Unknown command: {stage}. Allowed commands are {STAGES}")
+    stage_fn = globals()[stage]
+    if stage == "download":
+        stage_fn()
+        return
+    stage_fn()
+if __name__ == "__main__":
+    group = sys.argv.pop(1)
+    args = argbind.parse_args(group=group)
+    with argbind.scope(args):
+        run(group)

HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/model/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .base import CodecMixin
+from .base import DACFile
+from .dac import DAC
+from .discriminator import Discriminator

HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/model/base.py ADDED Viewed

	@@ -0,0 +1,301 @@

+import math
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Union
+import numpy as np
+import torch
+import tqdm
+from audiotools import AudioSignal
+from torch import nn
+SUPPORTED_VERSIONS = ["1.0.0"]
+@dataclass
+class DACFile:
+    codes: torch.Tensor
+    # Metadata
+    chunk_length: int
+    original_length: int
+    input_db: float
+    channels: int
+    sample_rate: int
+    padding: bool
+    dac_version: str
+    def save(self, path):
+        artifacts = {
+            "codes": self.codes.numpy().astype(np.uint16),
+            "metadata": {
+                "input_db": self.input_db.numpy().astype(np.float32),
+                "original_length": self.original_length,
+                "sample_rate": self.sample_rate,
+                "chunk_length": self.chunk_length,
+                "channels": self.channels,
+                "padding": self.padding,
+                "dac_version": SUPPORTED_VERSIONS[-1],
+            },
+        }
+        path = Path(path).with_suffix(".dac")
+        with open(path, "wb") as f:
+            np.save(f, artifacts)
+        return path
+    @classmethod
+    def load(cls, path):
+        artifacts = np.load(path, allow_pickle=True)[()]
+        codes = torch.from_numpy(artifacts["codes"].astype(int))
+        if artifacts["metadata"].get("dac_version", None) not in SUPPORTED_VERSIONS:
+            raise RuntimeError(
+                f"Given file {path} can't be loaded with this version of descript-audio-codec."
+            )
+        return cls(codes=codes, **artifacts["metadata"])
+class CodecMixin:
+    @property
+    def padding(self):
+        if not hasattr(self, "_padding"):
+            self._padding = True
+        return self._padding
+    @padding.setter
+    def padding(self, value):
+        assert isinstance(value, bool)
+        layers = [
+            l for l in self.modules() if isinstance(l, (nn.Conv1d, nn.ConvTranspose1d))
+        ]
+        for layer in layers:
+            if value:
+                if hasattr(layer, "original_padding"):
+                    layer.padding = layer.original_padding
+            else:
+                layer.original_padding = layer.padding
+                layer.padding = tuple(0 for _ in range(len(layer.padding)))
+        self._padding = value
+    def get_delay(self):
+        # Any number works here, delay is invariant to input length
+        l_out = self.get_output_length(0)
+        L = l_out
+        layers = []
+        for layer in self.modules():
+            if isinstance(layer, (nn.Conv1d, nn.ConvTranspose1d)):
+                layers.append(layer)
+        for layer in reversed(layers):
+            d = layer.dilation[0]
+            k = layer.kernel_size[0]
+            s = layer.stride[0]
+            if isinstance(layer, nn.ConvTranspose1d):
+                L = ((L - d * (k - 1) - 1) / s) + 1
+            elif isinstance(layer, nn.Conv1d):
+                L = (L - 1) * s + d * (k - 1) + 1
+            L = math.ceil(L)
+        l_in = L
+        return (l_in - l_out) // 2
+    def get_output_length(self, input_length):
+        L = input_length
+        # Calculate output length
+        for layer in self.modules():
+            if isinstance(layer, (nn.Conv1d, nn.ConvTranspose1d)):
+                d = layer.dilation[0]
+                k = layer.kernel_size[0]
+                s = layer.stride[0]
+                if isinstance(layer, nn.Conv1d):
+                    L = ((L - d * (k - 1) - 1) / s) + 1
+                elif isinstance(layer, nn.ConvTranspose1d):
+                    L = (L - 1) * s + d * (k - 1) + 1
+                L = math.floor(L)
+        return L
+    @torch.no_grad()
+    def compress(
+        self,
+        audio_path_or_signal: Union[str, Path, AudioSignal],
+        win_duration: float = 1.0,
+        verbose: bool = False,
+        normalize_db: float = -16,
+        n_quantizers: int = None,
+    ) -> DACFile:
+        """Processes an audio signal from a file or AudioSignal object into
+        discrete codes. This function processes the signal in short windows,
+        using constant GPU memory.
+        Parameters
+        ----------
+        audio_path_or_signal : Union[str, Path, AudioSignal]
+            audio signal to reconstruct
+        win_duration : float, optional
+            window duration in seconds, by default 5.0
+        verbose : bool, optional
+            by default False
+        normalize_db : float, optional
+            normalize db, by default -16
+        Returns
+        -------
+        DACFile
+            Object containing compressed codes and metadata
+            required for decompression
+        """
+        audio_signal = audio_path_or_signal
+        if isinstance(audio_signal, (str, Path)):
+            audio_signal = AudioSignal.load_from_file_with_ffmpeg(str(audio_signal))
+        self.eval()
+        original_padding = self.padding
+        original_device = audio_signal.device
+        audio_signal = audio_signal.clone()
+        audio_signal = audio_signal.to_mono()
+        original_sr = audio_signal.sample_rate
+        resample_fn = audio_signal.resample
+        loudness_fn = audio_signal.loudness
+        # If audio is > 10 minutes long, use the ffmpeg versions
+        if audio_signal.signal_duration >= 10 * 60 * 60:
+            resample_fn = audio_signal.ffmpeg_resample
+            loudness_fn = audio_signal.ffmpeg_loudness
+        original_length = audio_signal.signal_length
+        resample_fn(self.sample_rate)
+        input_db = loudness_fn()
+        if normalize_db is not None:
+            audio_signal.normalize(normalize_db)
+        audio_signal.ensure_max_of_audio()
+        nb, nac, nt = audio_signal.audio_data.shape
+        audio_signal.audio_data = audio_signal.audio_data.reshape(nb * nac, 1, nt)
+        win_duration = (
+            audio_signal.signal_duration if win_duration is None else win_duration
+        )
+        if audio_signal.signal_duration <= win_duration:
+            # Unchunked compression (used if signal length < win duration)
+            self.padding = True
+            n_samples = nt
+            hop = nt
+        else:
+            # Chunked inference
+            self.padding = False
+            # Zero-pad signal on either side by the delay
+            audio_signal.zero_pad(self.delay, self.delay)
+            n_samples = int(win_duration * self.sample_rate)
+            # Round n_samples to nearest hop length multiple
+            n_samples = int(math.ceil(n_samples / self.hop_length) * self.hop_length)
+            hop = self.get_output_length(n_samples)
+        codes = []
+        range_fn = range if not verbose else tqdm.trange
+        for i in range_fn(0, nt, hop):
+            x = audio_signal[..., i : i + n_samples]
+            x = x.zero_pad(0, max(0, n_samples - x.shape[-1]))
+            audio_data = x.audio_data.to(self.device)
+            audio_data = self.preprocess(audio_data, self.sample_rate)
+            _, c, _, _, _ = self.encode(audio_data, n_quantizers)
+            codes.append(c.to(original_device))
+            chunk_length = c.shape[-1]
+        codes = torch.cat(codes, dim=-1)
+        dac_file = DACFile(
+            codes=codes,
+            chunk_length=chunk_length,
+            original_length=original_length,
+            input_db=input_db,
+            channels=nac,
+            sample_rate=original_sr,
+            padding=self.padding,
+            dac_version=SUPPORTED_VERSIONS[-1],
+        )
+        if n_quantizers is not None:
+            codes = codes[:, :n_quantizers, :]
+        self.padding = original_padding
+        return dac_file
+    @torch.no_grad()
+    def decompress(
+        self,
+        obj: Union[str, Path, DACFile],
+        verbose: bool = False,
+    ) -> AudioSignal:
+        """Reconstruct audio from a given .dac file
+        Parameters
+        ----------
+        obj : Union[str, Path, DACFile]
+            .dac file location or corresponding DACFile object.
+        verbose : bool, optional
+            Prints progress if True, by default False
+        Returns
+        -------
+        AudioSignal
+            Object with the reconstructed audio
+        """
+        self.eval()
+        if isinstance(obj, (str, Path)):
+            obj = DACFile.load(obj)
+        original_padding = self.padding
+        self.padding = obj.padding
+        range_fn = range if not verbose else tqdm.trange
+        codes = obj.codes
+        original_device = codes.device
+        chunk_length = obj.chunk_length
+        recons = []
+        for i in range_fn(0, codes.shape[-1], chunk_length):
+            c = codes[..., i : i + chunk_length].to(self.device)
+            z = self.quantizer.from_codes(c)[0]
+            r = self.decode(z)
+            recons.append(r.to(original_device))
+        recons = torch.cat(recons, dim=-1)
+        recons = AudioSignal(recons, self.sample_rate)
+        resample_fn = recons.resample
+        loudness_fn = recons.loudness
+        # If audio is > 10 minutes long, use the ffmpeg versions
+        if recons.signal_duration >= 10 * 60 * 60:
+            resample_fn = recons.ffmpeg_resample
+            loudness_fn = recons.ffmpeg_loudness
+        if obj.input_db is not None:
+            recons.normalize(obj.input_db)
+        resample_fn(obj.sample_rate)
+        if obj.original_length is not None:
+            recons = recons[..., : obj.original_length]
+            loudness_fn()
+            recons.audio_data = recons.audio_data.reshape(
+                -1, obj.channels, obj.original_length
+            )
+        else:
+            loudness_fn()
+        self.padding = original_padding
+        return recons

HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/model/dac.py ADDED Viewed

	@@ -0,0 +1,410 @@

+import math
+from typing import List
+from typing import Union
+import numpy as np
+import torch
+from audiotools import AudioSignal
+from audiotools.ml import BaseModel
+from torch import nn
+from .base import CodecMixin
+from ..nn.layers import Snake1d
+from ..nn.layers import WNConv1d
+from ..nn.layers import WNConvTranspose1d
+from ..nn.quantize import ResidualVectorQuantize
+from ..nn.vae_utils import DiagonalGaussianDistribution
+def init_weights(m):
+    if isinstance(m, nn.Conv1d):
+        nn.init.trunc_normal_(m.weight, std=0.02)
+        nn.init.constant_(m.bias, 0)
+class ResidualUnit(nn.Module):
+    def __init__(self, dim: int = 16, dilation: int = 1):
+        super().__init__()
+        pad = ((7 - 1) * dilation) // 2
+        self.block = nn.Sequential(
+            Snake1d(dim),
+            WNConv1d(dim, dim, kernel_size=7, dilation=dilation, padding=pad),
+            Snake1d(dim),
+            WNConv1d(dim, dim, kernel_size=1),
+        )
+    def forward(self, x):
+        y = self.block(x)
+        pad = (x.shape[-1] - y.shape[-1]) // 2
+        if pad > 0:
+            x = x[..., pad:-pad]
+        return x + y
+class EncoderBlock(nn.Module):
+    def __init__(self, dim: int = 16, stride: int = 1):
+        super().__init__()
+        self.block = nn.Sequential(
+            ResidualUnit(dim // 2, dilation=1),
+            ResidualUnit(dim // 2, dilation=3),
+            ResidualUnit(dim // 2, dilation=9),
+            Snake1d(dim // 2),
+            WNConv1d(
+                dim // 2,
+                dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+            ),
+        )
+    def forward(self, x):
+        return self.block(x)
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        d_model: int = 64,
+        strides: list = [2, 4, 8, 8],
+        d_latent: int = 64,
+    ):
+        super().__init__()
+        # Create first convolution
+        self.block = [WNConv1d(1, d_model, kernel_size=7, padding=3)]
+        # Create EncoderBlocks that double channels as they downsample by `stride`
+        for stride in strides:
+            d_model *= 2
+            self.block += [EncoderBlock(d_model, stride=stride)]
+        # Create last convolution
+        self.block += [
+            Snake1d(d_model),
+            WNConv1d(d_model, d_latent, kernel_size=3, padding=1),
+        ]
+        # Wrap black into nn.Sequential
+        self.block = nn.Sequential(*self.block)
+        self.enc_dim = d_model
+    def forward(self, x):
+        return self.block(x)
+class DecoderBlock(nn.Module):
+    def __init__(self, input_dim: int = 16, output_dim: int = 8, stride: int = 1):
+        super().__init__()
+        self.block = nn.Sequential(
+            Snake1d(input_dim),
+            WNConvTranspose1d(
+                input_dim,
+                output_dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+                output_padding=stride % 2,
+            ),
+            ResidualUnit(output_dim, dilation=1),
+            ResidualUnit(output_dim, dilation=3),
+            ResidualUnit(output_dim, dilation=9),
+        )
+    def forward(self, x):
+        return self.block(x)
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        input_channel,
+        channels,
+        rates,
+        d_out: int = 1,
+    ):
+        super().__init__()
+        # Add first conv layer
+        layers = [WNConv1d(input_channel, channels, kernel_size=7, padding=3)]
+        # Add upsampling + MRF blocks
+        for i, stride in enumerate(rates):
+            input_dim = channels // 2**i
+            output_dim = channels // 2 ** (i + 1)
+            layers += [DecoderBlock(input_dim, output_dim, stride)]
+        # Add final conv layer
+        layers += [
+            Snake1d(output_dim),
+            WNConv1d(output_dim, d_out, kernel_size=7, padding=3),
+            nn.Tanh(),
+        ]
+        self.model = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.model(x)
+class DAC(BaseModel, CodecMixin):
+    def __init__(
+        self,
+        encoder_dim: int = 64,
+        encoder_rates: List[int] = [2, 4, 8, 8],
+        latent_dim: int = None,
+        decoder_dim: int = 1536,
+        decoder_rates: List[int] = [8, 8, 4, 2],
+        n_codebooks: int = 9,
+        codebook_size: int = 1024,
+        codebook_dim: Union[int, list] = 8,
+        quantizer_dropout: bool = False,
+        sample_rate: int = 44100,
+        continuous: bool = False,
+    ):
+        super().__init__()
+        self.encoder_dim = encoder_dim
+        self.encoder_rates = encoder_rates
+        self.decoder_dim = decoder_dim
+        self.decoder_rates = decoder_rates
+        self.sample_rate = sample_rate
+        self.continuous = continuous
+        if latent_dim is None:
+            latent_dim = encoder_dim * (2 ** len(encoder_rates))
+        self.latent_dim = latent_dim
+        self.hop_length = np.prod(encoder_rates)
+        self.encoder = Encoder(encoder_dim, encoder_rates, latent_dim)
+        if not continuous:
+            self.n_codebooks = n_codebooks
+            self.codebook_size = codebook_size
+            self.codebook_dim = codebook_dim
+            self.quantizer = ResidualVectorQuantize(
+                input_dim=latent_dim,
+                n_codebooks=n_codebooks,
+                codebook_size=codebook_size,
+                codebook_dim=codebook_dim,
+                quantizer_dropout=quantizer_dropout,
+            )
+        else:
+            self.quant_conv = torch.nn.Conv1d(latent_dim, 2 * latent_dim, 1)
+            self.post_quant_conv = torch.nn.Conv1d(latent_dim, latent_dim, 1)
+        self.decoder = Decoder(
+            latent_dim,
+            decoder_dim,
+            decoder_rates,
+        )
+        self.sample_rate = sample_rate
+        self.apply(init_weights)
+        self.delay = self.get_delay()
+    @property
+    def dtype(self):
+        """Get the dtype of the model parameters."""
+        # Return the dtype of the first parameter found
+        for param in self.parameters():
+            return param.dtype
+        return torch.float32  # fallback
+    @property
+    def device(self):
+        """Get the device of the model parameters."""
+        # Return the device of the first parameter found
+        for param in self.parameters():
+            return param.device
+        return torch.device('cpu')  # fallback
+    def preprocess(self, audio_data, sample_rate):
+        if sample_rate is None:
+            sample_rate = self.sample_rate
+        assert sample_rate == self.sample_rate
+        length = audio_data.shape[-1]
+        right_pad = math.ceil(length / self.hop_length) * self.hop_length - length
+        audio_data = nn.functional.pad(audio_data, (0, right_pad))
+        return audio_data
+    def encode(
+        self,
+        audio_data: torch.Tensor,
+        n_quantizers: int = None,
+    ):
+        """Encode given audio data and return quantized latent codes
+        Parameters
+        ----------
+        audio_data : Tensor[B x 1 x T]
+            Audio data to encode
+        n_quantizers : int, optional
+            Number of quantizers to use, by default None
+            If None, all quantizers are used.
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "z" : Tensor[B x D x T]
+                Quantized continuous representation of input
+            "codes" : Tensor[B x N x T]
+                Codebook indices for each codebook
+                (quantized discrete representation of input)
+            "latents" : Tensor[B x N*D x T]
+                Projected latents (continuous representation of input before quantization)
+            "vq/commitment_loss" : Tensor[1]
+                Commitment loss to train encoder to predict vectors closer to codebook
+                entries
+            "vq/codebook_loss" : Tensor[1]
+                Codebook loss to update the codebook
+            "length" : int
+                Number of samples in input audio
+        """
+        z = self.encoder(audio_data)  # [B x D x T]
+        if not self.continuous:
+            z, codes, latents, commitment_loss, codebook_loss = self.quantizer(z, n_quantizers)
+        else:
+            z = self.quant_conv(z)  # [B x 2D x T]
+            z = DiagonalGaussianDistribution(z)
+            codes, latents, commitment_loss, codebook_loss = None, None, 0, 0
+        return z, codes, latents, commitment_loss, codebook_loss
+    def decode(self, z: torch.Tensor):
+        """Decode given latent codes and return audio data
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+            Quantized continuous representation of input
+        length : int, optional
+            Number of samples in output audio, by default None
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "audio" : Tensor[B x 1 x length]
+                Decoded audio data.
+        """
+        if not self.continuous:
+            audio = self.decoder(z)
+        else:
+            z = self.post_quant_conv(z)
+            audio = self.decoder(z)
+        return audio
+    def forward(
+        self,
+        audio_data: torch.Tensor,
+        sample_rate: int = None,
+        n_quantizers: int = None,
+    ):
+        """Model forward pass
+        Parameters
+        ----------
+        audio_data : Tensor[B x 1 x T]
+            Audio data to encode
+        sample_rate : int, optional
+            Sample rate of audio data in Hz, by default None
+            If None, defaults to `self.sample_rate`
+        n_quantizers : int, optional
+            Number of quantizers to use, by default None.
+            If None, all quantizers are used.
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "z" : Tensor[B x D x T]
+                Quantized continuous representation of input
+            "codes" : Tensor[B x N x T]
+                Codebook indices for each codebook
+                (quantized discrete representation of input)
+            "latents" : Tensor[B x N*D x T]
+                Projected latents (continuous representation of input before quantization)
+            "vq/commitment_loss" : Tensor[1]
+                Commitment loss to train encoder to predict vectors closer to codebook
+                entries
+            "vq/codebook_loss" : Tensor[1]
+                Codebook loss to update the codebook
+            "length" : int
+                Number of samples in input audio
+            "audio" : Tensor[B x 1 x length]
+                Decoded audio data.
+        """
+        length = audio_data.shape[-1]
+        audio_data = self.preprocess(audio_data, sample_rate)
+        if not self.continuous:
+            z, codes, latents, commitment_loss, codebook_loss = self.encode(audio_data, n_quantizers)
+            x = self.decode(z)
+            return {
+                "audio": x[..., :length],
+                "z": z,
+                "codes": codes,
+                "latents": latents,
+                "vq/commitment_loss": commitment_loss,
+                "vq/codebook_loss": codebook_loss,
+            }
+        else:
+            posterior, _, _, _, _ = self.encode(audio_data, n_quantizers)
+            z = posterior.sample()
+            x = self.decode(z)
+            kl_loss = posterior.kl()
+            kl_loss = kl_loss.mean()
+            return {
+                "audio": x[..., :length],
+                "z": z,
+                "kl_loss": kl_loss,
+            }
+if __name__ == "__main__":
+    import numpy as np
+    from functools import partial
+    model = DAC().to("cpu")
+    for n, m in model.named_modules():
+        o = m.extra_repr()
+        p = sum([np.prod(p.size()) for p in m.parameters()])
+        fn = lambda o, p: o + f" {p/1e6:<.3f}M params."
+        setattr(m, "extra_repr", partial(fn, o=o, p=p))
+    print(model)
+    print("Total # of params: ", sum([np.prod(p.size()) for p in model.parameters()]))
+    length = 88200 * 2
+    x = torch.randn(1, 1, length).to(model.device)
+    x.requires_grad_(True)
+    x.retain_grad()
+    # Make a forward pass
+    out = model(x)["audio"]
+    print("Input shape:", x.shape)
+    print("Output shape:", out.shape)
+    # Create gradient variable
+    grad = torch.zeros_like(out)
+    grad[:, :, grad.shape[-1] // 2] = 1
+    # Make a backward pass
+    out.backward(grad)
+    # Check non-zero values
+    gradmap = x.grad.squeeze(0)
+    gradmap = (gradmap != 0).sum(0)  # sum across features
+    rf = (gradmap != 0).sum()
+    print(f"Receptive field: {rf.item()}")
+    x = AudioSignal(torch.randn(1, 1, 44100 * 60), 44100)
+    model.decompress(model.compress(x, verbose=True), verbose=True)

HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/model/discriminator.py ADDED Viewed

	@@ -0,0 +1,228 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from audiotools import AudioSignal
+from audiotools import ml
+from audiotools import STFTParams
+from einops import rearrange
+from torch.nn.utils import weight_norm
+def WNConv1d(*args, **kwargs):
+    act = kwargs.pop("act", True)
+    conv = weight_norm(nn.Conv1d(*args, **kwargs))
+    if not act:
+        return conv
+    return nn.Sequential(conv, nn.LeakyReLU(0.1))
+def WNConv2d(*args, **kwargs):
+    act = kwargs.pop("act", True)
+    conv = weight_norm(nn.Conv2d(*args, **kwargs))
+    if not act:
+        return conv
+    return nn.Sequential(conv, nn.LeakyReLU(0.1))
+class MPD(nn.Module):
+    def __init__(self, period):
+        super().__init__()
+        self.period = period
+        self.convs = nn.ModuleList(
+            [
+                WNConv2d(1, 32, (5, 1), (3, 1), padding=(2, 0)),
+                WNConv2d(32, 128, (5, 1), (3, 1), padding=(2, 0)),
+                WNConv2d(128, 512, (5, 1), (3, 1), padding=(2, 0)),
+                WNConv2d(512, 1024, (5, 1), (3, 1), padding=(2, 0)),
+                WNConv2d(1024, 1024, (5, 1), 1, padding=(2, 0)),
+            ]
+        )
+        self.conv_post = WNConv2d(
+            1024, 1, kernel_size=(3, 1), padding=(1, 0), act=False
+        )
+    def pad_to_period(self, x):
+        t = x.shape[-1]
+        x = F.pad(x, (0, self.period - t % self.period), mode="reflect")
+        return x
+    def forward(self, x):
+        fmap = []
+        x = self.pad_to_period(x)
+        x = rearrange(x, "b c (l p) -> b c l p", p=self.period)
+        for layer in self.convs:
+            x = layer(x)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        return fmap
+class MSD(nn.Module):
+    def __init__(self, rate: int = 1, sample_rate: int = 44100):
+        super().__init__()
+        self.convs = nn.ModuleList(
+            [
+                WNConv1d(1, 16, 15, 1, padding=7),
+                WNConv1d(16, 64, 41, 4, groups=4, padding=20),
+                WNConv1d(64, 256, 41, 4, groups=16, padding=20),
+                WNConv1d(256, 1024, 41, 4, groups=64, padding=20),
+                WNConv1d(1024, 1024, 41, 4, groups=256, padding=20),
+                WNConv1d(1024, 1024, 5, 1, padding=2),
+            ]
+        )
+        self.conv_post = WNConv1d(1024, 1, 3, 1, padding=1, act=False)
+        self.sample_rate = sample_rate
+        self.rate = rate
+    def forward(self, x):
+        x = AudioSignal(x, self.sample_rate)
+        x.resample(self.sample_rate // self.rate)
+        x = x.audio_data
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        return fmap
+BANDS = [(0.0, 0.1), (0.1, 0.25), (0.25, 0.5), (0.5, 0.75), (0.75, 1.0)]
+class MRD(nn.Module):
+    def __init__(
+        self,
+        window_length: int,
+        hop_factor: float = 0.25,
+        sample_rate: int = 44100,
+        bands: list = BANDS,
+    ):
+        """Complex multi-band spectrogram discriminator.
+        Parameters
+        ----------
+        window_length : int
+            Window length of STFT.
+        hop_factor : float, optional
+            Hop factor of the STFT, defaults to ``0.25 * window_length``.
+        sample_rate : int, optional
+            Sampling rate of audio in Hz, by default 44100
+        bands : list, optional
+            Bands to run discriminator over.
+        """
+        super().__init__()
+        self.window_length = window_length
+        self.hop_factor = hop_factor
+        self.sample_rate = sample_rate
+        self.stft_params = STFTParams(
+            window_length=window_length,
+            hop_length=int(window_length * hop_factor),
+            match_stride=True,
+        )
+        n_fft = window_length // 2 + 1
+        bands = [(int(b[0] * n_fft), int(b[1] * n_fft)) for b in bands]
+        self.bands = bands
+        ch = 32
+        convs = lambda: nn.ModuleList(
+            [
+                WNConv2d(2, ch, (3, 9), (1, 1), padding=(1, 4)),
+                WNConv2d(ch, ch, (3, 9), (1, 2), padding=(1, 4)),
+                WNConv2d(ch, ch, (3, 9), (1, 2), padding=(1, 4)),
+                WNConv2d(ch, ch, (3, 9), (1, 2), padding=(1, 4)),
+                WNConv2d(ch, ch, (3, 3), (1, 1), padding=(1, 1)),
+            ]
+        )
+        self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))])
+        self.conv_post = WNConv2d(ch, 1, (3, 3), (1, 1), padding=(1, 1), act=False)
+    def spectrogram(self, x):
+        x = AudioSignal(x, self.sample_rate, stft_params=self.stft_params)
+        x = torch.view_as_real(x.stft())
+        x = rearrange(x, "b 1 f t c -> (b 1) c t f")
+        # Split into bands
+        x_bands = [x[..., b[0] : b[1]] for b in self.bands]
+        return x_bands
+    def forward(self, x):
+        x_bands = self.spectrogram(x)
+        fmap = []
+        x = []
+        for band, stack in zip(x_bands, self.band_convs):
+            for layer in stack:
+                band = layer(band)
+                fmap.append(band)
+            x.append(band)
+        x = torch.cat(x, dim=-1)
+        x = self.conv_post(x)
+        fmap.append(x)
+        return fmap
+class Discriminator(ml.BaseModel):
+    def __init__(
+        self,
+        rates: list = [],
+        periods: list = [2, 3, 5, 7, 11],
+        fft_sizes: list = [2048, 1024, 512],
+        sample_rate: int = 44100,
+        bands: list = BANDS,
+    ):
+        """Discriminator that combines multiple discriminators.
+        Parameters
+        ----------
+        rates : list, optional
+            sampling rates (in Hz) to run MSD at, by default []
+            If empty, MSD is not used.
+        periods : list, optional
+            periods (of samples) to run MPD at, by default [2, 3, 5, 7, 11]
+        fft_sizes : list, optional
+            Window sizes of the FFT to run MRD at, by default [2048, 1024, 512]
+        sample_rate : int, optional
+            Sampling rate of audio in Hz, by default 44100
+        bands : list, optional
+            Bands to run MRD at, by default `BANDS`
+        """
+        super().__init__()
+        discs = []
+        discs += [MPD(p) for p in periods]
+        discs += [MSD(r, sample_rate=sample_rate) for r in rates]
+        discs += [MRD(f, sample_rate=sample_rate, bands=bands) for f in fft_sizes]
+        self.discriminators = nn.ModuleList(discs)
+    def preprocess(self, y):
+        # Remove DC offset
+        y = y - y.mean(dim=-1, keepdims=True)
+        # Peak normalize the volume of input audio
+        y = 0.8 * y / (y.abs().max(dim=-1, keepdim=True)[0] + 1e-9)
+        return y
+    def forward(self, x):
+        x = self.preprocess(x)
+        fmaps = [d(x) for d in self.discriminators]
+        return fmaps
+if __name__ == "__main__":
+    disc = Discriminator()
+    x = torch.zeros(1, 1, 44100)
+    results = disc(x)
+    for i, result in enumerate(results):
+        print(f"disc{i}")
+        for i, r in enumerate(result):
+            print(r.shape, r.mean(), r.min(), r.max())
+        print()

HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/nn/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from . import layers
+from . import loss
+from . import quantize

HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/nn/layers.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn.utils import weight_norm
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+def WNConvTranspose1d(*args, **kwargs):
+    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
+# Scripting this brings model speed up 1.4x
+@torch.jit.script
+def snake(x, alpha):
+    shape = x.shape
+    x = x.reshape(shape[0], shape[1], -1)
+    x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2)
+    x = x.reshape(shape)
+    return x
+class Snake1d(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.alpha = nn.Parameter(torch.ones(1, channels, 1))
+    def forward(self, x):
+        return snake(x, self.alpha)

HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/nn/loss.py ADDED Viewed

	@@ -0,0 +1,368 @@

+import typing
+from typing import List
+import torch
+import torch.nn.functional as F
+from audiotools import AudioSignal
+from audiotools import STFTParams
+from torch import nn
+class L1Loss(nn.L1Loss):
+    """L1 Loss between AudioSignals. Defaults
+    to comparing ``audio_data``, but any
+    attribute of an AudioSignal can be used.
+    Parameters
+    ----------
+    attribute : str, optional
+        Attribute of signal to compare, defaults to ``audio_data``.
+    weight : float, optional
+        Weight of this loss, defaults to 1.0.
+    Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/distance.py
+    """
+    def __init__(self, attribute: str = "audio_data", weight: float = 1.0, **kwargs):
+        self.attribute = attribute
+        self.weight = weight
+        super().__init__(**kwargs)
+    def forward(self, x: AudioSignal, y: AudioSignal):
+        """
+        Parameters
+        ----------
+        x : AudioSignal
+            Estimate AudioSignal
+        y : AudioSignal
+            Reference AudioSignal
+        Returns
+        -------
+        torch.Tensor
+            L1 loss between AudioSignal attributes.
+        """
+        if isinstance(x, AudioSignal):
+            x = getattr(x, self.attribute)
+            y = getattr(y, self.attribute)
+        return super().forward(x, y)
+class SISDRLoss(nn.Module):
+    """
+    Computes the Scale-Invariant Source-to-Distortion Ratio between a batch
+    of estimated and reference audio signals or aligned features.
+    Parameters
+    ----------
+    scaling : int, optional
+        Whether to use scale-invariant (True) or
+        signal-to-noise ratio (False), by default True
+    reduction : str, optional
+        How to reduce across the batch (either 'mean',
+        'sum', or none).], by default ' mean'
+    zero_mean : int, optional
+        Zero mean the references and estimates before
+        computing the loss, by default True
+    clip_min : int, optional
+        The minimum possible loss value. Helps network
+        to not focus on making already good examples better, by default None
+    weight : float, optional
+        Weight of this loss, defaults to 1.0.
+    Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/distance.py
+    """
+    def __init__(
+        self,
+        scaling: int = True,
+        reduction: str = "mean",
+        zero_mean: int = True,
+        clip_min: int = None,
+        weight: float = 1.0,
+    ):
+        self.scaling = scaling
+        self.reduction = reduction
+        self.zero_mean = zero_mean
+        self.clip_min = clip_min
+        self.weight = weight
+        super().__init__()
+    def forward(self, x: AudioSignal, y: AudioSignal):
+        eps = 1e-8
+        # nb, nc, nt
+        if isinstance(x, AudioSignal):
+            references = x.audio_data
+            estimates = y.audio_data
+        else:
+            references = x
+            estimates = y
+        nb = references.shape[0]
+        references = references.reshape(nb, 1, -1).permute(0, 2, 1)
+        estimates = estimates.reshape(nb, 1, -1).permute(0, 2, 1)
+        # samples now on axis 1
+        if self.zero_mean:
+            mean_reference = references.mean(dim=1, keepdim=True)
+            mean_estimate = estimates.mean(dim=1, keepdim=True)
+        else:
+            mean_reference = 0
+            mean_estimate = 0
+        _references = references - mean_reference
+        _estimates = estimates - mean_estimate
+        references_projection = (_references**2).sum(dim=-2) + eps
+        references_on_estimates = (_estimates * _references).sum(dim=-2) + eps
+        scale = (
+            (references_on_estimates / references_projection).unsqueeze(1)
+            if self.scaling
+            else 1
+        )
+        e_true = scale * _references
+        e_res = _estimates - e_true
+        signal = (e_true**2).sum(dim=1)
+        noise = (e_res**2).sum(dim=1)
+        sdr = -10 * torch.log10(signal / noise + eps)
+        if self.clip_min is not None:
+            sdr = torch.clamp(sdr, min=self.clip_min)
+        if self.reduction == "mean":
+            sdr = sdr.mean()
+        elif self.reduction == "sum":
+            sdr = sdr.sum()
+        return sdr
+class MultiScaleSTFTLoss(nn.Module):
+    """Computes the multi-scale STFT loss from [1].
+    Parameters
+    ----------
+    window_lengths : List[int], optional
+        Length of each window of each STFT, by default [2048, 512]
+    loss_fn : typing.Callable, optional
+        How to compare each loss, by default nn.L1Loss()
+    clamp_eps : float, optional
+        Clamp on the log magnitude, below, by default 1e-5
+    mag_weight : float, optional
+        Weight of raw magnitude portion of loss, by default 1.0
+    log_weight : float, optional
+        Weight of log magnitude portion of loss, by default 1.0
+    pow : float, optional
+        Power to raise magnitude to before taking log, by default 2.0
+    weight : float, optional
+        Weight of this loss, by default 1.0
+    match_stride : bool, optional
+        Whether to match the stride of convolutional layers, by default False
+    References
+    ----------
+    1.  Engel, Jesse, Chenjie Gu, and Adam Roberts.
+        "DDSP: Differentiable Digital Signal Processing."
+        International Conference on Learning Representations. 2019.
+    Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/spectral.py
+    """
+    def __init__(
+        self,
+        window_lengths: List[int] = [2048, 512],
+        loss_fn: typing.Callable = nn.L1Loss(),
+        clamp_eps: float = 1e-5,
+        mag_weight: float = 1.0,
+        log_weight: float = 1.0,
+        pow: float = 2.0,
+        weight: float = 1.0,
+        match_stride: bool = False,
+        window_type: str = None,
+    ):
+        super().__init__()
+        self.stft_params = [
+            STFTParams(
+                window_length=w,
+                hop_length=w // 4,
+                match_stride=match_stride,
+                window_type=window_type,
+            )
+            for w in window_lengths
+        ]
+        self.loss_fn = loss_fn
+        self.log_weight = log_weight
+        self.mag_weight = mag_weight
+        self.clamp_eps = clamp_eps
+        self.weight = weight
+        self.pow = pow
+    def forward(self, x: AudioSignal, y: AudioSignal):
+        """Computes multi-scale STFT between an estimate and a reference
+        signal.
+        Parameters
+        ----------
+        x : AudioSignal
+            Estimate signal
+        y : AudioSignal
+            Reference signal
+        Returns
+        -------
+        torch.Tensor
+            Multi-scale STFT loss.
+        """
+        loss = 0.0
+        for s in self.stft_params:
+            x.stft(s.window_length, s.hop_length, s.window_type)
+            y.stft(s.window_length, s.hop_length, s.window_type)
+            loss += self.log_weight * self.loss_fn(
+                x.magnitude.clamp(self.clamp_eps).pow(self.pow).log10(),
+                y.magnitude.clamp(self.clamp_eps).pow(self.pow).log10(),
+            )
+            loss += self.mag_weight * self.loss_fn(x.magnitude, y.magnitude)
+        return loss
+class MelSpectrogramLoss(nn.Module):
+    """Compute distance between mel spectrograms. Can be used
+    in a multi-scale way.
+    Parameters
+    ----------
+    n_mels : List[int]
+        Number of mels per STFT, by default [150, 80],
+    window_lengths : List[int], optional
+        Length of each window of each STFT, by default [2048, 512]
+    loss_fn : typing.Callable, optional
+        How to compare each loss, by default nn.L1Loss()
+    clamp_eps : float, optional
+        Clamp on the log magnitude, below, by default 1e-5
+    mag_weight : float, optional
+        Weight of raw magnitude portion of loss, by default 1.0
+    log_weight : float, optional
+        Weight of log magnitude portion of loss, by default 1.0
+    pow : float, optional
+        Power to raise magnitude to before taking log, by default 2.0
+    weight : float, optional
+        Weight of this loss, by default 1.0
+    match_stride : bool, optional
+        Whether to match the stride of convolutional layers, by default False
+    Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/spectral.py
+    """
+    def __init__(
+        self,
+        n_mels: List[int] = [150, 80],
+        window_lengths: List[int] = [2048, 512],
+        loss_fn: typing.Callable = nn.L1Loss(),
+        clamp_eps: float = 1e-5,
+        mag_weight: float = 1.0,
+        log_weight: float = 1.0,
+        pow: float = 2.0,
+        weight: float = 1.0,
+        match_stride: bool = False,
+        mel_fmin: List[float] = [0.0, 0.0],
+        mel_fmax: List[float] = [None, None],
+        window_type: str = None,
+    ):
+        super().__init__()
+        self.stft_params = [
+            STFTParams(
+                window_length=w,
+                hop_length=w // 4,
+                match_stride=match_stride,
+                window_type=window_type,
+            )
+            for w in window_lengths
+        ]
+        self.n_mels = n_mels
+        self.loss_fn = loss_fn
+        self.clamp_eps = clamp_eps
+        self.log_weight = log_weight
+        self.mag_weight = mag_weight
+        self.weight = weight
+        self.mel_fmin = mel_fmin
+        self.mel_fmax = mel_fmax
+        self.pow = pow
+    def forward(self, x: AudioSignal, y: AudioSignal):
+        """Computes mel loss between an estimate and a reference
+        signal.
+        Parameters
+        ----------
+        x : AudioSignal
+            Estimate signal
+        y : AudioSignal
+            Reference signal
+        Returns
+        -------
+        torch.Tensor
+            Mel loss.
+        """
+        loss = 0.0
+        for n_mels, fmin, fmax, s in zip(
+            self.n_mels, self.mel_fmin, self.mel_fmax, self.stft_params
+        ):
+            kwargs = {
+                "window_length": s.window_length,
+                "hop_length": s.hop_length,
+                "window_type": s.window_type,
+            }
+            x_mels = x.mel_spectrogram(n_mels, mel_fmin=fmin, mel_fmax=fmax, **kwargs)
+            y_mels = y.mel_spectrogram(n_mels, mel_fmin=fmin, mel_fmax=fmax, **kwargs)
+            loss += self.log_weight * self.loss_fn(
+                x_mels.clamp(self.clamp_eps).pow(self.pow).log10(),
+                y_mels.clamp(self.clamp_eps).pow(self.pow).log10(),
+            )
+            loss += self.mag_weight * self.loss_fn(x_mels, y_mels)
+        return loss
+class GANLoss(nn.Module):
+    """
+    Computes a discriminator loss, given a discriminator on
+    generated waveforms/spectrograms compared to ground truth
+    waveforms/spectrograms. Computes the loss for both the
+    discriminator and the generator in separate functions.
+    """
+    def __init__(self, discriminator):
+        super().__init__()
+        self.discriminator = discriminator
+    def forward(self, fake, real):
+        d_fake = self.discriminator(fake.audio_data)
+        d_real = self.discriminator(real.audio_data)
+        return d_fake, d_real
+    def discriminator_loss(self, fake, real):
+        d_fake, d_real = self.forward(fake.clone().detach(), real)
+        loss_d = 0
+        for x_fake, x_real in zip(d_fake, d_real):
+            loss_d += torch.mean(x_fake[-1] ** 2)
+            loss_d += torch.mean((1 - x_real[-1]) ** 2)
+        return loss_d
+    def generator_loss(self, fake, real):
+        d_fake, d_real = self.forward(fake, real)
+        loss_g = 0
+        for x_fake in d_fake:
+            loss_g += torch.mean((1 - x_fake[-1]) ** 2)
+        loss_feature = 0
+        for i in range(len(d_fake)):
+            for j in range(len(d_fake[i]) - 1):
+                loss_feature += F.l1_loss(d_fake[i][j], d_real[i][j].detach())
+        return loss_g, loss_feature

HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/nn/quantize.py ADDED Viewed

	@@ -0,0 +1,262 @@

+from typing import Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn.utils import weight_norm
+from .layers import WNConv1d
+class VectorQuantize(nn.Module):
+    """
+    Implementation of VQ similar to Karpathy's repo:
+    https://github.com/karpathy/deep-vector-quantization
+    Additionally uses following tricks from Improved VQGAN
+    (https://arxiv.org/pdf/2110.04627.pdf):
+        1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space
+            for improved codebook usage
+        2. l2-normalized codes: Converts euclidean distance to cosine similarity which
+            improves training stability
+    """
+    def __init__(self, input_dim: int, codebook_size: int, codebook_dim: int):
+        super().__init__()
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.in_proj = WNConv1d(input_dim, codebook_dim, kernel_size=1)
+        self.out_proj = WNConv1d(codebook_dim, input_dim, kernel_size=1)
+        self.codebook = nn.Embedding(codebook_size, codebook_dim)
+    def forward(self, z):
+        """Quantized the input tensor using a fixed codebook and returns
+        the corresponding codebook vectors
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized continuous representation of input
+        Tensor[1]
+            Commitment loss to train encoder to predict vectors closer to codebook
+            entries
+        Tensor[1]
+            Codebook loss to update the codebook
+        Tensor[B x T]
+            Codebook indices (quantized discrete representation of input)
+        Tensor[B x D x T]
+            Projected latents (continuous representation of input before quantization)
+        """
+        # Factorized codes (ViT-VQGAN) Project input into low-dimensional space
+        z_e = self.in_proj(z)  # z_e : (B x D x T)
+        z_q, indices = self.decode_latents(z_e)
+        commitment_loss = F.mse_loss(z_e, z_q.detach(), reduction="none").mean([1, 2])
+        codebook_loss = F.mse_loss(z_q, z_e.detach(), reduction="none").mean([1, 2])
+        z_q = (
+            z_e + (z_q - z_e).detach()
+        )  # noop in forward pass, straight-through gradient estimator in backward pass
+        z_q = self.out_proj(z_q)
+        return z_q, commitment_loss, codebook_loss, indices, z_e
+    def embed_code(self, embed_id):
+        return F.embedding(embed_id, self.codebook.weight)
+    def decode_code(self, embed_id):
+        return self.embed_code(embed_id).transpose(1, 2)
+    def decode_latents(self, latents):
+        encodings = rearrange(latents, "b d t -> (b t) d")
+        codebook = self.codebook.weight  # codebook: (N x D)
+        # L2 normalize encodings and codebook (ViT-VQGAN)
+        encodings = F.normalize(encodings)
+        codebook = F.normalize(codebook)
+        # Compute euclidean distance with codebook
+        dist = (
+            encodings.pow(2).sum(1, keepdim=True)
+            - 2 * encodings @ codebook.t()
+            + codebook.pow(2).sum(1, keepdim=True).t()
+        )
+        indices = rearrange((-dist).max(1)[1], "(b t) -> b t", b=latents.size(0))
+        z_q = self.decode_code(indices)
+        return z_q, indices
+class ResidualVectorQuantize(nn.Module):
+    """
+    Introduced in SoundStream: An end2end neural audio codec
+    https://arxiv.org/abs/2107.03312
+    """
+    def __init__(
+        self,
+        input_dim: int = 512,
+        n_codebooks: int = 9,
+        codebook_size: int = 1024,
+        codebook_dim: Union[int, list] = 8,
+        quantizer_dropout: float = 0.0,
+    ):
+        super().__init__()
+        if isinstance(codebook_dim, int):
+            codebook_dim = [codebook_dim for _ in range(n_codebooks)]
+        self.n_codebooks = n_codebooks
+        self.codebook_dim = codebook_dim
+        self.codebook_size = codebook_size
+        self.quantizers = nn.ModuleList(
+            [
+                VectorQuantize(input_dim, codebook_size, codebook_dim[i])
+                for i in range(n_codebooks)
+            ]
+        )
+        self.quantizer_dropout = quantizer_dropout
+    def forward(self, z, n_quantizers: int = None):
+        """Quantized the input tensor using a fixed set of `n` codebooks and returns
+        the corresponding codebook vectors
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        n_quantizers : int, optional
+            No. of quantizers to use
+            (n_quantizers < self.n_codebooks ex: for quantizer dropout)
+            Note: if `self.quantizer_dropout` is True, this argument is ignored
+                when in training mode, and a random number of quantizers is used.
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "z" : Tensor[B x D x T]
+                Quantized continuous representation of input
+            "codes" : Tensor[B x N x T]
+                Codebook indices for each codebook
+                (quantized discrete representation of input)
+            "latents" : Tensor[B x N*D x T]
+                Projected latents (continuous representation of input before quantization)
+            "vq/commitment_loss" : Tensor[1]
+                Commitment loss to train encoder to predict vectors closer to codebook
+                entries
+            "vq/codebook_loss" : Tensor[1]
+                Codebook loss to update the codebook
+        """
+        z_q = 0
+        residual = z
+        commitment_loss = 0
+        codebook_loss = 0
+        codebook_indices = []
+        latents = []
+        if n_quantizers is None:
+            n_quantizers = self.n_codebooks
+        if self.training:
+            n_quantizers = torch.ones((z.shape[0],)) * self.n_codebooks + 1
+            dropout = torch.randint(1, self.n_codebooks + 1, (z.shape[0],))
+            n_dropout = int(z.shape[0] * self.quantizer_dropout)
+            n_quantizers[:n_dropout] = dropout[:n_dropout]
+            n_quantizers = n_quantizers.to(z.device)
+        for i, quantizer in enumerate(self.quantizers):
+            if self.training is False and i >= n_quantizers:
+                break
+            z_q_i, commitment_loss_i, codebook_loss_i, indices_i, z_e_i = quantizer(
+                residual
+            )
+            # Create mask to apply quantizer dropout
+            mask = (
+                torch.full((z.shape[0],), fill_value=i, device=z.device) < n_quantizers
+            )
+            z_q = z_q + z_q_i * mask[:, None, None]
+            residual = residual - z_q_i
+            # Sum losses
+            commitment_loss += (commitment_loss_i * mask).mean()
+            codebook_loss += (codebook_loss_i * mask).mean()
+            codebook_indices.append(indices_i)
+            latents.append(z_e_i)
+        codes = torch.stack(codebook_indices, dim=1)
+        latents = torch.cat(latents, dim=1)
+        return z_q, codes, latents, commitment_loss, codebook_loss
+    def from_codes(self, codes: torch.Tensor):
+        """Given the quantized codes, reconstruct the continuous representation
+        Parameters
+        ----------
+        codes : Tensor[B x N x T]
+            Quantized discrete representation of input
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized continuous representation of input
+        """
+        z_q = 0.0
+        z_p = []
+        n_codebooks = codes.shape[1]
+        for i in range(n_codebooks):
+            z_p_i = self.quantizers[i].decode_code(codes[:, i, :])
+            z_p.append(z_p_i)
+            z_q_i = self.quantizers[i].out_proj(z_p_i)
+            z_q = z_q + z_q_i
+        return z_q, torch.cat(z_p, dim=1), codes
+    def from_latents(self, latents: torch.Tensor):
+        """Given the unquantized latents, reconstruct the
+        continuous representation after quantization.
+        Parameters
+        ----------
+        latents : Tensor[B x N x T]
+            Continuous representation of input after projection
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized representation of full-projected space
+        Tensor[B x D x T]
+            Quantized representation of latent space
+        """
+        z_q = 0
+        z_p = []
+        codes = []
+        dims = np.cumsum([0] + [q.codebook_dim for q in self.quantizers])
+        n_codebooks = np.where(dims <= latents.shape[1])[0].max(axis=0, keepdims=True)[
+            0
+        ]
+        for i in range(n_codebooks):
+            j, k = dims[i], dims[i + 1]
+            z_p_i, codes_i = self.quantizers[i].decode_latents(latents[:, j:k, :])
+            z_p.append(z_p_i)
+            codes.append(codes_i)
+            z_q_i = self.quantizers[i].out_proj(z_p_i)
+            z_q = z_q + z_q_i
+        return z_q, torch.cat(z_p, dim=1), torch.stack(codes, dim=1)
+if __name__ == "__main__":
+    rvq = ResidualVectorQuantize(quantizer_dropout=True)
+    x = torch.randn(16, 512, 80)
+    y = rvq(x)
+    print(y["latents"].shape)

HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/nn/vae_utils.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import torch
+import numpy as np
+class AbstractDistribution:
+    def sample(self):
+        raise NotImplementedError()
+    def mode(self):
+        raise NotImplementedError()
+class DiracDistribution(AbstractDistribution):
+    def __init__(self, value):
+        self.value = value
+    def sample(self):
+        return self.value
+    def mode(self):
+        return self.value
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters, deterministic=False):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
+    def sample(self):
+        x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
+        return x
+    def kl(self, other=None):
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            if other is None:
+                return 0.5 * torch.mean(
+                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
+                    dim=[1, 2],
+                )
+            else:
+                return 0.5 * torch.mean(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var
+                    - 1.0
+                    - self.logvar
+                    + other.logvar,
+                    dim=[1, 2],
+                )
+    def nll(self, sample, dims=[1, 2]):
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims,
+        )
+    def mode(self):
+        return self.mean
+def normal_kl(mean1, logvar1, mean2, logvar2):
+    """
+    source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
+    Compute the KL divergence between two gaussians.
+    Shapes are automatically broadcasted, so batches can be compared to
+    scalars, among other use cases.
+    """
+    tensor = None
+    for obj in (mean1, logvar1, mean2, logvar2):
+        if isinstance(obj, torch.Tensor):
+            tensor = obj
+            break
+    assert tensor is not None, "at least one argument must be a Tensor"
+    # Force variances to be Tensors. Broadcasting helps convert scalars to
+    # Tensors, but it does not work for torch.exp().
+    logvar1, logvar2 = [x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor) for x in (logvar1, logvar2)]
+    return 0.5 * (
+        -1.0 + logvar2 - logvar1 + torch.exp(logvar1 - logvar2) + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
+    )

HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,121 @@

+from pathlib import Path
+import argbind
+from audiotools import ml
+from ..model import DAC
+Accelerator = ml.Accelerator
+__MODEL_LATEST_TAGS__ = {
+    ("44khz", "8kbps"): "0.0.1",
+    ("24khz", "8kbps"): "0.0.4",
+    ("16khz", "8kbps"): "0.0.5",
+    ("44khz", "16kbps"): "1.0.0",
+}
+__MODEL_URLS__ = {
+    (
+        "44khz",
+        "0.0.1",
+        "8kbps",
+    ): "https://github.com/descriptinc/descript-audio-codec/releases/download/0.0.1/weights.pth",
+    (
+        "24khz",
+        "0.0.4",
+        "8kbps",
+    ): "https://github.com/descriptinc/descript-audio-codec/releases/download/0.0.4/weights_24khz.pth",
+    (
+        "16khz",
+        "0.0.5",
+        "8kbps",
+    ): "https://github.com/descriptinc/descript-audio-codec/releases/download/0.0.5/weights_16khz.pth",
+    (
+        "44khz",
+        "1.0.0",
+        "16kbps",
+    ): "https://github.com/descriptinc/descript-audio-codec/releases/download/1.0.0/weights_44khz_16kbps.pth",
+}
+@argbind.bind(group="download", positional=True, without_prefix=True)
+def download(
+    model_type: str = "44khz", model_bitrate: str = "8kbps", tag: str = "latest"
+):
+    """
+    Function that downloads the weights file from URL if a local cache is not found.
+    Parameters
+    ----------
+    model_type : str
+        The type of model to download. Must be one of "44khz", "24khz", or "16khz". Defaults to "44khz".
+    model_bitrate: str
+        Bitrate of the model. Must be one of "8kbps", or "16kbps". Defaults to "8kbps".
+        Only 44khz model supports 16kbps.
+    tag : str
+        The tag of the model to download. Defaults to "latest".
+    Returns
+    -------
+    Path
+        Directory path required to load model via audiotools.
+    """
+    model_type = model_type.lower()
+    tag = tag.lower()
+    assert model_type in [
+        "44khz",
+        "24khz",
+        "16khz",
+    ], "model_type must be one of '44khz', '24khz', or '16khz'"
+    assert model_bitrate in [
+        "8kbps",
+        "16kbps",
+    ], "model_bitrate must be one of '8kbps', or '16kbps'"
+    if tag == "latest":
+        tag = __MODEL_LATEST_TAGS__[(model_type, model_bitrate)]
+    download_link = __MODEL_URLS__.get((model_type, tag, model_bitrate), None)
+    if download_link is None:
+        raise ValueError(
+            f"Could not find model with tag {tag} and model type {model_type}"
+        )
+    local_path = (
+        Path.home()
+        / ".cache"
+        / "descript"
+        / "dac"
+        / f"weights_{model_type}_{model_bitrate}_{tag}.pth"
+    )
+    if not local_path.exists():
+        local_path.parent.mkdir(parents=True, exist_ok=True)
+        # Download the model
+        import requests
+        response = requests.get(download_link)
+        if response.status_code != 200:
+            raise ValueError(
+                f"Could not download model. Received response code {response.status_code}"
+            )
+        local_path.write_bytes(response.content)
+    return local_path
+def load_model(
+    model_type: str = "44khz",
+    model_bitrate: str = "8kbps",
+    tag: str = "latest",
+    load_path: str = None,
+):
+    if not load_path:
+        load_path = download(
+            model_type=model_type, model_bitrate=model_bitrate, tag=tag
+        )
+    generator = DAC.load(load_path)
+    return generator

HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/utils/decode.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import warnings
+from pathlib import Path
+import argbind
+import numpy as np
+import torch
+from audiotools import AudioSignal
+from tqdm import tqdm
+from ..model import DACFile
+from . import load_model
+warnings.filterwarnings("ignore", category=UserWarning)
+@argbind.bind(group="decode", positional=True, without_prefix=True)
+@torch.inference_mode()
+@torch.no_grad()
+def decode(
+    input: str,
+    output: str = "",
+    weights_path: str = "",
+    model_tag: str = "latest",
+    model_bitrate: str = "8kbps",
+    device: str = "cuda",
+    model_type: str = "44khz",
+    verbose: bool = False,
+):
+    """Decode audio from codes.
+    Parameters
+    ----------
+    input : str
+        Path to input directory or file
+    output : str, optional
+        Path to output directory, by default "".
+        If `input` is a directory, the directory sub-tree relative to `input` is re-created in `output`.
+    weights_path : str, optional
+        Path to weights file, by default "". If not specified, the weights file will be downloaded from the internet using the
+        model_tag and model_type.
+    model_tag : str, optional
+        Tag of the model to use, by default "latest". Ignored if `weights_path` is specified.
+    model_bitrate: str
+        Bitrate of the model. Must be one of "8kbps", or "16kbps". Defaults to "8kbps".
+    device : str, optional
+        Device to use, by default "cuda". If "cpu", the model will be loaded on the CPU.
+    model_type : str, optional
+        The type of model to use. Must be one of "44khz", "24khz", or "16khz". Defaults to "44khz". Ignored if `weights_path` is specified.
+    """
+    generator = load_model(
+        model_type=model_type,
+        model_bitrate=model_bitrate,
+        tag=model_tag,
+        load_path=weights_path,
+    )
+    generator.to(device)
+    generator.eval()
+    # Find all .dac files in input directory
+    _input = Path(input)
+    input_files = list(_input.glob("**/*.dac"))
+    # If input is a .dac file, add it to the list
+    if _input.suffix == ".dac":
+        input_files.append(_input)
+    # Create output directory
+    output = Path(output)
+    output.mkdir(parents=True, exist_ok=True)
+    for i in tqdm(range(len(input_files)), desc=f"Decoding files"):
+        # Load file
+        artifact = DACFile.load(input_files[i])
+        # Reconstruct audio from codes
+        recons = generator.decompress(artifact, verbose=verbose)
+        # Compute output path
+        relative_path = input_files[i].relative_to(input)
+        output_dir = output / relative_path.parent
+        if not relative_path.name:
+            output_dir = output
+            relative_path = input_files[i]
+        output_name = relative_path.with_suffix(".wav").name
+        output_path = output_dir / output_name
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        # Write to file
+        recons.write(output_path)
+if __name__ == "__main__":
+    args = argbind.parse_args()
+    with argbind.scope(args):
+        decode()

HunyuanVideo-Foley/hunyuanvideo_foley/models/dac_vae/utils/encode.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import math
+import warnings
+from pathlib import Path
+import argbind
+import numpy as np
+import torch
+from audiotools import AudioSignal
+from audiotools.core import util
+from tqdm import tqdm
+from . import load_model
+warnings.filterwarnings("ignore", category=UserWarning)
+@argbind.bind(group="encode", positional=True, without_prefix=True)
+@torch.inference_mode()
+@torch.no_grad()
+def encode(
+    input: str,
+    output: str = "",
+    weights_path: str = "",
+    model_tag: str = "latest",
+    model_bitrate: str = "8kbps",
+    n_quantizers: int = None,
+    device: str = "cuda",
+    model_type: str = "44khz",
+    win_duration: float = 5.0,
+    verbose: bool = False,
+):
+    """Encode audio files in input path to .dac format.
+    Parameters
+    ----------
+    input : str
+        Path to input audio file or directory
+    output : str, optional
+        Path to output directory, by default "". If `input` is a directory, the directory sub-tree relative to `input` is re-created in `output`.
+    weights_path : str, optional
+        Path to weights file, by default "". If not specified, the weights file will be downloaded from the internet using the
+        model_tag and model_type.
+    model_tag : str, optional
+        Tag of the model to use, by default "latest". Ignored if `weights_path` is specified.
+    model_bitrate: str
+        Bitrate of the model. Must be one of "8kbps", or "16kbps". Defaults to "8kbps".
+    n_quantizers : int, optional
+        Number of quantizers to use, by default None. If not specified, all the quantizers will be used and the model will compress at maximum bitrate.
+    device : str, optional
+        Device to use, by default "cuda"
+    model_type : str, optional
+        The type of model to use. Must be one of "44khz", "24khz", or "16khz". Defaults to "44khz". Ignored if `weights_path` is specified.
+    """
+    generator = load_model(
+        model_type=model_type,
+        model_bitrate=model_bitrate,
+        tag=model_tag,
+        load_path=weights_path,
+    )
+    generator.to(device)
+    generator.eval()
+    kwargs = {"n_quantizers": n_quantizers}
+    # Find all audio files in input path
+    input = Path(input)
+    audio_files = util.find_audio(input)
+    output = Path(output)
+    output.mkdir(parents=True, exist_ok=True)
+    for i in tqdm(range(len(audio_files)), desc="Encoding files"):
+        # Load file
+        signal = AudioSignal(audio_files[i])
+        # Encode audio to .dac format
+        artifact = generator.compress(signal, win_duration, verbose=verbose, **kwargs)
+        # Compute output path
+        relative_path = audio_files[i].relative_to(input)
+        output_dir = output / relative_path.parent
+        if not relative_path.name:
+            output_dir = output
+            relative_path = audio_files[i]
+        output_name = relative_path.with_suffix(".dac").name
+        output_path = output_dir / output_name
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        artifact.save(output_path)
+if __name__ == "__main__":
+    args = argbind.parse_args()
+    with argbind.scope(args):
+        encode()

HunyuanVideo-Foley/hunyuanvideo_foley/models/hifi_foley.py ADDED Viewed

	@@ -0,0 +1,794 @@

+from typing import List, Tuple, Optional, Union, Dict
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from einops.layers.torch import Rearrange
+from diffusers.models import ModelMixin
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from .nn.activation_layers import SwiGLU, get_activation_layer
+from .nn.attn_layers import apply_rotary_emb, attention
+from .nn.embed_layers import TimestepEmbedder, ConditionProjection, PatchEmbed1D
+from .nn.mlp_layers import MLP, ConvMLP, FinalLayer1D, ChannelLastConv1d
+from .nn.modulate_layers import ModulateDiT, ckpt_wrapper, apply_gate, modulate
+from .nn.norm_layers import get_norm_layer
+from .nn.posemb_layers import get_nd_rotary_pos_embed
+def interleave_two_sequences(x1: torch.Tensor, x2: torch.Tensor):
+    # [B, N1, H, C] & [B, N2, H, C]
+    B, N1, H, C = x1.shape
+    B, N2, H, C = x2.shape
+    assert x1.ndim == x2.ndim == 4
+    if N1 != N2:
+        x2 = x2.view(B, N2, -1).transpose(1, 2)
+        x2 = F.interpolate(x2, size=(N1), mode="nearest-exact")
+        x2 = x2.transpose(1, 2).view(B, N1, H, C)
+    x = torch.stack((x1, x2), dim=2)
+    x = x.reshape(B, N1 * 2, H, C)
+    return x
+def decouple_interleaved_two_sequences(x: torch.Tensor, len1: int, len2: int):
+    B, N, H, C = x.shape
+    assert N % 2 == 0 and N // 2 == len1
+    x = x.reshape(B, -1, 2, H, C)
+    x1 = x[:, :, 0]
+    x2 = x[:, :, 1]
+    if x2.shape[1] != len2:
+        x2 = x2.view(B, len1, H * C).transpose(1, 2)
+        x2 = F.interpolate(x2, size=(len2), mode="nearest-exact")
+        x2 = x2.transpose(1, 2).view(B, len2, H, C)
+    return x1, x2
+class TwoStreamCABlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float,
+        mlp_act_type: str = "gelu_tanh",
+        qk_norm: bool = True,
+        qk_norm_type: str = "rms",
+        qkv_bias: bool = False,
+        attn_mode: str = "torch",
+        reverse: bool = False,
+        interleaved_audio_visual_rope: bool = False,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.deterministic = False
+        self.reverse = reverse
+        self.attn_mode = attn_mode
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        head_dim = hidden_size // num_heads
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.interleaved_audio_visual_rope = interleaved_audio_visual_rope
+        # Self attention for audio + visual
+        self.audio_mod = ModulateDiT(hidden_size, factor=9, act_layer=get_activation_layer("silu"), **factory_kwargs)
+        self.audio_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        self.audio_self_attn_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs)
+        qk_norm_layer = get_norm_layer(qk_norm_type)
+        self.audio_self_q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.audio_self_k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.audio_self_proj = nn.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
+        # visual cond
+        self.v_cond_mod = ModulateDiT(hidden_size, factor=9, act_layer=get_activation_layer("silu"), **factory_kwargs)
+        self.v_cond_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        self.v_cond_attn_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs)
+        self.v_cond_attn_q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.v_cond_attn_k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.v_cond_self_proj = nn.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
+        self.max_text_len = 100
+        self.rope_dim_list = None
+        # audio and video norm for cross attention with text
+        self.audio_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        self.v_cond_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        # Cross attention: (video_audio) as query, text as key/value
+        self.audio_cross_q = nn.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
+        self.v_cond_cross_q = nn.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
+        self.text_cross_kv = nn.Linear(hidden_size, hidden_size * 2, bias=qkv_bias, **factory_kwargs)
+        self.audio_cross_q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.v_cond_cross_q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.text_cross_k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.audio_cross_proj = nn.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
+        self.v_cond_cross_proj = nn.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
+        # MLPs
+        self.audio_norm3 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        self.audio_mlp = MLP(
+            hidden_size, mlp_hidden_dim, act_layer=get_activation_layer(mlp_act_type), bias=True, **factory_kwargs
+        )
+        self.v_cond_norm3 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        self.v_cond_mlp = MLP(
+            hidden_size, mlp_hidden_dim, act_layer=get_activation_layer(mlp_act_type), bias=True, **factory_kwargs
+        )
+    def build_rope_for_text(self, text_len, head_dim, rope_dim_list=None):
+        target_ndim = 1  # n-d RoPE
+        rope_sizes = [text_len]
+        if rope_dim_list is None:
+            rope_dim_list = [head_dim // target_ndim for _ in range(target_ndim)]
+        assert sum(rope_dim_list) == head_dim, "sum(rope_dim_list) should equal to head_dim of attention layer"
+        text_freqs_cos, text_freqs_sin = get_nd_rotary_pos_embed(
+            rope_dim_list=rope_dim_list,
+            start=rope_sizes,
+            theta=10000,
+            use_real=True,
+            theta_rescale_factor=1.0,
+        )
+        return text_freqs_cos, text_freqs_sin
+    def set_attn_mode(self, new_mode):
+        if new_mode != "torch":
+            raise NotImplementedError(f"Only support 'torch' mode, got {new_mode}.")
+        self.attn_mode = new_mode
+    def enable_deterministic(self):
+        self.deterministic = True
+    def disable_deterministic(self):
+        self.deterministic = False
+    def forward(
+        self,
+        audio: torch.Tensor,
+        cond: torch.Tensor,
+        v_cond: torch.Tensor,
+        attn_mask: torch.Tensor,
+        vec: torch.Tensor,
+        freqs_cis: tuple = None,
+        v_freqs_cis: tuple = None,
+        sync_vec: torch.Tensor = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Get modulation parameters
+        if sync_vec is not None:
+            assert sync_vec.ndim == 3
+            (audio_mod1_shift, audio_mod1_scale, audio_mod1_gate,
+             audio_mod2_shift, audio_mod2_scale, audio_mod2_gate,
+             audio_mod3_shift, audio_mod3_scale, audio_mod3_gate,
+            ) = self.audio_mod(sync_vec).chunk(9, dim=-1)
+        else:
+            (audio_mod1_shift, audio_mod1_scale, audio_mod1_gate,
+             audio_mod2_shift, audio_mod2_scale, audio_mod2_gate,
+             audio_mod3_shift, audio_mod3_scale, audio_mod3_gate,
+            ) = self.audio_mod(vec).chunk(9, dim=-1)
+        (
+            v_cond_mod1_shift,
+            v_cond_mod1_scale,
+            v_cond_mod1_gate,
+            v_cond_mod2_shift,
+            v_cond_mod2_scale,
+            v_cond_mod2_gate,
+            v_cond_mod3_shift,
+            v_cond_mod3_scale,
+            v_cond_mod3_gate,
+        ) = self.v_cond_mod(vec).chunk(9, dim=-1)
+        # 1. Self Attention for audio + visual
+        audio_modulated = self.audio_norm1(audio)
+        audio_modulated = modulate(audio_modulated, shift=audio_mod1_shift, scale=audio_mod1_scale)
+        audio_qkv = self.audio_self_attn_qkv(audio_modulated)
+        audio_q, audio_k, audio_v = rearrange(audio_qkv, "B L (K H D) -> K B L H D", K=3, H=self.num_heads)
+        audio_q = self.audio_self_q_norm(audio_q).to(audio_v)
+        audio_k = self.audio_self_k_norm(audio_k).to(audio_v)
+        # Prepare visual cond for attention
+        v_cond_modulated = self.v_cond_norm1(v_cond)
+        v_cond_modulated = modulate(v_cond_modulated, shift=v_cond_mod1_shift, scale=v_cond_mod1_scale)
+        v_cond_qkv = self.v_cond_attn_qkv(v_cond_modulated)
+        v_cond_q, v_cond_k, v_cond_v = rearrange(v_cond_qkv, "B L (K H D) -> K B L H D", K=3, H=self.num_heads)
+        v_cond_q = self.v_cond_attn_q_norm(v_cond_q).to(v_cond_v)
+        v_cond_k = self.v_cond_attn_k_norm(v_cond_k).to(v_cond_v)
+        # Apply RoPE if needed for audio and visual
+        if freqs_cis is not None:
+            if not self.interleaved_audio_visual_rope:
+                audio_qq, audio_kk = apply_rotary_emb(audio_q, audio_k, freqs_cis, head_first=False)
+                audio_q, audio_k = audio_qq, audio_kk
+            else:
+                ori_audio_len = audio_q.shape[1]
+                ori_v_con_len = v_cond_q.shape[1]
+                interleaved_audio_visual_q = interleave_two_sequences(audio_q, v_cond_q)
+                interleaved_audio_visual_k = interleave_two_sequences(audio_k, v_cond_k)
+                interleaved_audio_visual_qq, interleaved_audio_visual_kk = apply_rotary_emb(
+                    interleaved_audio_visual_q, interleaved_audio_visual_k, freqs_cis, head_first=False
+                )
+                audio_qq, v_cond_qq = decouple_interleaved_two_sequences(
+                    interleaved_audio_visual_qq, ori_audio_len, ori_v_con_len
+                )
+                audio_kk, v_cond_kk = decouple_interleaved_two_sequences(
+                    interleaved_audio_visual_kk, ori_audio_len, ori_v_con_len
+                )
+                audio_q, audio_k = audio_qq, audio_kk
+                v_cond_q, v_cond_k = v_cond_qq, v_cond_kk
+        # Apply RoPE to visual if needed and not interleaved
+        if v_freqs_cis is not None and not self.interleaved_audio_visual_rope:
+            v_cond_qq, v_cond_kk = apply_rotary_emb(v_cond_q, v_cond_k, v_freqs_cis, head_first=False)
+            v_cond_q, v_cond_k = v_cond_qq, v_cond_kk
+        # Concatenate for self-attention
+        q = torch.cat((v_cond_q, audio_q), dim=1)
+        k = torch.cat((v_cond_k, audio_k), dim=1)
+        v = torch.cat((v_cond_v, audio_v), dim=1)
+        # Run self-attention
+        attn = attention(q, k, v, mode=self.attn_mode, attn_mask=attn_mask, deterministic=self.deterministic)
+        v_cond_attn, audio_attn = torch.split(attn, [v_cond.shape[1], audio.shape[1]], dim=1)
+        # Apply self-attention output to audio and v_cond
+        audio = audio + apply_gate(self.audio_self_proj(audio_attn), gate=audio_mod1_gate)
+        v_cond = v_cond + apply_gate(self.v_cond_self_proj(v_cond_attn), gate=v_cond_mod1_gate)
+        # 2. Cross Attention: (v_cond, audio) as query, text as key/value
+        # audio, v_cond modulation
+        audio_modulated = self.audio_norm2(audio)
+        audio_modulated = modulate(audio_modulated, shift=audio_mod2_shift, scale=audio_mod2_scale)
+        v_cond_modulated = self.v_cond_norm2(v_cond)
+        v_cond_modulated = modulate(v_cond_modulated, shift=v_cond_mod2_shift, scale=v_cond_mod2_scale)
+        # Prepare audio query
+        audio_q = self.audio_cross_q(audio_modulated)
+        audio_q = rearrange(audio_q, "B L (H D) -> B L H D", H=self.num_heads)
+        audio_q = self.audio_cross_q_norm(audio_q)
+        # Prepare v_cond query
+        v_cond_q = self.v_cond_cross_q(v_cond_modulated)
+        v_cond_q = rearrange(v_cond_q, "B L (H D) -> B L H D", H=self.num_heads)
+        v_cond_q = self.v_cond_cross_q_norm(v_cond_q)
+        # Prepare text key/value
+        text_kv = self.text_cross_kv(cond)
+        text_k, text_v = rearrange(text_kv, "B L (K H D) -> K B L H D", K=2, H=self.num_heads)
+        text_k = self.text_cross_k_norm(text_k).to(text_v)
+        # Apply RoPE to (v_cond, audio) query and text key if needed
+        head_dim = self.hidden_size // self.num_heads
+        audio_cross_freqs_cos, audio_cross_freqs_sin = self.build_rope_for_text(audio_q.shape[1], head_dim, rope_dim_list=self.rope_dim_list)
+        audio_cross_freqs_cis = (audio_cross_freqs_cos.to(audio_q.device), audio_cross_freqs_sin.to(audio_q.device))
+        audio_q = apply_rotary_emb(audio_q, audio_q, audio_cross_freqs_cis, head_first=False)[0]
+        v_cond_cross_freqs_cos, v_cond_cross_freqs_sin = self.build_rope_for_text(v_cond_q.shape[1], head_dim, rope_dim_list=self.rope_dim_list)
+        v_cond_cross_freqs_cis = (v_cond_cross_freqs_cos.to(v_cond_q.device), v_cond_cross_freqs_sin.to(v_cond_q.device))
+        v_cond_q = apply_rotary_emb(v_cond_q, v_cond_q, v_cond_cross_freqs_cis, head_first=False)[0]
+        text_len = text_k.shape[1]
+        text_freqs_cos, text_freqs_sin = self.build_rope_for_text(text_len, head_dim,
+                                                                 rope_dim_list=self.rope_dim_list)
+        text_freqs_cis = (text_freqs_cos.to(text_k.device), text_freqs_sin.to(text_k.device))
+        text_k = apply_rotary_emb(text_k, text_k, text_freqs_cis, head_first=False)[1]
+        # Concat v_cond and audio for cross-attention
+        v_cond_audio_q = torch.cat([v_cond_q, audio_q], dim=1)
+        # Run cross-attention
+        cross_attn = attention(v_cond_audio_q, text_k, text_v, mode=self.attn_mode, deterministic=self.deterministic)
+        v_cond_cross_attn, audio_cross_attn = torch.split(cross_attn, [v_cond.shape[1], audio.shape[1]], dim=1)
+        # Apply cross-attention output
+        audio = audio + apply_gate(self.audio_cross_proj(audio_cross_attn), gate=audio_mod2_gate)
+        v_cond = v_cond + apply_gate(self.v_cond_cross_proj(v_cond_cross_attn), gate=v_cond_mod2_gate)
+        # 3. Apply MLPs
+        audio = audio + apply_gate(
+            self.audio_mlp(modulate(self.audio_norm3(audio), shift=audio_mod3_shift, scale=audio_mod3_scale)),
+            gate=audio_mod3_gate,
+        )
+        # Apply visual MLP
+        v_cond = v_cond + apply_gate(
+            self.v_cond_mlp(modulate(self.v_cond_norm3(v_cond), shift=v_cond_mod3_shift, scale=v_cond_mod3_scale)),
+            gate=v_cond_mod3_gate,
+        )
+        return audio, cond, v_cond
+class SingleStreamBlock(nn.Module):
+    def __init__(self, hidden_size: int,
+                    num_heads: int,
+                    mlp_ratio: float,
+                    qk_norm_type: str = "rms",
+                    dtype: Optional[torch.dtype] = None,
+                    device: Optional[torch.device] = None,):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.modulation = ModulateDiT(
+            hidden_size=hidden_size,
+            factor=6,
+            act_layer=get_activation_layer("silu"),
+            **factory_kwargs,
+        )
+        self.linear_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=True)
+        self.linear1 = ChannelLastConv1d(hidden_size, hidden_size, kernel_size=3, padding=1, **factory_kwargs)
+        self.linear2 = ConvMLP(hidden_size, hidden_size * mlp_ratio, kernel_size=3, padding=1, **factory_kwargs)
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False)
+        self.q_norm = nn.RMSNorm(hidden_size // num_heads)
+        self.k_norm = nn.RMSNorm(hidden_size // num_heads)
+        self.rearrange = Rearrange("B L (H D K) -> B H L D K", K=3, H=num_heads)
+    def forward(self, x: torch.Tensor, cond: torch.Tensor,freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None):
+        assert cond.ndim == 3, "Condition should be in shape of [B, T, D]"
+        modulation = self.modulation(cond)
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = modulation.chunk(6, dim=-1)
+        x_norm1 = self.norm1(x) * (1 + scale_msa) + shift_msa
+        qkv = self.linear_qkv(x_norm1)
+        q, k, v = self.rearrange(qkv).chunk(3, dim=-1)
+        q = q.squeeze(-1)
+        k = k.squeeze(-1)
+        v = v.squeeze(-1)
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        q, k = apply_rotary_emb(q, k, freqs_cis, head_first=True)
+        q = q.contiguous()
+        k = k.contiguous()
+        v = v.contiguous()
+        out = F.scaled_dot_product_attention(q, k, v)
+        out = rearrange(out, 'b h n d -> b n (h d)').contiguous()
+        x = x + apply_gate(self.linear1(out),gate=gate_msa)
+        x_norm = self.norm2(x) * (1 + scale_mlp) + shift_mlp
+        x = x + apply_gate(self.linear2(x_norm), gate=gate_mlp)
+        return x
+class HunyuanVideoFoley(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        model_config,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        model_args = model_config.model_config.model_kwargs
+        self.depth_triple_blocks = model_args.get("depth_triple_blocks", 19)
+        self.depth_single_blocks = model_args.get("depth_single_blocks", 38)
+        # Gradient checkpoint.
+        self.gradient_checkpoint = False
+        self.gradient_checkpoint_layers = None
+        if self.gradient_checkpoint:
+            assert self.gradient_checkpoint_layers <= self.depth_triple_blocks + self.depth_single_blocks, (
+                f"Gradient checkpoint layers must be less or equal than the depth of the model. "
+                f"Got gradient_checkpoint_layers={self.gradient_checkpoint_layers} and depth={self.depth_triple_blocks + self.depth_single_blocks}."
+            )
+        self.interleaved_audio_visual_rope = model_args.get("interleaved_audio_visual_rope", False)
+        # Condition projection. Default to linear projection.
+        self.condition_projection = model_args.get("condition_projection", "linear")
+        self.condition_dim = model_args.get("condition_dim", None)
+        self.use_attention_mask = model_args.get("use_attention_mask", False)
+        self.patch_size = model_args.get("patch_size", 1)
+        self.visual_in_channels = model_args.get("clip_dim", 768)
+        self.audio_vae_latent_dim = model_args.get("audio_vae_latent_dim", 128)
+        self.out_channels = self.audio_vae_latent_dim
+        self.unpatchify_channels = self.out_channels
+        self.reverse = model_args.get("reverse", False)
+        self.num_heads = model_args.get("num_heads", 24)
+        self.hidden_size = model_args.get("hidden_size", 3072)
+        self.rope_dim_list = model_args.get("rope_dim_list", None)
+        self.mlp_ratio = model_args.get("mlp_ratio", 4.0)
+        self.mlp_act_type = model_args.get("mlp_act_type", "gelu_tanh")
+        self.qkv_bias = model_args.get("qkv_bias", True)
+        self.qk_norm = model_args.get("qk_norm", True)
+        self.qk_norm_type = model_args.get("qk_norm_type", "rms")
+        self.attn_mode = model_args.get("attn_mode", "torch")
+        self.embedder_type = model_args.get("embedder_type", "default")
+        # sync condition things
+        self.sync_modulation = model_args.get("sync_modulation", False)
+        self.add_sync_feat_to_audio = model_args.get("add_sync_feat_to_audio", False)
+        self.sync_feat_dim = model_args.get("sync_feat_dim", 768)
+        self.sync_in_ksz = model_args.get("sync_in_ksz", 1)
+        # condition tokens length
+        self.clip_len = model_args.get("clip_length", 64)
+        self.sync_len = model_args.get("sync_length", 192)
+        if self.hidden_size % self.num_heads != 0:
+            raise ValueError(f"Hidden size {self.hidden_size} must be divisible by num_heads {self.num_heads}")
+        # Build audio patchify layer and visual gated linear projection
+        self.patch_size = 1
+        self.audio_embedder = PatchEmbed1D(self.patch_size, self.audio_vae_latent_dim, self.hidden_size, **factory_kwargs)
+        self.visual_proj = SwiGLU(self.visual_in_channels, hidden_dim=self.hidden_size, out_dim=self.hidden_size)
+        # condition
+        if self.condition_projection == "linear":
+            self.cond_in = ConditionProjection(
+                self.condition_dim, self.hidden_size, get_activation_layer("silu"), **factory_kwargs
+            )
+        else:
+            raise NotImplementedError(f"Unsupported condition_projection: {self.condition_projection}")
+        # time modulation
+        self.time_in = TimestepEmbedder(self.hidden_size, get_activation_layer("silu"), **factory_kwargs)
+        # visual sync embedder if needed
+        if self.sync_in_ksz == 1:
+            sync_in_padding = 0
+        elif self.sync_in_ksz == 3:
+            sync_in_padding = 1
+        else:
+            raise ValueError
+        if self.sync_modulation or self.add_sync_feat_to_audio:
+            self.sync_in = nn.Sequential(
+                nn.Linear(self.sync_feat_dim, self.hidden_size),
+                nn.SiLU(),
+                ConvMLP(self.hidden_size, self.hidden_size * 4, kernel_size=self.sync_in_ksz, padding=sync_in_padding),
+            )
+            self.sync_pos_emb = nn.Parameter(torch.zeros((1, 1, 8, self.sync_feat_dim)))
+        self.triple_blocks = nn.ModuleList(
+            [
+                TwoStreamCABlock(
+                    hidden_size=self.hidden_size,
+                    num_heads=self.num_heads,
+                    mlp_ratio=self.mlp_ratio,
+                    mlp_act_type=self.mlp_act_type,
+                    qk_norm=self.qk_norm,
+                    qk_norm_type=self.qk_norm_type,
+                    qkv_bias=self.qkv_bias,
+                    attn_mode=self.attn_mode,
+                    reverse=self.reverse,
+                    interleaved_audio_visual_rope=self.interleaved_audio_visual_rope,
+                    **factory_kwargs,
+                )
+                for _ in range(self.depth_triple_blocks)
+            ]
+        )
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(
+                    hidden_size=self.hidden_size,
+                    num_heads=self.num_heads,
+                    mlp_ratio=self.mlp_ratio,
+                    qk_norm_type=self.qk_norm_type,
+                    **factory_kwargs,
+                )
+                for _ in range(self.depth_single_blocks)
+            ]
+        )
+        self.final_layer = FinalLayer1D(
+            self.hidden_size, self.patch_size, self.out_channels, get_activation_layer("silu"), **factory_kwargs
+        )
+        self.unpatchify_channels = self.out_channels
+        self.empty_clip_feat = nn.Parameter(torch.zeros(1, self.visual_in_channels), requires_grad=True)
+        self.empty_sync_feat = nn.Parameter(torch.zeros(1, self.sync_feat_dim), requires_grad=True)
+        nn.init.constant_(self.empty_clip_feat, 0)
+        nn.init.constant_(self.empty_sync_feat, 0)
+    def get_empty_string_sequence(self, bs=None) -> torch.Tensor:
+        if bs is None:
+            return self.empty_string_feat
+        else:
+            return self.empty_string_feat.unsqueeze(0).expand(bs, -1, -1)
+    def get_empty_clip_sequence(self, bs=None, len=None) -> torch.Tensor:
+        len = len if len is not None else self.clip_len
+        if bs is None:
+            return self.empty_clip_feat.expand(len, -1)  # 15s
+        else:
+            return self.empty_clip_feat.unsqueeze(0).expand(bs, len, -1)  # 15s
+    def get_empty_sync_sequence(self, bs=None, len=None) -> torch.Tensor:
+        len = len if len is not None else self.sync_len
+        if bs is None:
+            return self.empty_sync_feat.expand(len, -1)
+        else:
+            return self.empty_sync_feat.unsqueeze(0).expand(bs, len, -1)
+    def build_rope_for_audio_visual(self, audio_emb_len, visual_cond_len):
+        assert self.patch_size == 1
+        # ======================================== Build RoPE for audio tokens ======================================
+        target_ndim = 1  # n-d RoPE
+        rope_sizes = [audio_emb_len]
+        head_dim = self.hidden_size // self.num_heads
+        rope_dim_list = self.rope_dim_list
+        if rope_dim_list is None:
+            rope_dim_list = [head_dim // target_ndim for _ in range(target_ndim)]
+        assert sum(rope_dim_list) == head_dim, "sum(rope_dim_list) should equal to head_dim of attention layer"
+        freqs_cos, freqs_sin = get_nd_rotary_pos_embed(
+            rope_dim_list=rope_dim_list,
+            start=rope_sizes,
+            theta=10000,
+            use_real=True,
+            theta_rescale_factor=1.0,
+        )
+        # ========================== Build RoPE for clip tokens =========================
+        target_ndim = 1  # n-d RoPE
+        rope_sizes = [visual_cond_len]
+        head_dim = self.hidden_size // self.num_heads
+        rope_dim_list = self.rope_dim_list
+        if rope_dim_list is None:
+            rope_dim_list = [head_dim // target_ndim for _ in range(target_ndim)]
+        assert sum(rope_dim_list) == head_dim, "sum(rope_dim_list) should equal to head_dim of attention layer"
+        v_freqs_cos, v_freqs_sin = get_nd_rotary_pos_embed(
+            rope_dim_list=rope_dim_list,
+            start=rope_sizes,
+            theta=10000,
+            use_real=True,
+            theta_rescale_factor=1.0,
+            freq_scaling=1.0 * audio_emb_len / visual_cond_len,
+        )
+        return freqs_cos, freqs_sin, v_freqs_cos, v_freqs_sin
+    def build_rope_for_interleaved_audio_visual(self, total_len):
+        assert self.patch_size == 1
+        # ========================== Build RoPE for audio tokens ========================
+        target_ndim = 1  # n-d RoPE
+        rope_sizes = [total_len]
+        head_dim = self.hidden_size // self.num_heads
+        rope_dim_list = self.rope_dim_list
+        if rope_dim_list is None:
+            rope_dim_list = [head_dim // target_ndim for _ in range(target_ndim)]
+        assert sum(rope_dim_list) == head_dim, "sum(rope_dim_list) should equal to head_dim of attention layer"
+        freqs_cos, freqs_sin = get_nd_rotary_pos_embed(
+            rope_dim_list=rope_dim_list,
+            start=rope_sizes,
+            theta=10000,
+            use_real=True,
+            theta_rescale_factor=1.0,
+        )
+        return freqs_cos, freqs_sin
+    def set_attn_mode(self, new_mode):
+        for block in self.triple_blocks:
+            block.set_attn_mode(new_mode)
+        for block in self.single_blocks:
+            block.set_attn_mode(new_mode)
+    def enable_deterministic(self):
+        for block in self.triple_blocks:
+            block.enable_deterministic()
+        for block in self.single_blocks:
+            block.enable_deterministic()
+    def disable_deterministic(self):
+        for block in self.triple_blocks:
+            block.disable_deterministic()
+        for block in self.single_blocks:
+            block.disable_deterministic()
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.Tensor,  # Should be in range(0, 1000).
+        clip_feat: Optional[torch.Tensor] = None,
+        cond: torch.Tensor = None,
+        audio_mask: Optional[torch.Tensor] = None,
+        cond_mask: torch.Tensor = None,
+        sync_feat: Optional[torch.Tensor] = None,
+        drop_visual: Optional[List[bool]] = None,
+        return_dict: bool = True,
+    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        out = {}
+        audio = x
+        bs, _, ol = x.shape
+        tl = ol // self.patch_size
+        # Prepare learnable empty conditions for visual condition
+        if drop_visual is not None:
+            clip_feat[drop_visual] = self.get_empty_clip_sequence().to(dtype=clip_feat.dtype)
+            sync_feat[drop_visual] = self.get_empty_sync_sequence().to(dtype=sync_feat.dtype)
+        # ========================= Prepare time & visual modulation =========================
+        vec = self.time_in(t)
+        sync_vec = None
+        if self.sync_modulation:
+            assert sync_feat is not None and sync_feat.shape[1] % 8 == 0
+            sync_feat = sync_feat.view(bs, int(sync_feat.shape[1] / 8), 8, self.sync_feat_dim) + self.sync_pos_emb
+            sync_feat = sync_feat.view(bs, -1, self.sync_feat_dim)  # bs, num_segments * 8, channels
+            sync_vec = self.sync_in(sync_feat)  # bs, num_segments * 8, c
+            sync_vec = (
+                F.interpolate(sync_vec.transpose(1, 2), size=(tl), mode="nearest-exact").contiguous().transpose(1, 2)
+            )  # bs, tl, c
+            sync_vec = sync_vec + vec.unsqueeze(1)
+        elif self.add_sync_feat_to_audio:
+            assert sync_feat is not None and sync_feat.shape[1] % 8 == 0
+            sync_feat = sync_feat.view(bs, sync_feat.shape[1] // 8, 8, self.sync_feat_dim) + self.sync_pos_emb
+            sync_feat = sync_feat.view(bs, -1, self.sync_feat_dim)  # bs, num_segments * 8, channels
+            sync_feat = self.sync_in(sync_feat)  # bs, num_segments * 8, c
+            add_sync_feat_to_audio = (
+                F.interpolate(sync_feat.transpose(1, 2), size=(tl), mode="nearest-exact").contiguous().transpose(1, 2)
+            )  # bs, tl, c
+        # ========================= Get text, audio and video clip embedding =========================
+        cond = self.cond_in(cond)
+        cond_seq_len = cond.shape[1]
+        audio = self.audio_embedder(x)
+        audio_seq_len = audio.shape[1]
+        v_cond = self.visual_proj(clip_feat)
+        v_cond_seq_len = v_cond.shape[1]
+        # ========================= Compute attention mask =========================
+        attn_mask = None
+        if self.use_attention_mask:
+            assert cond_mask is not None
+            batch_size = audio.shape[0]
+            seq_len = cond_seq_len + v_cond_seq_len + audio_seq_len
+            # get default audio_mask and v_cond_mask
+            audio_mask = torch.ones((batch_size, audio_seq_len), dtype=torch.bool, device=audio.device)
+            v_cond_mask = torch.ones((batch_size, v_cond_seq_len), dtype=torch.bool, device=audio.device)
+            # batch_size x seq_len
+            concat_mask = torch.cat([cond_mask, v_cond_mask, audio_mask], dim=1)
+            # batch_size x 1 x seq_len x seq_len
+            attn_mask_1 = concat_mask.view(batch_size, 1, 1, seq_len).repeat(1, 1, seq_len, 1)
+            # batch_size x 1 x seq_len x seq_len
+            attn_mask_2 = attn_mask_1.transpose(2, 3)
+            # batch_size x 1 x seq_len x seq_len, 1 for broadcasting of num_heads
+            attn_mask = (attn_mask_1 & attn_mask_2).bool()
+            # avoids self-attention weight being NaN for text padding tokens
+            attn_mask[:, :, :, 0] = True
+        # ========================= Build rope for audio and clip tokens =========================
+        if self.interleaved_audio_visual_rope:
+            freqs_cos, freqs_sin = self.build_rope_for_interleaved_audio_visual(audio_seq_len * 2)
+            v_freqs_cos = v_freqs_sin = None
+        else:
+            freqs_cos, freqs_sin, v_freqs_cos, v_freqs_sin = self.build_rope_for_audio_visual(
+                audio_seq_len, v_cond_seq_len
+            )
+        # ========================= Pass through DiT blocks =========================
+        freqs_cis = (freqs_cos, freqs_sin) if freqs_cos is not None else None
+        v_freqs_cis = (v_freqs_cos, v_freqs_sin) if v_freqs_cos is not None else None
+        if self.add_sync_feat_to_audio:
+            add_sync_layer = 0
+        assert (
+            add_sync_layer < self.depth_triple_blocks
+        ), f"The layer to add mel_spectrogram feature and sync feature should in the triple_stream_blocks (n: {self.depth_triple_blocks})."
+        # Triple-stream blocks
+        for layer_num, block in enumerate(self.triple_blocks):
+            if self.add_sync_feat_to_audio and layer_num == add_sync_layer:
+                audio = audio + add_sync_feat_to_audio
+            triple_block_args = [audio, cond, v_cond, attn_mask, vec, freqs_cis, v_freqs_cis, sync_vec]
+            if (
+                self.training
+                and self.gradient_checkpoint
+                and (self.gradient_checkpoint_layers == -1 or layer_num < self.gradient_checkpoint_layers)
+            ):
+                audio, cond, v_cond = torch.utils.checkpoint.checkpoint(
+                    ckpt_wrapper(block), *triple_block_args, use_reentrant=False
+                )
+            else:
+                audio, cond, v_cond = block(*triple_block_args)
+        x = audio
+        if sync_vec is not None:
+            vec = vec.unsqueeze(1).repeat(1, cond_seq_len + v_cond_seq_len, 1)
+            vec = torch.cat((vec, sync_vec), dim=1)
+        freqs_cos, freqs_sin, _, _ = self.build_rope_for_audio_visual(audio_seq_len, v_cond_seq_len)
+        if self.add_sync_feat_to_audio:
+            vec = add_sync_feat_to_audio + vec.unsqueeze(dim=1)
+        if len(self.single_blocks) > 0:
+            for layer_num, block in enumerate(self.single_blocks):
+                single_block_args = [
+                    x,
+                    vec,
+                    (freqs_cos, freqs_sin),
+                ]
+                if (
+                    self.training
+                    and self.gradient_checkpoint
+                    and (
+                        self.gradient_checkpoint_layers == -1
+                        or layer_num + len(self.triple_blocks) < self.gradient_checkpoint_layers
+                    )
+                ):
+                    x = torch.utils.checkpoint.checkpoint(ckpt_wrapper(block), *single_block_args, use_reentrant=False)
+                else:
+                    x = block(*single_block_args)
+        audio = x
+        # ========================= Final layer =========================
+        if sync_vec is not None:
+            vec = sync_vec
+        audio = self.final_layer(audio, vec)  # (N, T, patch_size * out_channels)
+        audio = self.unpatchify1d(audio, tl)
+        if return_dict:
+            out["x"] = audio
+            return out
+        return audio
+    def unpatchify1d(self, x, l):
+        # x: (N, L, patch_size * C)
+        # audio: (N, C, T), T == L * patch_size
+        c = self.unpatchify_channels
+        p = self.patch_size
+        assert l == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], l, p, c))
+        x = torch.einsum("ntpc->nctp", x)
+        audio = x.reshape(shape=(x.shape[0], c, l * p))
+        return audio
+    def params_count(self):
+        counts = {
+            "triple": sum(
+                [
+                    sum(p.numel() for p in block.audio_cross_q.parameters())
+                    + sum(p.numel() for p in block.v_cond_cross_q.parameters())
+                    + sum(p.numel() for p in block.text_cross_kv.parameters())
+                    + sum(p.numel() for p in block.audio_self_attn_qkv.parameters())
+                    + sum(p.numel() for p in block.v_cond_attn_qkv.parameters())
+                    + sum(p.numel() for p in block.audio_mlp.parameters())
+                    + sum(p.numel() for p in block.audio_self_proj.parameters())
+                    + sum(p.numel() for p in block.v_cond_self_proj.parameters())
+                    + sum(p.numel() for p in block.v_cond_mlp.parameters())
+                    for block in self.triple_blocks
+                ]
+            ),
+            "single": sum(
+                [
+                    sum(p.numel() for p in block.linear1.parameters())
+                    + sum(p.numel() for p in block.linear2.parameters())
+                    for block in self.single_blocks
+                ]
+            ),
+            "total": sum(p.numel() for p in self.parameters()),
+        }
+        counts["attn+mlp"] = counts["triple"] + counts["single"]
+        return counts

HunyuanVideo-Foley/hunyuanvideo_foley/models/nn/__init__.py ADDED Viewed

File without changes

HunyuanVideo-Foley/hunyuanvideo_foley/models/nn/activation_layers.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import torch.nn as nn
+import torch.nn.functional as F
+def get_activation_layer(act_type):
+    if act_type == "gelu":
+        return lambda: nn.GELU()
+    elif act_type == "gelu_tanh":
+        # Approximate `tanh` requires torch >= 1.13
+        return lambda: nn.GELU(approximate="tanh")
+    elif act_type == "relu":
+        return nn.ReLU
+    elif act_type == "silu":
+        return nn.SiLU
+    else:
+        raise ValueError(f"Unknown activation type: {act_type}")
+class SwiGLU(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        out_dim: int,
+    ):
+        """
+        Initialize the SwiGLU FeedForward module.
+        Args:
+            dim (int): Input dimension.
+            hidden_dim (int): Hidden dimension of the feedforward layer.
+        Attributes:
+            w1: Linear transformation for the first layer.
+            w2: Linear transformation for the second layer.
+            w3: Linear transformation for the third layer.
+        """
+        super().__init__()
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, out_dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))

HunyuanVideo-Foley/hunyuanvideo_foley/models/nn/attn_layers.py ADDED Viewed

	@@ -0,0 +1,546 @@

+import importlib.metadata
+import math
+from typing import Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+try:
+    from flash_attn import (
+        flash_attn_qkvpacked_func,
+        flash_attn_kvpacked_func,
+        flash_attn_varlen_kvpacked_func,
+        flash_attn_varlen_qkvpacked_func,
+    )
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
+except ImportError:
+    flash_attn_qkvpacked_func, flash_attn_kvpacked_func, flash_attn_varlen_kvpacked_func = None, None, None
+    index_first_axis = None
+from packaging import version
+from transformers.utils.import_utils import _is_package_available
+from .norm_layers import get_norm_layer
+def reshape_for_broadcast(freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]], x: torch.Tensor, head_first=False):
+    """
+    Reshape frequency tensor for broadcasting it with another tensor.
+    This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
+    for the purpose of broadcasting the frequency tensor during element-wise operations.
+    Notes:
+        When using FlashMHAModified, head_first should be False.
+        When using Attention, head_first should be True.
+    Args:
+        freqs_cis (Union[torch.Tensor, Tuple[torch.Tensor]]): Frequency tensor to be reshaped.
+        x (torch.Tensor): Target tensor for broadcasting compatibility.
+        head_first (bool): head dimension first (except batch dim) or not.
+    Returns:
+        torch.Tensor: Reshaped frequency tensor.
+    Raises:
+        AssertionError: If the frequency tensor doesn't match the expected shape.
+        AssertionError: If the target tensor 'x' doesn't have the expected number of dimensions.
+    """
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    if isinstance(freqs_cis, tuple):
+        # freqs_cis: (cos, sin) in real space
+        if head_first:
+            assert freqs_cis[0].shape == (
+                x.shape[-2],
+                x.shape[-1],
+            ), f"freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}"
+            shape = [d if i == ndim - 2 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+        else:
+            assert freqs_cis[0].shape == (
+                x.shape[1],
+                x.shape[-1],
+            ), f"freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}"
+            shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+        return freqs_cis[0].view(*shape), freqs_cis[1].view(*shape)
+    else:
+        # freqs_cis: values in complex space
+        if head_first:
+            assert freqs_cis.shape == (
+                x.shape[-2],
+                x.shape[-1],
+            ), f"freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}"
+            shape = [d if i == ndim - 2 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+        else:
+            assert freqs_cis.shape == (
+                x.shape[1],
+                x.shape[-1],
+            ), f"freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}"
+            shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+        return freqs_cis.view(*shape)
+def rotate_half(x):
+    x_real, x_imag = x.float().reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+    return torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+    head_first: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor.
+    This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided
+    frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor
+    is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are
+    returned as real tensors.
+    Args:
+        xq (torch.Tensor): Query tensor to apply rotary embeddings. [B, S, H, D]
+        xk (torch.Tensor): Key tensor to apply rotary embeddings.   [B, S, H, D]
+        freqs_cis (torch.Tensor or tuple): Precomputed frequency tensor for complex exponential.
+        head_first (bool): head dimension first (except batch dim) or not.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+    """
+    xk_out = None
+    if isinstance(freqs_cis, tuple):
+        cos, sin = reshape_for_broadcast(freqs_cis, xq, head_first)  # [S, D]
+        cos, sin = cos.to(xq.device), sin.to(xq.device)
+        # real * cos - imag * sin
+        # imag * cos + real * sin
+        xq_out = (xq.float() * cos + rotate_half(xq.float()) * sin).type_as(xq)
+        xk_out = (xk.float() * cos + rotate_half(xk.float()) * sin).type_as(xk)
+    else:
+        # view_as_complex will pack [..., D/2, 2](real) to [..., D/2](complex)
+        xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))  # [B, S, H, D//2]
+        freqs_cis = reshape_for_broadcast(freqs_cis, xq_, head_first).to(xq.device)  # [S, D//2] --> [1, S, 1, D//2]
+        # (real, imag) * (cos, sin) = (real * cos - imag * sin, imag * cos + real * sin)
+        # view_as_real will expand [..., D/2](complex) to [..., D/2, 2](real)
+        xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).type_as(xq)
+        xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))  # [B, S, H, D//2]
+        xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).type_as(xk)
+    return xq_out, xk_out
+class BasicAttentionLayer(nn.Module):
+    def __init__(self, attn_mode="flash", deterministic=False):
+        super().__init__()
+        self.attn_mode = attn_mode
+        self.deterministic = deterministic
+    def set_attn_mode(self, new_mode):
+        self.attn_mode = new_mode
+    def enable_deterministic(self):
+        self.deterministic = True
+    def disable_deterministic(self):
+        self.deterministic = False
+MEMORY_LAYOUT = {
+    "self_flash": (
+        lambda x: x,
+        lambda x: x,
+    ),
+    "cross_flash": (
+        lambda x: x,
+        lambda x: x,
+    ),
+    "flash_torch_sp": (
+        lambda x: x,
+        lambda x: x,
+    ),
+    "torch": (
+        lambda x: x.transpose(1, 2),
+        lambda x: x.transpose(1, 2),
+    ),
+    "vanilla": (
+        lambda x: x.transpose(1, 2),
+        lambda x: x.transpose(1, 2),
+    ),
+}
+# Copyed from https://github.com/huggingface/transformers/blob/b873234cb649a24865021f0d598627ce2b24d34a/src/transformers/modeling_flash_attention_utils.py#L33C1-L57C6
+def _get_unpad_data(attention_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, int]:
+    """
+    Retrieves indexing data required to repad unpadded (ragged) tensors.
+    Arguments:
+        attention_mask (`torch.Tensor`):
+            Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
+    Return:
+        indices (`torch.Tensor):
+            The indices of non-masked tokens from the flattened input sequence.
+        cu_seqlens (`torch.Tensor`):
+            The cumulative sequence lengths, used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
+        max_seqlen_in_batch (`int`):
+            Maximum sequence length in batch.
+    """
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+# Copyed from https://github.com/huggingface/transformers/blob/b873234cb649a24865021f0d598627ce2b24d34a/src/transformers/utils/import_utils.py#L822
+def is_flash_attn_greater_or_equal(library_version: str):
+    if not _is_package_available("flash_attn"):
+        return False
+    return version.parse(importlib.metadata.version("flash_attn")) >= version.parse(library_version)
+def get_kv_seqlens_with_mask(attn_mask, k, v):
+    indices_k, cu_seqlens_k, max_seqlen_k = _get_unpad_data(attn_mask)
+    b, s1, a, d = k.shape
+    k = index_first_axis(k.reshape(b * s1, a, d), indices_k)
+    v = index_first_axis(v.reshape(b * s1, a, d), indices_k)
+    kv = torch.stack([k, v], dim=1)
+    return cu_seqlens_k, max_seqlen_k, kv
+def get_q_seqlens(q):
+    bs, s, a, d = q.shape
+    cu_seqlens_q = torch.arange(0, (bs + 1) * s, step=s, dtype=torch.int32, device=q.device)
+    q = q.reshape(bs * s, a, d)
+    return cu_seqlens_q, s, q
+def flash_attn_no_pad(
+    qkv, key_padding_mask, causal=False, dropout_p=0.0, softmax_scale=None
+):
+    # adapted from https://github.com/Dao-AILab/flash-attention/blob/13403e81157ba37ca525890f2f0f2137edf75311/flash_attn/flash_attention.py#L27
+    batch_size = qkv.shape[0]
+    seqlen = qkv.shape[1]
+    nheads = qkv.shape[-2]
+    x = rearrange(qkv, "b s three h d -> b s (three h d)")
+    # x_unpad, indices, cu_seqlens, max_s, used_seqlens_in_batch
+    # x_unpad, indices, cu_seqlens, max_s
+    unpad_results = unpad_input(
+        x, key_padding_mask
+    )
+    if len(unpad_results) == 4:
+        x_unpad, indices, cu_seqlens, max_s = unpad_results
+    elif len(unpad_results) == 5:
+        x_unpad, indices, cu_seqlens, max_s, used_seqlens_in_batch = unpad_results
+    else:
+        raise ValueError
+    x_unpad = rearrange(x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads)
+    output_unpad = flash_attn_varlen_qkvpacked_func(
+        x_unpad,
+        cu_seqlens,
+        max_s,
+        dropout_p,
+        softmax_scale=softmax_scale,
+        causal=causal,
+    )
+    output = rearrange(
+        pad_input(
+            rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, batch_size, seqlen
+        ),
+        "b s (h d) -> b s h d",
+        h=nheads,
+    )
+    return output
+def attention(
+    q,
+    k,
+    v,
+    mode,
+    drop_rate=0,
+    attn_mask=None,
+    cond_mask=None,
+    causal=False,
+    deterministic=False,
+    cu_seqlens=None,
+    max_seqlen=None,
+    cu_seqlens_k=None,
+    max_seqlen_k=None,
+    img_seq_len=None,
+):
+    """
+    Perform QKV self attention.
+    Args:
+        q (torch.Tensor): Query tensor with shape [b, s, a, d], where a is the number of heads.
+        k (torch.Tensor): Key tensor with shape [b, s1, a, d]
+        v (torch.Tensor): Value tensor with shape [b, s1, a, d]
+        mode (str): Attention mode. Choose from 'self_flash', 'cross_flash', 'torch', and 'vanilla'.
+        drop_rate (float): Dropout rate in attention map. (default: 0)
+        attn_mask (torch.Tensor): Attention mask with shape [b, s1] (cross_attn), or [b, a, s, s1] (torch or vanilla).
+            (default: None)
+        causal (bool): Whether to use causal attention. (default: False)
+        deterministic (bool): Whether to use deterministic attention. (default: False)
+        cu_seqlens (torch.Tensor): dtype torch.int32. The cumulative sequence lengths of the sequences in the batch,
+            used to index into q.
+        max_seqlen (int): The maximum sequence length in the batch of q.
+        cu_seqlens_k (torch.Tensor): dtype torch.int32. The cumulative sequence lengths of the sequences in the batch,
+            used to index into kv.
+        max_seqlen_k (int): The maximum sequence length in the batch of k and v.
+    Returns:
+        torch.Tensor: Output tensor after self attention with shape [b, s, ad]
+    """
+    if mode in ["torch", "vanilla", "self_flash", "cross_flash"]:
+        if isinstance(q, tuple):
+            q = torch.cat(q, dim=1)
+        if isinstance(k, tuple):
+            k = torch.cat(k, dim=1)
+        if isinstance(v, tuple):
+            v = torch.cat(v, dim=1)
+        pre_attn_layout, post_attn_layout = MEMORY_LAYOUT[mode]
+        q = pre_attn_layout(q)
+        k = pre_attn_layout(k)
+        v = pre_attn_layout(v)
+    if "flash" in mode:
+        assert (
+            flash_attn_qkvpacked_func is not None
+        ), "Flash attention is not available. Please install flash_attn first."
+        flash_kwargs = dict(dropout_p=drop_rate, causal=causal)
+        if deterministic:
+            if not is_flash_attn_greater_or_equal("2.4.1"):
+                raise ValueError(
+                    "Flash attention deterministic mode requires flash_attn>=2.4.1. " "Please upgrade flash_attn"
+                )
+            flash_kwargs["deterministic"] = deterministic
+        if mode == "self_flash":
+            qkv = torch.stack([q, k, v], dim=2)
+            if attn_mask is not None:
+                raise ValueError("Self attention does not support attention mask")
+            x = flash_attn_qkvpacked_func(qkv, **flash_kwargs)
+        elif mode == "cross_flash":
+            kv = torch.stack([k, v], dim=2)
+            if attn_mask is None:
+                x = flash_attn_kvpacked_func(q, kv, **flash_kwargs)
+            else:
+                b, s, a, h = q.shape
+                cu_seqlens_q, max_seqlen_q, q = get_q_seqlens(q)
+                cu_seqlens_k, max_seqlen_k, kv = get_kv_seqlens_with_mask(attn_mask, k, v)
+                attn_output = flash_attn_varlen_kvpacked_func(
+                    q,
+                    kv,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_q,
+                    max_seqlen_k=max_seqlen_k,
+                    **flash_kwargs,
+                )
+                x = attn_output.reshape(b, s, a, h)
+    elif mode == 'torch':
+        if attn_mask is not None and attn_mask.dtype != torch.bool:
+            attn_mask = attn_mask.to(q.dtype)
+        x = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=drop_rate, is_causal=causal)
+    elif mode == "vanilla":
+        scale_factor = 1 / math.sqrt(q.size(-1))
+        b, a, s, _ = q.shape
+        s1 = k.size(2)
+        attn_bias = torch.zeros(b, a, s, s1, dtype=q.dtype, device=q.device)
+        if causal:
+            # Only applied to self attention
+            assert attn_mask is None, "Causal mask and attn_mask cannot be used together"
+            temp_mask = torch.ones(b, a, s, s, dtype=torch.bool, device=q.device).tril(diagonal=0)
+            attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+            attn_bias.to(q.dtype)
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+            else:
+                attn_bias += attn_mask
+        # TODO(jarvizhang): Maybe force q and k to be float32 to avoid numerical overflow
+        attn = (q @ k.transpose(-2, -1)) * scale_factor
+        attn += attn_bias
+        attn = attn.softmax(dim=-1)
+        attn = torch.dropout(attn, p=drop_rate, train=True)
+        x = attn @ v
+    else:
+        raise NotImplementedError(f"Unsupported attention mode: {mode}")
+    if mode in ["torch", "vanilla", "self_flash", "cross_flash"]:
+        x = post_attn_layout(x).contiguous()
+    b, s, a, d = x.shape
+    out = x.reshape(b, s, -1)
+    return out
+class SelfAttentionLayer(BasicAttentionLayer):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        qkv_bias=True,
+        qk_norm=True,
+        attn_drop=0,
+        proj_drop=0,
+        dtype=None,
+        device=None,
+        norm_type="layer",
+        attn_mode="self_flash",
+        deterministic=False,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(attn_mode, deterministic)
+        self.dim = dim
+        self.num_heads = num_heads
+        assert self.dim % num_heads == 0, "dim must be divisible by num_heads"
+        self.head_dim = self.dim // num_heads
+        self.attn_drop = attn_drop
+        # This assertion is aligned with flash attention
+        assert self.head_dim % 8 == 0 and self.head_dim <= 128, "Only support head_dim <= 128 and divisible by 8"
+        self.Wqkv = nn.Linear(dim, dim * 3, bias=qkv_bias, **factory_kwargs)
+        norm_layer = get_norm_layer(norm_type)
+        self.q_norm = (
+            norm_layer(self.head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.k_norm = (
+            norm_layer(self.head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.out_proj = nn.Linear(dim, dim, bias=qkv_bias, **factory_kwargs)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x, freqs_cis=None, attn_mask=None):
+        """
+        Args:
+            x (torch.Tensor): (batch, seq_len, hidden_dim) (where hidden_dim = num heads * head dim)
+            freqs_cis (torch.Tensor, optional): (batch, hidden_dim // 2), RoPE for image
+            attn_mask (torch.Tensor, optional): (batch, seq_len, seq_len), mask for attention
+        """
+        b, s, d = x.shape
+        # Apply QKV projection
+        qkv = self.Wqkv(x)
+        qkv = qkv.view(b, s, 3, self.num_heads, self.head_dim)  # [b, s, 3, a, d]
+        q, k, v = qkv.unbind(dim=2)  # [b, s, a, d]
+        # Apply QK-Norm if needed
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        # Apply RoPE if needed
+        if freqs_cis is not None:
+            qq, kk = apply_rotary_emb(q, k, freqs_cis)
+            assert (
+                qq.shape == q.shape and kk.shape == k.shape
+            ), f"qq: {qq.shape}, q: {q.shape}, kk: {kk.shape}, k: {k.shape}"
+            q, k = qq, kk
+        # Apply self attention
+        context = attention(
+            q,
+            k,
+            v,
+            drop_rate=self.attn_drop if self.training else 0,
+            attn_mask=attn_mask,
+            mode=self.attn_mode,
+            deterministic=self.deterministic,
+        )
+        out = self.out_proj(context)
+        out = self.proj_drop(out)
+        return out
+class CrossAttentionLayer(BasicAttentionLayer):
+    def __init__(
+        self,
+        qdim,
+        kdim,
+        num_heads,
+        qkv_bias=True,
+        qk_norm=True,
+        attn_drop=0,
+        proj_drop=0,
+        dtype=None,
+        device=None,
+        norm_type="layer",
+        attn_mode="cross_flash",
+        deterministic=False,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(attn_mode, deterministic)
+        self.qdim = qdim
+        self.kdim = kdim
+        self.num_heads = num_heads
+        assert self.qdim % num_heads == 0, "qdim must be divisible by num_heads"
+        self.head_dim = self.qdim // num_heads
+        self.attn_drop = attn_drop
+        # This assertion is aligned with flash attention
+        assert self.head_dim % 8 == 0 and self.head_dim <= 128, "Only support head_dim <= 128 and divisible by 8"
+        self.q_proj = nn.Linear(qdim, qdim, bias=qkv_bias, **factory_kwargs)
+        self.kv_proj = nn.Linear(kdim, 2 * qdim, bias=qkv_bias, **factory_kwargs)
+        norm_layer = get_norm_layer(norm_type)
+        self.q_norm = (
+            norm_layer(self.head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.k_norm = (
+            norm_layer(self.head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.out_proj = nn.Linear(qdim, qdim, bias=qkv_bias, **factory_kwargs)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x, y, attn_mask=None):
+        """
+        Args:
+            x (torch.Tensor): (batch, seq_len, hidden_dim) (where hidden_dim = num heads * head dim)
+            y (torch.Tensor): (batch, seq_len1, hidden_dim1)
+            attn_mask (torch.Tensor): (batch, seq_len1), mask for attention
+        """
+        b, s, d = x.shape
+        _, s1, d1 = y.shape
+        q = self.q_proj(x).view(b, s, self.num_heads, self.head_dim)
+        kv = self.kv_proj(y).view(b, s1, 2, self.num_heads, self.head_dim)
+        k, v = kv.unbind(dim=2)
+        # Apply QK-Norm if needed
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        # Apply cross attention
+        context = attention(
+            q,
+            k,
+            v,
+            attn_mask=attn_mask,
+            drop_rate=self.attn_drop if self.training else 0,
+            mode=self.attn_mode,
+            deterministic=self.deterministic,
+        )
+        out = self.out_proj(context)
+        out = self.proj_drop(out)
+        return out

HunyuanVideo-Foley/hunyuanvideo_foley/models/nn/embed_layers.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import math
+import torch
+import torch.nn as nn
+from ...utils.helper import to_2tuple, to_1tuple
+class PatchEmbed1D(nn.Module):
+    """1D Audio to Patch Embedding
+    A convolution based approach to patchifying a 1D audio w/ embedding projection.
+    Based on the impl in https://github.com/google-research/vision_transformer
+    Hacked together by / Copyright 2020 Ross Wightman
+    """
+    def __init__(
+        self,
+        patch_size=1,
+        in_chans=768,
+        embed_dim=768,
+        norm_layer=None,
+        flatten=True,
+        bias=True,
+        dtype=None,
+        device=None,
+    ):
+        factory_kwargs = {"dtype": dtype, "device": device}
+        super().__init__()
+        patch_size = to_1tuple(patch_size)
+        self.patch_size = patch_size
+        self.flatten = flatten
+        self.proj = nn.Conv1d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias, **factory_kwargs
+        )
+        nn.init.xavier_uniform_(self.proj.weight.view(self.proj.weight.size(0), -1))
+        if bias:
+            nn.init.zeros_(self.proj.bias)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x):
+        assert (
+            x.shape[2] % self.patch_size[0] == 0
+        ), f"The patch_size of {self.patch_size[0]} must be divisible by the token number ({x.shape[2]}) of x."
+        x = self.proj(x)
+        if self.flatten:
+            x = x.transpose(1, 2)  # BCN -> BNC
+        x = self.norm(x)
+        return x
+class ConditionProjection(nn.Module):
+    """
+    Projects condition embeddings. Also handles dropout for classifier-free guidance.
+    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
+    """
+    def __init__(self, in_channels, hidden_size, act_layer, dtype=None, device=None):
+        factory_kwargs = {'dtype': dtype, 'device': device}
+        super().__init__()
+        self.linear_1 = nn.Linear(in_features=in_channels, out_features=hidden_size, bias=True, **factory_kwargs)
+        self.act_1 = act_layer()
+        self.linear_2 = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=True, **factory_kwargs)
+    def forward(self, caption):
+        hidden_states = self.linear_1(caption)
+        hidden_states = self.act_1(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+def timestep_embedding(t, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+    Args:
+        t (torch.Tensor): a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        dim (int): the dimension of the output.
+        max_period (int): controls the minimum frequency of the embeddings.
+    Returns:
+        embedding (torch.Tensor): An (N, D) Tensor of positional embeddings.
+    .. ref_link: https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+    """
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period)
+        * torch.arange(start=0, end=half, dtype=torch.float32)
+        / half
+    ).to(device=t.device)
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat(
+            [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+        )
+    return embedding
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self,
+                 hidden_size,
+                 act_layer,
+                 frequency_embedding_size=256,
+                 max_period=10000,
+                 out_size=None,
+                 dtype=None,
+                 device=None
+                 ):
+        factory_kwargs = {'dtype': dtype, 'device': device}
+        super().__init__()
+        self.frequency_embedding_size = frequency_embedding_size
+        self.max_period = max_period
+        if out_size is None:
+            out_size = hidden_size
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True, **factory_kwargs),
+            act_layer(),
+            nn.Linear(hidden_size, out_size, bias=True, **factory_kwargs),
+        )
+        nn.init.normal_(self.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.mlp[2].weight, std=0.02)
+    def forward(self, t):
+        t_freq = timestep_embedding(t, self.frequency_embedding_size, self.max_period).type(self.mlp[0].weight.dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb

HunyuanVideo-Foley/hunyuanvideo_foley/models/nn/mlp_layers.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# Modified from timm library:
+# https://github.com/huggingface/pytorch-image-models/blob/648aaa41233ba83eb38faf5ba9d415d574823241/timm/layers/mlp.py#L13
+from functools import partial
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .modulate_layers import modulate
+from ...utils.helper import to_2tuple
+class MLP(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(
+        self,
+        in_channels,
+        hidden_channels=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        norm_layer=None,
+        bias=True,
+        drop=0.0,
+        use_conv=False,
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        out_features = out_features or in_channels
+        hidden_channels = hidden_channels or in_channels
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+        self.fc1 = linear_layer(in_channels, hidden_channels, bias=bias[0], **factory_kwargs)
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = norm_layer(hidden_channels, **factory_kwargs) if norm_layer is not None else nn.Identity()
+        self.fc2 = linear_layer(hidden_channels, out_features, bias=bias[1], **factory_kwargs)
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.norm(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+# copied from https://github.com/black-forest-labs/flux/blob/main/src/flux/modules/layers.py
+# only used when use_vanilla is True
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int, device=None, dtype=None):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True, **factory_kwargs)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True, **factory_kwargs)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+class LinearWarpforSingle(nn.Module):
+    def __init__(self, in_dim: int, out_dim: int, bias=True, device=None, dtype=None):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.fc = nn.Linear(in_dim, out_dim, bias=bias, **factory_kwargs)
+    def forward(self, x, y):
+        z = torch.cat([x, y], dim=2)
+        return self.fc(z)
+class FinalLayer1D(nn.Module):
+    def __init__(self, hidden_size, patch_size, out_channels, act_layer, device=None, dtype=None):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        # Just use LayerNorm for the final layer
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        self.linear = nn.Linear(hidden_size, patch_size * out_channels, bias=True, **factory_kwargs)
+        nn.init.zeros_(self.linear.weight)
+        nn.init.zeros_(self.linear.bias)
+        # Here we don't distinguish between the modulate types. Just use the simple one.
+        self.adaLN_modulation = nn.Sequential(
+            act_layer(), nn.Linear(hidden_size, 2 * hidden_size, bias=True, **factory_kwargs)
+        )
+        # Zero-initialize the modulation
+        nn.init.zeros_(self.adaLN_modulation[1].weight)
+        nn.init.zeros_(self.adaLN_modulation[1].bias)
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
+        x = modulate(self.norm_final(x), shift=shift, scale=scale)
+        x = self.linear(x)
+        return x
+class ChannelLastConv1d(nn.Conv1d):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.permute(0, 2, 1)
+        x = super().forward(x)
+        x = x.permute(0, 2, 1)
+        return x
+class ConvMLP(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int = 256,
+        kernel_size: int = 3,
+        padding: int = 1,
+        device=None,
+        dtype=None,
+    ):
+        """
+        Convolutional MLP module.
+        Args:
+            dim (int): Input dimension.
+            hidden_dim (int): Hidden dimension of the feedforward layer.
+            multiple_of (int): Value to ensure hidden dimension is a multiple of this value.
+        Attributes:
+            w1: Linear transformation for the first layer.
+            w2: Linear transformation for the second layer.
+            w3: Linear transformation for the third layer.
+        """
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.w1 = ChannelLastConv1d(dim, hidden_dim, bias=False, kernel_size=kernel_size, padding=padding, **factory_kwargs)
+        self.w2 = ChannelLastConv1d(hidden_dim, dim, bias=False, kernel_size=kernel_size, padding=padding, **factory_kwargs)
+        self.w3 = ChannelLastConv1d(dim, hidden_dim, bias=False, kernel_size=kernel_size, padding=padding, **factory_kwargs)
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))

HunyuanVideo-Foley/hunyuanvideo_foley/models/nn/modulate_layers.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from typing import Callable
+import torch
+import torch.nn as nn
+class ModulateDiT(nn.Module):
+    def __init__(self, hidden_size: int, factor: int, act_layer: Callable, dtype=None, device=None):
+        factory_kwargs = {"dtype": dtype, "device": device}
+        super().__init__()
+        self.act = act_layer()
+        self.linear = nn.Linear(hidden_size, factor * hidden_size, bias=True, **factory_kwargs)
+        # Zero-initialize the modulation
+        nn.init.zeros_(self.linear.weight)
+        nn.init.zeros_(self.linear.bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.linear(self.act(x))
+def modulate(x, shift=None, scale=None):
+    if x.ndim == 3:
+        shift = shift.unsqueeze(1) if shift is not None and shift.ndim == 2 else None
+        scale = scale.unsqueeze(1) if scale is not None and scale.ndim == 2 else None
+    if scale is None and shift is None:
+        return x
+    elif shift is None:
+        return x * (1 + scale)
+    elif scale is None:
+        return x + shift
+    else:
+        return x * (1 + scale) + shift
+def apply_gate(x, gate=None, tanh=False):
+    if gate is None:
+        return x
+    if gate.ndim == 2 and x.ndim == 3:
+        gate = gate.unsqueeze(1)
+    if tanh:
+        return x * gate.tanh()
+    else:
+        return x * gate
+def ckpt_wrapper(module):
+    def ckpt_forward(*inputs):
+        outputs = module(*inputs)
+        return outputs
+    return ckpt_forward

HunyuanVideo-Foley/hunyuanvideo_foley/models/nn/norm_layers.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import torch
+import torch.nn as nn
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, elementwise_affine=True, eps: float = 1e-6,
+                 device=None, dtype=None):
+        """
+        Initialize the RMSNorm normalization layer.
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+        """
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.eps = eps
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(dim, **factory_kwargs))
+    def _norm(self, x):
+        """
+        Apply the RMSNorm normalization to the input tensor.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The normalized tensor.
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+        """
+        output = self._norm(x.float()).type_as(x)
+        if hasattr(self, "weight"):
+            output = output * self.weight
+        return output
+def get_norm_layer(norm_layer):
+    """
+    Get the normalization layer.
+    Args:
+        norm_layer (str): The type of normalization layer.
+    Returns:
+        norm_layer (nn.Module): The normalization layer.
+    """
+    if norm_layer == "layer":
+        return nn.LayerNorm
+    elif norm_layer == "rms":
+        return RMSNorm
+    else:
+        raise NotImplementedError(f"Norm layer {norm_layer} is not implemented")

HunyuanVideo-Foley/hunyuanvideo_foley/models/nn/posemb_layers.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import torch
+from typing import Union, Tuple
+def _to_tuple(x, dim=2):
+    if isinstance(x, int):
+        return (x,) * dim
+    elif len(x) == dim:
+        return x
+    else:
+        raise ValueError(f"Expected length {dim} or int, but got {x}")
+def get_meshgrid_nd(start, *args, dim=2):
+    """
+    Get n-D meshgrid with start, stop and num.
+    Args:
+        start (int or tuple): If len(args) == 0, start is num; If len(args) == 1, start is start, args[0] is stop,
+            step is 1; If len(args) == 2, start is start, args[0] is stop, args[1] is num. For n-dim, start/stop/num
+            should be int or n-tuple. If n-tuple is provided, the meshgrid will be stacked following the dim order in
+            n-tuples.
+        *args: See above.
+        dim (int): Dimension of the meshgrid. Defaults to 2.
+    Returns:
+        grid (np.ndarray): [dim, ...]
+    """
+    if len(args) == 0:
+        # start is grid_size
+        num = _to_tuple(start, dim=dim)
+        start = (0,) * dim
+        stop = num
+    elif len(args) == 1:
+        # start is start, args[0] is stop, step is 1
+        start = _to_tuple(start, dim=dim)
+        stop = _to_tuple(args[0], dim=dim)
+        num = [stop[i] - start[i] for i in range(dim)]
+    elif len(args) == 2:
+        # start is start, args[0] is stop, args[1] is num
+        start = _to_tuple(start, dim=dim)  # Left-Top       eg: 12,0
+        stop = _to_tuple(args[0], dim=dim)  # Right-Bottom   eg: 20,32
+        num = _to_tuple(args[1], dim=dim)  # Target Size    eg: 32,124
+    else:
+        raise ValueError(f"len(args) should be 0, 1 or 2, but got {len(args)}")
+    # PyTorch implement of np.linspace(start[i], stop[i], num[i], endpoint=False)
+    axis_grid = []
+    for i in range(dim):
+        a, b, n = start[i], stop[i], num[i]
+        g = torch.linspace(a, b, n + 1, dtype=torch.float32)[:n]
+        axis_grid.append(g)
+    grid = torch.meshgrid(*axis_grid, indexing="ij")  # dim x [W, H, D]
+    grid = torch.stack(grid, dim=0)  # [dim, W, H, D]
+    return grid
+#################################################################################
+#                   Rotary Positional Embedding Functions                       #
+#################################################################################
+# https://github.com/meta-llama/llama/blob/be327c427cc5e89cc1d3ab3d3fec4484df771245/llama/model.py#L80
+def get_nd_rotary_pos_embed(
+    rope_dim_list, start, *args, theta=10000.0, use_real=False, theta_rescale_factor=1.0, freq_scaling=1.0
+):
+    """
+    This is a n-d version of precompute_freqs_cis, which is a RoPE for tokens with n-d structure.
+    Args:
+        rope_dim_list (list of int): Dimension of each rope. len(rope_dim_list) should equal to n.
+            sum(rope_dim_list) should equal to head_dim of attention layer.
+        start (int | tuple of int | list of int): If len(args) == 0, start is num; If len(args) == 1, start is start,
+            args[0] is stop, step is 1; If len(args) == 2, start is start, args[0] is stop, args[1] is num.
+        *args: See above.
+        theta (float): Scaling factor for frequency computation. Defaults to 10000.0.
+        use_real (bool): If True, return real part and imaginary part separately. Otherwise, return complex numbers.
+            Some libraries such as TensorRT does not support complex64 data type. So it is useful to provide a real
+            part and an imaginary part separately.
+        theta_rescale_factor (float): Rescale factor for theta. Defaults to 1.0.
+        freq_scaling (float, optional): Frequence rescale factor, which is proposed in mmaudio. Defaults to 1.0.
+    Returns:
+        pos_embed (torch.Tensor): [HW, D/2]
+    """
+    grid = get_meshgrid_nd(start, *args, dim=len(rope_dim_list))  # [3, W, H, D] / [2, W, H]
+    # use 1/ndim of dimensions to encode grid_axis
+    embs = []
+    for i in range(len(rope_dim_list)):
+        emb = get_1d_rotary_pos_embed(
+            rope_dim_list[i],
+            grid[i].reshape(-1),
+            theta,
+            use_real=use_real,
+            theta_rescale_factor=theta_rescale_factor,
+            freq_scaling=freq_scaling,
+        )  # 2 x [WHD, rope_dim_list[i]]
+        embs.append(emb)
+    if use_real:
+        cos = torch.cat([emb[0] for emb in embs], dim=1)  # (WHD, D/2)
+        sin = torch.cat([emb[1] for emb in embs], dim=1)  # (WHD, D/2)
+        return cos, sin
+    else:
+        emb = torch.cat(embs, dim=1)  # (WHD, D/2)
+        return emb
+def get_1d_rotary_pos_embed(
+    dim: int,
+    pos: Union[torch.FloatTensor, int],
+    theta: float = 10000.0,
+    use_real: bool = False,
+    theta_rescale_factor: float = 1.0,
+    freq_scaling: float = 1.0,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    """
+    Precompute the frequency tensor for complex exponential (cis) with given dimensions.
+    (Note: `cis` means `cos + i * sin`, where i is the imaginary unit.)
+    This function calculates a frequency tensor with complex exponential using the given dimension 'dim'
+    and the end index 'end'. The 'theta' parameter scales the frequencies.
+    The returned tensor contains complex values in complex64 data type.
+    Args:
+        dim (int): Dimension of the frequency tensor.
+        pos (int or torch.FloatTensor): Position indices for the frequency tensor. [S] or scalar
+        theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
+        use_real (bool, optional): If True, return real part and imaginary part separately.
+                                   Otherwise, return complex numbers.
+        theta_rescale_factor (float, optional): Rescale factor for theta. Defaults to 1.0.
+        freq_scaling (float, optional): Frequence rescale factor, which is proposed in mmaudio. Defaults to 1.0.
+    Returns:
+        freqs_cis: Precomputed frequency tensor with complex exponential. [S, D/2]
+        freqs_cos, freqs_sin: Precomputed frequency tensor with real and imaginary parts separately. [S, D]
+    """
+    if isinstance(pos, int):
+        pos = torch.arange(pos).float()
+    # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
+    # has some connection to NTK literature
+    # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
+    if theta_rescale_factor != 1.0:
+        theta *= theta_rescale_factor ** (dim / (dim - 1))
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))  # [D/2]
+    freqs *= freq_scaling
+    freqs = torch.outer(pos, freqs)  # [S, D/2]
+    if use_real:
+        freqs_cos = freqs.cos().repeat_interleave(2, dim=1)  # [S, D]
+        freqs_sin = freqs.sin().repeat_interleave(2, dim=1)  # [S, D]
+        return freqs_cos, freqs_sin
+    else:
+        freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64     # [S, D/2]
+        return freqs_cis

HunyuanVideo-Foley/hunyuanvideo_foley/models/synchformer/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .synchformer import Synchformer

HunyuanVideo-Foley/hunyuanvideo_foley/models/synchformer/ast_model.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import logging
+import torch
+from transformers.modeling_outputs import BaseModelOutputWithPooling
+from .modeling_ast import ASTForAudioClassification, ASTConfig
+from .motionformer import AveragePooling, BaseEncoderLayer, TemporalTransformerEncoderLayer
+from .utils import check_if_file_exists_else_download
+class AST(torch.nn.Module):
+    def __init__(
+        self,
+        extract_features: bool = False,
+        ckpt_path: str = None,
+        feat_type: str = None,
+        max_spec_t: int = None,
+        factorize_freq_time: bool = None,
+        agg_freq_module: str = None,
+        agg_time_module: str = None,
+        add_global_repr: bool = True,
+        agg_segments_module: str = None,
+        max_segments: int = None,
+    ) -> None:
+        """
+        extract_features: if True, then the model will return the features instead of head's output
+        ckpt_path: is not a path to a ckpt file, but a name of a model from the HuggingFace model hub.
+        feat_type: if extract_features is True, this parameter specifies the type of features to return
+        max_spec_t: if specified, then the model (pos emb) will be patched to support this length of spec
+        factorize_freq_time: if True, then the model will use a factorized freq/time aggregation
+        agg_freq_module: if specified, then the model will use this module for freq aggregation
+        agg_time_module: if specified, then the model will use this module for time aggregation
+        add_global_repr: if True, adds a global representation to the features (aggregation on segments)
+        agg_segments_module: if specified, then the model will use this module for segments aggregation
+        max_segments: if specified, the initialization of PE in the global agg module will use this value.
+                      This should correspond to the max number of segments per video (if None, 16 is used)
+        """
+        super().__init__()
+        self.extract_features = extract_features
+        self.ckpt_path = ckpt_path
+        self.max_spec_t = max_spec_t
+        self.max_segments = max_segments
+        # depending on whether the feat extractor was pre-trained contrastively or not, we need to
+        # load the state dict differently.
+        # if ckpt is specified, then load the model from the HuggingFace model hub, otherwise init a new model
+        if ckpt_path == "MIT/ast-finetuned-audioset-10-10-0.4593":
+            revision = "c1c0c66"  # fixing the revision for compatibility (V4.27.4)
+            self.config = ASTConfig.from_pretrained(ckpt_path, revision=revision)
+            full_model = ASTForAudioClassification.from_pretrained(ckpt_path, revision=revision)
+            logging.info(f"Loaded AST from {ckpt_path}")
+        else:
+            self.config = ASTConfig()
+            self.config.num_labels = 527  # 2 by default, audioset has 527 labels
+            full_model = ASTForAudioClassification(self.config)
+            logging.info("Initialized AST from scratch with the AST AudioSet config")
+        was_pt_on_avclip = ckpt_path is not None and ckpt_path.endswith(".pt")
+        # feature extractor
+        self.ast = full_model.audio_spectrogram_transformer
+        if self.extract_features:
+            # assign `feat_type` (use default if not specified)
+            self.feat_type = "last_hidden_state" if feat_type is None else feat_type
+            # define adapters if needed
+            self.factorize_freq_time = factorize_freq_time
+            # avoiding code duplication (used only if agg_*_module is TransformerEncoderLayer)
+            transf_enc_layer_kwargs = dict(
+                d_model=self.config.hidden_size,
+                nhead=self.config.num_attention_heads,
+                dim_feedforward=self.config.intermediate_size,
+                activation=torch.nn.GELU(),
+                batch_first=True,
+                dropout=self.config.attention_probs_dropout_prob,
+                layer_norm_eps=1e-6,
+                norm_first=True,
+            )
+            if factorize_freq_time:
+                self.feat_type = "last_hidden_state"  # this feat_type supports factorization
+                # frequency aggreration
+                if agg_freq_module == "TransformerEncoderLayer":
+                    self.freq_attn_agg = FrequencyTransformerEncoderLayer(**transf_enc_layer_kwargs)
+                elif agg_freq_module == "AveragePooling":
+                    self.freq_attn_agg = AveragePooling(
+                        avg_pattern="BS D f t -> BS D t", then_permute_pattern="BS D t -> BS t D"
+                    )
+                # time aggreration
+                if agg_time_module == "TransformerEncoderLayer":
+                    self.temp_attn_agg = TemporalTransformerEncoderLayer(**transf_enc_layer_kwargs)
+                elif agg_time_module == "AveragePooling":
+                    self.temp_attn_agg = AveragePooling(avg_pattern="BS t D -> BS D")
+                elif "Identity" in agg_time_module:
+                    self.temp_attn_agg = torch.nn.Identity()
+            # define a global aggregation layer (aggregarate over segments)
+            self.add_global_repr = add_global_repr
+            if add_global_repr:
+                if agg_segments_module == "TransformerEncoderLayer":
+                    # we can reuse the same layer as for temporal factorization (B, dim_to_agg, D) -> (B, D)
+                    # we need to add pos emb (PE) because previously we added the same PE for each segment
+                    pos_max_len = max_segments if max_segments is not None else 16  # 16 = 10sec//0.64sec + 1
+                    self.global_attn_agg = TemporalTransformerEncoderLayer(
+                        add_pos_emb=True,
+                        pos_emb_drop=self.config.hidden_dropout_prob,
+                        pos_max_len=pos_max_len,
+                        **transf_enc_layer_kwargs,
+                    )
+                elif agg_segments_module == "AveragePooling":
+                    self.global_attn_agg = AveragePooling(avg_pattern="B S D -> B D")
+        else:
+            self.classifier = full_model.classifier
+        # AST.device fails with AttributeError. This is a workaround
+        self.device = full_model.device
+        # pre-trained on 12*101+2=1214 tokens, but we have less (e.g. 12*6+2=74)
+        self.patch_position_emb()
+        if was_pt_on_avclip:
+            # we need to filter out the state_dict of the AVCLIP model (has both A and V extractors)
+            # and keep only the state_dict of the feat extractor
+            check_if_file_exists_else_download(self.ckpt_path)
+            ckpt = torch.load(ckpt_path, map_location="cpu")
+            ckpt_weights = dict()
+            for k, v in ckpt["state_dict"].items():
+                if k.startswith(("module.a_encoder.", "a_encoder.")):
+                    k = k.replace("module.", "").replace("a_encoder.", "")
+                    ckpt_weights[k] = v
+            _load_status = self.load_state_dict(ckpt_weights, strict=False)
+            if len(_load_status.missing_keys) > 0 or len(_load_status.unexpected_keys) > 0:
+                logging.warning(
+                    f"Loading exact afeat_extractor ckpt from {self.ckpt_path} failed. \n"
+                    f"Missing keys ({len(_load_status.missing_keys)}): "
+                    f"{_load_status.missing_keys}, \n"
+                    f"Unexpected keys ({len(_load_status.unexpected_keys)}): "
+                    f"{_load_status.unexpected_keys} \n"
+                    f"temp_attn_agg are expected to be missing if ckpt was pt contrastively."
+                )
+            else:
+                logging.info(f"Loading afeat_extractor ckpt from {self.ckpt_path} succeeded.")
+        # print the number of parameters
+        logging.info(f"AST: {sum(p.numel() for p in self.parameters() if p.requires_grad):,}")
+    def forward(
+        self, x: torch.Tensor, for_loop: bool = False, cont_mask: torch.Tensor = None, **ast_kwargs
+    ) -> torch.Tensor:
+        """
+        x: (B, S, T, F) where S is number of segments, F is number of (mel) frequency bins,
+        ast_kwargs: additional arguments for the AST model
+        cont_mask: (B, S, T, F) where 0s are the values to be masked out
+        if `for_loop=True`, we use a for loop to extract features for each segment separately.
+        if `for_loop=False`, we extract features for all segments at once.
+            Using the for loop is slower but more memory efficient, while using all segments at once
+            is faster but more memory inefficient.
+            Using for loop allows to control the memory footprint by varying the number of videos in a
+            batch (batch size) rather than the number of segments in a video.
+        """
+        B, S, T, F = x.shape
+        if for_loop:
+            assert cont_mask is None, "cont_mask is not supported with for_loop=True"
+            orig_shape_s = (B, 1, T, F)
+            # NOTE: since x is (B, S, T, F), and forward_segments expects (BS, T, F).
+            # (B, S, T, F)[:, s] is (B, T, F) or (BS, T, F) if S=1.
+            x = torch.cat(
+                [self.forward_segments(x[:, s], orig_shape_s, **ast_kwargs).unsqueeze(1) for s in range(S)], dim=1
+            )
+        else:
+            orig_shape = (B, S, T, F)
+            x = x.view(B * S, T, F)
+            if cont_mask is not None:
+                cont_mask = cont_mask.reshape(B * S, T, F)
+            # AST expects a tensor of shape (B*S, T, F).
+            x = self.forward_segments(x, orig_shape=orig_shape, cont_mask=cont_mask, **ast_kwargs)
+            # unpack the segments (using rest dimensions to support different shapes e.g. (BS, D) or (BS, t, D))
+            x = x.view(B, S, *x.shape[1:])
+        # x now is of shape (B, S, D) or (B, S, t, D) if `self.temp_attn_agg` is `Identity`
+        global_x = None
+        if self.extract_features and self.add_global_repr:  # lazy execution, throws AttributeError
+            assert len(x.shape) == 3, f"Local representation should be (B, S, D) {x.shape}"
+            global_x = self.global_attn_agg(x)  # (B, D)
+        return x, global_x  # x is (B, S, ...), global_x is (B, D) or None
+    def forward_segments(self, x, orig_shape: tuple, cont_mask: torch.Tensor = None, **ast_kwargs):
+        """x is (BS, T, F), where S is the number of segments; cont_mask is (BS, T, F): 0s to be masked out"""
+        # 'pooler_output': (B, D); or 'last_hidden_state: (B, T, D) where T is [CLS, DISTILL, <tokens>]
+        # x_mask is (B, T) where 0s are the values to be masked out
+        x, x_mask = self.ast(x, cont_mask=cont_mask, **ast_kwargs)
+        if self.extract_features:
+            x = self.get_features_by_type(x)
+            if self.factorize_freq_time:
+                x = self.restore_freq_temp_dims(x, orig_shape)  # (BS, D, f, t) <- (B*S, T, D)
+                if cont_mask is not None:
+                    # duplicating the mask for the latent dimension (D) to be compatible with the next func
+                    x_mask = x_mask.unsqueeze(-1).expand(-1, -1, self.config.hidden_size)
+                    x_mask = self.restore_freq_temp_dims(x_mask, orig_shape)  # (BS, D, f, t) <- (B*S, T, D)
+                    # again removing the latent
+                    x_mask = x_mask[:, 0, :, :]
+                else:
+                    x_mask = None
+                x = self.freq_attn_agg(x, x_mask)  # (BS, t, D)
+                x = self.temp_attn_agg(x)  # (BS, D) or (BS, t, D) if self.temp_attn_agg is Identity
+        else:
+            x = x["pooler_output"]
+            x = self.classifier(x)
+        return x
+    def get_features_by_type(self, x: BaseModelOutputWithPooling) -> torch.Tensor:
+        if self.feat_type == "pooler_output":
+            return x["pooler_output"]  # (B, D)
+        elif self.feat_type == "CLS":
+            return x["last_hidden_state"][:, 0, :]  # (B, D)
+        elif self.feat_type == "last_hidden_state":
+            return x["last_hidden_state"]  # (B, 2+T, D)
+        elif self.feat_type == "last_hidden_state_no_AUX":
+            return x["last_hidden_state"][:, 2:, :]  # (B, T, D) removing CLS and distill tokens
+        else:
+            raise ValueError(f"Unknown feature type: {self.feat_type}")
+    def restore_freq_temp_dims(self, feats, orig_shape: tuple):
+        """
+        feats are of shape (B*S, T, D)
+            where T = 2 + f * t (if feat_type == 'last_hidden_state')
+            where T =     f * t (if feat_type == 'last_hidden_state_no_AUX')
+        Our goal is to make them of shape (B*S, f, t, D) where f and t are dimensions after patching.
+        From `self.ast.embeddings.patch_embeddings`, it follows that we could reshape feats:
+            `feats.transpose(1, 2).view(B*S, D, f, t)`
+        (Similar function is defined in for RGB features in `motionformer.py`)
+        """
+        B, S, T, F = orig_shape
+        D = self.config.hidden_size
+        # num patches in each dimension
+        f, t = self.ast.embeddings.get_shape(self.config)
+        if self.feat_type == "last_hidden_state":
+            feats = feats[:, 2:, :]  # removing CLS and distill tokens
+        feats = feats.permute(0, 2, 1)  # (B*S, D, T)
+        feats = feats.view(B * S, D, f, t)  # (B*S, D, f, t)
+        return feats
+    def patch_position_emb(self):
+        if self.max_spec_t is not None:
+            self.config.max_length = self.max_spec_t
+        f, t = self.ast.embeddings.get_shape(self.config)
+        shortened = self.ast.embeddings.position_embeddings[:, : f * t + 2].clone()  # +2 for CLS and distill tokens
+        self.ast.embeddings.position_embeddings = torch.nn.Parameter(shortened).to(self.device)
+    def to(self, device):
+        """AST.device fails with AttributeError. This is a workaround."""
+        self.device = torch.device(device)
+        return super().to(device)
+class FrequencyTransformerEncoderLayer(BaseEncoderLayer):
+    """This layer is used to aggregate the features along the frequency axis.
+    It follows the same logic as spatio-temporal aggregation in visual feature extractor.
+    Thus, it is recommended to check the definition of `BaseEncoderLayer` in `motionformer.py`"""
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, x: torch.Tensor, x_mask: torch.Tensor = None) -> torch.Tensor:
+        """x: (B*S, D, f, t); if specified x_mask (B*S, f, t), 0s are the values to be masked out"""
+        BS, D, f, t = x.shape
+        # time as a batch dimension
+        x = x.permute(0, 3, 2, 1)  # (B*S, t, f, D)
+        x = x.reshape(BS * t, f, D)  # .view() fails with non-contiguous memory
+        # similar to mask
+        if x_mask is not None:
+            x_mask = x_mask.permute(0, 2, 1)  # (B*S, t, f)
+            x_mask = x_mask.reshape(BS * t, f)
+        # apply encoder layer (BaseEncoderLayer.forward) - it will add CLS token and output its representation
+        x = super().forward(x=x, x_mask=x_mask)  # (B*S*t, D)
+        # reshape back to (B*S, t, D)
+        x = x.view(BS, t, D)
+        return x  # (B*S, t, D)

HunyuanVideo-Foley/hunyuanvideo_foley/models/synchformer/compute_desync_score.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import argparse
+import subprocess
+from pathlib import Path
+import torch
+import torchaudio
+import torchvision
+from omegaconf import OmegaConf
+import data_transforms
+from .synchformer import Synchformer
+from .data_transforms import make_class_grid, quantize_offset
+from .utils import check_if_file_exists_else_download, which_ffmpeg
+def prepare_inputs(batch, device):
+    aud = batch["audio"].to(device)
+    vid = batch["video"].to(device)
+    return aud, vid
+def get_test_transforms():
+    ts = [
+        data_transforms.EqualifyFromRight(),
+        data_transforms.RGBSpatialCrop(input_size=224, is_random=False),
+        data_transforms.TemporalCropAndOffset(
+            crop_len_sec=5,
+            max_off_sec=2,  # https://a3s.fi/swift/v1/AUTH_a235c0f452d648828f745589cde1219a/sync/sync_models/24-01-04T16-39-21/cfg-24-01-04T16-39-21.yaml
+            max_wiggle_sec=0.0,
+            do_offset=True,
+            offset_type="grid",
+            prob_oos="null",
+            grid_size=21,
+            segment_size_vframes=16,
+            n_segments=14,
+            step_size_seg=0.5,
+            vfps=25,
+        ),
+        data_transforms.GenerateMultipleSegments(
+            segment_size_vframes=16,
+            n_segments=14,
+            is_start_random=False,
+            step_size_seg=0.5,
+        ),
+        data_transforms.RGBToHalfToZeroOne(),
+        data_transforms.RGBNormalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),  # motionformer normalization
+        data_transforms.AudioMelSpectrogram(
+            sample_rate=16000,
+            win_length=400,  # 25 ms * 16 kHz
+            hop_length=160,  # 10 ms * 16 kHz
+            n_fft=1024,  # 2^(ceil(log2(window_size * sampling_rate)))
+            n_mels=128,  # as in AST
+        ),
+        data_transforms.AudioLog(),
+        data_transforms.PadOrTruncate(max_spec_t=66),
+        data_transforms.AudioNormalizeAST(mean=-4.2677393, std=4.5689974),  # AST, pre-trained on AudioSet
+        data_transforms.PermuteStreams(
+            einops_order_audio="S F T -> S 1 F T", einops_order_rgb="S T C H W -> S T C H W"  # same
+        ),
+    ]
+    transforms = torchvision.transforms.Compose(ts)
+    return transforms
+def get_video_and_audio(path, get_meta=False, start_sec=0, end_sec=None):
+    orig_path = path
+    # (Tv, 3, H, W) [0, 255, uint8]; (Ca, Ta)
+    rgb, audio, meta = torchvision.io.read_video(str(path), start_sec, end_sec, "sec", output_format="TCHW")
+    assert meta["video_fps"], f"No video fps for {orig_path}"
+    # (Ta) <- (Ca, Ta)
+    audio = audio.mean(dim=0)
+    # FIXME: this is legacy format of `meta` as it used to be loaded by VideoReader.
+    meta = {
+        "video": {"fps": [meta["video_fps"]]},
+        "audio": {"framerate": [meta["audio_fps"]]},
+    }
+    return rgb, audio, meta
+def reencode_video(path, vfps=25, afps=16000, in_size=256):
+    assert which_ffmpeg() != "", "Is ffmpeg installed? Check if the conda environment is activated."
+    new_path = Path.cwd() / "vis" / f"{Path(path).stem}_{vfps}fps_{in_size}side_{afps}hz.mp4"
+    new_path.parent.mkdir(exist_ok=True)
+    new_path = str(new_path)
+    cmd = f"{which_ffmpeg()}"
+    # no info/error printing
+    cmd += " -hide_banner -loglevel panic"
+    cmd += f" -y -i {path}"
+    # 1) change fps, 2) resize: min(H,W)=MIN_SIDE (vertical vids are supported), 3) change audio framerate
+    cmd += f" -vf fps={vfps},scale=iw*{in_size}/'min(iw,ih)':ih*{in_size}/'min(iw,ih)',crop='trunc(iw/2)'*2:'trunc(ih/2)'*2"
+    cmd += f" -ar {afps}"
+    cmd += f" {new_path}"
+    subprocess.call(cmd.split())
+    cmd = f"{which_ffmpeg()}"
+    cmd += " -hide_banner -loglevel panic"
+    cmd += f" -y -i {new_path}"
+    cmd += f" -acodec pcm_s16le -ac 1"
+    cmd += f' {new_path.replace(".mp4", ".wav")}'
+    subprocess.call(cmd.split())
+    return new_path
+def decode_single_video_prediction(off_logits, grid, item):
+    label = item["targets"]["offset_label"].item()
+    print("Ground Truth offset (sec):", f"{label:.2f} ({quantize_offset(grid, label)[-1].item()})")
+    print()
+    print("Prediction Results:")
+    off_probs = torch.softmax(off_logits, dim=-1)
+    k = min(off_probs.shape[-1], 5)
+    topk_logits, topk_preds = torch.topk(off_logits, k)
+    # remove batch dimension
+    assert len(topk_logits) == 1, "batch is larger than 1"
+    topk_logits = topk_logits[0]
+    topk_preds = topk_preds[0]
+    off_logits = off_logits[0]
+    off_probs = off_probs[0]
+    for target_hat in topk_preds:
+        print(f'p={off_probs[target_hat]:.4f} ({off_logits[target_hat]:.4f}), "{grid[target_hat]:.2f}" ({target_hat})')
+    return off_probs
+def main(args):
+    vfps = 25
+    afps = 16000
+    in_size = 256
+    # making the offset class grid similar to the one used in transforms,
+    # refer to the used one: https://a3s.fi/swift/v1/AUTH_a235c0f452d648828f745589cde1219a/sync/sync_models/24-01-04T16-39-21/cfg-24-01-04T16-39-21.yaml
+    max_off_sec = 2
+    num_cls = 21
+    # checking if the provided video has the correct frame rates
+    print(f"Using video: {args.vid_path}")
+    v, _, info = torchvision.io.read_video(args.vid_path, pts_unit="sec")
+    _, H, W, _ = v.shape
+    if info["video_fps"] != vfps or info["audio_fps"] != afps or min(H, W) != in_size:
+        print(f'Reencoding. vfps: {info["video_fps"]} -> {vfps};', end=" ")
+        print(f'afps: {info["audio_fps"]} -> {afps};', end=" ")
+        print(f"{(H, W)} -> min(H, W)={in_size}")
+        args.vid_path = reencode_video(args.vid_path, vfps, afps, in_size)
+    else:
+        print(f'Skipping reencoding. vfps: {info["video_fps"]}; afps: {info["audio_fps"]}; min(H, W)={in_size}')
+    device = torch.device(args.device)
+    # load visual and audio streams
+    # rgb: (Tv, 3, H, W) in [0, 225], audio: (Ta,) in [-1, 1]
+    rgb, audio, meta = get_video_and_audio(args.vid_path, get_meta=True)
+    # making an item (dict) to apply transformations
+    # NOTE: here is how it works:
+    # For instance, if the model is trained on 5sec clips, the provided video is 9sec, and `v_start_i_sec=1.3`
+    # the transform will crop out a 5sec-clip from 1.3 to 6.3 seconds and shift the start of the audio
+    # track by `args.offset_sec` seconds. It means that if `offset_sec` > 0, the audio will
+    # start by `offset_sec` earlier than the rgb track.
+    # It is a good idea to use something in [-`max_off_sec`, `max_off_sec`] (-2, +2) seconds (see `grid`)
+    item = dict(
+        video=rgb,
+        audio=audio,
+        meta=meta,
+        path=args.vid_path,
+        split="test",
+        targets={
+            "v_start_i_sec": args.v_start_i_sec,
+            "offset_sec": args.offset_sec,
+        },
+    )
+    grid = make_class_grid(-max_off_sec, max_off_sec, num_cls)
+    if not (min(grid) <= item["targets"]["offset_sec"] <= max(grid)):
+        print(f'WARNING: offset_sec={item["targets"]["offset_sec"]} is outside the trained grid: {grid}')
+    # applying the test-time transform
+    item = get_test_transforms()(item)
+    # prepare inputs for inference
+    batch = torch.utils.data.default_collate([item])
+    aud, vid = prepare_inputs(batch, device)
+    # TODO:
+    # sanity check: we will take the input to the `model` and recontruct make a video from it.
+    # Use this check to make sure the input makes sense (audio should be ok but shifted as you specified)
+    # reconstruct_video_from_input(aud, vid, batch['meta'], args.vid_path, args.v_start_i_sec, args.offset_sec,
+    #                              vfps, afps)
+    # forward pass
+    with torch.set_grad_enabled(False):
+        with torch.autocast("cuda", enabled=True):
+            _, logits = synchformer(vid, aud)
+    # simply prints the results of the prediction
+    decode_single_video_prediction(logits, grid, item)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--exp_name", required=True, help="In a format: xx-xx-xxTxx-xx-xx")
+    parser.add_argument("--vid_path", required=True, help="A path to .mp4 video")
+    parser.add_argument("--offset_sec", type=float, default=0.0)
+    parser.add_argument("--v_start_i_sec", type=float, default=0.0)
+    parser.add_argument("--device", default="cuda:0")
+    args = parser.parse_args()
+    synchformer = Synchformer().cuda().eval()
+    synchformer.load_state_dict(
+        torch.load(
+            os.environ.get("SYNCHFORMER_WEIGHTS", f"weights/synchformer.pth"),
+            weights_only=True,
+            map_location="cpu",
+        )
+    )
+    main(args)