Jack Wu commited on
Commit
22b2290
·
1 Parent(s): 3886668

Remove non-inference files from all three model folders

Browse files

Keep only what is imported at runtime by app.py:
- TARO: remove dataset.py, infer.py, loss.py, train.py, train.sh, preprocess/, README.md
- MMAudio: remove train.py, batch_eval.py, eval_onsets.py, demo.py, gradio_demo.py,
config/, docs/, sets/, training/, README.md, LICENSE, .gitignore
- HunyuanFoley: remove infer.py, gradio_app.py, tests/, assets/, build_package.sh,
download_test_videos.sh, DEVELOPMENT.md, INSTALL.md, LICENSE, MANIFEST.in,
NOTICE, pytest.ini, README.md, .gitattributes, .gitignore, .pre-commit-config.yaml

Update .gitignore to permanently exclude all of the above.

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +47 -0
  2. HunyuanVideo-Foley/.gitattributes +0 -3
  3. HunyuanVideo-Foley/.gitignore +0 -159
  4. HunyuanVideo-Foley/.pre-commit-config.yaml +0 -38
  5. HunyuanVideo-Foley/DEVELOPMENT.md +0 -187
  6. HunyuanVideo-Foley/INSTALL.md +0 -203
  7. HunyuanVideo-Foley/LICENSE +0 -77
  8. HunyuanVideo-Foley/MANIFEST.in +0 -38
  9. HunyuanVideo-Foley/NOTICE +0 -27
  10. HunyuanVideo-Foley/README.md +0 -519
  11. HunyuanVideo-Foley/build_package.sh +0 -58
  12. HunyuanVideo-Foley/download_test_videos.sh +0 -11
  13. HunyuanVideo-Foley/gradio_app.py +0 -834
  14. HunyuanVideo-Foley/infer.py +0 -304
  15. HunyuanVideo-Foley/pytest.ini +0 -11
  16. HunyuanVideo-Foley/tests/__init__.py +0 -1
  17. HunyuanVideo-Foley/tests/test_config_utils.py +0 -89
  18. HunyuanVideo-Foley/tests/test_media_utils.py +0 -82
  19. MMAudio/.gitignore +0 -146
  20. MMAudio/LICENSE +0 -21
  21. MMAudio/README.md +0 -198
  22. MMAudio/batch_eval.py +0 -110
  23. MMAudio/config/__init__.py +0 -0
  24. MMAudio/config/base_config.yaml +0 -62
  25. MMAudio/config/data/base.yaml +0 -70
  26. MMAudio/config/eval_config.yaml +0 -17
  27. MMAudio/config/eval_data/base.yaml +0 -22
  28. MMAudio/config/hydra/job_logging/custom-eval.yaml +0 -32
  29. MMAudio/config/hydra/job_logging/custom-no-rank.yaml +0 -32
  30. MMAudio/config/hydra/job_logging/custom-simplest.yaml +0 -26
  31. MMAudio/config/hydra/job_logging/custom.yaml +0 -33
  32. MMAudio/config/train_config.yaml +0 -41
  33. MMAudio/demo.py +0 -141
  34. MMAudio/docs/EVAL.md +0 -23
  35. MMAudio/docs/MODELS.md +0 -50
  36. MMAudio/docs/TRAINING.md +0 -184
  37. MMAudio/docs/demo.html +0 -81
  38. MMAudio/docs/images/icon.png +0 -0
  39. MMAudio/docs/index.html +0 -156
  40. MMAudio/docs/style.css +0 -78
  41. MMAudio/docs/style_videos.css +0 -52
  42. MMAudio/docs/video_gen.html +0 -254
  43. MMAudio/docs/video_main.html +0 -98
  44. MMAudio/docs/video_vgg.html +0 -452
  45. MMAudio/eval_onsets.py +0 -141
  46. MMAudio/gradio_demo.py +0 -343
  47. MMAudio/sets/vgg-test.tsv +0 -0
  48. MMAudio/sets/vgg-train.tsv +0 -0
  49. MMAudio/sets/vgg-val.tsv +0 -2049
  50. MMAudio/train.py +0 -209
.gitignore CHANGED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ---- TARO: training / preprocessing only ----
2
+ TARO/dataset.py
3
+ TARO/infer.py
4
+ TARO/loss.py
5
+ TARO/train.py
6
+ TARO/train.sh
7
+ TARO/preprocess/
8
+ TARO/README.md
9
+
10
+ # ---- MMAudio: training / eval / docs only ----
11
+ MMAudio/batch_eval.py
12
+ MMAudio/eval_onsets.py
13
+ MMAudio/train.py
14
+ MMAudio/demo.py
15
+ MMAudio/gradio_demo.py
16
+ MMAudio/config/
17
+ MMAudio/docs/
18
+ MMAudio/sets/
19
+ MMAudio/training/
20
+ MMAudio/README.md
21
+ MMAudio/.gitignore
22
+ MMAudio/LICENSE
23
+
24
+ # ---- HunyuanFoley: build / test / docs only ----
25
+ HunyuanVideo-Foley/.gitattributes
26
+ HunyuanVideo-Foley/.gitignore
27
+ HunyuanVideo-Foley/.pre-commit-config.yaml
28
+ HunyuanVideo-Foley/assets/
29
+ HunyuanVideo-Foley/build_package.sh
30
+ HunyuanVideo-Foley/download_test_videos.sh
31
+ HunyuanVideo-Foley/gradio_app.py
32
+ HunyuanVideo-Foley/infer.py
33
+ HunyuanVideo-Foley/DEVELOPMENT.md
34
+ HunyuanVideo-Foley/INSTALL.md
35
+ HunyuanVideo-Foley/LICENSE
36
+ HunyuanVideo-Foley/MANIFEST.in
37
+ HunyuanVideo-Foley/NOTICE
38
+ HunyuanVideo-Foley/pytest.ini
39
+ HunyuanVideo-Foley/README.md
40
+ HunyuanVideo-Foley/tests/
41
+
42
+ # ---- Python / IDE ----
43
+ __pycache__/
44
+ *.pyc
45
+ .venv/
46
+ .DS_Store
47
+ .idea/
HunyuanVideo-Foley/.gitattributes DELETED
@@ -1,3 +0,0 @@
1
- assets/data_pipeline.png filter=lfs diff=lfs merge=lfs -text
2
- assets/model_arch.png filter=lfs diff=lfs merge=lfs -text
3
- *.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
HunyuanVideo-Foley/.gitignore DELETED
@@ -1,159 +0,0 @@
1
- # Byte-compiled / optimized / DLL files
2
- __pycache__/
3
- *.py[cod]
4
- *$py.class
5
-
6
- # C extensions
7
- *.so
8
-
9
- # Distribution / packaging
10
- .Python
11
- build/
12
- develop-eggs/
13
- dist/
14
- downloads/
15
- eggs/
16
- .eggs/
17
- lib/
18
- lib64/
19
- parts/
20
- sdist/
21
- var/
22
- wheels/
23
- pip-wheel-metadata/
24
- share/python-wheels/
25
- *.egg-info/
26
- .installed.cfg
27
- *.egg
28
- MANIFEST
29
-
30
- # PyInstaller
31
- # Usually these files are written by a python script from a template
32
- # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
- *.manifest
34
- *.spec
35
-
36
- # Installer logs
37
- pip-log.txt
38
- pip-delete-this-directory.txt
39
-
40
- # Unit test / coverage reports
41
- htmlcov/
42
- .tox/
43
- .nox/
44
- .coverage
45
- .coverage.*
46
- .cache
47
- nosetests.xml
48
- coverage.xml
49
- *.cover
50
- *.py,cover
51
- .hypothesis/
52
- .pytest_cache/
53
-
54
- # Translations
55
- *.mo
56
- *.pot
57
-
58
- # Django stuff:
59
- *.log
60
- local_settings.py
61
- db.sqlite3
62
- db.sqlite3-journal
63
-
64
- # Flask stuff:
65
- instance/
66
- .webassets-cache
67
-
68
- # Scrapy stuff:
69
- .scrapy
70
-
71
- # Sphinx documentation
72
- docs/_build/
73
-
74
- # PyBuilder
75
- target/
76
-
77
- # Jupyter Notebook
78
- .ipynb_checkpoints
79
-
80
- # IPython
81
- profile_default/
82
- ipython_config.py
83
-
84
- # pyenv
85
- .python-version
86
-
87
- # pipenv
88
- # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
- # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
- # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
- # install all needed dependencies.
92
- #Pipfile.lock
93
-
94
- # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
- __pypackages__/
96
-
97
- # Celery stuff
98
- celerybeat-schedule
99
- celerybeat.pid
100
-
101
- # SageMath parsed files
102
- *.sage.py
103
-
104
- # Environments
105
- .env
106
- .venv
107
- env/
108
- venv/
109
- ENV/
110
- env.bak/
111
- venv.bak/
112
-
113
- # Spyder project settings
114
- .spyderproject
115
- .spyproject
116
-
117
- # Rope project settings
118
- .ropeproject
119
-
120
- # mkdocs documentation
121
- /site
122
-
123
- # mypy
124
- .mypy_cache/
125
- .dmypy.json
126
- dmypy.json
127
-
128
- # Pyre type checker
129
- .pyre/
130
-
131
- # ==========================================
132
- # Custom settings
133
- # ==========================================
134
-
135
- # For MacOS
136
- .DS_Store
137
-
138
- # For IDEs
139
- .idea/
140
- .vscode/
141
- pyrightconfig.json
142
- .cursorignore
143
-
144
- assets/
145
- examples/
146
-
147
- # For global settings
148
- __*/
149
- **/my_*
150
- tmp*.*
151
- .my*
152
- # Model checkpoints
153
- *.pt
154
- *.ckpt
155
- *.pth
156
- *.safetensors
157
-
158
-
159
- CLAUDE.md
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
HunyuanVideo-Foley/.pre-commit-config.yaml DELETED
@@ -1,38 +0,0 @@
1
- repos:
2
- - repo: https://github.com/pre-commit/pre-commit-hooks
3
- rev: v4.4.0
4
- hooks:
5
- - id: trailing-whitespace
6
- - id: end-of-file-fixer
7
- - id: check-yaml
8
- - id: check-added-large-files
9
- - id: check-merge-conflict
10
- - id: debug-statements
11
- - id: check-docstring-first
12
-
13
- - repo: https://github.com/psf/black
14
- rev: 23.3.0
15
- hooks:
16
- - id: black
17
- language_version: python3
18
- args: [--line-length=120]
19
-
20
- - repo: https://github.com/pycqa/isort
21
- rev: 5.12.0
22
- hooks:
23
- - id: isort
24
- args: [--profile, black, --line-length=120]
25
-
26
- - repo: https://github.com/pycqa/flake8
27
- rev: 6.0.0
28
- hooks:
29
- - id: flake8
30
- args: [--max-line-length=120]
31
- additional_dependencies: [flake8-docstrings]
32
-
33
- - repo: https://github.com/pre-commit/mirrors-mypy
34
- rev: v1.3.0
35
- hooks:
36
- - id: mypy
37
- additional_dependencies: [types-all]
38
- args: [--ignore-missing-imports]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
HunyuanVideo-Foley/DEVELOPMENT.md DELETED
@@ -1,187 +0,0 @@
1
- # Development Guide
2
-
3
- This document provides guidelines for developing and contributing to the HunyuanVideo-Foley project.
4
-
5
- ## Code Style and Quality
6
-
7
- ### Code Formatting
8
-
9
- We use the following tools to maintain consistent code style:
10
-
11
- - **Black**: Code formatter with 120 character line length
12
- - **isort**: Import sorter compatible with Black
13
- - **flake8**: Linting and style checking
14
- - **mypy**: Static type checking
15
-
16
- ### Pre-commit Hooks
17
-
18
- Install pre-commit hooks to automatically format code before commits:
19
-
20
- ```bash
21
- pip install pre-commit
22
- pre-commit install
23
- ```
24
-
25
- ### Manual Code Formatting
26
-
27
- Format code manually:
28
-
29
- ```bash
30
- # Format all Python files
31
- black --line-length 120 .
32
-
33
- # Sort imports
34
- isort --profile black --line-length 120 .
35
-
36
- # Check code style
37
- flake8 --max-line-length 120
38
-
39
- # Type checking
40
- mypy --ignore-missing-imports .
41
- ```
42
-
43
- ## Project Structure
44
-
45
- ```
46
- hunyuanvideo_foley/
47
- ├── models/ # Model implementations
48
- │ ├── hifi_foley.py # Main model
49
- │ ├── nn/ # Neural network layers
50
- │ ├── dac_vae/ # Audio VAE
51
- │ └── synchformer/ # Synchronization model
52
- ├── utils/ # Utilities
53
- │ ├── config_utils.py # Configuration handling
54
- │ ├── feature_utils.py # Feature extraction
55
- │ ├── model_utils.py # Model loading/saving
56
- │ └── media_utils.py # Audio/video processing
57
- └── constants.py # Project constants
58
- ```
59
-
60
- ## Coding Standards
61
-
62
- ### Error Handling
63
-
64
- - Use custom exceptions for domain-specific errors
65
- - Always validate inputs at function boundaries
66
- - Log errors with appropriate levels (ERROR, WARNING, INFO)
67
- - Provide helpful error messages to users
68
-
69
- ### Type Hints
70
-
71
- - Add type hints to all function parameters and return values
72
- - Use `Optional[Type]` for nullable parameters
73
- - Import types from `typing` module
74
-
75
- ### Documentation
76
-
77
- - Add docstrings to all public functions and classes
78
- - Use Google-style docstrings
79
- - Document parameters, return values, and exceptions
80
-
81
- ### Example Function
82
-
83
- ```python
84
- def process_video(
85
- video_path: str,
86
- max_duration: Optional[float] = None
87
- ) -> Tuple[np.ndarray, float]:
88
- """
89
- Process video file and extract frames.
90
-
91
- Args:
92
- video_path: Path to input video file
93
- max_duration: Maximum duration in seconds (optional)
94
-
95
- Returns:
96
- Tuple of (frames array, duration in seconds)
97
-
98
- Raises:
99
- FileNotFoundError: If video file doesn't exist
100
- VideoProcessingError: If video processing fails
101
- """
102
- if not os.path.exists(video_path):
103
- raise FileNotFoundError(f"Video file not found: {video_path}")
104
-
105
- # Implementation here...
106
- ```
107
-
108
- ## Testing
109
-
110
- ### Running Tests
111
-
112
- ```bash
113
- # Run all tests
114
- python -m pytest
115
-
116
- # Run specific test file
117
- python -m pytest tests/test_feature_utils.py
118
-
119
- # Run with coverage
120
- python -m pytest --cov=hunyuanvideo_foley
121
- ```
122
-
123
- ### Writing Tests
124
-
125
- - Place tests in `tests/` directory
126
- - Name test files as `test_*.py`
127
- - Use descriptive test function names
128
- - Test edge cases and error conditions
129
-
130
- ## Development Workflow
131
-
132
- 1. **Setup Environment**
133
- ```bash
134
- python -m venv venv
135
- source venv/bin/activate # Linux/Mac
136
- # or
137
- venv\Scripts\activate # Windows
138
-
139
- pip install -r requirements.txt
140
- pip install -e .
141
- ```
142
-
143
- 2. **Install Development Tools**
144
- ```bash
145
- pre-commit install
146
- ```
147
-
148
- 3. **Make Changes**
149
- - Follow the coding standards above
150
- - Add tests for new functionality
151
- - Update documentation as needed
152
-
153
- 4. **Run Quality Checks**
154
- ```bash
155
- black --check --line-length 120 .
156
- isort --check-only --profile black .
157
- flake8 --max-line-length 120
158
- mypy --ignore-missing-imports .
159
- pytest
160
- ```
161
-
162
- 5. **Commit Changes**
163
- ```bash
164
- git add .
165
- git commit -m "feat: add new feature"
166
- ```
167
-
168
- ## Performance Considerations
169
-
170
- - Use `torch.no_grad()` for inference-only code
171
- - Leverage GPU when available
172
- - Implement batch processing where possible
173
- - Profile code to identify bottlenecks
174
-
175
- ## Dependencies
176
-
177
- - Keep dependencies minimal and well-maintained
178
- - Pin versions for reproducibility
179
- - Separate development dependencies from runtime dependencies
180
- - Document any special installation requirements
181
-
182
- ## Configuration
183
-
184
- - Use centralized configuration in `constants.py`
185
- - Support environment variable overrides
186
- - Provide sensible defaults for all parameters
187
- - Validate configuration at startup
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
HunyuanVideo-Foley/INSTALL.md DELETED
@@ -1,203 +0,0 @@
1
- # 安装指南 - HunyuanVideo-Foley
2
-
3
- 本文档提供了将 HunyuanVideo-Foley 作为 Python 包安装和使用的详细指南。
4
-
5
- ## 安装方式
6
-
7
- ### 方式1:从源码安装(推荐)
8
-
9
- ```bash
10
- # 克隆仓库
11
- git clone https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley
12
- cd HunyuanVideo-Foley
13
-
14
- # 安装包(开发模式)
15
- pip install -e .
16
-
17
- # 或安装包含所有可选依赖
18
- pip install -e .[all]
19
- ```
20
-
21
- ### 方式2:直接从GitHub安装
22
-
23
- ```bash
24
- pip install git+https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley.git
25
- ```
26
-
27
- ### 方式3:构建wheel包安装
28
-
29
- ```bash
30
- # 在项目根目录下
31
- python setup.py bdist_wheel
32
- pip install dist/hunyuanvideo_foley-1.0.0-py3-none-any.whl
33
- ```
34
-
35
- ## 特殊依赖安装
36
-
37
- 由于某些依赖不在PyPI上,需要单独安装:
38
-
39
- ```bash
40
- # 安装audiotools(必需)
41
- pip install git+https://github.com/descriptinc/audiotools
42
-
43
- # 安装特定版本的transformers(支持SigLIP2)
44
- pip install git+https://github.com/huggingface/transformers@v4.49.0-SigLIP-2
45
- ```
46
-
47
- ## 可选依赖安装
48
-
49
- ```bash
50
- # 安装开发依赖
51
- pip install hunyuanvideo-foley[dev]
52
-
53
- # 安装测试依赖
54
- pip install hunyuanvideo-foley[test]
55
-
56
- # 安装Gradio界面依赖
57
- pip install hunyuanvideo-foley[gradio]
58
-
59
- # 安装所有可选依赖
60
- pip install hunyuanvideo-foley[all]
61
- ```
62
-
63
- ## 验证安装
64
-
65
- ```bash
66
- # 检查包是否正确安装
67
- python -c "import hunyuanvideo_foley; print(hunyuanvideo_foley.__version__)"
68
-
69
- # 检查命令行工具
70
- hunyuanvideo-foley --help
71
- ```
72
-
73
- ## 使用方法
74
-
75
- ### 1. 作为Python包使用
76
-
77
- ```python
78
- import hunyuanvideo_foley as hvf
79
-
80
- # 加载模型
81
- model_dict, cfg = hvf.load_model(
82
- model_path="path/to/model",
83
- config_path="configs/hunyuanvideo-foley-xxl.yaml"
84
- )
85
-
86
- # 处理特征
87
- visual_feats, text_feats, audio_len = hvf.feature_process(
88
- video_path="video.mp4",
89
- prompt="footsteps on gravel",
90
- model_dict=model_dict,
91
- cfg=cfg
92
- )
93
-
94
- # 生成音频
95
- audio, sample_rate = hvf.denoise_process(
96
- visual_feats, text_feats, audio_len,
97
- model_dict, cfg
98
- )
99
- ```
100
-
101
- ### 2. 使用命令行工具
102
-
103
- ```bash
104
- # 单个视频处理
105
- hunyuanvideo-foley \
106
- --model_path ./pretrained_models \
107
- --single_video video.mp4 \
108
- --single_prompt "footsteps on gravel" \
109
- --output_dir ./outputs
110
-
111
- # 批量处理
112
- hunyuanvideo-foley \
113
- --model_path ./pretrained_models \
114
- --csv_path batch_videos.csv \
115
- --output_dir ./outputs
116
-
117
- # 启动Gradio界面
118
- hunyuanvideo-foley --gradio --model_path ./pretrained_models
119
- ```
120
-
121
- ### 3. 使用原始脚本(向后兼容)
122
-
123
- ```bash
124
- # 使用原始infer.py脚本
125
- python infer.py --model_path ./pretrained_models --single_video video.mp4 --single_prompt "audio description"
126
-
127
- # 启动Gradio应用
128
- export HIFI_FOLEY_MODEL_PATH=./pretrained_models
129
- python gradio_app.py
130
- ```
131
-
132
- ## 开发环境设置
133
-
134
- 如果你想参与开发:
135
-
136
- ```bash
137
- # 克隆项目
138
- git clone https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley
139
- cd HunyuanVideo-Foley
140
-
141
- # 安装开发版本
142
- pip install -e .[dev]
143
-
144
- # 安装pre-commit钩子
145
- pre-commit install
146
-
147
- # 运行测试
148
- python -m pytest
149
-
150
- # 代码格式化
151
- black --line-length 120 .
152
- isort --profile black .
153
-
154
- # 类型检查
155
- mypy --ignore-missing-imports .
156
- ```
157
-
158
- ## 系统要求
159
-
160
- - **Python**: 3.8+
161
- - **操作系统**: Linux(主要支持),macOS,Windows
162
- - **GPU内存**: 推荐 ≥24GB VRAM(如RTX 3090/4090)
163
- - **CUDA版本**: 12.4 或 11.8(推荐)
164
-
165
- ## 故障排除
166
-
167
- ### 常见问题
168
-
169
- 1. **ImportError: No module named 'audiotools'**
170
- ```bash
171
- pip install git+https://github.com/descriptinc/audiotools
172
- ```
173
-
174
- 2. **CUDA内存不足**
175
- - 使用较小的批次大小
176
- - 确保GPU有足够的VRAM(推荐24GB+)
177
-
178
- 3. **transformers版本问题**
179
- ```bash
180
- pip install git+https://github.com/huggingface/transformers@v4.49.0-SigLIP-2
181
- ```
182
-
183
- ### 获取帮助
184
-
185
- - 查看项目README: [GitHub](https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley)
186
- - 报告问题: [GitHub Issues](https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley/issues)
187
- - 论文: [arXiv:2508.16930](https://arxiv.org/abs/2508.16930)
188
-
189
- ## 模型下载
190
-
191
- ```bash
192
- # 使用HuggingFace Hub
193
- git clone https://huggingface.co/tencent/HunyuanVideo-Foley
194
-
195
- # 或使用huggingface-cli
196
- huggingface-cli download tencent/HunyuanVideo-Foley
197
- ```
198
-
199
- ## 配置文件
200
-
201
- 包安装后,配置文件位于:
202
- - `hunyuanvideo_foley/configs/` 目录
203
- - 默认配置:`configs/hunyuanvideo-foley-xxl.yaml`
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
HunyuanVideo-Foley/LICENSE DELETED
@@ -1,77 +0,0 @@
1
- TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT
2
- Tencent HunyuanVideo-Foley Release Date: August 28, 2025
3
- THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
4
- By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying any portion or element of the Tencent Hunyuan Works, including via any Hosted Service, You will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
5
- 1. DEFINITIONS.
6
- a. “Acceptable Use Policy” shall mean the policy made available by Tencent as set forth in the Exhibit A.
7
- b. “Agreement” shall mean the terms and conditions for use, reproduction, distribution, modification, performance and displaying of Tencent Hunyuan Works or any portion or element thereof set forth herein.
8
- c. “Documentation” shall mean the specifications, manuals and documentation for Tencent Hunyuan made publicly available by Tencent.
9
- d. “Hosted Service” shall mean a hosted service offered via an application programming interface (API), web access, or any other electronic or remote means.
10
- e. “Licensee,” “You” or “Your” shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Tencent Hunyuan Works for any purpose and in any field of use.
11
- f. “Materials” shall mean, collectively, Tencent’s proprietary Tencent Hunyuan and Documentation (and any portion thereof) as made available by Tencent under this Agreement.
12
- g. “Model Derivatives” shall mean all: (i) modifications to Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; (ii) works based on Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; or (iii) any other machine learning model which is created by transfer of patterns of the weights, parameters, operations, or Output of Tencent Hunyuan or any Model Derivative of Tencent Hunyuan, to that model in order to cause that model to perform similarly to Tencent Hunyuan or a Model Derivative of Tencent Hunyuan, including distillation methods, methods that use intermediate data representations, or methods based on the generation of synthetic data Outputs by Tencent Hunyuan or a Model Derivative of Tencent Hunyuan for training that model. For clarity, Outputs by themselves are not deemed Model Derivatives.
13
- h. “Output” shall mean the information and/or content output of Tencent Hunyuan or a Model Derivative that results from operating or otherwise using Tencent Hunyuan or a Model Derivative, including via a Hosted Service.
14
- i. “Tencent,” “We” or “Us” shall mean the applicable entity or entities in the Tencent corporate family that own(s) intellectual property or other rights embodied in or utilized by the Materials.
15
- j. “Tencent Hunyuan” shall mean the large language models, text/image/video/audio/3D generation models, and multimodal large language models and their software and algorithms, including trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing made publicly available by Us, including, without limitation to, Tencent HunyuanVideo-Foley released at [https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley].
16
- k. “Tencent Hunyuan Works” shall mean: (i) the Materials; (ii) Model Derivatives; and (iii) all derivative works thereof.
17
- l. “Territory” shall mean the worldwide territory, excluding the territory of the European Union, United Kingdom and South Korea.
18
- m. “Third Party” or “Third Parties” shall mean individuals or legal entities that are not under common control with Us or You.
19
- n. “including” shall mean including but not limited to.
20
- 2. GRANT OF RIGHTS.
21
- We grant You, for the Territory only, a non-exclusive, non-transferable and royalty-free limited license under Tencent’s intellectual property or other rights owned by Us embodied in or utilized by the Materials to use, reproduce, distribute, create derivative works of (including Model Derivatives), and make modifications to the Materials, only in accordance with the terms of this Agreement and the Acceptable Use Policy, and You must not violate (or encourage or permit anyone else to violate) any term of this Agreement or the Acceptable Use Policy.
22
- 3. DISTRIBUTION.
23
- You may, subject to Your compliance with this Agreement, distribute or make available to Third Parties the Tencent Hunyuan Works, exclusively in the Territory, provided that You meet all of the following conditions:
24
- a. You must provide all such Third Party recipients of the Tencent Hunyuan Works or products or services using them a copy of this Agreement;
25
- b. You must cause any modified files to carry prominent notices stating that You changed the files;
26
- c. You are encouraged to: (i) publish at least one technology introduction blogpost or one public statement expressing Your experience of using the Tencent Hunyuan Works; and (ii) mark the products or services developed by using the Tencent Hunyuan Works to indicate that the product/service is “Powered by Tencent Hunyuan”; and
27
- d. All distributions to Third Parties (other than through a Hosted Service) must be accompanied by a “Notice” text file that contains the following notice: “Tencent Hunyuan is licensed under the Tencent Hunyuan Community License Agreement, Copyright © 2025 Tencent. All Rights Reserved. The trademark rights of “Tencent Hunyuan” are owned by Tencent or its affiliate.”
28
- You may add Your own copyright statement to Your modifications and, except as set forth in this Section and in Section 5, may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Model Derivatives as a whole, provided Your use, reproduction, modification, distribution, performance and display of the work otherwise complies with the terms and conditions of this Agreement (including as regards the Territory). If You receive Tencent Hunyuan Works from a Licensee as part of an integrated end user product, then this Section 3 of this Agreement will not apply to You.
29
- 4. ADDITIONAL COMMERCIAL TERMS.
30
- If, on the Tencent Hunyuan version release date, the monthly active users of all products or services made available by or for Licensee is greater than 100 million monthly active users in the preceding calendar month, You must request a license from Tencent, which Tencent may grant to You in its sole discretion, and You are not authorized to exercise any of the rights under this Agreement unless or until Tencent otherwise expressly grants You such rights.
31
- 5. RULES OF USE.
32
- a. Your use of the Tencent Hunyuan Works must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Tencent Hunyuan Works, which is hereby incorporated by reference into this Agreement. You must include the use restrictions referenced in these Sections 5(a) and 5(b) as an enforceable provision in any agreement (e.g., license agreement, terms of use, etc.) governing the use and/or distribution of Tencent Hunyuan Works and You must provide notice to subsequent users to whom You distribute that Tencent Hunyuan Works are subject to the use restrictions in these Sections 5(a) and 5(b).
33
- b. You must not use the Tencent Hunyuan Works or any Output or results of the Tencent Hunyuan Works to improve any other AI model (other than Tencent Hunyuan or Model Derivatives thereof).
34
- c. You must not use, reproduce, modify, distribute, or display the Tencent Hunyuan Works, Output or results of the Tencent Hunyuan Works outside the Territory. Any such use outside the Territory is unlicensed and unauthorized under this Agreement.
35
- 6. INTELLECTUAL PROPERTY.
36
- a. Subject to Tencent’s ownership of Tencent Hunyuan Works made by or for Tencent and intellectual property rights therein, conditioned upon Your compliance with the terms and conditions of this Agreement, as between You and Tencent, You will be the owner of any derivative works and modifications of the Materials and any Model Derivatives that are made by or for You.
37
- b. No trademark licenses are granted under this Agreement, and in connection with the Tencent Hunyuan Works, Licensee may not use any name or mark owned by or associated with Tencent or any of its affiliates, except as required for reasonable and customary use in describing and distributing the Tencent Hunyuan Works. Tencent hereby grants You a license to use “Tencent Hunyuan” (the “Mark”) in the Territory solely as required to comply with the provisions of Section 3(c), provided that You comply with any applicable laws related to trademark protection. All goodwill arising out of Your use of the Mark will inure to the benefit of Tencent.
38
- c. If You commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any person or entity alleging that the Materials or any Output, or any portion of any of the foregoing, infringe any intellectual property or other right owned or licensable by You, then all licenses granted to You under this Agreement shall terminate as of the date such lawsuit or other proceeding is filed. You will defend, indemnify and hold harmless Us from and against any claim by any Third Party arising out of or related to Your or the Third Party’s use or distribution of the Tencent Hunyuan Works.
39
- d. Tencent claims no rights in Outputs You generate. You and Your users are solely responsible for Outputs and their subsequent uses.
40
- 7. DISCLAIMERS OF WARRANTY AND LIMITATIONS OF LIABILITY.
41
- a. We are not obligated to support, update, provide training for, or develop any further version of the Tencent Hunyuan Works or to grant any license thereto.
42
- b. UNLESS AND ONLY TO THE EXTENT REQUIRED BY APPLICABLE LAW, THE TENCENT HUNYUAN WORKS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED “AS IS” WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES OF ANY KIND INCLUDING ANY WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, COURSE OF DEALING, USAGE OF TRADE, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING, REPRODUCING, MODIFYING, PERFORMING, DISPLAYING OR DISTRIBUTING ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND ASSUME ANY AND ALL RISKS ASSOCIATED WITH YOUR OR A THIRD PARTY’S USE OR DISTRIBUTION OF ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND YOUR EXERCISE OF RIGHTS AND PERMISSIONS UNDER THIS AGREEMENT.
43
- c. TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL TENCENT OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, FOR ANY DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, CONSEQUENTIAL OR PUNITIVE DAMAGES, OR LOST PROFITS OF ANY KIND ARISING FROM THIS AGREEMENT OR RELATED TO ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS, EVEN IF TENCENT OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
44
- 8. SURVIVAL AND TERMINATION.
45
- a. The term of this Agreement shall commence upon Your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
46
- b. We may terminate this Agreement if You breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, You must promptly delete and cease use of the Tencent Hunyuan Works. Sections 6(a), 6(c), 7 and 9 shall survive the termination of this Agreement.
47
- 9. GOVERNING LAW AND JURISDICTION.
48
- a. This Agreement and any dispute arising out of or relating to it will be governed by the laws of the Hong Kong Special Administrative Region of the People’s Republic of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
49
- b. Exclusive jurisdiction and venue for any dispute arising out of or relating to this Agreement will be a court of competent jurisdiction in the Hong Kong Special Administrative Region of the People’s Republic of China, and Tencent and Licensee consent to the exclusive jurisdiction of such court with respect to any such dispute.
50
-
51
- EXHIBIT A
52
- ACCEPTABLE USE POLICY
53
-
54
- Tencent reserves the right to update this Acceptable Use Policy from time to time.
55
- Last modified: November 5, 2024
56
-
57
- Tencent endeavors to promote safe and fair use of its tools and features, including Tencent Hunyuan. You agree not to use Tencent Hunyuan or Model Derivatives:
58
- 1. Outside the Territory;
59
- 2. In any way that violates any applicable national, federal, state, local, international or any other law or regulation;
60
- 3. To harm Yourself or others;
61
- 4. To repurpose or distribute output from Tencent Hunyuan or any Model Derivatives to harm Yourself or others;
62
- 5. To override or circumvent the safety guardrails and safeguards We have put in place;
63
- 6. For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
64
- 7. To generate or disseminate verifiably false information and/or content with the purpose of harming others or influencing elections;
65
- 8. To generate or facilitate false online engagement, including fake reviews and other means of fake online engagement;
66
- 9. To intentionally defame, disparage or otherwise harass others;
67
- 10. To generate and/or disseminate malware (including ransomware) or any other content to be used for the purpose of harming electronic systems;
68
- 11. To generate or disseminate personal identifiable information with the purpose of harming others;
69
- 12. To generate or disseminate information (including images, code, posts, articles), and place the information in any public context (including –through the use of bot generated tweets), without expressly and conspicuously identifying that the information and/or content is machine generated;
70
- 13. To impersonate another individual without consent, authorization, or legal right;
71
- 14. To make high-stakes automated decisions in domains that affect an individual’s safety, rights or wellbeing (e.g., law enforcement, migration, medicine/health, management of critical infrastructure, safety components of products, essential services, credit, employment, housing, education, social scoring, or insurance);
72
- 15. In a manner that violates or disrespects the social ethics and moral standards of other countries or regions;
73
- 16. To perform, facilitate, threaten, incite, plan, promote or encourage violent extremism or terrorism;
74
- 17. For any use intended to discriminate against or harm individuals or groups based on protected characteristics or categories, online or offline social behavior or known or predicted personal or personality characteristics;
75
- 18. To intentionally exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
76
- 19. For military purposes;
77
- 20. To engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or other professional practices.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
HunyuanVideo-Foley/MANIFEST.in DELETED
@@ -1,38 +0,0 @@
1
- # Include package metadata and documentation
2
- include README.md
3
- include LICENSE
4
- include NOTICE
5
- include DEVELOPMENT.md
6
- include CLAUDE.md
7
- include requirements.txt
8
- include pyproject.toml
9
- include pytest.ini
10
-
11
- # Include configuration files
12
- include configs/*.yaml
13
- include configs/*.yml
14
- recursive-include hunyuanvideo_foley/configs *.yaml *.yml
15
-
16
- # Include test assets if any
17
- include assets/*.csv
18
- include assets/*.txt
19
- recursive-include assets/test_videos *
20
-
21
- # Include example scripts
22
- include *.py
23
- include *.sh
24
-
25
- # Include test files
26
- recursive-include tests *.py
27
-
28
- # Exclude unnecessary files
29
- global-exclude *.pyc
30
- global-exclude *.pyo
31
- global-exclude *~
32
- global-exclude .DS_Store
33
- global-exclude __pycache__
34
- prune .git
35
- prune .github
36
- prune examples/*/outputs
37
- prune **/__pycache__
38
- prune **/*.pyc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
HunyuanVideo-Foley/NOTICE DELETED
@@ -1,27 +0,0 @@
1
- Usage and Legal Notices:
2
-
3
- Tencent is pleased to support the open source community by making Tencent HunyuanVideo-Foley available.
4
-
5
- Copyright (C) 2025 Tencent. All rights reserved.
6
-
7
- Tencent HunyuanVideo-Foley is licensed under TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT, which can be found in this repository called "LICENSE", except for the third-party components listed below. Tencent HunyuanVideo-Foley does not impose any additional limitations beyond what is outlined in the respective licenses of these third-party components. Users must comply with all terms and conditions of original licenses of these third-party components and must ensure that the usage of the third party components adheres to all relevant laws and regulations.
8
-
9
- For avoidance of doubts, Tencent HunyuanVideo-Foley means the large language models and their software and algorithms, including trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing made publicly available by Tencent in accordance with the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
10
-
11
-
12
- Other dependencies and licenses:
13
-
14
-
15
- Open Source Software Licensed under the MIT License:
16
- --------------------------------------------------------------------
17
- 1. syncformer
18
- Copyright (c) 2024 Vladimir Iashin
19
-
20
-
21
- Terms of the MIT License:
22
- --------------------------------------------------------------------
23
- Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
24
-
25
- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
26
-
27
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
HunyuanVideo-Foley/README.md DELETED
@@ -1,519 +0,0 @@
1
- <div align="center">
2
-
3
- https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley
4
-
5
- <img src="assets/logo.png" alt="HunyuanVideo-Foley Logo" width="400">
6
-
7
- <h4>Multimodal Diffusion with Representation Alignment for High-Fidelity Foley Audio Generation</h4>
8
-
9
- <p align="center">
10
- <strong>Professional-grade AI sound effect generation for video content creators</strong>
11
- </p>
12
-
13
- <div align="center">
14
- <a href=https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley target="_blank"><img src=https://img.shields.io/badge/Code-black.svg?logo=github height=22px></a>
15
- <a href=https://szczesnys.github.io/hunyuanvideo-foley target="_blank"><img src=https://img.shields.io/badge/Page-bb8a2e.svg?logo=github height=22px></a>
16
- <a href=https://huggingface.co/tencent/HunyuanVideo-Foley target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Models-d96902.svg height=22px></a>
17
- <a href=https://huggingface.co/spaces/tencent/HunyuanVideo-Foley target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Demo-276cb4.svg height=22px></a>
18
- <a href=https://arxiv.org/abs/2508.16930 target="_blank"><img src=https://img.shields.io/badge/Report-b5212f.svg?logo=arxiv height=22px></a>
19
- <a href=https://x.com/TencentHunyuan target="_blank"><img src=https://img.shields.io/badge/Hunyuan-black.svg?logo=x height=22px></a>
20
- <a href=https://discord.gg/YEyGGn6Bte target="_blank"><img src=https://img.shields.io/badge/Hunyuan-141984.svg?logo=discord height=22px></a>
21
- </div>
22
-
23
- </div>
24
-
25
- ---
26
-
27
- <div align="center">
28
-
29
- ### 👥 **Authors**
30
-
31
- <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 15px; margin: 20px 0;">
32
-
33
- **Sizhe Shan**<sup>1,2*</sup> • **Qiulin Li**<sup>1,3*</sup> • **Yutao Cui**<sup>1</sup> • **Miles Yang**<sup>1</sup> • **Yuehai Wang**<sup>2</sup> • **Qun Yang**<sup>3</sup> • **Jin Zhou**<sup>1†</sup> • **Zhao Zhong**<sup>1</sup>
34
-
35
- </div>
36
-
37
- <div style="margin-top: 15px; font-size: 14px; color: #666;">
38
-
39
- 🏢 <sup>1</sup>**Tencent Hunyuan** • 🎓 <sup>2</sup>**Zhejiang University** • ✈️ <sup>3</sup>**Nanjing University of Aeronautics and Astronautics**
40
-
41
- *Equal contribution • †Project lead
42
-
43
- </div>
44
-
45
- </div>
46
-
47
-
48
- ---
49
-
50
- ## 🔥🔥🔥 **News**
51
-
52
- <div style="background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%); padding: 20px; border-radius: 15px; margin: 20px 0; border-left: 5px solid #2196f3;">
53
-
54
- - **[2025.9.29]** 🚀 **HunyuanVideo-Foley-XL Model Release** - Release XL-sized model with offload inference support, significantly reducing VRAM requirements.
55
- - **[2025.8.28]** 🌟 **HunyuanVideo-Foley Open Source Release** - Inference code and model weights publicly available.
56
-
57
- </div>
58
-
59
- ---
60
-
61
- ## 🎥 **Demo & Showcase**
62
-
63
- <div align="center">
64
-
65
- > **Experience the magic of AI-generated Foley audio in perfect sync with video content!**
66
-
67
- <div style="border: 3px solid #4A90E2; border-radius: 15px; padding: 10px; margin: 20px 0; background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);">
68
-
69
- <video src="https://github.com/user-attachments/assets/d6e1b6fd-6980-4a68-8717-74298d064195" width="80%" controls style="border-radius: 10px; box-shadow: 0 8px 32px rgba(0,0,0,0.1);"> </video>
70
-
71
- <p><em>🎬 Watch how HunyuanVideo-Foley generates immersive sound effects synchronized with video content</em></p>
72
-
73
- </div>
74
-
75
- ---
76
-
77
- ## 🤝 **Community Contributions**
78
-
79
- <div style="background: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 4px solid #28a745; margin: 20px 0; color: #333;">
80
-
81
- **ComfyUI Integration** - Thanks to the amazing community for creating ComfyUI nodes:
82
-
83
- - **[if-ai/ComfyUI_HunyuanVideoFoley](https://github.com/if-ai/ComfyUI_HunyuanVideoFoley)** - ComfyUI workflow integration which supports cpu offloading and FP8 quantization
84
- - **[phazei/ComfyUI-HunyuanVideo-Foley](https://github.com/phazei/ComfyUI-HunyuanVideo-Foley)** - Alternative ComfyUI node implementation which supports different precision modes
85
-
86
- </div>
87
-
88
- <div align="center" style="margin: 20px 0;">
89
-
90
- **🌟 We encourage and appreciate community contributions that make HunyuanVideo-Foley more accessible!**
91
-
92
- </div>
93
-
94
- ---
95
- ### ✨ **Key Highlights**
96
-
97
- <table align="center" style="border: none; margin: 20px 0;">
98
- <tr>
99
- <td align="center" width="33%">
100
-
101
- 🎭 **Multi-scenario Sync**
102
- High-quality audio synchronized with complex video scenes
103
-
104
- </td>
105
- <td align="center" width="33%">
106
-
107
- 🧠 **Multi-modal Balance**
108
- Perfect harmony between visual and textual information
109
-
110
- </td>
111
- <td align="center" width="33%">
112
-
113
- 🎵 **48kHz Hi-Fi Output**
114
- Professional-grade audio generation with crystal clarity
115
-
116
- </td>
117
- </tr>
118
- </table>
119
-
120
- </div>
121
-
122
- ---
123
-
124
- ## 📄 **Abstract**
125
-
126
- <div align="center" style="background: linear-gradient(135deg, #ffeef8 0%, #f0f8ff 100%); padding: 30px; border-radius: 20px; margin: 20px 0; border-left: 5px solid #ff6b9d; color: #333;">
127
-
128
- **🚀 Tencent Hunyuan** open-sources **HunyuanVideo-Foley** an end-to-end video sound effect generation model!
129
-
130
- *A professional-grade AI tool specifically designed for video content creators, widely applicable to diverse scenarios including short video creation, film production, advertising creativity, and game development.*
131
-
132
- </div>
133
-
134
- ### 🎯 **Core Highlights**
135
-
136
- <div style="display: grid; grid-template-columns: 1fr; gap: 15px; margin: 20px 0;">
137
-
138
- <div style="border-left: 4px solid #4CAF50; padding: 15px; background: #f8f9fa; border-radius: 8px; color: #333;">
139
-
140
- **🎬 Multi-scenario Audio-Visual Synchronization**
141
- Supports generating high-quality audio that is synchronized and semantically aligned with complex video scenes, enhancing realism and immersive experience for film/TV and gaming applications.
142
-
143
- </div>
144
-
145
- <div style="border-left: 4px solid #2196F3; padding: 15px; background: #f8f9fa; border-radius: 8px; color: #333;">
146
-
147
- **⚖️ Multi-modal Semantic Balance**
148
- Intelligently balances visual and textual information analysis, comprehensively orchestrates sound effect elements, avoids one-sided generation, and meets personalized dubbing requirements.
149
-
150
- </div>
151
-
152
- <div style="border-left: 4px solid #FF9800; padding: 15px; background: #f8f9fa; border-radius: 8px; color: #333;">
153
-
154
- **🎵 High-fidelity Audio Output**
155
- Self-developed 48kHz audio VAE perfectly reconstructs sound effects, music, and vocals, achieving professional-grade audio generation quality.
156
-
157
- </div>
158
-
159
- </div>
160
-
161
- <div align="center" style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 15px; margin: 20px 0; color: #333;">
162
-
163
- **🏆 SOTA Performance Achieved**
164
-
165
- *HunyuanVideo-Foley comprehensively leads the field across multiple evaluation benchmarks, achieving new state-of-the-art levels in audio fidelity, visual-semantic alignment, temporal alignment, and distribution matching - surpassing all open-source solutions!*
166
-
167
- </div>
168
-
169
- <div align="center">
170
-
171
- ![Performance Overview](assets/pan_chart.png)
172
- *📊 Performance comparison across different evaluation metrics - HunyuanVideo-Foley leads in all categories*
173
-
174
- </div>
175
-
176
- ---
177
-
178
- ## 🔧 **Technical Architecture**
179
-
180
- ### 📊 **Data Pipeline Design**
181
-
182
- <div align="center" style="margin: 20px 0; color: #333;">
183
-
184
- ![Data Pipeline](assets/data_pipeline.png)
185
- *🔄 Comprehensive data processing pipeline for high-quality text-video-audio datasets*
186
-
187
- </div>
188
-
189
- <div style="background: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 4px solid #17a2b8; margin: 20px 0;">
190
-
191
- The **TV2A (Text-Video-to-Audio)** task presents a complex multimodal generation challenge requiring large-scale, high-quality datasets. Our comprehensive data pipeline systematically identifies and excludes unsuitable content to produce robust and generalizable audio generation capabilities.
192
-
193
- </div>
194
-
195
- ### 🏗️ **Model Architecture**
196
-
197
- <div align="center" style="margin: 20px 0; color: #333;">
198
-
199
- ![Model Architecture](assets/model_arch.png)
200
- *🧠 HunyuanVideo-Foley hybrid architecture with multimodal and unimodal transformer blocks*
201
-
202
- </div>
203
-
204
- <div style="background: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 4px solid #28a745; margin: 20px 0;">
205
-
206
- **HunyuanVideo-Foley** employs a sophisticated hybrid architecture:
207
-
208
- - **🔄 Multimodal Transformer Blocks**: Process visual-audio streams simultaneously
209
- - **🎵 Unimodal Transformer Blocks**: Focus on audio stream refinement
210
- - **👁️ Visual Encoding**: Pre-trained encoder extracts visual features from video frames
211
- - **📝 Text Processing**: Semantic features extracted via pre-trained text encoder
212
- - **🎧 Audio Encoding**: Latent representations with Gaussian noise perturbation
213
- - **⏰ Temporal Alignment**: Synchformer-based frame-level synchronization with gated modulation
214
-
215
- </div>
216
-
217
- ---
218
-
219
- ## 📈 **Performance Benchmarks**
220
-
221
- ### 🎬 **MovieGen-Audio-Bench Results**
222
-
223
- <div align="center">
224
-
225
- > *Objective and Subjective evaluation results demonstrating superior performance across all metrics*
226
-
227
- </div>
228
-
229
- <div style="overflow-x: auto; margin: 20px 0;">
230
-
231
- | 🏆 **Method** | **PQ** ↑ | **PC** ↓ | **CE** ↑ | **CU** ↑ | **IB** ↑ | **DeSync** ↓ | **CLAP** ↑ | **MOS-Q** ↑ | **MOS-S** ↑ | **MOS-T** ↑ |
232
- |:-------------:|:--------:|:--------:|:--------:|:--------:|:--------:|:-------------:|:-----------:|:------------:|:------------:|:------------:|
233
- | FoleyGrafter | 6.27 | 2.72 | 3.34 | 5.68 | 0.17 | 1.29 | 0.14 | 3.36±0.78 | 3.54±0.88 | 3.46±0.95 |
234
- | V-AURA | 5.82 | 4.30 | 3.63 | 5.11 | 0.23 | 1.38 | 0.14 | 2.55±0.97 | 2.60±1.20 | 2.70±1.37 |
235
- | Frieren | 5.71 | 2.81 | 3.47 | 5.31 | 0.18 | 1.39 | 0.16 | 2.92±0.95 | 2.76±1.20 | 2.94±1.26 |
236
- | MMAudio | 6.17 | 2.84 | 3.59 | 5.62 | 0.27 | 0.80 | 0.35 | 3.58±0.84 | 3.63±1.00 | 3.47±1.03 |
237
- | ThinkSound | 6.04 | 3.73 | 3.81 | 5.59 | 0.18 | 0.91 | 0.20 | 3.20±0.97 | 3.01±1.04 | 3.02±1.08 |
238
- | **HunyuanVideo-Foley (ours)** | **6.59** | **2.74** | **3.88** | **6.13** | **0.35** | **0.74** | **0.33** | **4.14±0.68** | **4.12±0.77** | **4.15±0.75** |
239
-
240
- </div>
241
-
242
-
243
- ### 🎯 **Kling-Audio-Eval Results**
244
-
245
- <div align="center">
246
-
247
- > *Comprehensive objective evaluation showcasing state-of-the-art performance*
248
-
249
- </div>
250
-
251
- <div style="overflow-x: auto; margin: 20px 0;">
252
-
253
- | 🏆 **Method** | **FD_PANNs** ↓ | **FD_PASST** ↓ | **KL** ↓ | **IS** ↑ | **PQ** ↑ | **PC** ↓ | **CE** ↑ | **CU** ↑ | **IB** ↑ | **DeSync** ↓ | **CLAP** ↑ |
254
- |:-------------:|:--------------:|:--------------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:-------------:|:-----------:|
255
- | FoleyGrafter | 22.30 | 322.63 | 2.47 | 7.08 | 6.05 | 2.91 | 3.28 | 5.44 | 0.22 | 1.23 | 0.22 |
256
- | V-AURA | 33.15 | 474.56 | 3.24 | 5.80 | 5.69 | 3.98 | 3.13 | 4.83 | 0.25 | 0.86 | 0.13 |
257
- | Frieren | 16.86 | 293.57 | 2.95 | 7.32 | 5.72 | 2.55 | 2.88 | 5.10 | 0.21 | 0.86 | 0.16 |
258
- | MMAudio | 9.01 | 205.85 | 2.17 | 9.59 | 5.94 | 2.91 | 3.30 | 5.39 | 0.30 | 0.56 | 0.27 |
259
- | ThinkSound | 9.92 | 228.68 | 2.39 | 6.86 | 5.78 | 3.23 | 3.12 | 5.11 | 0.22 | 0.67 | 0.22 |
260
- | **HunyuanVideo-Foley (ours)** | **6.07** | **202.12** | **1.89** | **8.30** | **6.12** | **2.76** | **3.22** | **5.53** | **0.38** | **0.54** | **0.24** |
261
-
262
- </div>
263
-
264
- <div align="center" style="background: linear-gradient(135deg, #4CAF50 0%, #45a049 100%); color: white; padding: 15px; border-radius: 10px; margin: 20px 0; color: #333;">
265
-
266
- **🎉 Outstanding Results!** HunyuanVideo-Foley achieves the best scores across **ALL** evaluation metrics, demonstrating significant improvements in audio quality, synchronization, and semantic alignment.
267
-
268
- </div>
269
-
270
-
271
-
272
- ---
273
-
274
- ## 🚀 **Quick Start**
275
-
276
- ### 📦 **Installation**
277
-
278
- <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 15px; margin: 20px 0; color: #333;">
279
-
280
- **🔧 System Requirements**
281
- - **CUDA**: 12.4 or 11.8 recommended
282
- - **Python**: 3.8+
283
- - **OS**: Linux (primary support)
284
- - **VRAM**: 20GB for XXL model (or 12GB with `--enable_offload`), 16GB for XL model (or 8GB with `--enable_offload`)
285
-
286
- </div>
287
-
288
- #### **Step 1: Clone Repository**
289
-
290
- ```bash
291
- # 📥 Clone the repository
292
- git clone https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley
293
- cd HunyuanVideo-Foley
294
- ```
295
-
296
- #### **Step 2: Environment Setup**
297
-
298
- <div style="background: #fff3cd; padding: 15px; border-radius: 8px; border-left: 4px solid #ffc107; margin: 10px 0; color: #333;">
299
-
300
- 💡 **Tip**: We recommend using [Conda](https://docs.anaconda.com/free/miniconda/index.html) for Python environment management.
301
-
302
- </div>
303
-
304
- ```bash
305
- # 🔧 Install dependencies
306
- pip install -r requirements.txt
307
- ```
308
-
309
- #### **Step 3: Download Pretrained Models**
310
-
311
- <div style="background: #d1ecf1; padding: 15px; border-radius: 8px; border-left: 4px solid #17a2b8; margin: 10px 0;color: #333;">
312
-
313
- 🔗 **Download Model weights from Huggingface**
314
- ```bash
315
- # using git-lfs
316
- git clone https://huggingface.co/tencent/HunyuanVideo-Foley
317
-
318
- # using huggingface-cli
319
- huggingface-cli download tencent/HunyuanVideo-Foley
320
- ```
321
-
322
- <!-- 🔗 **Download Model weights from ModelScope** -->
323
- <!-- ```bash -->
324
- <!-- # using git-lfs -->
325
- <!-- git clone https://huggingface.co/tencent/HunyuanVideo-Foley -->
326
- <!-- -->
327
- <!-- # using huggingface-cli -->
328
- <!-- huggingface-cli download tencent/HunyuanVideo-Foley -->
329
- <!-- ``` -->
330
-
331
- </div>
332
-
333
-
334
- ---
335
-
336
- ## 💻 **Usage**
337
-
338
- ### 📊 **Model Specifications**
339
-
340
- | Model | Checkpoint | VRAM (Normal) | VRAM (Offload) |
341
- |-------|------------|---------------|----------------|
342
- | **XXL** *(Default)* | `hunyuanvideo_foley.pth` | 20GB | 12GB |
343
- | **XL** | `hunyuanvideo_foley_xl.pth` | 16GB | 8GB |
344
-
345
- ### 🎬 **Single Video Generation**
346
-
347
- <div style="background: #e8f5e8; padding: 15px; border-radius: 8px; border-left: 4px solid #28a745; margin: 10px 0;color: #333;">
348
-
349
- Generate Foley audio for a single video file with text description:
350
-
351
- </div>
352
-
353
- ```bash
354
- # Use XXL model (default, best quality)
355
- python3 infer.py \
356
- --model_path PRETRAINED_MODEL_PATH_DIR \
357
- --single_video video_path \
358
- --single_prompt "audio description" \
359
- --output_dir OUTPUT_DIR \
360
- # --enable_offload
361
-
362
- # Use XL model (memory-friendly)
363
- python3 infer.py \
364
- --model_path PRETRAINED_MODEL_PATH_DIR \
365
- --model_size xl \
366
- --single_video video_path \
367
- --single_prompt "audio description" \
368
- --output_dir OUTPUT_DIR \
369
- # --enable_offload
370
- ```
371
-
372
- ### 📂 **Batch Processing**
373
-
374
- <div style="background: #fff3e0; padding: 15px; border-radius: 8px; border-left: 4px solid #ff9800; margin: 10px 0;color: #333;">
375
-
376
- Process multiple videos using a CSV file with video paths and descriptions:
377
-
378
- </div>
379
-
380
- ```bash
381
- # Download sample test videos
382
- bash ./download_test_videos.sh
383
-
384
- # Batch processing
385
- python3 infer.py \
386
- --model_path PRETRAINED_MODEL_PATH_DIR \
387
- --csv_path assets/test.csv \
388
- --output_dir OUTPUT_DIR \
389
- # --enable_offload
390
- ```
391
-
392
- ### 🌐 **Interactive Web Interface**
393
-
394
- <div style="background: #f3e5f5; padding: 15px; border-radius: 8px; border-left: 4px solid #9c27b0; margin: 10px 0;color: #333;">
395
-
396
- Launch a user-friendly Gradio web interface for easy interaction:
397
-
398
- </div>
399
-
400
- ```bash
401
- # Launch with XXL model (default)
402
- export HIFI_FOLEY_MODEL_PATH=PRETRAINED_MODEL_PATH_DIR
403
- python3 gradio_app.py
404
-
405
- # Launch with XL model (memory-friendly)
406
- export HIFI_FOLEY_MODEL_PATH=PRETRAINED_MODEL_PATH_DIR
407
- MODEL_SIZE=xl python3 gradio_app.py
408
-
409
- # Optional: Enable offload to reduce memory usage
410
- ENABLE_OFFLOAD=true python3 gradio_app.py
411
- ```
412
-
413
- <div align="center" style="margin: 20px 0; color: #333;">
414
-
415
- *🚀 Then open your browser and navigate to the provided local URL to start generating Foley audio!*
416
-
417
- </div>
418
-
419
- ---
420
-
421
- ## 📚 **Citation**
422
-
423
- <div style="background: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 4px solid #6c757d; margin: 20px 0; color: #333;">
424
-
425
- If you find **HunyuanVideo-Foley** useful for your research, please consider citing our paper:
426
-
427
- </div>
428
-
429
- ```bibtex
430
- @misc{shan2025hunyuanvideofoleymultimodaldiffusionrepresentation,
431
- title={HunyuanVideo-Foley: Multimodal Diffusion with Representation Alignment for High-Fidelity Foley Audio Generation},
432
- author={Sizhe Shan and Qiulin Li and Yutao Cui and Miles Yang and Yuehai Wang and Qun Yang and Jin Zhou and Zhao Zhong},
433
- year={2025},
434
- eprint={2508.16930},
435
- archivePrefix={arXiv},
436
- primaryClass={eess.AS},
437
- url={https://arxiv.org/abs/2508.16930},
438
- }
439
- ```
440
- ## Star History
441
-
442
- [![Star History Chart](https://api.star-history.com/svg?repos=Tencent-Hunyuan/HunyuanVideo-Foley&type=Date)](https://www.star-history.com/#Tencent-Hunyuan/HunyuanVideo-Foley&Date)
443
- ---
444
-
445
- ## 🙏 **Acknowledgements**
446
-
447
- <div align="center">
448
-
449
- **We extend our heartfelt gratitude to the open-source community!**
450
-
451
- </div>
452
-
453
- <table align="center" style="width: 100%; border: none; margin: 20px 0;">
454
- <tr>
455
- <td align="center" style="width: 33%; padding: 10px; vertical-align: top;">
456
-
457
- 🎨 **[Stable Diffusion 3](https://huggingface.co/stabilityai/stable-diffusion-3-medium)**
458
- *Foundation diffusion models*
459
-
460
- </td>
461
- <td align="center" style="width: 33%; padding: 10px; vertical-align: top;">
462
-
463
- ⚡ **[FLUX](https://github.com/black-forest-labs/flux)**
464
- *Advanced generation techniques*
465
-
466
- </td>
467
- <td align="center" style="width: 33%; padding: 10px; vertical-align: top;">
468
-
469
- 🎵 **[MMAudio](https://github.com/hkchengrex/MMAudio)**
470
- *Multimodal audio generation*
471
-
472
- </td>
473
- </tr>
474
- <tr>
475
- <td align="center" style="width: 33%; padding: 10px; vertical-align: top;">
476
-
477
- 🤗 **[HuggingFace](https://huggingface.co)**
478
- *Platform & diffusers library*
479
-
480
- </td>
481
- <td align="center" style="width: 33%; padding: 10px; vertical-align: top;">
482
-
483
- 🗜️ **[DAC](https://github.com/descriptinc/descript-audio-codec)**
484
- *High-Fidelity Audio Compression*
485
-
486
- </td>
487
- <td align="center" style="width: 33%; padding: 10px; vertical-align: top;">
488
-
489
- 🔗 **[Synchformer](https://github.com/v-iashin/Synchformer)**
490
- *Audio-Visual Synchronization*
491
-
492
- </td>
493
- </tr>
494
- </table>
495
-
496
- <div align="center" style="background: linear-gradient(135deg, #74b9ff 0%, #0984e3 100%); color: white; padding: 20px; border-radius: 15px; margin: 20px 0;, color: #333;">
497
-
498
- **🌟 Special thanks to all researchers and developers who contribute to the advancement of AI-generated audio and multimodal learning!**
499
-
500
- </div>
501
-
502
-
503
- ---
504
-
505
- <div align="center" style="margin: 30px 0;">
506
-
507
- ### 🔗 **Connect with Us**
508
-
509
- [![GitHub](https://img.shields.io/badge/GitHub-Follow-black?style=for-the-badge&logo=github)](https://github.com/Tencent-Hunyuan)
510
- [![Twitter](https://img.shields.io/badge/Twitter-Follow-blue?style=for-the-badge&logo=twitter)](https://twitter.com/Tencent)
511
- [![Hunyuan](https://img.shields.io/badge/Website-HunyuanAI-green?style=for-the-badge&logo=hunyuan)](https://hunyuan.tencent.com/)
512
-
513
- <p style="color: #666; margin-top: 15px; font-size: 14px;">
514
-
515
- © 2025 Tencent Hunyuan. All rights reserved. | Made with ❤️ for the AI community
516
-
517
- </p>
518
-
519
- </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
HunyuanVideo-Foley/build_package.sh DELETED
@@ -1,58 +0,0 @@
1
- #!/bin/bash
2
- # 构建 HunyuanVideo-Foley Python 包的脚本
3
-
4
- set -e # 出现错误时退出
5
-
6
- echo "🚀 开始构建 HunyuanVideo-Foley Python 包..."
7
-
8
- # 清理之前的构建文件
9
- echo "🧹 清理之前的构建文件..."
10
- rm -rf build/ dist/ *.egg-info/
11
-
12
- # 检查必要的工具
13
- echo "🔍 检查构建工具..."
14
- python -c "import setuptools, wheel; print('✅ setuptools和wheel已安装')" || {
15
- echo "❌ 请安装构建工具: pip install setuptools wheel"
16
- exit 1
17
- }
18
-
19
- # 检查setup.py
20
- echo "🔍 验证setup.py配置..."
21
- python setup.py check --restructuredtext --strict || {
22
- echo "⚠️ setup.py验证有警告,但继续构建..."
23
- }
24
-
25
- # 构建源码分发包
26
- echo "📦 构建源码分发包..."
27
- python setup.py sdist
28
-
29
- # 构建wheel包
30
- echo "🎡 构建wheel包..."
31
- python setup.py bdist_wheel
32
-
33
- # 显示构建结果
34
- echo "✅ 构建完成!生成的包:"
35
- ls -la dist/
36
-
37
- # 验证包
38
- echo "🔍 验证生成的包..."
39
- python -m pip check dist/*.whl || echo "⚠️ 包验证有警告"
40
-
41
- echo ""
42
- echo "📝 安装说明:"
43
- echo "# 从wheel文件安装:"
44
- echo "pip install dist/hunyuanvideo_foley-1.0.0-py3-none-any.whl"
45
- echo ""
46
- echo "# 开发模式安装:"
47
- echo "pip install -e ."
48
- echo ""
49
- echo "# 安装所有可选依赖:"
50
- echo "pip install -e .[all]"
51
- echo ""
52
-
53
- echo "⚠️ 注意:某些依赖需要单独安装:"
54
- echo "pip install git+https://github.com/descriptinc/audiotools"
55
- echo "pip install git+https://github.com/huggingface/transformers@v4.49.0-SigLIP-2"
56
-
57
- echo ""
58
- echo "🎉 构建完成!查看 INSTALL.md 获取详细安装指南。"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
HunyuanVideo-Foley/download_test_videos.sh DELETED
@@ -1,11 +0,0 @@
1
- #!/bin/bash
2
-
3
- # Download MoviegenAudioBenchSfx 10 videos
4
- curl -O https://texttoaudio-train-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuanvideo-foley_demo/MovieGenAudioBenchSfx.tar.gz
5
- tar -xzvf MovieGenAudioBenchSfx.tar.gz -C ./assets
6
- rm MovieGenAudioBenchSfx.tar.gz
7
-
8
- # Download gradio example video
9
- curl -O https://texttoaudio-train-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuanvideo-foley_demo/examples.tar.gz
10
- tar -xvzf examples.tar.gz
11
- rm examples.tar.gz
 
 
 
 
 
 
 
 
 
 
 
 
HunyuanVideo-Foley/gradio_app.py DELETED
@@ -1,834 +0,0 @@
1
- import os
2
- import tempfile
3
- import gradio as gr
4
- import torch
5
- import torchaudio
6
- from loguru import logger
7
- from typing import Optional, Tuple
8
- import random
9
- import numpy as np
10
-
11
- from hunyuanvideo_foley.utils.model_utils import load_model
12
- from hunyuanvideo_foley.utils.feature_utils import feature_process
13
- from hunyuanvideo_foley.utils.model_utils import denoise_process
14
- from hunyuanvideo_foley.utils.media_utils import merge_audio_video
15
-
16
- # Global variables for model storage
17
- model_dict = None
18
- cfg = None
19
- device = None
20
-
21
- # need to modify the model path
22
- MODEL_PATH = os.environ.get("HIFI_FOLEY_MODEL_PATH", "./pretrained_models/")
23
- ENABLE_OFFLOAD = os.environ.get("ENABLE_OFFLOAD", "false").lower() in ("true", "1", "yes")
24
- MODEL_SIZE = os.environ.get("MODEL_SIZE", "xxl") # default to xxl model
25
- CONFIG_PATH = os.environ.get("CONFIG_PATH", "")
26
-
27
- def setup_device(device_str: str = "auto", gpu_id: int = 0) -> torch.device:
28
- """Setup computing device"""
29
- if device_str == "auto":
30
- if torch.cuda.is_available():
31
- device = torch.device(f"cuda:{gpu_id}")
32
- logger.info(f"Using CUDA device: {device}")
33
- elif torch.backends.mps.is_available():
34
- device = torch.device("mps")
35
- logger.info("Using MPS device")
36
- else:
37
- device = torch.device("cpu")
38
- logger.info("Using CPU device")
39
- else:
40
- if device_str == "cuda":
41
- device = torch.device(f"cuda:{gpu_id}")
42
- else:
43
- device = torch.device(device_str)
44
- logger.info(f"Using specified device: {device}")
45
-
46
- return device
47
-
48
- def auto_load_models() -> str:
49
- """Automatically load preset models"""
50
- global model_dict, cfg, device
51
-
52
- try:
53
- if not os.path.exists(MODEL_PATH):
54
- return f"❌ Model directory not found: {MODEL_PATH}"
55
-
56
- # Use GPU by default
57
- device = setup_device("auto", 0)
58
-
59
- # Auto-select config if not specified
60
- config_path = CONFIG_PATH
61
- if not config_path:
62
- config_mapping = {
63
- "xl": "configs/hunyuanvideo-foley-xl.yaml",
64
- "xxl": "configs/hunyuanvideo-foley-xxl.yaml"
65
- }
66
- config_path = config_mapping.get(MODEL_SIZE, "configs/hunyuanvideo-foley-xxl.yaml")
67
-
68
- # Load model
69
- logger.info("Auto-loading model...")
70
- logger.info(f"Model path: {MODEL_PATH}")
71
- logger.info(f"Model size: {MODEL_SIZE}")
72
- logger.info(f"Config path: {config_path}")
73
- logger.info(f"Offload mode: {'enabled' if ENABLE_OFFLOAD else 'disabled'}")
74
-
75
- model_dict, cfg = load_model(MODEL_PATH, config_path, device, enable_offload=ENABLE_OFFLOAD, model_size=MODEL_SIZE)
76
-
77
- logger.info("✅ Model loaded successfully!")
78
- return "✅ Model loaded successfully!"
79
-
80
- except Exception as e:
81
- logger.error(f"Model loading failed: {str(e)}")
82
- return f"❌ Model loading failed: {str(e)}"
83
-
84
- def infer_single_video(
85
- video_file,
86
- text_prompt: str,
87
- neg_prompt: str = None,
88
- guidance_scale: float = 4.5,
89
- num_inference_steps: int = 50,
90
- sample_nums: int = 1
91
- ) -> Tuple[list, str]:
92
- """Single video inference"""
93
- global model_dict, cfg, device
94
-
95
- if model_dict is None or cfg is None:
96
- return [], "❌ Please load the model first!"
97
-
98
- if video_file is None:
99
- return [], "❌ Please upload a video file!"
100
-
101
- # Allow empty text prompt, use empty string if no prompt provided
102
- if text_prompt is None:
103
- text_prompt = ""
104
- text_prompt = text_prompt.strip()
105
-
106
- try:
107
- logger.info(f"Processing video: {video_file}")
108
- logger.info(f"Text prompt: {text_prompt}")
109
-
110
- # Feature processing
111
- visual_feats, text_feats, audio_len_in_s = feature_process(
112
- video_file,
113
- text_prompt,
114
- model_dict,
115
- cfg,
116
- neg_prompt=neg_prompt
117
- )
118
-
119
- # Denoising process to generate multiple audio samples
120
- # Note: The model now generates sample_nums audio samples per inference
121
- # The denoise_process function returns audio with shape [batch_size, channels, samples]
122
- logger.info(f"Generating {sample_nums} audio samples...")
123
- audio, sample_rate = denoise_process(
124
- visual_feats,
125
- text_feats,
126
- audio_len_in_s,
127
- model_dict,
128
- cfg,
129
- guidance_scale=guidance_scale,
130
- num_inference_steps=num_inference_steps,
131
- batch_size=sample_nums
132
- )
133
-
134
- # Create temporary files to save results
135
- temp_dir = tempfile.mkdtemp()
136
- video_outputs = []
137
-
138
- # Process each generated audio sample
139
- for i in range(sample_nums):
140
- # Save audio file
141
- audio_output = os.path.join(temp_dir, f"generated_audio_{i+1}.wav")
142
- torchaudio.save(audio_output, audio[i], sample_rate)
143
-
144
- # Merge video and audio
145
- video_output = os.path.join(temp_dir, f"video_with_audio_{i+1}.mp4")
146
- merge_audio_video(audio_output, video_file, video_output)
147
- video_outputs.append(video_output)
148
-
149
- logger.info(f"Inference completed! Generated {sample_nums} samples.")
150
- return video_outputs, f"✅ Generated {sample_nums} audio sample(s) successfully!"
151
-
152
- except Exception as e:
153
- logger.error(f"Inference failed: {str(e)}")
154
- return [], f"❌ Inference failed: {str(e)}"
155
-
156
- def update_video_outputs(video_list, status_msg):
157
- """Update video outputs based on the number of generated samples"""
158
- # Initialize all outputs as None
159
- outputs = [None] * 6
160
-
161
- # Set values based on generated videos
162
- for i, video_path in enumerate(video_list[:6]): # Max 6 samples
163
- outputs[i] = video_path
164
-
165
- # Return all outputs plus status message
166
- return tuple(outputs + [status_msg])
167
-
168
- def create_gradio_interface():
169
- """Create Gradio interface"""
170
-
171
- # Custom CSS for beautiful interface with better contrast
172
- css = """
173
- .gradio-container {
174
- font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
175
- background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
176
- min-height: 100vh;
177
- }
178
-
179
- .main-header {
180
- text-align: center;
181
- padding: 2rem 0;
182
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
183
- border-radius: 20px;
184
- margin-bottom: 2rem;
185
- box-shadow: 0 8px 32px rgba(0,0,0,0.15);
186
- }
187
-
188
- .main-header h1 {
189
- color: white;
190
- font-size: 3rem;
191
- font-weight: 700;
192
- margin-bottom: 0.5rem;
193
- text-shadow: 0 2px 10px rgba(0,0,0,0.3);
194
- }
195
-
196
- .main-header p {
197
- color: rgba(255, 255, 255, 0.95);
198
- font-size: 1.2rem;
199
- font-weight: 300;
200
- }
201
-
202
- .status-card {
203
- background: white;
204
- border-radius: 15px;
205
- padding: 1rem;
206
- margin-bottom: 1.5rem;
207
- border: 1px solid #e1e5e9;
208
- box-shadow: 0 4px 20px rgba(0,0,0,0.08);
209
- }
210
-
211
- .status-card label {
212
- color: #2d3748 !important;
213
- font-weight: 600 !important;
214
- }
215
-
216
- .usage-guide h3 {
217
- color: #2d3748 !important;
218
- font-weight: 600 !important;
219
- margin-bottom: 0.5rem !important;
220
- }
221
-
222
- .usage-guide p {
223
- color: #4a5568 !important;
224
- font-size: 1rem !important;
225
- line-height: 1.6 !important;
226
- margin: 0.5rem 0 !important;
227
- }
228
-
229
- .usage-guide strong {
230
- color: #1a202c !important;
231
- font-weight: 700 !important;
232
- }
233
-
234
- .usage-guide em {
235
- color: #1a202c !important;
236
- font-weight: 700 !important;
237
- font-style: normal !important;
238
- }
239
-
240
- .main-interface {
241
- margin-bottom: 2rem;
242
- }
243
-
244
- .input-section {
245
- background: white;
246
- border-radius: 20px;
247
- padding: 2rem;
248
- margin-right: 1rem;
249
- box-shadow: 0 8px 32px rgba(0,0,0,0.1);
250
- border: 1px solid #e1e5e9;
251
- }
252
-
253
- .input-section h3 {
254
- color: #2d3748 !important;
255
- font-weight: 600 !important;
256
- margin-bottom: 1rem !important;
257
- }
258
-
259
- .input-section label {
260
- color: #4a5568 !important;
261
- font-weight: 500 !important;
262
- }
263
-
264
- .output-section {
265
- background: white;
266
- border-radius: 20px;
267
- padding: 2rem;
268
- margin-left: 1rem;
269
- box-shadow: 0 8px 32px rgba(0,0,0,0.1);
270
- border: 1px solid #e1e5e9;
271
- }
272
-
273
- .output-section h3 {
274
- color: #2d3748 !important;
275
- font-weight: 600 !important;
276
- margin-bottom: 1rem !important;
277
- }
278
-
279
- .output-section label {
280
- color: #4a5568 !important;
281
- font-weight: 500 !important;
282
- }
283
-
284
- .examples-section h3 {
285
- color: #2d3748 !important;
286
- font-weight: 600 !important;
287
- margin-bottom: 1.5rem !important;
288
- }
289
-
290
- .generate-btn {
291
- background: linear-gradient(45deg, #667eea, #764ba2) !important;
292
- border: none !important;
293
- color: white !important;
294
- font-weight: 600 !important;
295
- font-size: 1.1rem !important;
296
- padding: 12px 30px !important;
297
- border-radius: 25px !important;
298
- box-shadow: 0 4px 15px rgba(102, 126, 234, 0.4) !important;
299
- transition: all 0.3s ease !important;
300
- }
301
-
302
- .generate-btn:hover {
303
- transform: translateY(-2px) !important;
304
- box-shadow: 0 8px 25px rgba(102, 126, 234, 0.6) !important;
305
- }
306
-
307
-
308
-
309
- .examples-section {
310
- background: white;
311
- border-radius: 20px;
312
- padding: 2rem;
313
- margin-top: 2rem;
314
- box-shadow: 0 8px 32px rgba(0,0,0,0.1);
315
- border: 1px solid #e1e5e9;
316
- }
317
-
318
- .examples-section p {
319
- color: #4a5568 !important;
320
- margin-bottom: 1rem !important;
321
- }
322
-
323
- .example-row {
324
- background: #f8fafc;
325
- border: 1px solid #e2e8f0;
326
- border-radius: 15px;
327
- padding: 1.5rem;
328
- margin: 1rem 0;
329
- transition: all 0.3s ease;
330
- align-items: center;
331
- }
332
-
333
- .example-row:hover {
334
- border-color: #667eea;
335
- transform: translateY(-2px);
336
- box-shadow: 0 4px 20px rgba(102, 126, 234, 0.15);
337
- }
338
-
339
- .example-row .markdown {
340
- color: #2d3748 !important;
341
- }
342
-
343
- .example-row .markdown p {
344
- color: #2d3748 !important;
345
- margin: 0.5rem 0 !important;
346
- line-height: 1.5 !important;
347
- }
348
-
349
- .example-row .markdown strong {
350
- color: #1a202c !important;
351
- font-weight: 600 !important;
352
- }
353
-
354
- /* Example grid layout styles */
355
- .example-grid-row {
356
- margin: 1rem 0;
357
- gap: 1rem;
358
- }
359
-
360
- .example-item {
361
- background: #f8fafc;
362
- border: 1px solid #e2e8f0;
363
- border-radius: 15px;
364
- padding: 1rem;
365
- transition: all 0.3s ease;
366
- margin: 0.25rem;
367
- max-width: 250px;
368
- margin-left: auto;
369
- margin-right: auto;
370
- }
371
-
372
- .example-item:hover {
373
- border-color: #667eea;
374
- transform: translateY(-2px);
375
- box-shadow: 0 4px 20px rgba(102, 126, 234, 0.15);
376
- }
377
-
378
- .example-caption {
379
- margin: 0.5rem 0 !important;
380
- min-height: 2.8rem !important;
381
- display: flex !important;
382
- align-items: flex-start !important;
383
- }
384
-
385
- .example-caption p {
386
- color: #2d3748 !important;
387
- font-size: 0.9rem !important;
388
- line-height: 1.4 !important;
389
- margin: 0.5rem 0 !important;
390
- }
391
-
392
- /* Multi-video gallery styles */
393
- .additional-samples {
394
- margin-top: 1rem;
395
- gap: 0.5rem;
396
- }
397
-
398
- .additional-samples .gradio-video {
399
- border-radius: 10px;
400
- overflow: hidden;
401
- }
402
-
403
- /* Video gallery responsive layout */
404
- .video-gallery {
405
- display: grid;
406
- gap: 1rem;
407
- margin-top: 1rem;
408
- }
409
-
410
- .video-gallery.single {
411
- grid-template-columns: 1fr;
412
- }
413
-
414
- .video-gallery.dual {
415
- grid-template-columns: 1fr 1fr;
416
- }
417
-
418
- .video-gallery.multi {
419
- grid-template-columns: repeat(2, 1fr);
420
- grid-template-rows: auto auto auto;
421
- }
422
-
423
- .footer-text {
424
- color: #718096 !important;
425
- text-align: center;
426
- padding: 2rem;
427
- font-size: 0.9rem;
428
- }
429
-
430
- /* Video component styling for consistent size */
431
- .input-section video,
432
- .output-section video,
433
- .example-row video {
434
- width: 100% !important;
435
- height: 300px !important;
436
- object-fit: contain !important;
437
- border-radius: 10px !important;
438
- background-color: #000 !important;
439
- }
440
-
441
- .example-row video {
442
- height: 150px !important;
443
- }
444
-
445
- /* Fix for additional samples video display */
446
- .additional-samples video {
447
- height: 150px !important;
448
- object-fit: contain !important;
449
- border-radius: 10px !important;
450
- background-color: #000 !important;
451
- }
452
-
453
- .additional-samples .gradio-video {
454
- border-radius: 10px !important;
455
- overflow: hidden !important;
456
- background-color: #000 !important;
457
- }
458
-
459
- .additional-samples .gradio-video > div {
460
- background-color: #000 !important;
461
- border-radius: 10px !important;
462
- }
463
-
464
- /* Video container styling */
465
- .input-section .video-container,
466
- .output-section .video-container,
467
- .example-row .video-container {
468
- background-color: #000 !important;
469
- border-radius: 10px !important;
470
- display: flex !important;
471
- align-items: center !important;
472
- justify-content: center !important;
473
- overflow: hidden !important;
474
- }
475
-
476
- /* Ensure proper alignment */
477
- .example-row {
478
- display: flex !important;
479
- align-items: stretch !important;
480
- }
481
-
482
- .example-row > div {
483
- display: flex !important;
484
- flex-direction: column !important;
485
- justify-content: center !important;
486
- }
487
-
488
- /* Video wrapper for better control */
489
- .video-wrapper {
490
- position: relative !important;
491
- width: 100% !important;
492
- background: #000 !important;
493
- border-radius: 10px !important;
494
- overflow: hidden !important;
495
- display: flex !important;
496
- align-items: center !important;
497
- justify-content: center !important;
498
- }
499
- """
500
-
501
- with gr.Blocks(css=css, title="HunyuanVideo-Foley") as app:
502
-
503
- # Main header
504
- with gr.Column(elem_classes=["main-header"]):
505
- gr.HTML("""
506
- <h1>🎵 HunyuanVideo-Foley</h1>
507
- <p>Text-Video-to-Audio Synthesis: Generate realistic audio from video and text descriptions</p>
508
- """)
509
-
510
- # Usage Guide
511
- with gr.Column(elem_classes=["status-card"]):
512
- gr.Markdown("""
513
- ### 📋 Quick Start Guide
514
- **1.** Upload your video file\t**2.** Add optional text description\t**3.** Adjust sample numbers (1-6)\t**4.** Click Generate Audio
515
-
516
- 💡 For quick start, you can load the prepared examples by clicking the button.
517
- """, elem_classes=["usage-guide"])
518
-
519
- # Main inference interface - Input and Results side by side
520
- with gr.Row(elem_classes=["main-interface"]):
521
- # Input section
522
- with gr.Column(scale=1, elem_classes=["input-section"]):
523
- gr.Markdown("### 📹 Video Input")
524
-
525
- video_input = gr.Video(
526
- label="Upload Video",
527
- info="Supported formats: MP4, AVI, MOV, etc.",
528
- height=300
529
- )
530
-
531
- text_input = gr.Textbox(
532
- label="🎯 Audio Description (English)",
533
- placeholder="A person walks on frozen ice",
534
- lines=3,
535
- info="Describe the audio you want to generate (optional)"
536
- )
537
-
538
- neg_prompt_input = gr.Textbox(
539
- label="🚫 Negative Prompt",
540
- placeholder="noisy, harsh",
541
- lines=2,
542
- info="Describe what you want to avoid in the generated audio (optional, default: 'noisy, harsh')"
543
- )
544
-
545
- with gr.Row():
546
- guidance_scale = gr.Slider(
547
- minimum=1.0,
548
- maximum=10.0,
549
- value=4.5,
550
- step=0.1,
551
- label="🎚️ CFG Scale",
552
- )
553
-
554
- inference_steps = gr.Slider(
555
- minimum=10,
556
- maximum=100,
557
- value=50,
558
- step=5,
559
- label="⚡ Steps",
560
- )
561
-
562
- sample_nums = gr.Slider(
563
- minimum=1,
564
- maximum=6,
565
- value=1,
566
- step=1,
567
- label="🎲 Sample Nums",
568
- )
569
-
570
- generate_btn = gr.Button(
571
- "🎵 Generate Audio",
572
- variant="primary",
573
- elem_classes=["generate-btn"]
574
- )
575
-
576
- # Results section
577
- with gr.Column(scale=1, elem_classes=["output-section"]):
578
- gr.Markdown("### 🎥 Generated Results")
579
-
580
- # Multi-video gallery for displaying multiple generated samples
581
- with gr.Column():
582
- # Primary video (Sample 1)
583
- video_output_1 = gr.Video(
584
- label="Sample 1",
585
- height=250,
586
- visible=True
587
- )
588
-
589
- # Additional videos (Samples 2-6) - initially hidden
590
- with gr.Row(elem_classes=["additional-samples"]):
591
- with gr.Column(scale=1):
592
- video_output_2 = gr.Video(
593
- label="Sample 2",
594
- height=150,
595
- visible=False
596
- )
597
- video_output_3 = gr.Video(
598
- label="Sample 3",
599
- height=150,
600
- visible=False
601
- )
602
- with gr.Column(scale=1):
603
- video_output_4 = gr.Video(
604
- label="Sample 4",
605
- height=150,
606
- visible=False
607
- )
608
- video_output_5 = gr.Video(
609
- label="Sample 5",
610
- height=150,
611
- visible=False
612
- )
613
-
614
- # Sample 6 - full width
615
- video_output_6 = gr.Video(
616
- label="Sample 6",
617
- height=150,
618
- visible=False
619
- )
620
-
621
- result_text = gr.Textbox(
622
- label="Status",
623
- interactive=False,
624
- lines=2
625
- )
626
-
627
- # Examples section at the bottom
628
- with gr.Column(elem_classes=["examples-section"]):
629
- gr.Markdown("### 🌟 Examples")
630
- gr.Markdown("Click on any example to load it into the interface above")
631
-
632
- # Define your custom examples here - 8 examples total
633
- examples_data = [
634
- # Example 1
635
- {
636
- "caption": "A person walks on frozen ice",
637
- "video_path": "examples/1_video.mp4",
638
- "result_path": "examples/1_result.mp4"
639
- },
640
- # Example 2
641
- {
642
- "caption": "With a faint sound as their hands parted, the two embraced, a soft 'mm' escaping between them.",
643
- "video_path": "examples/2_video.mp4",
644
- "result_path": "examples/2_result.mp4"
645
- },
646
- # Example 3
647
- {
648
- "caption": "The sound of the number 3's bouncing footsteps is as light and clear as glass marbles hitting the ground. Each step carries a magical sound.",
649
- "video_path": "examples/3_video.mp4",
650
- "result_path": "examples/3_result.mp4"
651
- },
652
- # Example 4
653
- {
654
- "caption": "gentle gurgling of the stream's current, and music plays in the background which is a beautiful and serene piano solo with a hint of classical charm, evoking a sense of peace and serenity in people's hearts.",
655
- "video_path": "examples/4_video.mp4",
656
- "result_path": "examples/4_result.mp4"
657
- },
658
- # Example 5 - Add your new examples here
659
- {
660
- "caption": "snow crunching under the snowboard's edge.",
661
- "video_path": "examples/5_video.mp4",
662
- "result_path": "examples/5_result.mp4"
663
- },
664
- # Example 6
665
- {
666
- "caption": "The crackling of the fire, the whooshing of the flames, and the occasional crisp popping of charred leaves filled the forest.",
667
- "video_path": "examples/6_video.mp4",
668
- "result_path": "examples/6_result.mp4"
669
- },
670
- # Example 7
671
- {
672
- "caption": "humming of the scooter engine accelerates slowly.",
673
- "video_path": "examples/7_video.mp4",
674
- "result_path": "examples/7_result.mp4"
675
- },
676
- # Example 8
677
- {
678
- "caption": "splash of water and loud thud as person hits the surface.",
679
- "video_path": "examples/8_video.mp4",
680
- "result_path": "examples/8_result.mp4"
681
- }
682
- ]
683
-
684
- # Create example grid - 4 examples per row, 2 rows total
685
- example_buttons = []
686
- for row in range(2): # 2 rows
687
- with gr.Row(elem_classes=["example-grid-row"]):
688
- for col in range(4): # 4 columns
689
- idx = row * 4 + col
690
- if idx < len(examples_data):
691
- example = examples_data[idx]
692
-
693
- with gr.Column(scale=1, elem_classes=["example-item"]):
694
- # Video thumbnail
695
- if os.path.exists(example['video_path']):
696
- example_video = gr.Video(
697
- value=example['video_path'],
698
- label=f"Example {idx+1}",
699
- interactive=False,
700
- show_label=True,
701
- height=180
702
- )
703
- else:
704
- example_video = gr.HTML(f"""
705
- <div style="background: #f0f0f0; padding: 15px; text-align: center; border-radius: 8px; height: 180px; display: flex; align-items: center; justify-content: center;">
706
- <div>
707
- <p style="color: #666; margin: 0; font-size: 12px;">📹 Video not found</p>
708
- <small style="color: #999; font-size: 10px;">{example['video_path']}</small>
709
- </div>
710
- </div>
711
- """)
712
-
713
- # Caption (truncated for grid layout)
714
- caption_preview = example['caption'][:60] + "..." if len(example['caption']) > 60 else example['caption']
715
- gr.Markdown(f"{caption_preview}", elem_classes=["example-caption"])
716
-
717
- # Load button
718
- example_btn = gr.Button(
719
- f"Load Example {idx+1}",
720
- variant="secondary",
721
- size="sm"
722
- )
723
- example_buttons.append((example_btn, example))
724
-
725
- # Event handlers
726
- def process_inference(video_file, text_prompt, neg_prompt, guidance_scale, inference_steps, sample_nums):
727
- # Generate videos
728
- video_list, status_msg = infer_single_video(
729
- video_file, text_prompt, neg_prompt, guidance_scale, inference_steps, int(sample_nums)
730
- )
731
- # Update outputs with proper visibility
732
- return update_video_outputs(video_list, status_msg)
733
-
734
- # Add dynamic visibility control based on sample_nums
735
- def update_visibility(sample_nums):
736
- sample_nums = int(sample_nums)
737
- return [
738
- gr.update(visible=True), # Sample 1 always visible
739
- gr.update(visible=sample_nums >= 2), # Sample 2
740
- gr.update(visible=sample_nums >= 3), # Sample 3
741
- gr.update(visible=sample_nums >= 4), # Sample 4
742
- gr.update(visible=sample_nums >= 5), # Sample 5
743
- gr.update(visible=sample_nums >= 6), # Sample 6
744
- ]
745
-
746
- # Update visibility when sample_nums changes
747
- sample_nums.change(
748
- fn=update_visibility,
749
- inputs=[sample_nums],
750
- outputs=[video_output_1, video_output_2, video_output_3, video_output_4, video_output_5, video_output_6]
751
- )
752
-
753
- generate_btn.click(
754
- fn=process_inference,
755
- inputs=[video_input, text_input, neg_prompt_input, guidance_scale, inference_steps, sample_nums],
756
- outputs=[
757
- video_output_1, # Sample 1 value
758
- video_output_2, # Sample 2 value
759
- video_output_3, # Sample 3 value
760
- video_output_4, # Sample 4 value
761
- video_output_5, # Sample 5 value
762
- video_output_6, # Sample 6 value
763
- result_text
764
- ]
765
- )
766
-
767
- # Add click handlers for example buttons
768
- for btn, example in example_buttons:
769
- def create_example_handler(ex):
770
- def handler():
771
- # Check if files exist, if not, return placeholder message
772
- if os.path.exists(ex['video_path']):
773
- video_file = ex['video_path']
774
- else:
775
- video_file = None
776
-
777
- if os.path.exists(ex['result_path']):
778
- result_video = ex['result_path']
779
- else:
780
- result_video = None
781
-
782
- status_msg = f"✅ Loaded example with caption: {ex['caption'][:50]}..."
783
- if not video_file:
784
- status_msg += f"\n⚠️ Video file not found: {ex['video_path']}"
785
- if not result_video:
786
- status_msg += f"\n⚠️ Result video not found: {ex['result_path']}"
787
-
788
- return video_file, ex['caption'], "noisy, harsh", result_video, status_msg
789
- return handler
790
-
791
- btn.click(
792
- fn=create_example_handler(example),
793
- outputs=[video_input, text_input, neg_prompt_input, video_output_1, result_text]
794
- )
795
-
796
- # Footer
797
- gr.HTML("""
798
- <div class="footer-text">
799
- <p>🚀 Powered by HunyuanVideo-Foley | Generate high-quality audio from video and text descriptions</p>
800
- </div>
801
- """)
802
-
803
- return app
804
-
805
- def set_manual_seed(global_seed):
806
- random.seed(global_seed)
807
- np.random.seed(global_seed)
808
- torch.manual_seed(global_seed)
809
-
810
- if __name__ == "__main__":
811
- set_manual_seed(1)
812
- # Setup logging
813
- logger.remove()
814
- logger.add(lambda msg: print(msg, end=''), level="INFO")
815
-
816
- # Auto-load model
817
- logger.info("Starting application and loading model...")
818
- model_load_result = auto_load_models()
819
- logger.info(model_load_result)
820
-
821
- # Create and launch Gradio app
822
- app = create_gradio_interface()
823
-
824
- # Log completion status
825
- if "successfully" in model_load_result:
826
- logger.info("Application ready, model loaded")
827
-
828
- app.launch(
829
- server_name="0.0.0.0",
830
- server_port=8080,
831
- share=False,
832
- debug=False,
833
- show_error=True
834
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
HunyuanVideo-Foley/infer.py DELETED
@@ -1,304 +0,0 @@
1
- import os
2
- import argparse
3
- import random
4
- import numpy as np
5
- import torch
6
- import pandas as pd
7
- import torchaudio
8
- from loguru import logger
9
- from hunyuanvideo_foley.utils.model_utils import load_model
10
- from hunyuanvideo_foley.utils.feature_utils import feature_process
11
- from hunyuanvideo_foley.utils.model_utils import denoise_process
12
- from hunyuanvideo_foley.utils.media_utils import merge_audio_video
13
-
14
- def set_manual_seed(global_seed):
15
- random.seed(global_seed)
16
- np.random.seed(global_seed)
17
- torch.manual_seed(global_seed)
18
-
19
- def infer(video_path, prompt, model_dict, cfg, guidance_scale=4.5, num_inference_steps=50, neg_prompt=None):
20
- visual_feats, text_feats, audio_len_in_s = feature_process(
21
- video_path,
22
- prompt,
23
- model_dict,
24
- cfg,
25
- neg_prompt=neg_prompt
26
- )
27
-
28
- audio, sample_rate = denoise_process(
29
- visual_feats,
30
- text_feats,
31
- audio_len_in_s,
32
- model_dict,
33
- cfg,
34
- guidance_scale=guidance_scale,
35
- num_inference_steps=num_inference_steps
36
- )
37
- return audio[0], sample_rate
38
-
39
-
40
- def generate_audio(model_dict, cfg, csv_path, output_dir, guidance_scale=4.5, num_inference_steps=50, neg_prompt=None):
41
-
42
- os.makedirs(output_dir, exist_ok=True)
43
- test_df = pd.read_csv(csv_path)
44
-
45
- for index, row in test_df.iterrows():
46
- video_path = row['video']
47
- prompt = row['prompt']
48
-
49
- logger.info(f"Processing video: {video_path}")
50
- logger.info(f"Prompt: {prompt}")
51
-
52
- output_audio_path = os.path.join(output_dir, f"{index:04d}.wav")
53
- output_video_path = os.path.join(output_dir, f"{index:04d}.mp4")
54
-
55
- if not os.path.exists(output_audio_path) or not os.path.exists(output_video_path):
56
- audio, sample_rate = infer(video_path, prompt, model_dict, cfg, guidance_scale=guidance_scale, num_inference_steps=num_inference_steps, neg_prompt=neg_prompt)
57
- torchaudio.save(output_audio_path, audio, sample_rate)
58
-
59
- merge_audio_video(output_audio_path, video_path, output_video_path)
60
-
61
- logger.info(f"All audio files saved to {output_dir}")
62
-
63
-
64
- def parse_args():
65
- parser = argparse.ArgumentParser(
66
- description="HunyuanVideo-Foley: Generate audio from video and text prompts",
67
- formatter_class=argparse.ArgumentDefaultsHelpFormatter
68
- )
69
-
70
- parser.add_argument(
71
- "--model_path",
72
- type=str,
73
- required=True,
74
- help="Path to the pretrained model dir"
75
- )
76
- parser.add_argument(
77
- "--config_path",
78
- type=str,
79
- help="Path to the configuration file (.yaml file). If not specified, will be inferred from model_size"
80
- )
81
- parser.add_argument(
82
- "--model_size",
83
- type=str,
84
- choices=["xl", "xxl"],
85
- default="xxl",
86
- help="Model size (xl/xxl). Auto-selects config and model file (default: xxl)"
87
- )
88
-
89
- input_group = parser.add_mutually_exclusive_group(required=True)
90
- input_group.add_argument(
91
- "--csv_path",
92
- type=str,
93
- help="Path to CSV file containing video paths and text prompts (columns: 'video', 'text')"
94
- )
95
- input_group.add_argument(
96
- "--single_video",
97
- type=str,
98
- help="Path to a single video file for inference"
99
- )
100
- parser.add_argument(
101
- "--single_prompt",
102
- type=str,
103
- help="Text prompt for single video (required when using --single_video)"
104
- )
105
- parser.add_argument(
106
- "--neg_prompt",
107
- type=str,
108
- default=None,
109
- help="Negative prompt to avoid during generation (default: 'noisy, harsh')"
110
- )
111
-
112
- parser.add_argument(
113
- "--output_dir",
114
- type=str,
115
- required=True,
116
- help="Directory to save generated audio and video files"
117
- )
118
-
119
- parser.add_argument(
120
- "--guidance_scale",
121
- type=float,
122
- default=4.5,
123
- help="Guidance scale for classifier-free guidance (higher = more text adherence)"
124
- )
125
- parser.add_argument(
126
- "--num_inference_steps",
127
- type=int,
128
- default=50,
129
- help="Number of denoising steps for diffusion sampling"
130
- )
131
- parser.add_argument(
132
- "--audio_length",
133
- type=float,
134
- default=None,
135
- help="Maximum audio length in seconds (default: video length)"
136
- )
137
-
138
- parser.add_argument(
139
- "--device",
140
- type=str,
141
- default="auto",
142
- choices=["auto", "cpu", "cuda", "mps"],
143
- help="Device to use for inference"
144
- )
145
- parser.add_argument(
146
- "--gpu_id",
147
- type=int,
148
- default=0,
149
- help="GPU ID to use when device is cuda"
150
- )
151
-
152
- parser.add_argument(
153
- "--batch_size",
154
- type=int,
155
- default=1,
156
- help="Batch size for processing multiple videos"
157
- )
158
- parser.add_argument(
159
- "--skip_existing",
160
- action="store_true",
161
- help="Skip processing if output files already exist"
162
- )
163
- parser.add_argument(
164
- "--save_video",
165
- action="store_true",
166
- default=True,
167
- help="Save video with generated audio merged"
168
- )
169
- parser.add_argument(
170
- "--log_level",
171
- type=str,
172
- default="INFO",
173
- choices=["DEBUG", "INFO", "WARNING", "ERROR"],
174
- help="Logging level"
175
- )
176
- parser.add_argument(
177
- "--enable_offload",
178
- action="store_true",
179
- help="Enable model offloading to reduce peak memory usage (good for small VRAM GPUs)"
180
- )
181
-
182
- args = parser.parse_args()
183
-
184
- if args.single_video and not args.single_prompt:
185
- parser.error("--single_prompt is required when using --single_video")
186
-
187
- # 如果指定了model_size,自动推断config_path和model文件
188
- if args.model_size:
189
- config_mapping = {
190
- "xl": "configs/hunyuanvideo-foley-xl.yaml",
191
- "xxl": "configs/hunyuanvideo-foley-xxl.yaml"
192
- }
193
-
194
- if not args.config_path:
195
- args.config_path = config_mapping[args.model_size]
196
- logger.info(f"Auto-selected config for {args.model_size} model: {args.config_path}")
197
- elif not args.config_path:
198
- args.model_size = "xxl"
199
- args.config_path = "configs/hunyuanvideo-foley-xxl.yaml"
200
- logger.info(f"Using default {args.model_size} model: {args.config_path}")
201
-
202
- return args
203
-
204
-
205
- def setup_device(device_str, gpu_id=0):
206
- if device_str == "auto":
207
- if torch.cuda.is_available():
208
- device = torch.device(f"cuda:{gpu_id}")
209
- logger.info(f"Using CUDA device: {device}")
210
- elif torch.backends.mps.is_available():
211
- device = torch.device("mps")
212
- logger.info("Using MPS device")
213
- else:
214
- device = torch.device("cpu")
215
- logger.info("Using CPU device")
216
- else:
217
- if device_str == "cuda":
218
- device = torch.device(f"cuda:{gpu_id}")
219
- else:
220
- device = torch.device(device_str)
221
- logger.info(f"Using specified device: {device}")
222
-
223
- return device
224
-
225
-
226
- def process_single_video(video_path, prompt, model_dict, cfg, output_dir, args):
227
- logger.info(f"Processing single video: {video_path}")
228
- logger.info(f"Text prompt: {prompt}")
229
-
230
- video_name = os.path.splitext(os.path.basename(video_path))[0]
231
- output_audio_path = os.path.join(output_dir, f"{video_name}_generated.wav")
232
- output_video_path = os.path.join(output_dir, f"{video_name}_with_audio.mp4")
233
-
234
- if args.skip_existing and os.path.exists(output_audio_path):
235
- logger.info(f"Skipping existing audio file: {output_audio_path}")
236
- if args.save_video and os.path.exists(output_video_path):
237
- logger.info(f"Skipping existing video file: {output_video_path}")
238
- return
239
-
240
- audio, sample_rate = infer(
241
- video_path, prompt, model_dict, cfg,
242
- guidance_scale=args.guidance_scale,
243
- num_inference_steps=args.num_inference_steps,
244
- neg_prompt=args.neg_prompt
245
- )
246
-
247
- torchaudio.save(output_audio_path, audio, sample_rate)
248
- logger.info(f"Audio saved to: {output_audio_path}")
249
-
250
- if args.save_video:
251
- merge_audio_video(output_audio_path, video_path, output_video_path)
252
- logger.info(f"Video with audio saved to: {output_video_path}")
253
-
254
- def main():
255
- set_manual_seed(1)
256
- args = parse_args()
257
-
258
- logger.remove()
259
- logger.add(lambda msg: print(msg, end=''), level=args.log_level)
260
-
261
- device = setup_device(args.device, args.gpu_id)
262
-
263
- if not os.path.exists(args.model_path):
264
- logger.error(f"Model file not found: {args.model_path}")
265
- exit(1)
266
- if not os.path.exists(args.config_path):
267
- logger.error(f"Config file not found: {args.config_path}")
268
- exit(1)
269
-
270
- if args.csv_path:
271
- if not os.path.exists(args.csv_path):
272
- logger.error(f"CSV file not found: {args.csv_path}")
273
- exit(1)
274
- elif args.single_video:
275
- if not os.path.exists(args.single_video):
276
- logger.error(f"Video file not found: {args.single_video}")
277
- exit(1)
278
-
279
- os.makedirs(args.output_dir, exist_ok=True)
280
- logger.info(f"Output directory: {args.output_dir}")
281
-
282
- logger.info("Loading models...")
283
- model_dict, cfg = load_model(args.model_path, args.config_path, device, enable_offload=args.enable_offload, model_size=args.model_size)
284
-
285
- if args.single_video:
286
- process_single_video(
287
- args.single_video, args.single_prompt,
288
- model_dict, cfg, args.output_dir, args
289
- )
290
- else:
291
- generate_audio(
292
- model_dict, cfg,
293
- args.csv_path, args.output_dir,
294
- guidance_scale=args.guidance_scale,
295
- num_inference_steps=args.num_inference_steps,
296
- neg_prompt=args.neg_prompt
297
- )
298
-
299
- logger.info("Processing completed!")
300
-
301
-
302
-
303
- if __name__ == "__main__":
304
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
HunyuanVideo-Foley/pytest.ini DELETED
@@ -1,11 +0,0 @@
1
- [tool:pytest]
2
- testpaths = tests
3
- python_files = test_*.py
4
- python_functions = test_*
5
- addopts =
6
- --verbose
7
- --tb=short
8
- --strict-markers
9
- --disable-warnings
10
- markers =
11
- slow: marks tests as slow (deselect with '-m "not slow"')
 
 
 
 
 
 
 
 
 
 
 
 
HunyuanVideo-Foley/tests/__init__.py DELETED
@@ -1 +0,0 @@
1
- # Test suite for HunyuanVideo-Foley
 
 
HunyuanVideo-Foley/tests/test_config_utils.py DELETED
@@ -1,89 +0,0 @@
1
- """Tests for configuration utilities."""
2
-
3
- import pytest
4
- import tempfile
5
- import yaml
6
- from pathlib import Path
7
-
8
- from hunyuanvideo_foley.utils.config_utils import AttributeDict, load_yaml
9
-
10
-
11
- class TestAttributeDict:
12
- """Test cases for AttributeDict class."""
13
-
14
- def test_dict_access(self):
15
- """Test dictionary-style access."""
16
- data = {"key1": "value1", "key2": {"nested": "value2"}}
17
- attr_dict = AttributeDict(data)
18
-
19
- assert attr_dict["key1"] == "value1"
20
- assert attr_dict["key2"]["nested"] == "value2"
21
-
22
- def test_attribute_access(self):
23
- """Test attribute-style access."""
24
- data = {"key1": "value1", "key2": {"nested": "value2"}}
25
- attr_dict = AttributeDict(data)
26
-
27
- assert attr_dict.key1 == "value1"
28
- assert attr_dict.key2.nested == "value2"
29
-
30
- def test_list_handling(self):
31
- """Test list data handling."""
32
- data = [1, 2, {"nested": "value"}]
33
- attr_dict = AttributeDict(data)
34
-
35
- assert attr_dict[0] == 1
36
- assert attr_dict[2].nested == "value"
37
-
38
- def test_keys_method(self):
39
- """Test keys() method."""
40
- data = {"key1": "value1", "key2": "value2"}
41
- attr_dict = AttributeDict(data)
42
-
43
- keys = list(attr_dict.keys())
44
- assert "key1" in keys
45
- assert "key2" in keys
46
-
47
- def test_get_method(self):
48
- """Test get() method."""
49
- data = {"key1": "value1"}
50
- attr_dict = AttributeDict(data)
51
-
52
- assert attr_dict.get("key1") == "value1"
53
- assert attr_dict.get("nonexistent", "default") == "default"
54
-
55
-
56
- class TestLoadYaml:
57
- """Test cases for load_yaml function."""
58
-
59
- def test_load_valid_yaml(self):
60
- """Test loading valid YAML file."""
61
- data = {"model": {"name": "test_model", "params": {"lr": 0.001}}}
62
-
63
- with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f:
64
- yaml.dump(data, f)
65
- yaml_path = f.name
66
-
67
- try:
68
- result = load_yaml(yaml_path)
69
- assert result.model.name == "test_model"
70
- assert result.model.params.lr == 0.001
71
- finally:
72
- Path(yaml_path).unlink()
73
-
74
- def test_load_nonexistent_file(self):
75
- """Test loading non-existent file."""
76
- with pytest.raises(FileNotFoundError):
77
- load_yaml("nonexistent.yaml")
78
-
79
- def test_load_invalid_yaml(self):
80
- """Test loading invalid YAML file."""
81
- with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f:
82
- f.write("invalid: yaml: content: [\n") # Invalid YAML
83
- yaml_path = f.name
84
-
85
- try:
86
- with pytest.raises(yaml.YAMLError):
87
- load_yaml(yaml_path)
88
- finally:
89
- Path(yaml_path).unlink()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
HunyuanVideo-Foley/tests/test_media_utils.py DELETED
@@ -1,82 +0,0 @@
1
- """Tests for media utilities."""
2
-
3
- import pytest
4
- import tempfile
5
- import os
6
- from unittest.mock import patch, MagicMock
7
-
8
- from hunyuanvideo_foley.utils.media_utils import merge_audio_video, MediaProcessingError
9
-
10
-
11
- class TestMergeAudioVideo:
12
- """Test cases for merge_audio_video function."""
13
-
14
- def test_invalid_audio_path(self):
15
- """Test with non-existent audio file."""
16
- with pytest.raises(MediaProcessingError, match="Audio file not found"):
17
- merge_audio_video("nonexistent.wav", "video.mp4", "output.mp4")
18
-
19
- def test_invalid_video_path(self):
20
- """Test with non-existent video file."""
21
- with tempfile.NamedTemporaryFile(suffix='.wav') as audio_file:
22
- with pytest.raises(MediaProcessingError, match="Video file not found"):
23
- merge_audio_video(audio_file.name, "nonexistent.mp4", "output.mp4")
24
-
25
- @patch('subprocess.Popen')
26
- def test_successful_merge(self, mock_popen):
27
- """Test successful merge operation."""
28
- # Create temporary files
29
- with tempfile.NamedTemporaryFile(suffix='.wav') as audio_file, \
30
- tempfile.NamedTemporaryFile(suffix='.mp4') as video_file, \
31
- tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as output_file:
32
-
33
- # Mock successful subprocess
34
- mock_process = MagicMock()
35
- mock_process.returncode = 0
36
- mock_process.communicate.return_value = ("", "")
37
- mock_popen.return_value = mock_process
38
-
39
- result = merge_audio_video(
40
- audio_file.name,
41
- video_file.name,
42
- output_file.name
43
- )
44
-
45
- assert result == output_file.name
46
- mock_popen.assert_called_once()
47
-
48
- # Cleanup
49
- os.unlink(output_file.name)
50
-
51
- @patch('subprocess.Popen')
52
- def test_ffmpeg_failure(self, mock_popen):
53
- """Test ffmpeg failure handling."""
54
- # Create temporary files
55
- with tempfile.NamedTemporaryFile(suffix='.wav') as audio_file, \
56
- tempfile.NamedTemporaryFile(suffix='.mp4') as video_file:
57
-
58
- # Mock failed subprocess
59
- mock_process = MagicMock()
60
- mock_process.returncode = 1
61
- mock_process.communicate.return_value = ("", "FFmpeg error")
62
- mock_popen.return_value = mock_process
63
-
64
- with pytest.raises(MediaProcessingError, match="FFmpeg failed"):
65
- merge_audio_video(
66
- audio_file.name,
67
- video_file.name,
68
- "output.mp4"
69
- )
70
-
71
- @patch('subprocess.Popen', side_effect=FileNotFoundError)
72
- def test_ffmpeg_not_found(self, mock_popen):
73
- """Test ffmpeg not found error."""
74
- with tempfile.NamedTemporaryFile(suffix='.wav') as audio_file, \
75
- tempfile.NamedTemporaryFile(suffix='.mp4') as video_file:
76
-
77
- with pytest.raises(FileNotFoundError, match="ffmpeg not found"):
78
- merge_audio_video(
79
- audio_file.name,
80
- video_file.name,
81
- "output.mp4"
82
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/.gitignore DELETED
@@ -1,146 +0,0 @@
1
- run_*.sh
2
- log/
3
- saves
4
- saves/
5
- weights/
6
- weights
7
- output/
8
- output
9
- pretrained/
10
- workspace
11
- workspace/
12
- ext_weights/
13
- ext_weights
14
- .checkpoints/
15
- .vscode/
16
- training/example_output/
17
-
18
- # Byte-compiled / optimized / DLL files
19
- __pycache__/
20
- *.py[cod]
21
- *$py.class
22
-
23
- # C extensions
24
- *.so
25
-
26
- # Distribution / packaging
27
- .Python
28
- build/
29
- develop-eggs/
30
- dist/
31
- downloads/
32
- eggs/
33
- .eggs/
34
- lib/
35
- lib64/
36
- parts/
37
- sdist/
38
- var/
39
- wheels/
40
- pip-wheel-metadata/
41
- share/python-wheels/
42
- *.egg-info/
43
- .installed.cfg
44
- *.egg
45
- MANIFEST
46
-
47
- # PyInstaller
48
- # Usually these files are written by a python script from a template
49
- # before PyInstaller builds the exe, so as to inject date/other infos into it.
50
- *.manifest
51
- *.spec
52
-
53
- # Installer logs
54
- pip-log.txt
55
- pip-delete-this-directory.txt
56
-
57
- # Unit test / coverage reports
58
- htmlcov/
59
- .tox/
60
- .nox/
61
- .coverage
62
- .coverage.*
63
- .cache
64
- nosetests.xml
65
- coverage.xml
66
- *.cover
67
- *.py,cover
68
- .hypothesis/
69
- .pytest_cache/
70
-
71
- # Translations
72
- *.mo
73
- *.pot
74
-
75
- # Django stuff:
76
- *.log
77
- local_settings.py
78
- db.sqlite3
79
- db.sqlite3-journal
80
-
81
- # Flask stuff:
82
- instance/
83
- .webassets-cache
84
-
85
- # Scrapy stuff:
86
- .scrapy
87
-
88
- # Sphinx documentation
89
- docs/_build/
90
-
91
- # PyBuilder
92
- target/
93
-
94
- # Jupyter Notebook
95
- .ipynb_checkpoints
96
-
97
- # IPython
98
- profile_default/
99
- ipython_config.py
100
-
101
- # pyenv
102
- .python-version
103
-
104
- # pipenv
105
- # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
106
- # However, in case of collaboration, if having platform-specific dependencies or dependencies
107
- # having no cross-platform support, pipenv may install dependencies that don't work, or not
108
- # install all needed dependencies.
109
- #Pipfile.lock
110
-
111
- # PEP 582; used by e.g. github.com/David-OConnor/pyflow
112
- __pypackages__/
113
-
114
- # Celery stuff
115
- celerybeat-schedule
116
- celerybeat.pid
117
-
118
- # SageMath parsed files
119
- *.sage.py
120
-
121
- # Environments
122
- .env
123
- .venv
124
- env/
125
- venv/
126
- ENV/
127
- env.bak/
128
- venv.bak/
129
-
130
- # Spyder project settings
131
- .spyderproject
132
- .spyproject
133
-
134
- # Rope project settings
135
- .ropeproject
136
-
137
- # mkdocs documentation
138
- /site
139
-
140
- # mypy
141
- .mypy_cache/
142
- .dmypy.json
143
- dmypy.json
144
-
145
- # Pyre type checker
146
- .pyre/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/LICENSE DELETED
@@ -1,21 +0,0 @@
1
- MIT License
2
-
3
- Copyright (c) 2024 Sony Research Inc.
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/README.md DELETED
@@ -1,198 +0,0 @@
1
- <div align="center">
2
-
3
- https://github.com/hkchengrex/MMAudio
4
-
5
- <p align="center">
6
- <h2>MMAudio</h2>
7
- <a href="https://arxiv.org/abs/2412.15322">Paper</a> | <a href="https://hkchengrex.github.io/MMAudio">Webpage</a> | <a href="https://huggingface.co/hkchengrex/MMAudio/tree/main">Models</a> | <a href="https://huggingface.co/spaces/hkchengrex/MMAudio"> Huggingface Demo</a> | <a href="https://colab.research.google.com/drive/1TAaXCY2-kPk4xE4PwKB3EqFbSnkUuzZ8?usp=sharing">Colab Demo</a> | <a href="https://replicate.com/zsxkib/mmaudio">Replicate Demo</a>
8
- </p>
9
- </div>
10
-
11
- ## [Taming Multimodal Joint Training for High-Quality Video-to-Audio Synthesis](https://hkchengrex.github.io/MMAudio)
12
-
13
- [Ho Kei Cheng](https://hkchengrex.github.io/), [Masato Ishii](https://scholar.google.co.jp/citations?user=RRIO1CcAAAAJ), [Akio Hayakawa](https://scholar.google.com/citations?user=sXAjHFIAAAAJ), [Takashi Shibuya](https://scholar.google.com/citations?user=XCRO260AAAAJ), [Alexander Schwing](https://www.alexander-schwing.de/), [Yuki Mitsufuji](https://www.yukimitsufuji.com/)
14
-
15
- University of Illinois Urbana-Champaign, Sony AI, and Sony Group Corporation
16
-
17
- CVPR 2025
18
-
19
- ## Highlight
20
-
21
- MMAudio generates synchronized audio given video and/or text inputs.
22
- Our key innovation is multimodal joint training which allows training on a wide range of audio-visual and audio-text datasets.
23
- Moreover, a synchronization module aligns the generated audio with the video frames.
24
-
25
- Check out this fun video:
26
-
27
- [![Does Your Voice Match Your Face?](https://img.youtube.com/vi/SLz3NWLyHxg/0.jpg)](https://youtu.be/SLz3NWLyHxg)
28
-
29
- [[Does Your Voice Match Your Face? https://youtu.be/SLz3NWLyHxg]](https://youtu.be/SLz3NWLyHxg)
30
-
31
- ## Results
32
-
33
- (All audio from our algorithm MMAudio)
34
-
35
- Videos from Sora:
36
-
37
- https://github.com/user-attachments/assets/82afd192-0cee-48a1-86ca-bd39b8c8f330
38
-
39
- Videos from Veo 2:
40
-
41
- https://github.com/user-attachments/assets/8a11419e-fee2-46e0-9e67-dfb03c48d00e
42
-
43
- Videos from MovieGen/Hunyuan Video/VGGSound:
44
-
45
- https://github.com/user-attachments/assets/29230d4e-21c1-4cf8-a221-c28f2af6d0ca
46
-
47
- For more results, visit https://hkchengrex.com/MMAudio/video_main.html.
48
-
49
-
50
- ## Installation
51
-
52
- We have only tested this on Ubuntu.
53
-
54
- ### Prerequisites
55
-
56
- We recommend using a [miniforge](https://github.com/conda-forge/miniforge) environment.
57
-
58
- - Python 3.9+
59
- - PyTorch **2.5.1+** and corresponding torchvision/torchaudio (pick your CUDA version https://pytorch.org/, pip install recommended)
60
- <!-- - ffmpeg<7 ([this is required by torchaudio](https://pytorch.org/audio/master/installation.html#optional-dependencies), you can install it in a miniforge environment with `conda install -c conda-forge 'ffmpeg<7'`) -->
61
-
62
- **1. Install prerequisite if not yet met:**
63
-
64
- ```bash
65
- pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 --upgrade
66
- ```
67
-
68
- (Or any other CUDA versions that your GPUs/driver support)
69
-
70
- <!-- ```
71
- conda install -c conda-forge 'ffmpeg<7
72
- ```
73
- (Optional, if you use miniforge and don't already have the appropriate ffmpeg) -->
74
-
75
- **2. Clone our repository:**
76
-
77
- ```bash
78
- git clone https://github.com/hkchengrex/MMAudio.git
79
- ```
80
-
81
- **3. Install with pip (install pytorch first before attempting this!):**
82
-
83
- ```bash
84
- cd MMAudio
85
- pip install -e .
86
- ```
87
-
88
- (If you encounter the File "setup.py" not found error, upgrade your pip with pip install --upgrade pip)
89
-
90
-
91
- **Pretrained models:**
92
-
93
- The models will be downloaded automatically when you run the demo script. MD5 checksums are provided in `mmaudio/utils/download_utils.py`.
94
- The models are also available at https://huggingface.co/hkchengrex/MMAudio/tree/main
95
- See [MODELS.md](docs/MODELS.md) for more details.
96
-
97
- ## Demo
98
-
99
- By default, these scripts use the `large_44k_v2` model.
100
- In our experiments, inference only takes around 6GB of GPU memory (in 16-bit mode) which should fit in most modern GPUs.
101
-
102
- ### Command-line interface
103
-
104
- With `demo.py`
105
-
106
- ```bash
107
- python demo.py --duration=8 --video=<path to video> --prompt "your prompt"
108
- ```
109
-
110
- The output (audio in `.flac` format, and video in `.mp4` format) will be saved in `./output`.
111
- See the file for more options.
112
- Simply omit the `--video` option for text-to-audio synthesis.
113
- The default output (and training) duration is 8 seconds. Longer/shorter durations could also work, but a large deviation from the training duration may result in a lower quality.
114
-
115
- ### Gradio interface
116
-
117
- Supports video-to-audio and text-to-audio synthesis.
118
- You can also try experimental image-to-audio synthesis which duplicates the input image to a video for processing. This might be interesting to some but it is not something MMAudio has been trained for.
119
- Use [port forwarding](https://unix.stackexchange.com/questions/115897/whats-ssh-port-forwarding-and-whats-the-difference-between-ssh-local-and-remot) (e.g., `ssh -L 7860:localhost:7860 server`) if necessary. The default port is `7860` which you can specify with `--port`.
120
-
121
- ```bash
122
- python gradio_demo.py
123
- ```
124
-
125
- ### FAQ
126
-
127
- 1. Video processing
128
- - Processing higher-resolution videos takes longer due to encoding and decoding (which can take >95% of the processing time!), but it does not improve the quality of results.
129
- - The CLIP encoder resizes input frames to 384×384 pixels.
130
- - Synchformer resizes the shorter edge to 224 pixels and applies a center crop, focusing only on the central square of each frame.
131
- 2. Frame rates
132
- - The CLIP model operates at 8 FPS, while Synchformer works at 25 FPS.
133
- - Frame rate conversion happens on-the-fly via the video reader.
134
- - For input videos with a frame rate below 25 FPS, frames will be duplicated to match the required rate.
135
- 3. Failure cases
136
- As with most models of this type, failures can occur, and the reasons are not always clear. Below are some known failure modes. If you notice a failure mode or believe there’s a bug, feel free to open an issue in the repository.
137
- 4. Performance variations
138
- We notice that there can be subtle performance variations in different hardware and software environments. Some of the reasons include using/not using `torch.compile`, video reader library/backend, inference precision, batch sizes, random seeds, etc. We (will) provide pre-computed results on standard benchmark for reference. Results obtained from this codebase should be similar but might not be exactly the same.
139
-
140
- ### Known limitations
141
-
142
- 1. The model sometimes generates unintelligible human speech-like sounds
143
- 2. The model sometimes generates background music (without explicit training, it would not be high quality)
144
- 3. The model struggles with unfamiliar concepts, e.g., it can generate "gunfires" but not "RPG firing".
145
-
146
- We believe all of these three limitations can be addressed with more high-quality training data.
147
-
148
- ## Training
149
-
150
- See [TRAINING.md](docs/TRAINING.md).
151
-
152
- ## Evaluation
153
-
154
- See [EVAL.md](docs/EVAL.md).
155
-
156
- ## Training Datasets
157
-
158
- MMAudio was trained on several datasets, including [AudioSet](https://research.google.com/audioset/), [Freesound](https://github.com/LAION-AI/audio-dataset/blob/main/laion-audio-630k/README.md), [VGGSound](https://www.robots.ox.ac.uk/~vgg/data/vggsound/), [AudioCaps](https://audiocaps.github.io/), and [WavCaps](https://github.com/XinhaoMei/WavCaps). These datasets are subject to specific licenses, which can be accessed on their respective websites. We do not guarantee that the pre-trained models are suitable for commercial use. Please use them at your own risk.
159
-
160
- ## Update Logs
161
-
162
- - 2025-03-09: Uploaded the corrected tsv files. See [TRAINING.md](docs/TRAINING.md).
163
- - 2025-02-27: Disabled the GradScaler by default to improve training stability. See #49.
164
- - 2024-12-23: Added training and batch evaluation scripts.
165
- - 2024-12-14: Removed the `ffmpeg<7` requirement for the demos by replacing `torio.io.StreamingMediaDecoder` with `pyav` for reading frames. The read frames are also cached, so we are not reading the same frames again during reconstruction. This should speed things up and make installation less of a hassle.
166
- - 2024-12-13: Improved for-loop processing in CLIP/Sync feature extraction by introducing a batch size multiplier. We can approximately use 40x batch size for CLIP/Sync without using more memory, thereby speeding up processing. Removed VAE encoder during inference -- we don't need it.
167
- - 2024-12-11: Replaced `torio.io.StreamingMediaDecoder` with `pyav` for reading framerate when reconstructing the input video. `torio.io.StreamingMediaDecoder` does not work reliably in huggingface ZeroGPU's environment, and I suspect that it might not work in some other environments as well.
168
-
169
- ## Citation
170
-
171
- ```bibtex
172
- @inproceedings{cheng2025taming,
173
- title={{MMAudio}: Taming Multimodal Joint Training for High-Quality Video-to-Audio Synthesis},
174
- author={Cheng, Ho Kei and Ishii, Masato and Hayakawa, Akio and Shibuya, Takashi and Schwing, Alexander and Mitsufuji, Yuki},
175
- booktitle={CVPR},
176
- year={2025}
177
- }
178
- ```
179
-
180
- ## Relevant Repositories
181
-
182
- - [av-benchmark](https://github.com/hkchengrex/av-benchmark) for benchmarking results.
183
-
184
- ## License
185
- - The code in this repository is released under the MIT license as found in the [LICENSE file](LICENSE)
186
- - The checkpoints are released on Hugging Face under the CC-BY-NC 4.0 license as found at [https://creativecommons.org/licenses/by-nc/4.0/](https://creativecommons.org/licenses/by-nc/4.0/).
187
-
188
- ## Disclaimer
189
-
190
- We have no affiliation with and have no knowledge of the party behind the domain "mmaudio.net".
191
-
192
- ## Acknowledgement
193
-
194
- Many thanks to:
195
- - [Make-An-Audio 2](https://github.com/bytedance/Make-An-Audio-2) for the 16kHz BigVGAN pretrained model and the VAE architecture
196
- - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
197
- - [Synchformer](https://github.com/v-iashin/Synchformer)
198
- - [EDM2](https://github.com/NVlabs/edm2) for the magnitude-preserving VAE network architecture
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/batch_eval.py DELETED
@@ -1,110 +0,0 @@
1
- import logging
2
- import os
3
- from pathlib import Path
4
-
5
- import hydra
6
- import torch
7
- import torch.distributed as distributed
8
- import torchaudio
9
- from hydra.core.hydra_config import HydraConfig
10
- from omegaconf import DictConfig
11
- from tqdm import tqdm
12
-
13
- from mmaudio.data.data_setup import setup_eval_dataset
14
- from mmaudio.eval_utils import ModelConfig, all_model_cfg, generate
15
- from mmaudio.model.flow_matching import FlowMatching
16
- from mmaudio.model.networks import MMAudio, get_my_mmaudio
17
- from mmaudio.model.utils.features_utils import FeaturesUtils
18
-
19
- torch.backends.cuda.matmul.allow_tf32 = True
20
- torch.backends.cudnn.allow_tf32 = True
21
-
22
- local_rank = int(os.environ['LOCAL_RANK'])
23
- world_size = int(os.environ['WORLD_SIZE'])
24
- log = logging.getLogger()
25
-
26
-
27
- @torch.inference_mode()
28
- @hydra.main(version_base='1.3.2', config_path='config', config_name='eval_config.yaml')
29
- def main(cfg: DictConfig):
30
- device = 'cuda'
31
- torch.cuda.set_device(local_rank)
32
-
33
- if cfg.model not in all_model_cfg:
34
- raise ValueError(f'Unknown model variant: {cfg.model}')
35
- model: ModelConfig = all_model_cfg[cfg.model]
36
- model.download_if_needed()
37
- seq_cfg = model.seq_cfg
38
-
39
- run_dir = Path(HydraConfig.get().run.dir)
40
- if cfg.output_name is None:
41
- output_dir = run_dir / cfg.dataset
42
- else:
43
- output_dir = run_dir / f'{cfg.dataset}-{cfg.output_name}'
44
- output_dir.mkdir(parents=True, exist_ok=True)
45
-
46
- # load a pretrained model
47
- seq_cfg.duration = cfg.duration_s
48
- net: MMAudio = get_my_mmaudio(cfg.model).to(device).eval()
49
- net.load_weights(torch.load(model.model_path, map_location=device, weights_only=True))
50
- log.info(f'Loaded weights from {model.model_path}')
51
- net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
52
- log.info(f'Latent seq len: {seq_cfg.latent_seq_len}')
53
- log.info(f'Clip seq len: {seq_cfg.clip_seq_len}')
54
- log.info(f'Sync seq len: {seq_cfg.sync_seq_len}')
55
-
56
- # misc setup
57
- rng = torch.Generator(device=device)
58
- rng.manual_seed(cfg.seed)
59
- fm = FlowMatching(cfg.sampling.min_sigma,
60
- inference_mode=cfg.sampling.method,
61
- num_steps=cfg.sampling.num_steps)
62
-
63
- feature_utils = FeaturesUtils(tod_vae_ckpt=model.vae_path,
64
- synchformer_ckpt=model.synchformer_ckpt,
65
- enable_conditions=True,
66
- mode=model.mode,
67
- bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
68
- need_vae_encoder=False)
69
- feature_utils = feature_utils.to(device).eval()
70
-
71
- if cfg.compile:
72
- net.preprocess_conditions = torch.compile(net.preprocess_conditions)
73
- net.predict_flow = torch.compile(net.predict_flow)
74
- feature_utils.compile()
75
-
76
- dataset, loader = setup_eval_dataset(cfg.dataset, cfg)
77
-
78
- with torch.amp.autocast(enabled=cfg.amp, dtype=torch.bfloat16, device_type=device):
79
- for batch in tqdm(loader):
80
- audios = generate(batch.get('clip_video', None),
81
- batch.get('sync_video', None),
82
- batch.get('caption', None),
83
- feature_utils=feature_utils,
84
- net=net,
85
- fm=fm,
86
- rng=rng,
87
- cfg_strength=cfg.cfg_strength,
88
- clip_batch_size_multiplier=64,
89
- sync_batch_size_multiplier=64)
90
- audios = audios.float().cpu()
91
- names = batch['name']
92
- for audio, name in zip(audios, names):
93
- torchaudio.save(output_dir / f'{name}.flac', audio, seq_cfg.sampling_rate)
94
-
95
-
96
- def distributed_setup():
97
- distributed.init_process_group(backend="nccl")
98
- local_rank = distributed.get_rank()
99
- world_size = distributed.get_world_size()
100
- log.info(f'Initialized: local_rank={local_rank}, world_size={world_size}')
101
- return local_rank, world_size
102
-
103
-
104
- if __name__ == '__main__':
105
- distributed_setup()
106
-
107
- main()
108
-
109
- # clean-up
110
- distributed.destroy_process_group()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/config/__init__.py DELETED
File without changes
MMAudio/config/base_config.yaml DELETED
@@ -1,62 +0,0 @@
1
- defaults:
2
- - data: base
3
- - eval_data: base
4
- - override hydra/job_logging: custom-simplest
5
- - _self_
6
-
7
- hydra:
8
- run:
9
- dir: ./output/${exp_id}
10
- output_subdir: ${now:%Y-%m-%d_%H-%M-%S}-hydra
11
-
12
- enable_email: False
13
-
14
- model: small_16k
15
-
16
- exp_id: default
17
- debug: False
18
- cudnn_benchmark: True
19
- compile: True
20
- amp: True
21
- weights: null
22
- checkpoint: null
23
- seed: 14159265
24
- num_workers: 10 # per-GPU
25
- pin_memory: False # set to True if your system can handle it, i.e., have enough memory
26
-
27
- # NOTE: This DOSE NOT affect the model during inference in any way
28
- # they are just for the dataloader to fill in the missing data in multi-modal loading
29
- # to change the sequence length for the model, see networks.py
30
- data_dim:
31
- text_seq_len: 77
32
- clip_dim: 1024
33
- sync_dim: 768
34
- text_dim: 1024
35
-
36
- # ema configuration
37
- ema:
38
- enable: True
39
- sigma_rels: [0.05, 0.1]
40
- update_every: 1
41
- checkpoint_every: 5_000
42
- checkpoint_folder: ${hydra:run.dir}/ema_ckpts
43
- default_output_sigma: 0.05
44
-
45
-
46
- # sampling
47
- sampling:
48
- mean: 0.0
49
- scale: 1.0
50
- min_sigma: 0.0
51
- method: euler
52
- num_steps: 25
53
-
54
- # classifier-free guidance
55
- null_condition_probability: 0.1
56
- cfg_strength: 4.5
57
-
58
- # checkpoint paths to external modules
59
- vae_16k_ckpt: ./ext_weights/v1-16.pth
60
- vae_44k_ckpt: ./ext_weights/v1-44.pth
61
- bigvgan_vocoder_ckpt: ./ext_weights/best_netG.pt
62
- synchformer_ckpt: ./ext_weights/synchformer_state_dict.pth
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/config/data/base.yaml DELETED
@@ -1,70 +0,0 @@
1
- VGGSound:
2
- root: ../data/video
3
- subset_name: sets/vgg3-train.tsv
4
- fps: 8
5
- height: 384
6
- width: 384
7
- sample_duration_sec: 8.0
8
-
9
- VGGSound_test:
10
- root: ../data/video
11
- subset_name: sets/vgg3-test.tsv
12
- fps: 8
13
- height: 384
14
- width: 384
15
- sample_duration_sec: 8.0
16
-
17
- VGGSound_val:
18
- root: ../data/video
19
- subset_name: sets/vgg3-val.tsv
20
- fps: 8
21
- height: 384
22
- width: 384
23
- sample_duration_sec: 8.0
24
-
25
- ExtractedVGG:
26
- tsv: ../data/v1-16-memmap/vgg-train.tsv
27
- memmap_dir: ../data/v1-16-memmap/vgg-train
28
-
29
- ExtractedVGG_test:
30
- tag: test
31
- gt_cache: ../data/eval-cache/vggsound-test
32
- output_subdir: null
33
- tsv: ../data/v1-16-memmap/vgg-test.tsv
34
- memmap_dir: ../data/v1-16-memmap/vgg-test
35
-
36
- ExtractedVGG_val:
37
- tag: val
38
- gt_cache: ../data/eval-cache/vggsound-val
39
- output_subdir: val
40
- tsv: ../data/v1-16-memmap/vgg-val.tsv
41
- memmap_dir: ../data/v1-16-memmap/vgg-val
42
-
43
- AudioCaps:
44
- tsv: ../data/v1-16-memmap/audiocaps.tsv
45
- memmap_dir: ../data/v1-16-memmap/audiocaps
46
-
47
- AudioSetSL:
48
- tsv: ../data/v1-16-memmap/audioset_sl.tsv
49
- memmap_dir: ../data/v1-16-memmap/audioset_sl
50
-
51
- BBCSound:
52
- tsv: ../data/v1-16-memmap/bbcsound.tsv
53
- memmap_dir: ../data/v1-16-memmap/bbcsound
54
-
55
- FreeSound:
56
- tsv: ../data/v1-16-memmap/freesound.tsv
57
- memmap_dir: ../data/v1-16-memmap/freesound
58
-
59
- Clotho:
60
- tsv: ../data/v1-16-memmap/clotho.tsv
61
- memmap_dir: ../data/v1-16-memmap/clotho
62
-
63
- Example_video:
64
- tsv: ./training/example_output/memmap/vgg-example.tsv
65
- memmap_dir: ./training/example_output/memmap/vgg-example
66
-
67
- Example_audio:
68
- tsv: ./training/example_output/memmap/audio-example.tsv
69
- memmap_dir: ./training/example_output/memmap/audio-example
70
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/config/eval_config.yaml DELETED
@@ -1,17 +0,0 @@
1
- defaults:
2
- - base_config
3
- - override hydra/job_logging: custom-simplest
4
- - _self_
5
-
6
- hydra:
7
- run:
8
- dir: ./output/${exp_id}
9
- output_subdir: eval-${now:%Y-%m-%d_%H-%M-%S}-hydra
10
-
11
- exp_id: ${model}
12
- dataset: audiocaps
13
- duration_s: 8.0
14
-
15
- # for inference, this is the per-GPU batch size
16
- batch_size: 16
17
- output_name: null
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/config/eval_data/base.yaml DELETED
@@ -1,22 +0,0 @@
1
- AudioCaps:
2
- audio_path: ../data/AudioCaps-test-audioldm-ver
3
- # a csv file, with a header row of 'name' and 'caption'
4
- # name should match the audio file name without extension
5
- # Can be downloaded here: https://github.com/hkchengrex/MMAudio/releases/download/v0.1/AudioCaps_audioldm_data.csv
6
- csv_path: ../data/AudioCaps-test-audioldm-ver/data.csv
7
-
8
- AudioCaps_full:
9
- audio_path: ../data/AudioCaps-test-full-ver
10
- # a csv file, with a header row of 'name' and 'caption'
11
- # name should match the audio file name without extension
12
- # Can be downloaded here: https://github.com/hkchengrex/MMAudio/releases/download/v0.1/AudioCaps_full_data.csv
13
- csv_path: ../data/AudioCaps-test-full-ver/data.csv
14
-
15
- MovieGen:
16
- video_path: ../data/MovieGen/MovieGenAudioBenchSfx/video_with_audio
17
- jsonl_path: ../data/MovieGen/MovieGenAudioBenchSfx/metadata
18
-
19
- VGGSound:
20
- video_path: ../data/test-videos
21
- # from the officially released csv file
22
- csv_path: ../data/vggsound.csv
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/config/hydra/job_logging/custom-eval.yaml DELETED
@@ -1,32 +0,0 @@
1
- # python logging configuration for tasks
2
- version: 1
3
- formatters:
4
- simple:
5
- format: '[%(asctime)s][%(levelname)s][r${oc.env:LOCAL_RANK}] - %(message)s'
6
- datefmt: '%Y-%m-%d %H:%M:%S'
7
- colorlog:
8
- '()': 'colorlog.ColoredFormatter'
9
- format: '[%(cyan)s%(asctime)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - %(message)s'
10
- datefmt: '%Y-%m-%d %H:%M:%S'
11
- log_colors:
12
- DEBUG: purple
13
- INFO: green
14
- WARNING: yellow
15
- ERROR: red
16
- CRITICAL: red
17
- handlers:
18
- console:
19
- class: logging.StreamHandler
20
- formatter: colorlog
21
- stream: ext://sys.stdout
22
- file:
23
- class: logging.FileHandler
24
- formatter: simple
25
- # absolute file path
26
- filename: ${hydra.runtime.output_dir}/eval-${now:%Y-%m-%d_%H-%M-%S}-rank${oc.env:LOCAL_RANK}.log
27
- mode: w
28
- root:
29
- level: INFO
30
- handlers: [console, file]
31
-
32
- disable_existing_loggers: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/config/hydra/job_logging/custom-no-rank.yaml DELETED
@@ -1,32 +0,0 @@
1
- # python logging configuration for tasks
2
- version: 1
3
- formatters:
4
- simple:
5
- format: '[%(asctime)s][%(levelname)s] - %(message)s'
6
- datefmt: '%Y-%m-%d %H:%M:%S'
7
- colorlog:
8
- '()': 'colorlog.ColoredFormatter'
9
- format: '[%(cyan)s%(asctime)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - %(message)s'
10
- datefmt: '%Y-%m-%d %H:%M:%S'
11
- log_colors:
12
- DEBUG: purple
13
- INFO: green
14
- WARNING: yellow
15
- ERROR: red
16
- CRITICAL: red
17
- handlers:
18
- console:
19
- class: logging.StreamHandler
20
- formatter: colorlog
21
- stream: ext://sys.stdout
22
- file:
23
- class: logging.FileHandler
24
- formatter: simple
25
- # absolute file path
26
- filename: ${hydra.runtime.output_dir}/${now:%Y-%m-%d_%H-%M-%S}-eval.log
27
- mode: w
28
- root:
29
- level: INFO
30
- handlers: [console, file]
31
-
32
- disable_existing_loggers: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/config/hydra/job_logging/custom-simplest.yaml DELETED
@@ -1,26 +0,0 @@
1
- # python logging configuration for tasks
2
- version: 1
3
- formatters:
4
- simple:
5
- format: '[%(asctime)s][%(levelname)s] - %(message)s'
6
- datefmt: '%Y-%m-%d %H:%M:%S'
7
- colorlog:
8
- '()': 'colorlog.ColoredFormatter'
9
- format: '[%(cyan)s%(asctime)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - %(message)s'
10
- datefmt: '%Y-%m-%d %H:%M:%S'
11
- log_colors:
12
- DEBUG: purple
13
- INFO: green
14
- WARNING: yellow
15
- ERROR: red
16
- CRITICAL: red
17
- handlers:
18
- console:
19
- class: logging.StreamHandler
20
- formatter: colorlog
21
- stream: ext://sys.stdout
22
- root:
23
- level: INFO
24
- handlers: [console]
25
-
26
- disable_existing_loggers: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/config/hydra/job_logging/custom.yaml DELETED
@@ -1,33 +0,0 @@
1
- # @package hydra.job_logging
2
- # python logging configuration for tasks
3
- version: 1
4
- formatters:
5
- simple:
6
- format: '[%(asctime)s][%(levelname)s][r${oc.env:LOCAL_RANK}] - %(message)s'
7
- datefmt: '%Y-%m-%d %H:%M:%S'
8
- colorlog:
9
- '()': 'colorlog.ColoredFormatter'
10
- format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)sr${oc.env:LOCAL_RANK}%(reset)s][%(log_color)s%(levelname)s%(reset)s] - %(message)s'
11
- datefmt: '%Y-%m-%d %H:%M:%S'
12
- log_colors:
13
- DEBUG: purple
14
- INFO: green
15
- WARNING: yellow
16
- ERROR: red
17
- CRITICAL: red
18
- handlers:
19
- console:
20
- class: logging.StreamHandler
21
- formatter: colorlog
22
- stream: ext://sys.stdout
23
- file:
24
- class: logging.FileHandler
25
- formatter: simple
26
- # absolute file path
27
- filename: ${hydra.runtime.output_dir}/train-${now:%Y-%m-%d_%H-%M-%S}-rank${oc.env:LOCAL_RANK}.log
28
- mode: w
29
- root:
30
- level: INFO
31
- handlers: [console, file]
32
-
33
- disable_existing_loggers: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/config/train_config.yaml DELETED
@@ -1,41 +0,0 @@
1
- defaults:
2
- - base_config
3
- - override data: base
4
- - override hydra/job_logging: custom
5
- - _self_
6
-
7
- hydra:
8
- run:
9
- dir: ./output/${exp_id}
10
- output_subdir: train-${now:%Y-%m-%d_%H-%M-%S}-hydra
11
-
12
- ema:
13
- start: 0
14
-
15
- mini_train: False
16
- example_train: False
17
- enable_grad_scaler: False
18
- vgg_oversample_rate: 5
19
-
20
- log_text_interval: 200
21
- log_extra_interval: 20_000
22
- val_interval: 5_000
23
- eval_interval: 20_000
24
- save_eval_interval: 40_000
25
- save_weights_interval: 10_000
26
- save_checkpoint_interval: 10_000
27
- save_copy_iterations: []
28
-
29
- batch_size: 512
30
- eval_batch_size: 256 # per-GPU
31
-
32
- num_iterations: 300_000
33
- learning_rate: 1.0e-4
34
- linear_warmup_steps: 1_000
35
-
36
- lr_schedule: step
37
- lr_schedule_steps: [240_000, 270_000]
38
- lr_schedule_gamma: 0.1
39
-
40
- clip_grad_norm: 1.0
41
- weight_decay: 1.0e-6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/demo.py DELETED
@@ -1,141 +0,0 @@
1
- import logging
2
- from argparse import ArgumentParser
3
- from pathlib import Path
4
-
5
- import torch
6
- import torchaudio
7
-
8
- from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video,
9
- setup_eval_logging)
10
- from mmaudio.model.flow_matching import FlowMatching
11
- from mmaudio.model.networks import MMAudio, get_my_mmaudio
12
- from mmaudio.model.utils.features_utils import FeaturesUtils
13
-
14
- torch.backends.cuda.matmul.allow_tf32 = True
15
- torch.backends.cudnn.allow_tf32 = True
16
-
17
- log = logging.getLogger()
18
-
19
-
20
- @torch.inference_mode()
21
- def main():
22
- setup_eval_logging()
23
-
24
- parser = ArgumentParser()
25
- parser.add_argument('--variant',
26
- type=str,
27
- default='large_44k_v2',
28
- help='small_16k, small_44k, medium_44k, large_44k, large_44k_v2')
29
- parser.add_argument('--video', type=Path, help='Path to the video file')
30
- parser.add_argument('--prompt', type=str, help='Input prompt', default='')
31
- parser.add_argument('--negative_prompt', type=str, help='Negative prompt', default='')
32
- parser.add_argument('--duration', type=float, default=8.0)
33
- parser.add_argument('--cfg_strength', type=float, default=4.5)
34
- parser.add_argument('--num_steps', type=int, default=25)
35
-
36
- parser.add_argument('--mask_away_clip', action='store_true')
37
-
38
- parser.add_argument('--output', type=Path, help='Output directory', default='./output')
39
- parser.add_argument('--seed', type=int, help='Random seed', default=42)
40
- parser.add_argument('--skip_video_composite', action='store_true')
41
- parser.add_argument('--full_precision', action='store_true')
42
-
43
- args = parser.parse_args()
44
-
45
- if args.variant not in all_model_cfg:
46
- raise ValueError(f'Unknown model variant: {args.variant}')
47
- model: ModelConfig = all_model_cfg[args.variant]
48
- model.download_if_needed()
49
- seq_cfg = model.seq_cfg
50
-
51
- if args.video:
52
- video_path: Path = Path(args.video).expanduser()
53
- else:
54
- video_path = None
55
- prompt: str = args.prompt
56
- negative_prompt: str = args.negative_prompt
57
- output_dir: str = args.output.expanduser()
58
- seed: int = args.seed
59
- num_steps: int = args.num_steps
60
- duration: float = args.duration
61
- cfg_strength: float = args.cfg_strength
62
- skip_video_composite: bool = args.skip_video_composite
63
- mask_away_clip: bool = args.mask_away_clip
64
-
65
- device = 'cpu'
66
- if torch.cuda.is_available():
67
- device = 'cuda'
68
- elif torch.backends.mps.is_available():
69
- device = 'mps'
70
- else:
71
- log.warning('CUDA/MPS are not available, running on CPU')
72
- dtype = torch.float32 if args.full_precision else torch.bfloat16
73
-
74
- output_dir.mkdir(parents=True, exist_ok=True)
75
-
76
- # load a pretrained model
77
- net: MMAudio = get_my_mmaudio(model.model_name).to(device, dtype).eval()
78
- net.load_weights(torch.load(model.model_path, map_location=device, weights_only=True))
79
- log.info(f'Loaded weights from {model.model_path}')
80
-
81
- # misc setup
82
- rng = torch.Generator(device=device)
83
- rng.manual_seed(seed)
84
- fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
85
-
86
- feature_utils = FeaturesUtils(tod_vae_ckpt=model.vae_path,
87
- synchformer_ckpt=model.synchformer_ckpt,
88
- enable_conditions=True,
89
- mode=model.mode,
90
- bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
91
- need_vae_encoder=False)
92
- feature_utils = feature_utils.to(device, dtype).eval()
93
-
94
- if video_path is not None:
95
- log.info(f'Using video {video_path}')
96
- video_info = load_video(video_path, duration)
97
- clip_frames = video_info.clip_frames
98
- sync_frames = video_info.sync_frames
99
- duration = video_info.duration_sec
100
- if mask_away_clip:
101
- clip_frames = None
102
- else:
103
- clip_frames = clip_frames.unsqueeze(0)
104
- sync_frames = sync_frames.unsqueeze(0)
105
- else:
106
- log.info('No video provided -- text-to-audio mode')
107
- clip_frames = sync_frames = None
108
-
109
- seq_cfg.duration = duration
110
- net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
111
-
112
- log.info(f'Prompt: {prompt}')
113
- log.info(f'Negative prompt: {negative_prompt}')
114
-
115
- audios = generate(clip_frames,
116
- sync_frames, [prompt],
117
- negative_text=[negative_prompt],
118
- feature_utils=feature_utils,
119
- net=net,
120
- fm=fm,
121
- rng=rng,
122
- cfg_strength=cfg_strength)
123
- audio = audios.float().cpu()[0]
124
- if video_path is not None:
125
- save_path = output_dir / f'{video_path.stem}.flac'
126
- else:
127
- safe_filename = prompt.replace(' ', '_').replace('/', '_').replace('.', '')
128
- save_path = output_dir / f'{safe_filename}.flac'
129
- torchaudio.save(save_path, audio, seq_cfg.sampling_rate)
130
-
131
- log.info(f'Audio saved to {save_path}')
132
- if video_path is not None and not skip_video_composite:
133
- video_save_path = output_dir / f'{video_path.stem}.mp4'
134
- make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
135
- log.info(f'Video saved to {output_dir / video_save_path}')
136
-
137
- log.info('Memory usage: %.2f GB', torch.cuda.max_memory_allocated() / (2**30))
138
-
139
-
140
- if __name__ == '__main__':
141
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/docs/EVAL.md DELETED
@@ -1,23 +0,0 @@
1
- # Evaluation
2
-
3
- ## Batch Evaluation
4
-
5
- To evaluate the model on a dataset, use the `batch_eval.py` script. It is significantly more efficient in large-scale evaluation compared to `demo.py`, supporting batched inference, multi-GPU inference, torch compilation, and skipping video compositions.
6
-
7
- An example of running this script with four GPUs is as follows:
8
-
9
- ```bash
10
- OMP_NUM_THREADS=4 torchrun --standalone --nproc_per_node=4 batch_eval.py duration_s=8 dataset=vggsound model=small_16k num_workers=8
11
- ```
12
-
13
- You may need to update the data paths in `config/eval_data/base.yaml`.
14
- More configuration options can be found in `config/base_config.yaml` and `config/eval_config.yaml`.
15
- You might also want to change the dataset definition if you are not evaluating on VGGSound: https://github.com/hkchengrex/MMAudio/blob/main/mmaudio/data/eval/video_dataset.py
16
-
17
- ## Precomputed Results
18
-
19
- Precomputed results for VGGSound, AudioCaps, and MovieGen are available here: https://huggingface.co/datasets/hkchengrex/MMAudio-precomputed-results
20
-
21
- ## Obtaining Quantitative Metrics
22
-
23
- Our evaluation code is available here: https://github.com/hkchengrex/av-benchmark
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/docs/MODELS.md DELETED
@@ -1,50 +0,0 @@
1
- # Pretrained models
2
-
3
- The models will be downloaded automatically when you run the demo script. MD5 checksums are provided in `mmaudio/utils/download_utils.py`.
4
- The models are also available at https://huggingface.co/hkchengrex/MMAudio/tree/main
5
-
6
- | Model | Download link | File size |
7
- | -------- | ------- | ------- |
8
- | Flow prediction network, small 16kHz | <a href="https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_small_16k.pth" download="mmaudio_small_16k.pth">mmaudio_small_16k.pth</a> | 601M |
9
- | Flow prediction network, small 44.1kHz | <a href="https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_small_44k.pth" download="mmaudio_small_44k.pth">mmaudio_small_44k.pth</a> | 601M |
10
- | Flow prediction network, medium 44.1kHz | <a href="https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_medium_44k.pth" download="mmaudio_medium_44k.pth">mmaudio_medium_44k.pth</a> | 2.4G |
11
- | Flow prediction network, large 44.1kHz | <a href="https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_large_44k.pth" download="mmaudio_large_44k.pth">mmaudio_large_44k.pth</a> | 3.9G |
12
- | Flow prediction network, large 44.1kHz, v2 **(recommended)** | <a href="https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_large_44k_v2.pth" download="mmaudio_large_44k_v2.pth">mmaudio_large_44k_v2.pth</a> | 3.9G |
13
- | 16kHz VAE | <a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/v1-16.pth">v1-16.pth</a> | 655M |
14
- | 16kHz BigVGAN vocoder (from Make-An-Audio 2) |<a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/best_netG.pt">best_netG.pt</a> | 429M |
15
- | 44.1kHz VAE |<a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/v1-44.pth">v1-44.pth</a> | 1.2G |
16
- | Synchformer visual encoder |<a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/synchformer_state_dict.pth">synchformer_state_dict.pth</a> | 907M |
17
-
18
- To run the model, you need four components: a flow prediction network, visual feature extractors (Synchformer and CLIP, CLIP will be downloaded automatically), a VAE, and a vocoder. VAEs and vocoders are specific to the sampling rate (16kHz or 44.1kHz) and not model sizes.
19
- The 44.1kHz vocoder will be downloaded automatically.
20
- The `_v2` model performs worse in benchmarking (e.g., in Fréchet distance), but, in my experience, generalizes better to new data.
21
-
22
- The expected directory structure (full):
23
-
24
- ```bash
25
- MMAudio
26
- ├── ext_weights
27
- │ ├── best_netG.pt
28
- │ ├── synchformer_state_dict.pth
29
- │ ├── v1-16.pth
30
- │ └── v1-44.pth
31
- ├── weights
32
- │ ├── mmaudio_small_16k.pth
33
- │ ├── mmaudio_small_44k.pth
34
- │ ├── mmaudio_medium_44k.pth
35
- │ ├── mmaudio_large_44k.pth
36
- │ └── mmaudio_large_44k_v2.pth
37
- └── ...
38
- ```
39
-
40
- The expected directory structure (minimal, for the recommended model only):
41
-
42
- ```bash
43
- MMAudio
44
- ├── ext_weights
45
- │ ├── synchformer_state_dict.pth
46
- │ └── v1-44.pth
47
- ├── weights
48
- │ └── mmaudio_large_44k_v2.pth
49
- └── ...
50
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/docs/TRAINING.md DELETED
@@ -1,184 +0,0 @@
1
- # Training
2
-
3
- ## Overview
4
-
5
- We have put a large emphasis on making training as fast as possible.
6
- Consequently, some pre-processing steps are required.
7
-
8
- Namely, before starting any training, we
9
-
10
- 1. Obtain training data as videos, audios, and captions.
11
- 2. Encode training audios into spectrograms and then with VAE into mean/std
12
- 3. Extract CLIP and synchronization features from videos
13
- 4. Extract CLIP features from text (captions)
14
- 5. Encode all extracted features into [MemoryMappedTensors](https://pytorch.org/tensordict/main/reference/generated/tensordict.MemoryMappedTensor.html) with [TensorDict](https://pytorch.org/tensordict/main/reference/tensordict.html)
15
-
16
- **NOTE:** for maximum training speed (e.g., when training the base model with 2*H100s), you would need around 3~5 GB/s of random read speed. Spinning disks would not be able to catch up and most consumer-grade SSDs would struggle. In my experience, the best bet is to have a large enough system memory such that the OS can cache the data. This way, the data is read from RAM instead of disk.
17
-
18
- The current training script does not support `_v2` training.
19
-
20
- ## Recommended Hardware Configuration
21
-
22
- These are what I recommend for a smooth and efficient training experience. These are not minimum requirements.
23
-
24
- - Single-node machine. We did not implement multi-node training
25
- - GPUs: for the small model, two 80G-H100s or above; for the large model, eight 80G-H100s or above
26
- - System memory: for 16kHz training, 600GB+; for 44kHz training, 700GB+
27
- - Storage: >2TB of fast NVMe storage. If you have enough system memory, OS caching will help and the storage does not need to be as fast.
28
-
29
- ## Prerequisites
30
-
31
- 1. Install [av-benchmark](https://github.com/hkchengrex/av-benchmark). We use this library to automatically evaluate on the validation set during training, and on the test set after training.
32
- 2. Extract features for evaluation using [av-benchmark](https://github.com/hkchengrex/av-benchmark) for the validation and test set as a [validation cache](https://github.com/hkchengrex/MMAudio/blob/34bf089fdd2e457cd5ef33be96c0e1c8a0412476/config/data/base.yaml#L38) and a [test cache](https://github.com/hkchengrex/MMAudio/blob/34bf089fdd2e457cd5ef33be96c0e1c8a0412476/config/data/base.yaml#L31). You can also download the precomputed evaluation cache [here](https://huggingface.co/datasets/hkchengrex/MMAudio-precomputed-results/tree/main).
33
-
34
- 3. You will need ffmpeg to extract frames from videos. Note that `torchaudio` imposes a maximum version limit (`ffmpeg<7`). You can install it as follows:
35
-
36
- ```bash
37
- conda install -c conda-forge 'ffmpeg<7'
38
- ```
39
-
40
- 4. Download the training datasets. We used [VGGSound](https://arxiv.org/abs/2004.14368), [AudioCaps](https://audiocaps.github.io/), [WavCaps](https://arxiv.org/abs/2303.17395), and [Clotho](https://arxiv.org/abs/1910.09387) (paper to be updated). Note that the audio files in the huggingface release of WavCaps have been downsampled to 32kHz. To the best of our ability, we located the original (high-sampling rate) audio files and used them instead to prevent artifacts during 44.1kHz training. We did not use the "SoundBible" portion of WavCaps, since it is a small set with many short audio unsuitable for our training.
41
-
42
- 5. Download the corresponding VAE (`v1-16.pth` for 16kHz training, and `v1-44.pth` for 44.1kHz training), vocoder models (`best_netG.pt` for 16kHz training; the vocoder for 44.1kHz training will be downloaded automatically), the [empty string encoding](https://github.com/hkchengrex/MMAudio/releases/download/v0.1/empty_string.pth), and Synchformer weights from [MODELS.md](https://github.com/hkchengrex/MMAudio/blob/main/docs/MODELS.md) place them in `ext_weights/`.
43
-
44
- ### Helpful links for downloading the datasets
45
-
46
- We cannot redistribute the datasets for copyright reasons, but we do find some links helpful and they might be helpful to you as well.
47
-
48
- - https://huggingface.co/datasets/Meranti/CLAP_freesound
49
- - https://huggingface.co/datasets/agkphysics/AudioSet
50
- - https://sound-effects.bbcrewind.co.uk/
51
-
52
- For certain sources of VGGSound, you might notice desychronization between the audio and the video. This happens the video keyframes do not always align with the start of the audio and what happens during playbacks is player-dependent. We used PyTorch's decoder which can correctly handle these cases.
53
-
54
- ## Preparing Audio-Video-Text Features
55
-
56
- We have prepared some example data in `training/example_videos`.
57
- `training/extract_video_training_latents.py` extracts audio, video, and text features and save them as a `TensorDict` with a `.tsv` file containing metadata to `output_dir`.
58
-
59
- To run this script, use the `torchrun` utility:
60
-
61
- ```bash
62
- torchrun --standalone training/extract_video_training_latents.py
63
- ```
64
-
65
- You can run this script with multiple GPUs (with `--nproc_per_node=<n>` after `--standalone` and before the script name) to speed up extraction.
66
- Modify the definitions near the top of the script to switch between 16kHz/44.1kHz extraction.
67
- Change the data path definitions in `data_cfg` if necessary.
68
-
69
- Arguments:
70
-
71
- - `latent_dir` -- where intermediate latent outputs are saved. It is safe to delete this directory afterwards.
72
- - `output_dir` -- where TensorDict and the metadata file are saved.
73
-
74
- Outputs produced in `output_dir`:
75
-
76
- 1. A directory named `vgg-{split}` (i.e., in the TensorDict format), containing
77
- a. `mean.memmap` mean values predicted by the VAE encoder (number of videos X sequence length X channel size)
78
- b. `std.memmap` standard deviation values predicted by the VAE encoder (number of videos X sequence length X channel size)
79
- c. `text_features.memmap` text features extracted from CLIP (number of videos X 77 (sequence length) X 1024)
80
- d. `clip_features.memmap` clip features extracted from CLIP (number of videos X 64 (8 fps) X 1024)
81
- e. `sync_features.memmap` synchronization features extracted from Synchformer (number of videos X 192 (24 fps) X 768)
82
- f. `meta.json` that contains the metadata for the above memory mappings
83
- 2. A tab-separated values file named `vgg-{split}.tsv` that contains two columns: `id` containing video file names without extension, and `label` containing corresponding text labels (i.e., captions)
84
-
85
- ## Preparing Audio-Text Features
86
-
87
- We have prepared some example data in `training/example_audios`.
88
-
89
- 1. Run `training/partition_clips` to partition each audio file into clips (by finding start and end points; we do not save the partitioned audio onto the disk to save disk space)
90
- 2. Run `training/extract_audio_training_latents.py` to extract each clip's audio and text features and save them as a `TensorDict` with a `.tsv` file containing metadata to `output_dir`.
91
-
92
- ### Partitioning the audio files
93
-
94
- Run
95
-
96
- ```bash
97
- python training/partition_clips.py
98
- ```
99
-
100
- Arguments:
101
-
102
- - `data_dir` -- path to a directory containing the audio files (`.flac` or `.wav`)
103
- - `output_dir` -- path to the output `.csv` file
104
- - `start` -- optional; useful when you need to run multiple processes to speed up processing -- this defines the beginning of the chunk to be processed
105
- - `end` -- optional; useful when you need to run multiple processes to speed up processing -- this defines the end of the chunk to be processed
106
-
107
- ### Extracting audio and text features
108
-
109
- Run
110
-
111
- ```bash
112
- torchrun --standalone training/extract_audio_training_latents.py
113
- ```
114
-
115
- You can run this with multiple GPUs (with `--nproc_per_node=<n>`) to speed up extraction.
116
- Modify the definitions near the top of the script to switch between 16kHz/44.1kHz extraction.
117
-
118
- Arguments:
119
-
120
- - `data_dir` -- path to a directory containing the audio files (`.flac` or `.wav`), same as the previous step
121
- - `captions_tsv` -- path to the captions file, a tab-separated values (tsv) file at least with columns `id` and `caption`
122
- - `clips_tsv` -- path to the clips file, generated in the last step
123
- - `latent_dir` -- where intermediate latent outputs are saved. It is safe to delete this directory afterwards.
124
- - `output_dir` -- where TensorDict and the metadata file are saved.
125
-
126
- Outputs produced in `output_dir`:
127
-
128
- 1. A directory named `{basename(output_dir)}` (i.e., in the TensorDict format), containing
129
- a. `mean.memmap` mean values predicted by the VAE encoder (number of audios X sequence length X channel size)
130
- b. `std.memmap` standard deviation values predicted by the VAE encoder (number of audios X sequence length X channel size)
131
- c. `text_features.memmap` text features extracted from CLIP (number of audios X 77 (sequence length) X 1024)
132
- f. `meta.json` that contains the metadata for the above memory mappings
133
- 2. A tab-separated values file named `{basename(output_dir)}.tsv` that contains two columns: `id` containing audio file names without extension, and `label` containing corresponding text labels (i.e., captions)
134
-
135
- ### Reference tsv files (with overlaps removed as mentioned in the paper)
136
-
137
- The reference tsv files can be found [here](https://github.com/hkchengrex/MMAudio/releases/tag/v0.1).
138
-
139
- Note that these reference tsv files are the **outputs** of `extract_audio_training_latents.py`, which means the `id` column might contain duplicate entries (one per clip). You can still use it as the `captions_tsv` input though -- the script will handle duplicates gracefully.
140
- Among these reference tsv files, `audioset_sl.tsv`, `bbcsound.tsv`, and `freesound.tsv` are subsets that are parts of WavCaps. These subsets might be smaller than the original datasets.
141
- The Clotho data contains both the development set and the validation set.
142
-
143
- **Update (Mar 9, 2025)**:
144
- We have updated a corrected set of reference tsv files. The previous tsv files contained some (<1%) corrupted captions (ie, mismatch between audio and caption, see https://github.com/hkchengrex/MMAudio/issues/56). The tsv files for VGGSound are unaffected. This reason for this error is unknown, but I cannot reproduce this error in the latest version of the code. Our pre-trained models are trained with **uncorrected** tsv files. For future training, I recommend using the corrected tsv files.
145
-
146
- The error statistics are as follows:
147
-
148
- - AudioCaps (170/43824), 0.39%
149
- - Freesound: (1670/180636), 0.92%
150
- - AudioSet: (290/100776), 0.29%
151
- - BBCSound: (3/29975), 0.01%
152
- - Clotho: (8/24332), 0.03%
153
-
154
- ## Training on Extracted Features
155
-
156
- We use Distributed Data Parallel (DDP) for training.
157
- First, specify the data path in `config/data/base.yaml`. If you used the default parameters in the scripts above to extract features for the example data, the `Example_video` and `Example_audio` items should already be correct.
158
-
159
- To run training on the example data, use the following command:
160
-
161
- ```bash
162
- OMP_NUM_THREADS=4 torchrun --standalone --nproc_per_node=1 train.py exp_id=debug compile=False debug=True example_train=True batch_size=1
163
- ```
164
-
165
- This will not train a useful model, but it will check if everything is set up correctly.
166
-
167
- For full training on the base model with two GPUs, use the following command:
168
-
169
- ```bash
170
- OMP_NUM_THREADS=4 torchrun --standalone --nproc_per_node=2 train.py exp_id=exp_1 model=small_16k
171
- ```
172
-
173
- Any outputs from training will be stored in `output/<exp_id>`.
174
-
175
- More configuration options can be found in `config/base_config.yaml` and `config/train_config.yaml`.
176
- For the medium and large models, specify `vgg_oversample_rate` to be `3` to reduce overfitting.
177
-
178
- ## Checkpoints
179
-
180
- Model checkpoints, including optimizer states and the latest EMA weights, are available here: https://huggingface.co/hkchengrex/MMAudio
181
-
182
- ---
183
-
184
- Godspeed!
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/docs/demo.html DELETED
@@ -1,81 +0,0 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
- <!-- Google tag (gtag.js) -->
5
- <script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
6
- <script>
7
- window.dataLayer = window.dataLayer || [];
8
- function gtag(){dataLayer.push(arguments);}
9
- gtag('js', new Date());
10
- gtag('config', 'G-0JKBJ3WRJZ');
11
- </script>
12
-
13
- <link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
14
- <meta charset="UTF-8">
15
- <title>MMAudio</title>
16
-
17
- <link rel="icon" type="image/png" href="images/icon.png">
18
-
19
- <meta name="viewport" content="width=device-width, initial-scale=1">
20
- <!-- CSS only -->
21
- <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/css/bootstrap.min.css" rel="stylesheet"
22
- integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
23
- <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.7.1/jquery.min.js"></script>
24
-
25
- <link rel="stylesheet" href="style_videos.css">
26
- </head>
27
- <body>
28
-
29
- <div id="moviegen_all">
30
- <h2 id="moviegen" style="text-align: center;">Supplementary Videos</h2>
31
-
32
-
33
- <div class="row g-1">
34
- <div class="col-12 col-md-4">
35
- <div class="video-header" style="font-size: large;">Golf; ground-truth</div>
36
- <div class="video-container">
37
- <iframe src="https://youtube.com/embed/1hwSu42kkho"></iframe>
38
- </div>
39
- </div>
40
- <div class="col-12 col-md-4">
41
- <div class="video-header" style="font-size: large;">Golf; FoleyCrafter</div>
42
- <div class="video-container">
43
- <iframe src="https://youtube.com/embed/Lfsx8mOPcJo"></iframe>
44
- </div>
45
- </div>
46
- <div class="col-12 col-md-4">
47
- <div class="video-header" style="font-size: large;">Golf; Ours</div>
48
- <div class="video-container">
49
- <iframe src="https://youtube.com/embed/kZibDoDCNxI"></iframe>
50
- </div>
51
- </div>
52
- </div>
53
- <br>
54
-
55
- <div class="row g-1">
56
- <div class="col-12 col-md-4">
57
- <div class="video-header" style="font-size: large;">Waves; Ours</div>
58
- <div class="video-container">
59
- <iframe src="https://youtube.com/embed/7zQzDEuFnfI"></iframe>
60
- </div>
61
- </div>
62
- <div class="col-12 col-md-4">
63
- <div class="video-header" style="font-size: large;">Featured MMAudio</div>
64
- <div class="video-container">
65
- <iframe src="https://youtube.com/embed/SLz3NWLyHxg"></iframe>
66
- </div>
67
- </div>
68
- <div class="col-12 col-md-4">
69
- <div class="video-header" style="font-size: large;">Failure case</div>
70
- <div class="video-container">
71
- <iframe src="https://youtube.com/embed/nx0CyrDu70Y"></iframe>
72
- </div>
73
- </div>
74
- </div>
75
- <br>
76
-
77
-
78
- </div>
79
-
80
- </body>
81
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/docs/images/icon.png DELETED
Binary file (163 Bytes)
 
MMAudio/docs/index.html DELETED
@@ -1,156 +0,0 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
- <!-- Google tag (gtag.js) -->
5
- <script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
6
- <script>
7
- window.dataLayer = window.dataLayer || [];
8
- function gtag(){dataLayer.push(arguments);}
9
- gtag('js', new Date());
10
- gtag('config', 'G-0JKBJ3WRJZ');
11
- </script>
12
-
13
- <link rel="preconnect" href="https://fonts.googleapis.com">
14
- <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
15
- <link href="https://fonts.googleapis.com/css2?family=Source+Sans+3&display=swap" rel="stylesheet">
16
- <meta charset="UTF-8">
17
- <title>MMAudio</title>
18
-
19
- <link rel="icon" type="image/png" href="images/icon.png">
20
-
21
- <meta name="viewport" content="width=device-width, initial-scale=1">
22
- <!-- CSS only -->
23
- <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/css/bootstrap.min.css" rel="stylesheet"
24
- integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
25
- <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
26
-
27
- <link rel="stylesheet" href="style.css">
28
- </head>
29
- <body>
30
-
31
- <body>
32
- <br><br><br><br>
33
- <div class="container">
34
- <div class="row text-center" style="font-size:38px">
35
- <div class="col strong">
36
- MMAudio: Taming Multimodal Joint Training for High-Quality Video-to-Audio Synthesis
37
- </div>
38
- </div>
39
-
40
- <br>
41
- <div class="row text-center" style="font-size:28px">
42
- <div class="col">
43
- CVPR 2025
44
- </div>
45
- </div>
46
- <br>
47
-
48
- <div class="h-100 row text-center heavy justify-content-md-center" style="font-size:22px;">
49
- <div class="col-sm-auto px-lg-2">
50
- <a href="https://hkchengrex.github.io/">Ho Kei Cheng<sup>1</sup></a>
51
- </div>
52
- <div class="col-sm-auto px-lg-2">
53
- <nobr><a href="https://scholar.google.co.jp/citations?user=RRIO1CcAAAAJ">Masato Ishii<sup>2</sup></a></nobr>
54
- </div>
55
- <div class="col-sm-auto px-lg-2">
56
- <nobr><a href="https://scholar.google.com/citations?user=sXAjHFIAAAAJ">Akio Hayakawa<sup>2</sup></a></nobr>
57
- </div>
58
- <div class="col-sm-auto px-lg-2">
59
- <nobr><a href="https://scholar.google.com/citations?user=XCRO260AAAAJ">Takashi Shibuya<sup>2</sup></a></nobr>
60
- </div>
61
- <div class="col-sm-auto px-lg-2">
62
- <nobr><a href="https://www.alexander-schwing.de/">Alexander Schwing<sup>1</sup></a></nobr>
63
- </div>
64
- <div class="col-sm-auto px-lg-2" >
65
- <nobr><a href="https://www.yukimitsufuji.com/">Yuki Mitsufuji<sup>2,3</sup></a></nobr>
66
- </div>
67
- </div>
68
-
69
- <div class="h-100 row text-center heavy justify-content-md-center" style="font-size:22px;">
70
- <div class="col-sm-auto px-lg-2">
71
- <sup>1</sup>University of Illinois Urbana-Champaign
72
- </div>
73
- <div class="col-sm-auto px-lg-2">
74
- <sup>2</sup>Sony AI
75
- </div>
76
- <div class="col-sm-auto px-lg-2">
77
- <sup>3</sup>Sony Group Corporation
78
- </div>
79
- </div>
80
-
81
- <br>
82
-
83
- <br>
84
-
85
- <div class="h-100 row text-center justify-content-md-center" style="font-size:20px;">
86
- <div class="col-sm-2">
87
- <a href="https://arxiv.org/abs/2412.15322">[Paper]</a>
88
- </div>
89
- <div class="col-sm-2">
90
- <a href="https://github.com/hkchengrex/MMAudio">[Code]</a>
91
- </div>
92
- <div class="col-sm-3">
93
- <a href="https://huggingface.co/spaces/hkchengrex/MMAudio">[Huggingface Demo]</a>
94
- </div>
95
- <div class="col-sm-2">
96
- <a href="https://colab.research.google.com/drive/1TAaXCY2-kPk4xE4PwKB3EqFbSnkUuzZ8?usp=sharing">[Colab Demo]</a>
97
- </div>
98
- <div class="col-sm-3">
99
- <a href="https://replicate.com/zsxkib/mmaudio">[Replicate Demo]</a>
100
- </div>
101
- </div>
102
-
103
- <br>
104
-
105
- <hr>
106
-
107
- <div class="row" style="font-size:32px">
108
- <div class="col strong">
109
- TL;DR
110
- </div>
111
- </div>
112
- <br>
113
- <div class="row">
114
- <div class="col">
115
- <p class="light" style="text-align: left;">
116
- MMAudio generates synchronized audio given video and/or text inputs.
117
- </p>
118
-
119
- <p>
120
- Check out this fun video!
121
- <div class="video-container" style="text-align: center;">
122
- <iframe src="https://youtube.com/embed/SLz3NWLyHxg"></iframe>
123
- </div>
124
- </p>
125
- </div>
126
- </div>
127
-
128
- <br>
129
- <hr>
130
- <br>
131
-
132
- <div class="row" style="font-size:32px">
133
- <div class="col strong">
134
- Demo
135
- </div>
136
- </div>
137
- <br>
138
- <div class="row" style="font-size:48px">
139
- <div class="col strong text-center">
140
- <a href="video_main.html" style="text-decoration: underline;">&lt;More results&gt;</a>
141
- </div>
142
- </div>
143
- <br>
144
- <div class="video-container" style="text-align: center;">
145
- <iframe src="https://youtube.com/embed/YElewUT2M4M"></iframe>
146
- </div>
147
-
148
- <br>
149
-
150
- <br><br>
151
- <br><br>
152
-
153
- </div>
154
-
155
- </body>
156
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/docs/style.css DELETED
@@ -1,78 +0,0 @@
1
- body {
2
- font-family: 'Source Sans 3', sans-serif;
3
- font-size: 18px;
4
- margin-left: auto;
5
- margin-right: auto;
6
- font-weight: 400;
7
- height: 100%;
8
- max-width: 1000px;
9
- }
10
-
11
- table {
12
- width: 100%;
13
- border-collapse: collapse;
14
- }
15
- th, td {
16
- border: 1px solid #ddd;
17
- padding: 8px;
18
- text-align: center;
19
- }
20
- th {
21
- background-color: #f2f2f2;
22
- }
23
- video {
24
- width: 100%;
25
- height: auto;
26
- }
27
- p {
28
- font-size: 28px;
29
- }
30
- h2 {
31
- font-size: 36px;
32
- }
33
-
34
- .strong {
35
- font-weight: 700;
36
- }
37
-
38
- .light {
39
- font-weight: 100;
40
- }
41
-
42
- .heavy {
43
- font-weight: 900;
44
- }
45
-
46
- .column {
47
- float: left;
48
- }
49
-
50
- a:link,
51
- a:visited {
52
- color: #05538f;
53
- text-decoration: none;
54
- }
55
-
56
- a:hover {
57
- color: #63cbdd;
58
- }
59
-
60
- hr {
61
- border: 0;
62
- height: 1px;
63
- background-image: linear-gradient(to right, rgba(0, 0, 0, 0), rgba(0, 0, 0, 0.75), rgba(0, 0, 0, 0));
64
- }
65
-
66
- .video-container {
67
- position: relative;
68
- padding-bottom: 56.25%; /* 16:9 */
69
- height: 0;
70
- }
71
-
72
- .video-container iframe {
73
- position: absolute;
74
- top: 0;
75
- left: 0;
76
- width: 100%;
77
- height: 100%;
78
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/docs/style_videos.css DELETED
@@ -1,52 +0,0 @@
1
- body {
2
- font-family: 'Source Sans 3', sans-serif;
3
- font-size: 1.5vh;
4
- font-weight: 400;
5
- }
6
-
7
- table {
8
- width: 100%;
9
- border-collapse: collapse;
10
- }
11
- th, td {
12
- border: 1px solid #ddd;
13
- padding: 8px;
14
- text-align: center;
15
- }
16
- th {
17
- background-color: #f2f2f2;
18
- }
19
- video {
20
- width: 100%;
21
- height: auto;
22
- }
23
- p {
24
- font-size: 1.5vh;
25
- font-weight: bold;
26
- }
27
- h2 {
28
- font-size: 2vh;
29
- font-weight: bold;
30
- }
31
-
32
- .video-container {
33
- position: relative;
34
- padding-bottom: 56.25%; /* 16:9 */
35
- height: 0;
36
- }
37
-
38
- .video-container iframe {
39
- position: absolute;
40
- top: 0;
41
- left: 0;
42
- width: 100%;
43
- height: 100%;
44
- }
45
-
46
- .video-header {
47
- background-color: #f2f2f2;
48
- text-align: center;
49
- font-size: 1.5vh;
50
- font-weight: bold;
51
- padding: 8px;
52
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/docs/video_gen.html DELETED
@@ -1,254 +0,0 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
- <!-- Google tag (gtag.js) -->
5
- <script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
6
- <script>
7
- window.dataLayer = window.dataLayer || [];
8
- function gtag(){dataLayer.push(arguments);}
9
- gtag('js', new Date());
10
- gtag('config', 'G-0JKBJ3WRJZ');
11
- </script>
12
-
13
- <link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
14
- <meta charset="UTF-8">
15
- <title>MMAudio</title>
16
-
17
- <link rel="icon" type="image/png" href="images/icon.png">
18
-
19
- <meta name="viewport" content="width=device-width, initial-scale=1">
20
- <!-- CSS only -->
21
- <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/css/bootstrap.min.css" rel="stylesheet"
22
- integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
23
- <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.7.1/jquery.min.js"></script>
24
-
25
- <link rel="stylesheet" href="style_videos.css">
26
- </head>
27
- <body>
28
-
29
- <div id="moviegen_all">
30
- <h2 id="moviegen" style="text-align: center;">Comparisons with Movie Gen Audio on Videos Generated by MovieGen</h2>
31
- <p id="moviegen1" style="overflow: hidden;">
32
- Example 1: Ice cracking with sharp snapping sound, and metal tool scraping against the ice surface.
33
- <span style="float: right;"><a href="#index">Back to index</a></span>
34
- </p>
35
-
36
- <div class="row g-1">
37
- <div class="col-sm-6">
38
- <div class="video-header">Movie Gen Audio</div>
39
- <div class="video-container">
40
- <iframe src="https://youtube.com/embed/d7Lb0ihtGcE"></iframe>
41
- </div>
42
- </div>
43
- <div class="col-sm-6">
44
- <div class="video-header">Ours</div>
45
- <div class="video-container">
46
- <iframe src="https://youtube.com/embed/F4JoJ2r2m8U"></iframe>
47
- </div>
48
- </div>
49
- </div>
50
- <br>
51
-
52
- <!-- <p id="moviegen2">Example 2: Rhythmic splashing and lapping of water. <span style="float:right;"><a href="#index">Back to index</a></span> </p>
53
-
54
- <table>
55
- <thead>
56
- <tr>
57
- <th>Movie Gen Audio</th>
58
- <th>Ours</th>
59
- </tr>
60
- </thead>
61
- <tbody>
62
- <tr>
63
- <td width="50%">
64
- <div class="video-container">
65
- <iframe src="https://youtube.com/embed/5gQNPK99CIk"></iframe>
66
- </div>
67
- </td>
68
- <td width="50%">
69
- <div class="video-container">
70
- <iframe src="https://youtube.com/embed/AbwnTzG-BpA"></iframe>
71
- </div>
72
- </td>
73
- </tr>
74
- </tbody>
75
- </table> -->
76
-
77
- <p id="moviegen2" style="overflow: hidden;">
78
- Example 2: Rhythmic splashing and lapping of water.
79
- <span style="float:right;"><a href="#index">Back to index</a></span>
80
- </p>
81
- <div class="row g-1">
82
- <div class="col-sm-6">
83
- <div class="video-header">Movie Gen Audio</div>
84
- <div class="video-container">
85
- <iframe src="https://youtube.com/embed/5gQNPK99CIk"></iframe>
86
- </div>
87
- </div>
88
- <div class="col-sm-6">
89
- <div class="video-header">Ours</div>
90
- <div class="video-container">
91
- <iframe src="https://youtube.com/embed/AbwnTzG-BpA"></iframe>
92
- </div>
93
- </div>
94
- </div>
95
- <br>
96
-
97
- <p id="moviegen3" style="overflow: hidden;">
98
- Example 3: Shovel scrapes against dry earth.
99
- <span style="float:right;"><a href="#index">Back to index</a></span>
100
- </p>
101
- <div class="row g-1">
102
- <div class="col-sm-6">
103
- <div class="video-header">Movie Gen Audio</div>
104
- <div class="video-container">
105
- <iframe src="https://youtube.com/embed/PUKGyEve7XQ"></iframe>
106
- </div>
107
- </div>
108
- <div class="col-sm-6">
109
- <div class="video-header">Ours</div>
110
- <div class="video-container">
111
- <iframe src="https://youtube.com/embed/CNn7i8VNkdc"></iframe>
112
- </div>
113
- </div>
114
- </div>
115
- <br>
116
-
117
-
118
- <p id="moviegen4" style="overflow: hidden;">
119
- (Failure case) Example 4: Creamy sound of mashed potatoes being scooped.
120
- <span style="float:right;"><a href="#index">Back to index</a></span>
121
- </p>
122
- <div class="row g-1">
123
- <div class="col-sm-6">
124
- <div class="video-header">Movie Gen Audio</div>
125
- <div class="video-container">
126
- <iframe src="https://youtube.com/embed/PJv1zxR9JjQ"></iframe>
127
- </div>
128
- </div>
129
- <div class="col-sm-6">
130
- <div class="video-header">Ours</div>
131
- <div class="video-container">
132
- <iframe src="https://youtube.com/embed/c3-LJ1lNsPQ"></iframe>
133
- </div>
134
- </div>
135
- </div>
136
- <br>
137
-
138
- </div>
139
-
140
- <div id="hunyuan_sora_all">
141
-
142
- <h2 id="hunyuan" style="text-align: center;">Results on Videos Generated by Hunyuan</h2>
143
- <p style="overflow: hidden;">
144
- <span style="float:right;"><a href="#index">Back to index</a></span>
145
- </p>
146
- <div class="row g-1">
147
- <div class="col-sm-6">
148
- <div class="video-header">Typing</div>
149
- <div class="video-container">
150
- <iframe src="https://youtube.com/embed/8ln_9hhH_nk"></iframe>
151
- </div>
152
- </div>
153
- <div class="col-sm-6">
154
- <div class="video-header">Water is rushing down a stream and pouring</div>
155
- <div class="video-container">
156
- <iframe src="https://youtube.com/embed/5df1FZFQj30"></iframe>
157
- </div>
158
- </div>
159
- </div>
160
- <div class="row g-1">
161
- <div class="col-sm-6">
162
- <div class="video-header">Waves on beach</div>
163
- <div class="video-container">
164
- <iframe src="https://youtube.com/embed/7wQ9D5WgpFc"></iframe>
165
- </div>
166
- </div>
167
- <div class="col-sm-6">
168
- <div class="video-header">Water droplet</div>
169
- <div class="video-container">
170
- <iframe src="https://youtube.com/embed/q7M2nsalGjM"></iframe>
171
- </div>
172
- </div>
173
- </div>
174
- <br>
175
-
176
- <h2 id="sora" style="text-align: center;">Results on Videos Generated by Sora</h2>
177
- <p style="overflow: hidden;">
178
- <span style="float:right;"><a href="#index">Back to index</a></span>
179
- </p>
180
- <div class="row g-1">
181
- <div class="col-sm-6">
182
- <div class="video-header">Ships riding waves</div>
183
- <div class="video-container">
184
- <iframe src="https://youtube.com/embed/JbgQzHHytk8"></iframe>
185
- </div>
186
- </div>
187
- <div class="col-sm-6">
188
- <div class="video-header">Train (no text prompt given)</div>
189
- <div class="video-container">
190
- <iframe src="https://youtube.com/embed/xOW7zrjpWC8"></iframe>
191
- </div>
192
- </div>
193
- </div>
194
- <div class="row g-1">
195
- <div class="col-sm-6">
196
- <div class="video-header">Seashore (no text prompt given)</div>
197
- <div class="video-container">
198
- <iframe src="https://youtube.com/embed/fIuw5Y8ZZ9E"></iframe>
199
- </div>
200
- </div>
201
- <div class="col-sm-6">
202
- <div class="video-header">Surfing (failure: unprompted music)</div>
203
- <div class="video-container">
204
- <iframe src="https://youtube.com/embed/UcSTk-v0M_s"></iframe>
205
- </div>
206
- </div>
207
- </div>
208
- <br>
209
-
210
- <div id="mochi_ltx_all">
211
- <h2 id="mochi" style="text-align: center;">Results on Videos Generated by Mochi 1</h2>
212
- <p style="overflow: hidden;">
213
- <span style="float:right;"><a href="#index">Back to index</a></span>
214
- </p>
215
- <div class="row g-1">
216
- <div class="col-sm-6">
217
- <div class="video-header">Magical fire and lightning (no text prompt given)</div>
218
- <div class="video-container">
219
- <iframe src="https://youtube.com/embed/tTlRZaSMNwY"></iframe>
220
- </div>
221
- </div>
222
- <div class="col-sm-6">
223
- <div class="video-header">Storm (no text prompt given)</div>
224
- <div class="video-container">
225
- <iframe src="https://youtube.com/embed/4hrZTMJUy3w"></iframe>
226
- </div>
227
- </div>
228
- </div>
229
- <br>
230
-
231
- <h2 id="ltx" style="text-align: center;">Results on Videos Generated by LTX-Video</h2>
232
- <p style="overflow: hidden;">
233
- <span style="float:right;"><a href="#index">Back to index</a></span>
234
- </p>
235
- <div class="row g-1">
236
- <div class="col-sm-6">
237
- <div class="video-header">Firewood burning and cracking</div>
238
- <div class="video-container">
239
- <iframe src="https://youtube.com/embed/P7_DDpgev0g"></iframe>
240
- </div>
241
- </div>
242
- <div class="col-sm-6">
243
- <div class="video-header">Waterfall, water splashing</div>
244
- <div class="video-container">
245
- <iframe src="https://youtube.com/embed/4MvjceYnIO0"></iframe>
246
- </div>
247
- </div>
248
- </div>
249
- <br>
250
-
251
- </div>
252
-
253
- </body>
254
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/docs/video_main.html DELETED
@@ -1,98 +0,0 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
- <!-- Google tag (gtag.js) -->
5
- <script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
6
- <script>
7
- window.dataLayer = window.dataLayer || [];
8
- function gtag(){dataLayer.push(arguments);}
9
- gtag('js', new Date());
10
- gtag('config', 'G-0JKBJ3WRJZ');
11
- </script>
12
-
13
- <link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
14
- <meta charset="UTF-8">
15
- <title>MMAudio</title>
16
-
17
- <link rel="icon" type="image/png" href="images/icon.png">
18
-
19
- <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no">
20
- <!-- CSS only -->
21
- <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/css/bootstrap.min.css" rel="stylesheet"
22
- integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
23
- <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.7.1/jquery.min.js"></script>
24
-
25
- <link rel="stylesheet" href="style_videos.css">
26
-
27
- <script type="text/javascript">
28
- $(document).ready(function(){
29
- $("#content").load("video_gen.html #moviegen_all");
30
- $("#load_moveigen").click(function(){
31
- $("#content").load("video_gen.html #moviegen_all");
32
- });
33
- $("#load_hunyuan_sora").click(function(){
34
- $("#content").load("video_gen.html #hunyuan_sora_all");
35
- });
36
- $("#load_mochi_ltx").click(function(){
37
- $("#content").load("video_gen.html #mochi_ltx_all");
38
- });
39
- $("#load_vgg1").click(function(){
40
- $("#content").load("video_vgg.html #vgg1");
41
- });
42
- $("#load_vgg2").click(function(){
43
- $("#content").load("video_vgg.html #vgg2");
44
- });
45
- $("#load_vgg3").click(function(){
46
- $("#content").load("video_vgg.html #vgg3");
47
- });
48
- $("#load_vgg4").click(function(){
49
- $("#content").load("video_vgg.html #vgg4");
50
- });
51
- $("#load_vgg5").click(function(){
52
- $("#content").load("video_vgg.html #vgg5");
53
- });
54
- $("#load_vgg6").click(function(){
55
- $("#content").load("video_vgg.html #vgg6");
56
- });
57
- $("#load_vgg_extra").click(function(){
58
- $("#content").load("video_vgg.html #vgg_extra");
59
- });
60
- });
61
- </script>
62
- </head>
63
- <body>
64
- <h1 id="index" style="text-align: center;">Index</h1>
65
- <p><b>(Click on the links to load the corresponding videos)</b> <span style="float:right;"><a href="index.html">Back to project page</a></span></p>
66
-
67
- <ol>
68
- <li>
69
- <a href="#" id="load_moveigen">Comparisons with Movie Gen Audio on Videos Generated by MovieGen</a>
70
- </li>
71
- <li>
72
- <a href="#" id="load_hunyuan_sora">Results on Videos Generated by Hunyuan and Sora</a>
73
- </li>
74
- <li>
75
- <a href="#" id="load_mochi_ltx">Results on Videos Generated by Mochi 1 and LTX-Video</a>
76
- </li>
77
- <li>
78
- On VGGSound
79
- <ol>
80
- <li><a id='load_vgg1' href="#">Example 1: Wolf howling</a></li>
81
- <li><a id='load_vgg2' href="#">Example 2: Striking a golf ball</a></li>
82
- <li><a id='load_vgg3' href="#">Example 3: Hitting a drum</a></li>
83
- <li><a id='load_vgg4' href="#">Example 4: Dog barking</a></li>
84
- <li><a id='load_vgg5' href="#">Example 5: Playing a string instrument</a></li>
85
- <li><a id='load_vgg6' href="#">Example 6: A group of people playing tambourines</a></li>
86
- <li><a id='load_vgg_extra' href="#">Extra results & failure cases</a></li>
87
- </ol>
88
- </li>
89
- </ol>
90
-
91
- <div id="content" class="container-fluid">
92
-
93
- </div>
94
- <br>
95
- <br>
96
-
97
- </body>
98
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/docs/video_vgg.html DELETED
@@ -1,452 +0,0 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
- <!-- Google tag (gtag.js) -->
5
- <script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
6
- <script>
7
- window.dataLayer = window.dataLayer || [];
8
- function gtag(){dataLayer.push(arguments);}
9
- gtag('js', new Date());
10
- gtag('config', 'G-0JKBJ3WRJZ');
11
- </script>
12
-
13
- <link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
14
- <meta charset="UTF-8">
15
- <title>MMAudio</title>
16
-
17
- <meta name="viewport" content="width=device-width, initial-scale=1">
18
- <!-- CSS only -->
19
- <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/css/bootstrap.min.css" rel="stylesheet"
20
- integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
21
- <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
22
-
23
- <link rel="stylesheet" href="style_videos.css">
24
- </head>
25
- <body>
26
-
27
- <div id="vgg1">
28
- <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
29
- <p style="overflow: hidden;">
30
- Example 1: Wolf howling.
31
- <span style="float:right;"><a href="#index">Back to index</a></span>
32
- </p>
33
- <div class="row g-1">
34
- <div class="col-sm-3">
35
- <div class="video-header">Ground-truth</div>
36
- <div class="video-container">
37
- <iframe src="https://youtube.com/embed/9J_V74gqMUA"></iframe>
38
- </div>
39
- </div>
40
- <div class="col-sm-3">
41
- <div class="video-header">Ours</div>
42
- <div class="video-container">
43
- <iframe src="https://youtube.com/embed/P6O8IpjErPc"></iframe>
44
- </div>
45
- </div>
46
- <div class="col-sm-3">
47
- <div class="video-header">V2A-Mapper</div>
48
- <div class="video-container">
49
- <iframe src="https://youtube.com/embed/w-5eyqepvTk"></iframe>
50
- </div>
51
- </div>
52
- <div class="col-sm-3">
53
- <div class="video-header">FoleyCrafter</div>
54
- <div class="video-container">
55
- <iframe src="https://youtube.com/embed/VOLfoZlRkzo"></iframe>
56
- </div>
57
- </div>
58
- </div>
59
- <div class="row g-1">
60
- <div class="col-sm-3">
61
- <div class="video-header">Frieren</div>
62
- <div class="video-container">
63
- <iframe src="https://youtube.com/embed/49owKyA5Pa8"></iframe>
64
- </div>
65
- </div>
66
- <div class="col-sm-3">
67
- <div class="video-header">VATT</div>
68
- <div class="video-container">
69
- <iframe src="https://youtube.com/embed/QVtrFgbeGDM"></iframe>
70
- </div>
71
- </div>
72
- <div class="col-sm-3">
73
- <div class="video-header">V-AURA</div>
74
- <div class="video-container">
75
- <iframe src="https://youtube.com/embed/8r0uEfSNjvI"></iframe>
76
- </div>
77
- </div>
78
- <div class="col-sm-3">
79
- <div class="video-header">Seeing and Hearing</div>
80
- <div class="video-container">
81
- <iframe src="https://youtube.com/embed/bn-sLg2qulk"></iframe>
82
- </div>
83
- </div>
84
- </div>
85
- </div>
86
-
87
- <div id="vgg2">
88
- <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
89
- <p style="overflow: hidden;">
90
- Example 2: Striking a golf ball.
91
- <span style="float:right;"><a href="#index">Back to index</a></span>
92
- </p>
93
-
94
- <div class="row g-1">
95
- <div class="col-sm-3">
96
- <div class="video-header">Ground-truth</div>
97
- <div class="video-container">
98
- <iframe src="https://youtube.com/embed/1hwSu42kkho"></iframe>
99
- </div>
100
- </div>
101
- <div class="col-sm-3">
102
- <div class="video-header">Ours</div>
103
- <div class="video-container">
104
- <iframe src="https://youtube.com/embed/kZibDoDCNxI"></iframe>
105
- </div>
106
- </div>
107
- <div class="col-sm-3">
108
- <div class="video-header">V2A-Mapper</div>
109
- <div class="video-container">
110
- <iframe src="https://youtube.com/embed/jgKfLBLhh7Y"></iframe>
111
- </div>
112
- </div>
113
- <div class="col-sm-3">
114
- <div class="video-header">FoleyCrafter</div>
115
- <div class="video-container">
116
- <iframe src="https://youtube.com/embed/Lfsx8mOPcJo"></iframe>
117
- </div>
118
- </div>
119
- </div>
120
- <div class="row g-1">
121
- <div class="col-sm-3">
122
- <div class="video-header">Frieren</div>
123
- <div class="video-container">
124
- <iframe src="https://youtube.com/embed/tz-LpbB0MBc"></iframe>
125
- </div>
126
- </div>
127
- <div class="col-sm-3">
128
- <div class="video-header">VATT</div>
129
- <div class="video-container">
130
- <iframe src="https://youtube.com/embed/RTDUHMi08n4"></iframe>
131
- </div>
132
- </div>
133
- <div class="col-sm-3">
134
- <div class="video-header">V-AURA</div>
135
- <div class="video-container">
136
- <iframe src="https://youtube.com/embed/N-3TDOsPnZQ"></iframe>
137
- </div>
138
- </div>
139
- <div class="col-sm-3">
140
- <div class="video-header">Seeing and Hearing</div>
141
- <div class="video-container">
142
- <iframe src="https://youtube.com/embed/QnsHnLn4gB0"></iframe>
143
- </div>
144
- </div>
145
- </div>
146
- </div>
147
-
148
- <div id="vgg3">
149
- <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
150
- <p style="overflow: hidden;">
151
- Example 3: Hitting a drum.
152
- <span style="float:right;"><a href="#index">Back to index</a></span>
153
- </p>
154
-
155
- <div class="row g-1">
156
- <div class="col-sm-3">
157
- <div class="video-header">Ground-truth</div>
158
- <div class="video-container">
159
- <iframe src="https://youtube.com/embed/0oeIwq77w0Q"></iframe>
160
- </div>
161
- </div>
162
- <div class="col-sm-3">
163
- <div class="video-header">Ours</div>
164
- <div class="video-container">
165
- <iframe src="https://youtube.com/embed/-UtPV9ohuIM"></iframe>
166
- </div>
167
- </div>
168
- <div class="col-sm-3">
169
- <div class="video-header">V2A-Mapper</div>
170
- <div class="video-container">
171
- <iframe src="https://youtube.com/embed/9yivkgN-zwc"></iframe>
172
- </div>
173
- </div>
174
- <div class="col-sm-3">
175
- <div class="video-header">FoleyCrafter</div>
176
- <div class="video-container">
177
- <iframe src="https://youtube.com/embed/kkCsXPOlBvY"></iframe>
178
- </div>
179
- </div>
180
- </div>
181
- <div class="row g-1">
182
- <div class="col-sm-3">
183
- <div class="video-header">Frieren</div>
184
- <div class="video-container">
185
- <iframe src="https://youtube.com/embed/MbNKsVsuvig"></iframe>
186
- </div>
187
- </div>
188
- <div class="col-sm-3">
189
- <div class="video-header">VATT</div>
190
- <div class="video-container">
191
- <iframe src="https://youtube.com/embed/2yYviBjrpBw"></iframe>
192
- </div>
193
- </div>
194
- <div class="col-sm-3">
195
- <div class="video-header">V-AURA</div>
196
- <div class="video-container">
197
- <iframe src="https://youtube.com/embed/9yivkgN-zwc"></iframe>
198
- </div>
199
- </div>
200
- <div class="col-sm-3">
201
- <div class="video-header">Seeing and Hearing</div>
202
- <div class="video-container">
203
- <iframe src="https://youtube.com/embed/6dnyQt4Fuhs"></iframe>
204
- </div>
205
- </div>
206
- </div>
207
- </div>
208
- </div>
209
-
210
- <div id="vgg4">
211
- <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
212
- <p style="overflow: hidden;">
213
- Example 4: Dog barking.
214
- <span style="float:right;"><a href="#index">Back to index</a></span>
215
- </p>
216
-
217
- <div class="row g-1">
218
- <div class="col-sm-3">
219
- <div class="video-header">Ground-truth</div>
220
- <div class="video-container">
221
- <iframe src="https://youtube.com/embed/ckaqvTyMYAw"></iframe>
222
- </div>
223
- </div>
224
- <div class="col-sm-3">
225
- <div class="video-header">Ours</div>
226
- <div class="video-container">
227
- <iframe src="https://youtube.com/embed/_aRndFZzZ-I"></iframe>
228
- </div>
229
- </div>
230
- <div class="col-sm-3">
231
- <div class="video-header">V2A-Mapper</div>
232
- <div class="video-container">
233
- <iframe src="https://youtube.com/embed/mNCISP3LBl0"></iframe>
234
- </div>
235
- </div>
236
- <div class="col-sm-3">
237
- <div class="video-header">FoleyCrafter</div>
238
- <div class="video-container">
239
- <iframe src="https://youtube.com/embed/phZBQ3L7foE"></iframe>
240
- </div>
241
- </div>
242
- </div>
243
- <div class="row g-1">
244
- <div class="col-sm-3">
245
- <div class="video-header">Frieren</div>
246
- <div class="video-container">
247
- <iframe src="https://youtube.com/embed/Sb5Mg1-ORao"></iframe>
248
- </div>
249
- </div>
250
- <div class="col-sm-3">
251
- <div class="video-header">VATT</div>
252
- <div class="video-container">
253
- <iframe src="https://youtube.com/embed/eHmAGOmtDDg"></iframe>
254
- </div>
255
- </div>
256
- <div class="col-sm-3">
257
- <div class="video-header">V-AURA</div>
258
- <div class="video-container">
259
- <iframe src="https://youtube.com/embed/NEGa3krBrm0"></iframe>
260
- </div>
261
- </div>
262
- <div class="col-sm-3">
263
- <div class="video-header">Seeing and Hearing</div>
264
- <div class="video-container">
265
- <iframe src="https://youtube.com/embed/aO0EAXlwE7A"></iframe>
266
- </div>
267
- </div>
268
- </div>
269
- </div>
270
-
271
- <div id="vgg5">
272
- <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
273
- <p style="overflow: hidden;">
274
- Example 5: Playing a string instrument.
275
- <span style="float:right;"><a href="#index">Back to index</a></span>
276
- </p>
277
-
278
- <div class="row g-1">
279
- <div class="col-sm-3">
280
- <div class="video-header">Ground-truth</div>
281
- <div class="video-container">
282
- <iframe src="https://youtube.com/embed/KP1QhWauIOc"></iframe>
283
- </div>
284
- </div>
285
- <div class="col-sm-3">
286
- <div class="video-header">Ours</div>
287
- <div class="video-container">
288
- <iframe src="https://youtube.com/embed/ovaJhWSquYE"></iframe>
289
- </div>
290
- </div>
291
- <div class="col-sm-3">
292
- <div class="video-header">V2A-Mapper</div>
293
- <div class="video-container">
294
- <iframe src="https://youtube.com/embed/N723FS9lcy8"></iframe>
295
- </div>
296
- </div>
297
- <div class="col-sm-3">
298
- <div class="video-header">FoleyCrafter</div>
299
- <div class="video-container">
300
- <iframe src="https://youtube.com/embed/t0N4ZAAXo58"></iframe>
301
- </div>
302
- </div>
303
- </div>
304
- <div class="row g-1">
305
- <div class="col-sm-3">
306
- <div class="video-header">Frieren</div>
307
- <div class="video-container">
308
- <iframe src="https://youtube.com/embed/8YSRs03QNNA"></iframe>
309
- </div>
310
- </div>
311
- <div class="col-sm-3">
312
- <div class="video-header">VATT</div>
313
- <div class="video-container">
314
- <iframe src="https://youtube.com/embed/vOpMz55J1kY"></iframe>
315
- </div>
316
- </div>
317
- <div class="col-sm-3">
318
- <div class="video-header">V-AURA</div>
319
- <div class="video-container">
320
- <iframe src="https://youtube.com/embed/9JHC75vr9h0"></iframe>
321
- </div>
322
- </div>
323
- <div class="col-sm-3">
324
- <div class="video-header">Seeing and Hearing</div>
325
- <div class="video-container">
326
- <iframe src="https://youtube.com/embed/9w0JckNzXmY"></iframe>
327
- </div>
328
- </div>
329
- </div>
330
- </div>
331
-
332
- <div id="vgg6">
333
- <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
334
- <p style="overflow: hidden;">
335
- Example 6: A group of people playing tambourines.
336
- <span style="float:right;"><a href="#index">Back to index</a></span>
337
- </p>
338
-
339
- <div class="row g-1">
340
- <div class="col-sm-3">
341
- <div class="video-header">Ground-truth</div>
342
- <div class="video-container">
343
- <iframe src="https://youtube.com/embed/mx6JLxzUkRc"></iframe>
344
- </div>
345
- </div>
346
- <div class="col-sm-3">
347
- <div class="video-header">Ours</div>
348
- <div class="video-container">
349
- <iframe src="https://youtube.com/embed/oLirHhP9Su8"></iframe>
350
- </div>
351
- </div>
352
- <div class="col-sm-3">
353
- <div class="video-header">V2A-Mapper</div>
354
- <div class="video-container">
355
- <iframe src="https://youtube.com/embed/HkLkHMqptv0"></iframe>
356
- </div>
357
- </div>
358
- <div class="col-sm-3">
359
- <div class="video-header">FoleyCrafter</div>
360
- <div class="video-container">
361
- <iframe src="https://youtube.com/embed/rpHiiODjmNU"></iframe>
362
- </div>
363
- </div>
364
- </div>
365
- <div class="row g-1">
366
- <div class="col-sm-3">
367
- <div class="video-header">Frieren</div>
368
- <div class="video-container">
369
- <iframe src="https://youtube.com/embed/1mVD3fJ0LpM"></iframe>
370
- </div>
371
- </div>
372
- <div class="col-sm-3">
373
- <div class="video-header">VATT</div>
374
- <div class="video-container">
375
- <iframe src="https://youtube.com/embed/yjVFnJiEJlw"></iframe>
376
- </div>
377
- </div>
378
- <div class="col-sm-3">
379
- <div class="video-header">V-AURA</div>
380
- <div class="video-container">
381
- <iframe src="https://youtube.com/embed/neVeMSWtRkU"></iframe>
382
- </div>
383
- </div>
384
- <div class="col-sm-3">
385
- <div class="video-header">Seeing and Hearing</div>
386
- <div class="video-container">
387
- <iframe src="https://youtube.com/embed/EUE7YwyVWz8"></iframe>
388
- </div>
389
- </div>
390
- </div>
391
- </div>
392
-
393
- <div id="vgg_extra">
394
- <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
395
- <p style="overflow: hidden;">
396
- <span style="float:right;"><a href="#index">Back to index</a></span>
397
- </p>
398
-
399
- <div class="row g-1">
400
- <div class="col-sm-3">
401
- <div class="video-header">Moving train</div>
402
- <div class="video-container">
403
- <iframe src="https://youtube.com/embed/Ta6H45rBzJc"></iframe>
404
- </div>
405
- </div>
406
- <div class="col-sm-3">
407
- <div class="video-header">Water splashing</div>
408
- <div class="video-container">
409
- <iframe src="https://youtube.com/embed/hl6AtgHXpb4"></iframe>
410
- </div>
411
- </div>
412
- <div class="col-sm-3">
413
- <div class="video-header">Skateboarding</div>
414
- <div class="video-container">
415
- <iframe src="https://youtube.com/embed/n4sCNi_9buI"></iframe>
416
- </div>
417
- </div>
418
- <div class="col-sm-3">
419
- <div class="video-header">Synchronized clapping</div>
420
- <div class="video-container">
421
- <iframe src="https://youtube.com/embed/oxexfpLn7FE"></iframe>
422
- </div>
423
- </div>
424
- </div>
425
-
426
- <br><br>
427
-
428
- <div id="extra-failure">
429
- <h2 style="text-align: center;">Failure cases</h2>
430
- <p style="overflow: hidden;">
431
- <span style="float:right;"><a href="#index">Back to index</a></span>
432
- </p>
433
-
434
- <div class="row g-1">
435
- <div class="col-sm-6">
436
- <div class="video-header">Human speech</div>
437
- <div class="video-container">
438
- <iframe src="https://youtube.com/embed/nx0CyrDu70Y"></iframe>
439
- </div>
440
- </div>
441
- <div class="col-sm-6">
442
- <div class="video-header">Unfamiliar vision input</div>
443
- <div class="video-container">
444
- <iframe src="https://youtube.com/embed/hfnAqmK3X7w"></iframe>
445
- </div>
446
- </div>
447
- </div>
448
- </div>
449
- </div>
450
-
451
- </body>
452
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/eval_onsets.py DELETED
@@ -1,141 +0,0 @@
1
- # Modified from https://github.com/XYPB/CondFoleyGen/blob/main/predict_onset.py
2
-
3
- import argparse
4
- import copy
5
- import os
6
- from pathlib import Path
7
-
8
- import librosa
9
- import numpy as np
10
- from sklearn.metrics import (average_precision_score, f1_score, precision_recall_curve)
11
- from tqdm import tqdm
12
-
13
- sample_rate = 22050
14
- conf_interval = int(0.05 * 22050)
15
- duration = 8
16
-
17
-
18
- def onset_nms(onsets, wav_norm, window=0.05):
19
- confidence = [np.max(wav_norm[o - conf_interval:o + conf_interval]) for o in onsets]
20
-
21
- onset_remain = onsets.tolist()
22
- output = []
23
- sorted_idx = np.argsort(confidence)[::-1]
24
- for idx in sorted_idx:
25
- cur = onsets[idx]
26
- if cur not in onset_remain:
27
- continue
28
- output.append(cur)
29
- onset_remain.remove(cur)
30
- for o in onset_remain:
31
- if abs(cur - o) < window * sample_rate:
32
- onset_remain.remove(o)
33
- return np.array(sorted(output))
34
-
35
-
36
- def predict_audio(audio_path: Path, delta: float) -> np.ndarray:
37
- wav, _ = librosa.load(audio_path, sr=sample_rate)
38
- wav = wav[:duration * sample_rate]
39
- onsets = librosa.onset.onset_detect(y=wav, sr=sample_rate, units='samples', delta=delta)
40
- wav_norm = (wav - wav.min()) / (wav.max() - wav.min() + 1e-6)
41
-
42
- return onsets, wav_norm
43
-
44
-
45
- def read_gt(gt_file: Path) -> np.ndarray:
46
- all_times = []
47
- with open(gt_file, 'r') as f:
48
- lines = f.readlines()
49
- for l in lines:
50
- time = float(l.split(' ')[0])
51
- if time >= duration:
52
- break
53
- all_times.append(time)
54
- return np.array(all_times)
55
-
56
-
57
- def main():
58
- parser = argparse.ArgumentParser()
59
- parser.add_argument('--input_dir', type=Path)
60
- parser.add_argument('--gt_dir', type=Path)
61
- parser.add_argument('--delta', type=float, default=0.3)
62
- args = parser.parse_args()
63
-
64
- input_dir = args.input_dir
65
- gt_dir = args.gt_dir
66
- delta = args.delta
67
-
68
- overall_acc = 0
69
- overall_ap = 0
70
- overall_f1 = 0
71
-
72
- audio_files = sorted(os.listdir(input_dir))
73
- audio_files = [f for f in audio_files if f.endswith('.flac') or f.endswith('.wav')]
74
- for audio_file in tqdm(audio_files):
75
- base_name = Path(audio_file).stem
76
- gt_name = base_name.replace('_denoised', '_times')
77
- gt_file = gt_dir / f'{gt_name}.txt'
78
- gt_times = read_gt(gt_file) * sample_rate
79
-
80
- onsets, wav_norm = predict_audio(input_dir / audio_file, delta)
81
- onsets = onset_nms(onsets, wav_norm)
82
-
83
- onsets_onuse = copy.deepcopy(onsets.tolist())
84
- onsets_res = [0 for _ in onsets_onuse]
85
-
86
- y_gt = []
87
- y_pred = []
88
- hit_cnt = 0
89
- for gt_onset in gt_times:
90
- diff = [abs(pred_onset - gt_onset) for pred_onset in onsets_onuse]
91
- idx_in_window = [idx for idx in range(len(onsets_onuse)) if diff[idx] < delta * 22050]
92
- if len(idx_in_window) == 0:
93
- y_gt.append(1)
94
- y_pred.append(0)
95
- else:
96
- conf_in_window = [wav_norm[onsets[idx]] for idx in idx_in_window]
97
- max_conf_idx = np.argsort(conf_in_window)[-1]
98
- match_idx = idx_in_window[max_conf_idx]
99
- conf = np.max(wav_norm[onsets_onuse[match_idx] -
100
- conf_interval:onsets_onuse[match_idx] + conf_interval])
101
- hit_cnt += 1
102
- y_gt.append(1)
103
- y_pred.append(conf)
104
- # y_pred.append(1)
105
- for i in range(len(onsets)):
106
- if onsets[i] == onsets_onuse[match_idx]:
107
- onsets_res[i] = 1
108
- onsets_onuse.remove(onsets_onuse[match_idx])
109
- if len(onsets_onuse) == 0:
110
- break
111
-
112
- for o in onsets_onuse:
113
- y_gt.append(0)
114
- y_pred.append(np.max(wav_norm[o - conf_interval:o + conf_interval]))
115
- # y_pred.append(1)
116
-
117
- acc = hit_cnt / len(gt_times) if len(gt_times) != 0 else 0
118
- ap = average_precision_score(y_gt, y_pred)
119
- f1 = f1_score(y_gt, [1 if p > 0 else 0 for p in y_pred])
120
- # print(y_gt, y_pred, ap, f1)
121
-
122
- overall_acc += acc
123
- overall_ap += ap
124
- overall_f1 += f1
125
-
126
- overall_acc /= len(audio_files)
127
- overall_ap /= len(audio_files)
128
- overall_f1 /= len(audio_files)
129
- print(f'Overall accuracy: {overall_acc:.4f}')
130
- print(f'Overall AP: {overall_ap:.4f}')
131
- print(f'Overall F1: {overall_f1:.4f}')
132
-
133
- # write to file
134
- with open(input_dir / 'eval_results.txt', 'w') as f:
135
- f.write(f'Overall accuracy: {overall_acc:.4f}\n')
136
- f.write(f'Overall AP: {overall_ap:.4f}\n')
137
- f.write(f'Overall F1: {overall_f1:.4f}\n')
138
-
139
-
140
- if __name__ == '__main__':
141
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/gradio_demo.py DELETED
@@ -1,343 +0,0 @@
1
- import gc
2
- import logging
3
- from argparse import ArgumentParser
4
- from datetime import datetime
5
- from fractions import Fraction
6
- from pathlib import Path
7
-
8
- import gradio as gr
9
- import torch
10
- import torchaudio
11
-
12
- from mmaudio.eval_utils import (ModelConfig, VideoInfo, all_model_cfg, generate, load_image,
13
- load_video, make_video, setup_eval_logging)
14
- from mmaudio.model.flow_matching import FlowMatching
15
- from mmaudio.model.networks import MMAudio, get_my_mmaudio
16
- from mmaudio.model.sequence_config import SequenceConfig
17
- from mmaudio.model.utils.features_utils import FeaturesUtils
18
-
19
- torch.backends.cuda.matmul.allow_tf32 = True
20
- torch.backends.cudnn.allow_tf32 = True
21
-
22
- log = logging.getLogger()
23
-
24
- device = 'cpu'
25
- if torch.cuda.is_available():
26
- device = 'cuda'
27
- elif torch.backends.mps.is_available():
28
- device = 'mps'
29
- else:
30
- log.warning('CUDA/MPS are not available, running on CPU')
31
- dtype = torch.bfloat16
32
-
33
- model: ModelConfig = all_model_cfg['large_44k_v2']
34
- model.download_if_needed()
35
- output_dir = Path('./output/gradio')
36
-
37
- setup_eval_logging()
38
-
39
-
40
- def get_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
41
- seq_cfg = model.seq_cfg
42
-
43
- net: MMAudio = get_my_mmaudio(model.model_name).to(device, dtype).eval()
44
- net.load_weights(torch.load(model.model_path, map_location=device, weights_only=True))
45
- log.info(f'Loaded weights from {model.model_path}')
46
-
47
- feature_utils = FeaturesUtils(tod_vae_ckpt=model.vae_path,
48
- synchformer_ckpt=model.synchformer_ckpt,
49
- enable_conditions=True,
50
- mode=model.mode,
51
- bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
52
- need_vae_encoder=False)
53
- feature_utils = feature_utils.to(device, dtype).eval()
54
-
55
- return net, feature_utils, seq_cfg
56
-
57
-
58
- net, feature_utils, seq_cfg = get_model()
59
-
60
-
61
- @torch.inference_mode()
62
- def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
63
- cfg_strength: float, duration: float):
64
-
65
- rng = torch.Generator(device=device)
66
- if seed >= 0:
67
- rng.manual_seed(seed)
68
- else:
69
- rng.seed()
70
- fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
71
-
72
- video_info = load_video(video, duration)
73
- clip_frames = video_info.clip_frames
74
- sync_frames = video_info.sync_frames
75
- duration = video_info.duration_sec
76
- clip_frames = clip_frames.unsqueeze(0)
77
- sync_frames = sync_frames.unsqueeze(0)
78
- seq_cfg.duration = duration
79
- net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
80
-
81
- audios = generate(clip_frames,
82
- sync_frames, [prompt],
83
- negative_text=[negative_prompt],
84
- feature_utils=feature_utils,
85
- net=net,
86
- fm=fm,
87
- rng=rng,
88
- cfg_strength=cfg_strength)
89
- audio = audios.float().cpu()[0]
90
-
91
- current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
92
- output_dir.mkdir(exist_ok=True, parents=True)
93
- video_save_path = output_dir / f'{current_time_string}.mp4'
94
- make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
95
- gc.collect()
96
- return video_save_path
97
-
98
-
99
- @torch.inference_mode()
100
- def image_to_audio(image: gr.Image, prompt: str, negative_prompt: str, seed: int, num_steps: int,
101
- cfg_strength: float, duration: float):
102
-
103
- rng = torch.Generator(device=device)
104
- if seed >= 0:
105
- rng.manual_seed(seed)
106
- else:
107
- rng.seed()
108
- fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
109
-
110
- image_info = load_image(image)
111
- clip_frames = image_info.clip_frames
112
- sync_frames = image_info.sync_frames
113
- clip_frames = clip_frames.unsqueeze(0)
114
- sync_frames = sync_frames.unsqueeze(0)
115
- seq_cfg.duration = duration
116
- net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
117
-
118
- audios = generate(clip_frames,
119
- sync_frames, [prompt],
120
- negative_text=[negative_prompt],
121
- feature_utils=feature_utils,
122
- net=net,
123
- fm=fm,
124
- rng=rng,
125
- cfg_strength=cfg_strength,
126
- image_input=True)
127
- audio = audios.float().cpu()[0]
128
-
129
- current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
130
- output_dir.mkdir(exist_ok=True, parents=True)
131
- video_save_path = output_dir / f'{current_time_string}.mp4'
132
- video_info = VideoInfo.from_image_info(image_info, duration, fps=Fraction(1))
133
- make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
134
- gc.collect()
135
- return video_save_path
136
-
137
-
138
- @torch.inference_mode()
139
- def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float,
140
- duration: float):
141
-
142
- rng = torch.Generator(device=device)
143
- if seed >= 0:
144
- rng.manual_seed(seed)
145
- else:
146
- rng.seed()
147
- fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
148
-
149
- clip_frames = sync_frames = None
150
- seq_cfg.duration = duration
151
- net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
152
-
153
- audios = generate(clip_frames,
154
- sync_frames, [prompt],
155
- negative_text=[negative_prompt],
156
- feature_utils=feature_utils,
157
- net=net,
158
- fm=fm,
159
- rng=rng,
160
- cfg_strength=cfg_strength)
161
- audio = audios.float().cpu()[0]
162
-
163
- current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
164
- output_dir.mkdir(exist_ok=True, parents=True)
165
- audio_save_path = output_dir / f'{current_time_string}.flac'
166
- torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
167
- gc.collect()
168
- return audio_save_path
169
-
170
-
171
- video_to_audio_tab = gr.Interface(
172
- fn=video_to_audio,
173
- description="""
174
- Project page: <a href="https://hkchengrex.com/MMAudio/">https://hkchengrex.com/MMAudio/</a><br>
175
- Code: <a href="https://github.com/hkchengrex/MMAudio">https://github.com/hkchengrex/MMAudio</a><br>
176
-
177
- NOTE: It takes longer to process high-resolution videos (>384 px on the shorter side).
178
- Doing so does not improve results.
179
- """,
180
- inputs=[
181
- gr.Video(),
182
- gr.Text(label='Prompt'),
183
- gr.Text(label='Negative prompt', value='music'),
184
- gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
185
- gr.Number(label='Num steps', value=25, precision=0, minimum=1),
186
- gr.Number(label='Guidance Strength', value=4.5, minimum=1),
187
- gr.Number(label='Duration (sec)', value=8, minimum=1),
188
- ],
189
- outputs='playable_video',
190
- cache_examples=False,
191
- title='MMAudio — Video-to-Audio Synthesis',
192
- examples=[
193
- [
194
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_beach.mp4',
195
- 'waves, seagulls',
196
- '',
197
- 0,
198
- 25,
199
- 4.5,
200
- 10,
201
- ],
202
- [
203
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_serpent.mp4',
204
- '',
205
- 'music',
206
- 0,
207
- 25,
208
- 4.5,
209
- 10,
210
- ],
211
- [
212
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_seahorse.mp4',
213
- 'bubbles',
214
- '',
215
- 0,
216
- 25,
217
- 4.5,
218
- 10,
219
- ],
220
- [
221
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_india.mp4',
222
- 'Indian holy music',
223
- '',
224
- 0,
225
- 25,
226
- 4.5,
227
- 10,
228
- ],
229
- [
230
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_galloping.mp4',
231
- 'galloping',
232
- '',
233
- 0,
234
- 25,
235
- 4.5,
236
- 10,
237
- ],
238
- [
239
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_kraken.mp4',
240
- 'waves, storm',
241
- '',
242
- 0,
243
- 25,
244
- 4.5,
245
- 10,
246
- ],
247
- [
248
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/mochi_storm.mp4',
249
- 'storm',
250
- '',
251
- 0,
252
- 25,
253
- 4.5,
254
- 10,
255
- ],
256
- [
257
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_spring.mp4',
258
- '',
259
- '',
260
- 0,
261
- 25,
262
- 4.5,
263
- 10,
264
- ],
265
- [
266
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_typing.mp4',
267
- 'typing',
268
- '',
269
- 0,
270
- 25,
271
- 4.5,
272
- 10,
273
- ],
274
- [
275
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_wake_up.mp4',
276
- '',
277
- '',
278
- 0,
279
- 25,
280
- 4.5,
281
- 10,
282
- ],
283
- [
284
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_nyc.mp4',
285
- '',
286
- '',
287
- 0,
288
- 25,
289
- 4.5,
290
- 10,
291
- ],
292
- ])
293
-
294
- text_to_audio_tab = gr.Interface(
295
- fn=text_to_audio,
296
- description="""
297
- Project page: <a href="https://hkchengrex.com/MMAudio/">https://hkchengrex.com/MMAudio/</a><br>
298
- Code: <a href="https://github.com/hkchengrex/MMAudio">https://github.com/hkchengrex/MMAudio</a><br>
299
- """,
300
- inputs=[
301
- gr.Text(label='Prompt'),
302
- gr.Text(label='Negative prompt'),
303
- gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
304
- gr.Number(label='Num steps', value=25, precision=0, minimum=1),
305
- gr.Number(label='Guidance Strength', value=4.5, minimum=1),
306
- gr.Number(label='Duration (sec)', value=8, minimum=1),
307
- ],
308
- outputs='audio',
309
- cache_examples=False,
310
- title='MMAudio — Text-to-Audio Synthesis',
311
- )
312
-
313
- image_to_audio_tab = gr.Interface(
314
- fn=image_to_audio,
315
- description="""
316
- Project page: <a href="https://hkchengrex.com/MMAudio/">https://hkchengrex.com/MMAudio/</a><br>
317
- Code: <a href="https://github.com/hkchengrex/MMAudio">https://github.com/hkchengrex/MMAudio</a><br>
318
-
319
- NOTE: It takes longer to process high-resolution images (>384 px on the shorter side).
320
- Doing so does not improve results.
321
- """,
322
- inputs=[
323
- gr.Image(type='filepath'),
324
- gr.Text(label='Prompt'),
325
- gr.Text(label='Negative prompt'),
326
- gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
327
- gr.Number(label='Num steps', value=25, precision=0, minimum=1),
328
- gr.Number(label='Guidance Strength', value=4.5, minimum=1),
329
- gr.Number(label='Duration (sec)', value=8, minimum=1),
330
- ],
331
- outputs='playable_video',
332
- cache_examples=False,
333
- title='MMAudio — Image-to-Audio Synthesis (experimental)',
334
- )
335
-
336
- if __name__ == "__main__":
337
- parser = ArgumentParser()
338
- parser.add_argument('--port', type=int, default=7860)
339
- args = parser.parse_args()
340
-
341
- gr.TabbedInterface([video_to_audio_tab, text_to_audio_tab, image_to_audio_tab],
342
- ['Video-to-Audio', 'Text-to-Audio', 'Image-to-Audio (experimental)']).launch(
343
- server_port=args.port, allowed_paths=[output_dir])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/sets/vgg-test.tsv DELETED
The diff for this file is too large to render. See raw diff
 
MMAudio/sets/vgg-train.tsv DELETED
The diff for this file is too large to render. See raw diff
 
MMAudio/sets/vgg-val.tsv DELETED
@@ -1,2049 +0,0 @@
1
- id label
2
- --96EN9NUQM_000242 alarm clock ringing
3
- -2toZf00LvI_000012 bowling impact
4
- -8OE7Vydkl4_000221 bowling impact
5
- -AEZuuoyJug_000030 playing violin, fiddle
6
- -CUgrFw8TEI_000045 dog whimpering
7
- -CexapzRAPQ_000051 ferret dooking
8
- -DHGwygUsQc_000030 skateboarding
9
- -G-o-Y4WuaU_000139 playing harmonica
10
- -G_2v0L4U_s_000078 playing tennis
11
- -HIPq7T3eFI_000011 driving motorcycle
12
- -I8C3cRr5TY_000030 female singing
13
- -K232jBK8VQ_000030 car passing by
14
- -L_RH-nw11I_000025 vacuum cleaner cleaning floors
15
- -MfBpxtGQmE_000020 ambulance siren
16
- -NYZDjBz60I_000085 child singing
17
- -QWcNg6FCgE_000022 playing bass guitar
18
- -T06kz4MI20_000030 female singing
19
- -UJJsEdgqMQ_000011 horse clip-clop
20
- -VyLmfnIc5Q_000162 driving snowmobile
21
- -W3y3qz3yp8_000256 people eating crisps
22
- -WBvJuF2UOk_000030 playing acoustic guitar
23
- -Yep0TGjWmc_000140 subway, metro, underground
24
- -YrSxLTPdcA_000004 underwater bubbling
25
- -YwZOeyAQC8_000002 baby laughter
26
- -Zd-ZSnZ3so_000159 playing banjo
27
- -_mqzXgg5eQ_000046 ripping paper
28
- -c7lpU-_-V8_000030 motorboat, speedboat acceleration
29
- -c96lccP5nc_000200 skidding
30
- -eqkzAKGBZg_000030 playing drum kit
31
- -geN4ECfl0Q_000030 playing bass guitar
32
- -ibjrtJo9rY_000030 duck quacking
33
- -nEg1olBLcw_000030 male singing
34
- -s2G3Kto0Gw_000030 typing on computer keyboard
35
- -s6dPB8fyQQ_000030 playing electric guitar
36
- -tGOjLdrF6g_000087 playing squash
37
- -v12qcLw5u0_000187 machine gun shooting
38
- -vC3oqlxf4I_000010 slot machine
39
- -vY141CdTc4_000030 playing bass guitar
40
- -vmyjjovGXM_000116 cattle, bovinae cowbell
41
- -vra5dNsP4w_000080 playing bass guitar
42
- -w7WfMgSBD4_000047 lighting firecrackers
43
- -wJ_UfBsiR0_000280 playing accordion
44
- -xzWsDpVEiE_000060 child speech, kid speaking
45
- -yby37u00N4_000030 playing violin, fiddle
46
- -zHk3s6BkpA_000030 chainsawing trees
47
- -zZR-ps0nJY_000137 hail
48
- 0-fd-lvizrY_000024 yodelling
49
- 00eb49xIULo_000030 female speech, woman speaking
50
- 01LPFe-13Aw_000030 playing electric guitar
51
- 01W8XIz7KDM_000007 donkey, ass braying
52
- 02t6zmS4RAk_000102 playing didgeridoo
53
- 038-gneOcks_000309 people eating crisps
54
- 04m_7jCGHko_000030 wind noise
55
- 04sf3v7xOzo_000005 cat meowing
56
- 055LCXe4pR8_000012 people whistling
57
- 09qDi4Auiyo_000030 playing electric guitar
58
- 0Ca2CTVwOxs_000019 cuckoo bird calling
59
- 0CvAFdtyVlo_000023 underwater bubbling
60
- 0G0mSrzOZ2M_000400 driving buses
61
- 0IvNbabusiY_000030 playing flute
62
- 0JPlNHX2HQ8_000049 playing accordion
63
- 0Lro_JzyUX0_000030 male speech, man speaking
64
- 0McmdH07r7w_000050 playing flute
65
- 0OHWW60khJ4_000030 playing bass guitar
66
- 0PZQL-Msz0s_000030 horse clip-clop
67
- 0RFEHUrGOP0_000170 playing acoustic guitar
68
- 0SsaL_YNyjY_000030 waterfall burbling
69
- 0T4gZQwzyKY_000030 people crowd
70
- 0U_Q9JTATCk_000044 owl hooting
71
- 0WIzNXqWrZk_000204 playing hockey
72
- 0XzJKHmoN6w_000019 duck quacking
73
- 0cMnDz8SSwQ_000014 disc scratching
74
- 0dkhsBmUZSY_000030 people cheering
75
- 0fQJ9nShofs_000093 dinosaurs bellowing
76
- 0hCiGC4c97g_000033 crow cawing
77
- 0hWyQpwHNDU_000030 motorboat, speedboat acceleration
78
- 0iVM2GY3R_c_000030 ambulance siren
79
- 0kar1O-1Ckk_000114 playing french horn
80
- 0m3kYCMUuCk_000000 cattle, bovinae cowbell
81
- 0sY8RR7V_q4_000220 female singing
82
- 0tJevlglhe4_000010 railroad car, train wagon
83
- 0uHGQmkKMr0_000223 people marching
84
- 0yAboI4QC6k_000109 hail
85
- 1-2zGkXe070_000098 rope skipping
86
- 10fjkn2eM_M_000050 slot machine
87
- 12tsmtyIALQ_000009 cat meowing
88
- 13LB6yibhQ8_000009 scuba diving
89
- 1CIxzqH4zzM_000040 ice cracking
90
- 1Fp6zPswdjI_000233 tapping guitar
91
- 1JMgZaCb9WM_000204 playing steelpan
92
- 1MCjHVRBDTk_000055 slot machine
93
- 1MLUEfkJDSw_000001 beat boxing
94
- 1MPwoS-R83A_000030 cat meowing
95
- 1Mx2iDMsZj8_000018 playing french horn
96
- 1NTsWn1Gir4_000103 playing snare drum
97
- 1NvpdqTAf3U_000030 skidding
98
- 1NwFHr4VHS0_000090 playing clarinet
99
- 1RB0gsxkPBo_000020 lions growling
100
- 1RSK3TFru0g_000000 sailing
101
- 1T1PLOWu65c_000250 skiing
102
- 1TARmg2FYJQ_000010 people whistling
103
- 1V65GzuCqaw_000030 bird chirping, tweeting
104
- 1Vn7SftZxS4_000030 rowboat, canoe, kayak rowing
105
- 1WaTnza9cn0_000160 playing violin, fiddle
106
- 1YGJDa3aCGo_000289 fire truck siren
107
- 1_CC87jIhXk_000382 swimming
108
- 1acVFuCvOJg_000512 canary calling
109
- 1bBdyTowO-M_000041 parrot talking
110
- 1dO7fONpkvE_000000 people farting
111
- 1eYmBacWt3k_000027 civil defense siren
112
- 1f9IgOjZjn4_000037 rapping
113
- 1gVugA2dsi4_000332 dinosaurs bellowing
114
- 1gXDaVse3SQ_000387 planing timber
115
- 1inu4aoQFKM_000164 planing timber
116
- 1kdGia7plHk_000030 playing electric guitar
117
- 1nDhQKLRJbg_000030 playing marimba, xylophone
118
- 1p8YDM6gG6Y_000014 dog howling
119
- 1t63KIS6F4I_000070 people sobbing
120
- 1x7wVFMW4dk_000030 playing acoustic guitar
121
- 1zWc46eeWLU_000167 playing sitar
122
- 2-Ipq91ns0k_000036 playing bass drum
123
- 21OWtKgJlIE_000270 canary calling
124
- 23ky1UGWeKg_000190 playing bass guitar
125
- 26KmPM2YkmQ_000004 ambulance siren
126
- 2A5eS9kMm-U_000018 owl hooting
127
- 2CebaASg1m4_000030 male singing
128
- 2EeOU7PgSck_000030 female singing
129
- 2F2NSNlc6dQ_000030 male singing
130
- 2FNZwK-4sUA_000030 female speech, woman speaking
131
- 2Jt4iqSqNTg_000012 bird chirping, tweeting
132
- 2LBEllUpWiA_000000 volcano explosion
133
- 2MDjnJzuUaU_000015 skidding
134
- 2NIaPAfScHM_000030 motorboat, speedboat acceleration
135
- 2NjwuyNgNoE_000050 playing hammond organ
136
- 2P7ZXBq5r04_000274 playing cornet
137
- 2RPPKMapBWY_000036 ice cream truck, ice cream van
138
- 2SlVaOyh69w_000219 cattle mooing
139
- 2Sto24aXwao_000097 baltimore oriole calling
140
- 2VdOQylRl08_000002 playing lacrosse
141
- 2YIZLARm8sI_000201 parrot talking
142
- 2d43OFDr5aI_000001 frog croaking
143
- 2ehs70MWQTs_000050 waterfall burbling
144
- 2fCC4BkdMT0_000106 basketball bounce
145
- 2fn6GFSwTEw_000096 cap gun shooting
146
- 2iwPgYGH_Ew_000400 railroad car, train wagon
147
- 2jy1b77hxXc_000136 playing bass guitar
148
- 2lALVOKDQNM_000059 dog howling
149
- 2myGIZCgZ2g_000018 tractor digging
150
- 2rSFLrwcvcY_000020 pheasant crowing
151
- 2szJ9STQPUk_000030 male singing
152
- 2w6jRF1Ekhs_000130 playing sitar
153
- 2xlWTgqPUOA_000004 beat boxing
154
- 2yeuzECPVUI_000033 playing badminton
155
- 2zev5MpJKPc_000039 chicken clucking
156
- 33NCPZjFuLE_000056 playing oboe
157
- 35MtyyqqQyw_000030 playing acoustic guitar
158
- 35c4EPiZ8JM_000030 horse clip-clop
159
- 35iGp2g_U6A_000000 church bell ringing
160
- 37Tl9YROdbA_000077 playing trombone
161
- 3EcAiTE0JyE_000052 playing theremin
162
- 3JyLYEjo4ok_000000 people giggling
163
- 3LfWg5Be60Q_000163 people burping
164
- 3MOG_CAcWkw_000142 playing badminton
165
- 3NcIWxDdTW0_000050 dog growling
166
- 3O8InHTYtk0_000020 male singing
167
- 3Okx0T5vpFc_000192 airplane flyby
168
- 3OxJ7KtIb2A_000100 playing saxophone
169
- 3QHNbJ_XATY_000036 civil defense siren
170
- 3S2-TODd__k_000090 train horning
171
- 3VK-nOg0-RQ_000046 pheasant crowing
172
- 3VSUuTABb3U_000074 wind chime
173
- 3WUTEMZv3EI_000046 slot machine
174
- 3YuBzhAU_Yc_000000 race car, auto racing
175
- 3cMrwXYnjd4_000026 air horn
176
- 3d5tPNd4Olk_000020 wind noise
177
- 3dBQbWPOjjI_000030 playing acoustic guitar
178
- 3djcJkGeJK8_000293 running electric fan
179
- 3e8ECt9wF5Y_000015 playing saxophone
180
- 3en9IzSPnNU_000027 driving snowmobile
181
- 3gTMehPiQ9s_000150 playing harpsichord
182
- 3kXROE2wcRA_000069 bowling impact
183
- 3p9aVzs8aYA_000030 female singing
184
- 3u3iunnXAOs_000432 playing hammond organ
185
- 3wboiuBfavA_000172 people nose blowing
186
- 3yolbg1tH9U_000030 male singing
187
- 4-_AWdbZnzE_000005 playing trombone
188
- 42Iss6TfcpQ_000742 lip smacking
189
- 433xsSMNLf4_000070 playing electronic organ
190
- 43ijm8y4z2o_000030 horse clip-clop
191
- 44UMQ5ZFuuY_000030 engine accelerating, revving, vroom
192
- 457yRHL0f2E_000030 female singing
193
- 45iXudFVQ4E_000000 subway, metro, underground
194
- 46LjKw-7mU0_000030 male singing
195
- 47QYxqXGZ3w_000244 people shuffling
196
- 47SP2azKv8Q_000030 playing electric guitar
197
- 47YlecLyyK0_000030 playing acoustic guitar
198
- 47y5k6vaUxE_000089 francolin calling
199
- 49gi-iYJ1F0_000107 tap dancing
200
- 4CLnZSI8aPs_000092 hair dryer drying
201
- 4DcOTOS_LE0_000454 sliding door
202
- 4DzuWR9ekko_000000 playing bugle
203
- 4E6mA8Y2Be0_000060 using sewing machines
204
- 4FOFcRJR9go_000084 playing glockenspiel
205
- 4H29LCZTMBs_000050 using sewing machines
206
- 4K345_DRFRk_000056 playing volleyball
207
- 4Ofe_ManxZc_000047 playing french horn
208
- 4OxCr981HvY_000016 ice cracking
209
- 4SlcVylJxxk_000297 arc welding
210
- 4WGMFP00rIg_000030 playing acoustic guitar
211
- 4YnMOFstVnk_000066 parrot talking
212
- 4_QGupz8UNA_000189 hail
213
- 4aFirNGu_P8_000381 planing timber
214
- 4dhyddSUAWg_000175 police radio chatter
215
- 4dkU-c4g1VM_000111 dog barking
216
- 4h9o2iL6nps_000050 child speech, kid speaking
217
- 4hU6jqQQUto_000009 playing harpsichord
218
- 4iBqpFUnPoA_000170 fireworks banging
219
- 4j7GbxZQjB8_000024 car engine knocking
220
- 4jHrFbnaVRc_000294 firing muskets
221
- 4kvqtJEFqjw_000190 playing bagpipes
222
- 4ldID97D-oU_000020 people coughing
223
- 4n657Imjmjo_000015 sheep bleating
224
- 4o2IRyXi-aY_000667 playing harpsichord
225
- 4rehS_cPodk_000020 female speech, woman speaking
226
- 4t_Qz9RyUm8_000006 alarm clock ringing
227
- 4yUvIrchOzQ_000280 playing saxophone
228
- 4zf3qRiZ3Ok_000030 child singing
229
- 4zsLfdNLUD4_000033 cat hissing
230
- 50OgBbJZUUc_000064 typing on typewriter
231
- 50jxPCLUFdU_000002 cricket chirping
232
- 53ohFLBl0iE_000052 alarm clock ringing
233
- 542uea0zO1I_000036 sea lion barking
234
- 54XBPEFJQc4_000076 playing djembe
235
- 574NjiOGi5s_000030 female singing
236
- 58KzLvK1OYs_000144 dog growling
237
- 5901zjV6oAo_000006 swimming
238
- 5AFKEd8nSpg_000050 people sniggering
239
- 5CtoZvJaGAM_000096 woodpecker pecking tree
240
- 5D201VjroT0_000229 sharpen knife
241
- 5D2E7s9bEf0_000010 basketball bounce
242
- 5EPnuy_sKHI_000010 singing bowl
243
- 5IZv217s4_E_000049 playing badminton
244
- 5KRxqVykvvI_000030 printer printing
245
- 5S3QDnRCnOQ_000003 tapping guitar
246
- 5Sv97J7mksY_000030 playing electric guitar
247
- 5UqwkZ1XK18_000050 helicopter
248
- 5VyCTHzLVdU_000011 playing bongo
249
- 5WVhslWt1wU_000030 female singing
250
- 5Wb1zMq_DiU_000020 fireworks banging
251
- 5XK1Vgiwllc_000073 playing mandolin
252
- 5X_B2L1-4Bc_000030 playing electric guitar
253
- 5briopN06L8_000000 playing piano
254
- 5eHlhJ-ZOpg_000030 playing hammond organ
255
- 5fZn_7LbKSI_000020 people burping
256
- 5hi4T4Gp6v4_000002 air horn
257
- 5hjKe0FWq9E_000002 horse neighing
258
- 5iEbFJkG6Xg_000557 bird squawking
259
- 5jQLK4Z1EH4_000020 wind noise
260
- 5jt7lR8WY3g_000172 playing castanets
261
- 5lV59hZgwRM_000009 scuba diving
262
- 5mBCF05DV5s_000280 church bell ringing
263
- 5mJ7_05tlhs_000005 crow cawing
264
- 5nscL4EBrXA_000030 male singing
265
- 5r1zW38AWvs_000057 wind chime
266
- 5rP9Z4jEq6s_000024 cap gun shooting
267
- 5xJdFysNSf4_000110 race car, auto racing
268
- 5xefixXFNwk_000020 playing bass guitar
269
- 64eXDlUgPoA_000079 lighting firecrackers
270
- 64lQIoDGX6o_000040 playing marimba, xylophone
271
- 64ollREPrUw_000132 raining
272
- 64zPbHPyiwE_000030 male speech, man speaking
273
- 659mhmSPXWA_000276 bee, wasp, etc. buzzing
274
- 65u3pwOEcBg_000002 frog croaking
275
- 67hDkeQalow_000030 motorboat, speedboat acceleration
276
- 68mXCuRvQkw_000045 people burping
277
- 6ARTjahUaYY_000030 playing electric guitar
278
- 6BQgJ0tvUkc_000162 baby babbling
279
- 6CYhRsU4F34_000000 people whistling
280
- 6EcmHiscsOc_000287 lighting firecrackers
281
- 6GsamqJ5tFU_000075 airplane flyby
282
- 6IMlkVOKxJw_000032 cap gun shooting
283
- 6IQkdce9a7Q_000184 slot machine
284
- 6KO3eMyEeOg_000000 race car, auto racing
285
- 6LB-qRj_zW4_000030 horse clip-clop
286
- 6LKFDTu9vRQ_000018 playing french horn
287
- 6NxeHScEnJE_000000 dog bow-wow
288
- 6OhTwJrVxXs_000028 playing timbales
289
- 6RGa6DvWpt0_000035 people marching
290
- 6UcuQgsHFCA_000142 playing french horn
291
- 6Y6CvX7EP68_000030 singing choir
292
- 6ZbVXBeNsX8_000125 playing didgeridoo
293
- 6aYfccsgIjk_000094 baby crying
294
- 6gTR_Avjz6g_000170 playing cymbal
295
- 6j2g_OZnW74_000189 missile launch
296
- 6mE_v9a5dbM_000030 male singing
297
- 6o0mZVMfKss_000140 people clapping
298
- 6of3tx7IOik_000030 wind noise
299
- 6shIFnN-LsY_000141 playing flute
300
- 6v53uAVpXC4_000071 people babbling
301
- 6wpifZcwOJU_000023 underwater bubbling
302
- 6xAClSJ21qA_000491 rapping
303
- 6xgTrufXcCM_000126 wood thrush calling
304
- 6yBZH5cV7GE_000030 playing electric guitar
305
- 6z_pfZ6Rvfs_000023 playing table tennis
306
- 7-7r-FRwp_w_000041 playing glockenspiel
307
- 72d2TsdeSg8_000000 tap dancing
308
- 75FLwnGZJTc_000125 playing oboe
309
- 75m0cvRBGY0_000030 vehicle horn, car horn, honking
310
- 76F-K-7HUXE_000010 lions roaring
311
- 77rq4-p4vV8_000030 wind noise
312
- 78hdsP0edMg_000030 railroad car, train wagon
313
- 7Ck8cfF2rl0_000200 otter growling
314
- 7ELF2dbWe5w_000010 female singing
315
- 7I_wdG-eOc0_000106 playing hammond organ
316
- 7JT43yyNGkk_000003 black capped chickadee calling
317
- 7JX-Bx0BETQ_000205 rapping
318
- 7LMkG7uISis_000102 playing gong
319
- 7MuetSj86N0_000490 bird squawking
320
- 7NyPcaVKao4_000025 dog growling
321
- 7Odi8SKArQI_000030 playing saxophone
322
- 7P-1-qzwyYA_000055 magpie calling
323
- 7Qr1ncg86N4_000007 lions roaring
324
- 7TMOCRG4EBA_000030 female singing
325
- 7U5V5Teqo8Q_000000 dog barking
326
- 7V6NAsZ86xw_000000 beat boxing
327
- 7VT8p9Er3n8_000020 mynah bird singing
328
- 7Y3u8Aj8UV4_000010 driving motorcycle
329
- 7YGUQYRwnHs_000019 horse neighing
330
- 7YTsyqVSEeI_000006 child singing
331
- 7b__KH3VA_o_000035 people booing
332
- 7caL9c6N1zc_000122 child singing
333
- 7gdSJ30FfNU_000490 people hiccup
334
- 7h7_U2q-VwY_000276 dog baying
335
- 7hdXzJpOXiY_000018 police car (siren)
336
- 7iT77hG1X18_000063 playing erhu
337
- 7kIhqlZok8c_000074 running electric fan
338
- 7kIhqlZok8c_000241 running electric fan
339
- 7lz-THXCwi8_000030 male speech, man speaking
340
- 7ogdSWU90s4_000100 opening or closing drawers
341
- 7pc3c5ZGbwo_000030 ocean burbling
342
- 7rLRSpEqgZk_000253 playing sitar
343
- 7tsuYUeV7_k_000191 airplane flyby
344
- 7vF2Qq0Pg6w_000024 ice cream truck, ice cream van
345
- 7xZxYm27FdA_000020 toilet flushing
346
- 7xaNqQ8FAwI_000100 mynah bird singing
347
- 7yBOHsPAJgw_000040 vacuum cleaner cleaning floors
348
- 7yXROxIZfeo_000053 raining
349
- 80KT6bYCFkg_000077 playing tuning fork
350
- 81ACguOEqoM_000042 electric shaver, electric razor shaving
351
- 82ic2Xisrqg_000030 car engine knocking
352
- 83mmLOdwZlA_000081 air conditioning noise
353
- 85Nd7APr5Os_000028 ice cracking
354
- 85Nd7APr5Os_000052 ice cracking
355
- 87-ZrpDyRHE_000238 cat hissing
356
- 88oLbuKd7Rg_000030 car passing by
357
- 8906Y6i-h10_000102 playing cymbal
358
- 89NzFtLSRSo_000030 engine accelerating, revving, vroom
359
- 8DDro-N5-54_000029 swimming
360
- 8GTkmen1bBg_000110 playing piano
361
- 8IdUE6nhR3E_000030 playing violin, fiddle
362
- 8N0GxZtk9wE_000051 playing didgeridoo
363
- 8NMHjXutgVs_000333 electric shaver, electric razor shaving
364
- 8Rh7NvJDexA_000068 tapping guitar
365
- 8VEqGk0W4xY_000192 playing darts
366
- 8XH7xIWWC6c_000090 people cheering
367
- 8Y9VKxl-1gE_000063 rapping
368
- 8ZWKl-_qHM0_000010 driving buses
369
- 8_xdWIziFpI_000030 baby crying
370
- 8b2ASj5nmos_000251 playing darts
371
- 8c9PJLozdtA_000020 playing bass drum
372
- 8jkr7bOR8ck_000146 playing table tennis
373
- 8lIh0qRN7PE_000220 cattle mooing
374
- 8m7VIFtS4gc_000000 typing on typewriter
375
- 8n76LfbY3qo_000232 chainsawing trees
376
- 8ngu3TPmfZQ_000110 playing drum kit
377
- 8ugZkKeLL7Y_000030 male speech, man speaking
378
- 8vE2wod7rhE_000030 horse neighing
379
- 8ytjUazIdno_000023 playing glockenspiel
380
- 9-N8v-cC0Tg_000002 air horn
381
- 9-xW047dMpk_000125 missile launch
382
- 913ItBzDHLQ_000124 playing synthesizer
383
- 91kWVMnyKxA_000019 bird squawking
384
- 92G0bdxj5ck_000091 mouse pattering
385
- 93a7wS41kLc_000060 splashing water
386
- 93rlsDDmFYo_000110 playing cymbal
387
- 95UKs8K92C4_000110 playing timpani
388
- 96wdXcwIbgk_001238 playing bongo
389
- 97svDuqFctI_000139 playing steel guitar, slide guitar
390
- 98Nc3x8U1JI_000187 playing badminton
391
- 993A2y5lv-s_000030 bird chirping, tweeting
392
- 99WZAe6QKUc_000030 people whispering
393
- 99cGCS0ko2Q_000120 playing saxophone
394
- 99ylFYthGcI_000004 donkey, ass braying
395
- 9A8hgZdD__g_000030 horse clip-clop
396
- 9CFR1VdlIMc_000142 rope skipping
397
- 9CmEsDtIz_Q_000060 playing accordion
398
- 9D9sfe1eaK8_000000 frog croaking
399
- 9HtYErt1moA_000100 playing drum kit
400
- 9IwjfATt51Y_000030 playing french horn
401
- 9JwaE3BmICE_000061 female speech, woman speaking
402
- 9LFGIpAO3NE_000161 tapping guitar
403
- 9LY2BJ2fqts_000090 people gargling
404
- 9Q1RM-pY2yY_000180 playing bongo
405
- 9UvWyax1fEU_000000 people booing
406
- 9Y3ausHODlk_000557 playing electronic organ
407
- 9ZCgk2e7wZM_000269 woodpecker pecking tree
408
- 9ZE18L9NN1Y_000105 striking pool
409
- 9af_fvuAY8E_000167 barn swallow calling
410
- 9exZEq85L1k_000260 playing trombone
411
- 9fvJeyH-4II_000100 fireworks banging
412
- 9gJ4NQYcakk_000030 male speech, man speaking
413
- 9k0OwVahe5Y_000066 playing cymbal
414
- 9pBp5wd9rpw_000043 firing cannon
415
- 9s6jvP1V56w_000080 people crowd
416
- 9t7YT0OKpaM_000130 playing cello
417
- 9tyf9HGsIe4_000000 people finger snapping
418
- 9zriIjvwqJw_000480 people burping
419
- A08MfFxzmxo_000030 playing accordion
420
- A0tXM5fSFrw_000062 alligators, crocodiles hissing
421
- A1-T0wdI8Nw_000070 sloshing water
422
- A1vf6We9a_Q_000290 mouse pattering
423
- A2GEU2r5KnQ_000030 playing acoustic guitar
424
- A551qSirV68_000298 alligators, crocodiles hissing
425
- A55rHYLkwQk_000000 singing bowl
426
- A95vqV9oM6g_000081 people marching
427
- A9pIMNKQWCk_000005 footsteps on snow
428
- AApTo3l6NfA_000030 train horning
429
- AFmF56HVvVg_000151 cat purring
430
- AI3om1uyCH0_000009 fire truck siren
431
- AJhEl41TC5s_000443 lathe spinning
432
- AM8-hH1Oahw_000510 driving buses
433
- AOPgsB4hsH8_000206 electric grinder grinding
434
- ARFBC4LeCFY_000019 chicken clucking
435
- ARrb06s5a0Y_000082 donkey, ass braying
436
- AUufm-TAVg8_000101 playing french horn
437
- AW-JhveJXFw_000030 typing on typewriter
438
- AXDomj6KnkE_000141 playing tabla
439
- AZgG_6NE8j4_000240 parrot talking
440
- Ac0OxSV8Nqk_000030 female speech, woman speaking
441
- ActIkLSW20Y_000350 railroad car, train wagon
442
- AefLmdFYR6k_000136 playing tambourine
443
- AfrcYQw5mXw_000010 telephone bell ringing
444
- Agl-AQmIYBE_000030 rowboat, canoe, kayak rowing
445
- AhUYTb14QZU_000020 chimpanzee pant-hooting
446
- AiriN8WOgiI_000123 playing bass drum
447
- AjD1BiY0o8E_000001 pheasant crowing
448
- AlTNj6IWey4_000112 airplane flyby
449
- AlebU-Vdy18_000022 playing theremin
450
- ApMojxDfms0_000273 magpie calling
451
- AteFCZfJfLY_000011 vehicle horn, car horn, honking
452
- AvguIvLb0GY_000027 electric shaver, electric razor shaving
453
- Aw5BwrqdmHc_000010 canary calling
454
- Aw9arRIoBR4_000030 sliding door
455
- Aygl9-ur8NU_000001 gibbon howling
456
- Az1M0iLYjIg_000030 ice cream truck, ice cream van
457
- B1ax5dX6XrU_000215 wood thrush calling
458
- B7cF_In3_-c_000030 horse neighing
459
- B7zgWPjx8hg_000026 wind noise
460
- B9Mk5n5Zwjg_000240 driving buses
461
- BBumD37-y80_000110 train horning
462
- BC4LglYv70Q_000108 playing oboe
463
- BH8QYqAvO2k_000020 playing vibraphone
464
- BJ31LCL3Dy4_000100 crow cawing
465
- BLWCHd07ATw_000080 playing electronic organ
466
- BM1fw080pSs_000030 female speech, woman speaking
467
- BM4YyahEm8Q_000078 spraying water
468
- BO1K4wXy2CI_000299 mosquito buzzing
469
- BPD7Qj1U_Bo_000131 playing theremin
470
- BPxW7nP4loQ_000003 eagle screaming
471
- BTdIM1mncyA_000030 female speech, woman speaking
472
- BWw6dgq07Qo_000053 playing clarinet
473
- BYJ2UHIHCLU_000009 hail
474
- BbUBFko93XE_000031 people sneezing
475
- BbfDej2cM2I_000001 volcano explosion
476
- BdIWeKYKIzk_000000 train horning
477
- BfyYYuE12dw_000006 goose honking
478
- BkXyLmdb8Yw_000290 playing hammond organ
479
- BkwStRX3xE0_000010 fireworks banging
480
- BmsTQHrCwB8_000215 heart sounds, heartbeat
481
- BnRtoIC87Po_000030 female speech, woman speaking
482
- BniijKHywXM_000103 playing tambourine
483
- Bo271H1XM40_000127 arc welding
484
- BpV7n-YUtos_000248 rope skipping
485
- BpfM3evN6H8_000009 people eating apple
486
- BsHgr_sj6ec_000058 playing bongo
487
- BvdvbeIdUtk_000072 people booing
488
- C1fdGRZRPtU_000040 barn swallow calling
489
- C2kKMjYETRQ_000050 playing acoustic guitar
490
- C5ik_rcugw8_000040 people marching
491
- C7zLftUgskY_000035 playing harp
492
- C85lxZAStBk_000056 playing hammond organ
493
- CA3sbGHEE3c_000001 people screaming
494
- CFazHdGsxcU_000155 lighting firecrackers
495
- CG-2XtQI6sM_000000 cat hissing
496
- CG_qvz_V1Jo_000374 playing gong
497
- CJjTs72p1gI_000002 alarm clock ringing
498
- CKwtP-eN1Zk_000014 striking pool
499
- CUY3hob5V_o_000010 opening or closing car doors
500
- CUqga7lwvfM_000080 subway, metro, underground
501
- CVVrs_KA6sU_000030 rowboat, canoe, kayak rowing
502
- CViouHw-mfQ_000122 playing cello
503
- CYTTsSPohw0_000122 planing timber
504
- CdcfD8mg-k4_000030 singing choir
505
- CeD6RlRSr8M_000099 cat purring
506
- Cg3XzrFzzpM_000060 playing accordion
507
- CgevwvZLE3c_000219 running electric fan
508
- CheeUmf4IhE_000030 fireworks banging
509
- ChhTVgWMxiI_000030 duck quacking
510
- CiLwbeRDj8E_000000 people crowd
511
- CjwqjkkoJHY_000199 car engine starting
512
- CkYUBci5xEM_000070 using sewing machines
513
- Cntxv6aE3DY_000030 sliding door
514
- Co1qXvuwkes_000146 arc welding
515
- CpW7umx_bi0_000067 playing mandolin
516
- CqgPmVXNdNQ_000058 striking pool
517
- Csr7c9uFvQk_000028 dog whimpering
518
- CvN_oC0AGvM_000340 toilet flushing
519
- Cvgc82TDNnE_000025 lions roaring
520
- CvxL2n9DX6w_000251 lighting firecrackers
521
- CyW4FoAJ1MU_000260 police car (siren)
522
- CzGGyIj84Hs_000030 pigeon, dove cooing
523
- D-HXQTcZNGU_000130 female speech, woman speaking
524
- D109-sQNo1k_000028 sliding door
525
- D6BCygx6jcs_000000 dog howling
526
- D7kL3EEOyR4_000050 helicopter
527
- DAy_bV1d9c4_000046 playing squash
528
- DGU-HbuX6rs_000230 people crowd
529
- DJan9OSSF7c_000060 plastic bottle crushing
530
- DO-9yuU9brk_000028 sea lion barking
531
- DOi5UxxTknA_000041 driving snowmobile
532
- DPpo_Whnuqc_000022 missile launch
533
- DQelhAtUyHY_000030 playing electric guitar
534
- DR7TdSc2ahQ_000030 sliding door
535
- DSThhOKXU-c_000250 playing bass guitar
536
- DSgKhbtDWWo_000400 playing flute
537
- DX5_AglGFMw_000349 metronome
538
- DXfTYgSGLac_000177 alligators, crocodiles hissing
539
- DZo15IMYpmA_000206 vacuum cleaner cleaning floors
540
- DaMG8zJSkuw_000100 playing trumpet
541
- DbgRhWmYTJk_000002 frog croaking
542
- Dbi2L5z8U-w_000020 driving motorcycle
543
- DdZ6PSUQoQA_000050 female speech, woman speaking
544
- DfZmOeeF_CI_000024 lathe spinning
545
- DhaOFNnOC8o_000102 playing steel guitar, slide guitar
546
- DhxWWDGdF8I_000159 frog croaking
547
- Dj3sIimPrCk_000330 pigeon, dove cooing
548
- DmSsL0Xde-I_000005 missile launch
549
- DpIKdB4c_JU_000030 sloshing water
550
- DqnMEAN1GVc_000098 baby laughter
551
- DrPa82cqlSM_000008 playing mandolin
552
- DroorVxOn5s_000030 engine accelerating, revving, vroom
553
- DsVtCIaWv-Y_000377 hedge trimmer running
554
- DtRqBLRUTRo_000069 playing clarinet
555
- Dtiv9RNaA4U_000106 train horning
556
- Duk5ikgbUfU_000030 playing violin, fiddle
557
- DuyL15HJn6M_000036 elk bugling
558
- DvascfU3OM4_000233 playing bongo
559
- DwE0cQ3Xz70_000030 chainsawing trees
560
- Dxxg6NenmBQ_000153 playing oboe
561
- E0ocfyjk1lw_000129 scuba diving
562
- E22HBR9rEkI_000030 lawn mowing
563
- E22UuQ6SRf4_000001 fire truck siren
564
- E4IHTinI-3k_000010 people whistling
565
- E4dvhMWr7K0_000140 playing didgeridoo
566
- E5ICgH7JVFI_000003 driving snowmobile
567
- E67GhkgB8Jc_000033 cell phone buzzing
568
- E6tu_4cO7ok_000107 playing cornet
569
- E8LoFlcAC-M_000051 playing vibraphone
570
- EDtJ88ZJtWo_000008 playing bagpipes
571
- EEJp_Ssp0No_000004 dog howling
572
- EG2bfvkpzjk_000136 playing steelpan
573
- EGKE_rOo-Gg_000030 playing violin, fiddle
574
- EHHBn9EAtg4_000040 people booing
575
- EHHefsog-aM_000069 black capped chickadee calling
576
- EHkkma0y1T8_000030 people sneezing
577
- EJudk9RWsZI_000000 car engine starting
578
- EKkFWhdVAOU_000032 woodpecker pecking tree
579
- ETcwLdOldMg_000000 blowtorch igniting
580
- EU3OmHbOUo0_000000 cattle mooing
581
- EbnPPw9P3MQ_000409 snake rattling
582
- Ee1Glgpx3YE_000038 scuba diving
583
- EeUHgSkCSi8_000666 turkey gobbling
584
- EhDl29RiF74_000085 black capped chickadee calling
585
- EhaE7gijT78_000119 baby laughter
586
- EkbcNbEn1Z8_000063 opening or closing car doors
587
- EoubRuwDlrw_000038 canary calling
588
- ErdH1gc3ZmU_000003 playing cornet
589
- F186zkBSFjE_000110 helicopter
590
- F1ZVQSywml4_000040 skateboarding
591
- F3yETAYfYZg_000009 playing theremin
592
- F6xLA2AA2GA_000090 people crowd
593
- FAdeuN1uc-M_000230 subway, metro, underground
594
- FCir2lQei8M_000030 playing harpsichord
595
- FEltES9TUEU_000008 hammering nails
596
- FGWcwpr_SeM_000133 fire truck siren
597
- FGoXt7LIK3U_000010 police car (siren)
598
- FHz8YQy4q5A_000027 tractor digging
599
- FIPu0jd8I28_000030 people screaming
600
- FIpCyWCy9Qc_000030 playing violin, fiddle
601
- FRxNI559-Xs_000280 railroad car, train wagon
602
- FUVXK29tUwQ_000000 owl hooting
603
- FWuYLFTe3_8_000000 playing trombone
604
- FXzP5bUz-Lo_000017 horse clip-clop
605
- FauD2eg73V8_000030 playing electronic organ
606
- Fd_SXrGw6ag_000030 playing marimba, xylophone
607
- Fe9YJozRi78_000148 child singing
608
- FfpD5XC8b5w_000137 playing bongo
609
- FglnuP1jpRY_000030 playing cello
610
- FhHBIlZ_5T8_000035 wood thrush calling
611
- Fj34VCzy_Og_000030 horse clip-clop
612
- Fpqf057G_SY_000000 chipmunk chirping
613
- FpwtNUX45qU_000047 snake hissing
614
- Frs4_Uf8Tq4_000127 ice cracking
615
- FtCT62fiyrU_000270 church bell ringing
616
- FtNV_Gq62l8_000019 cat meowing
617
- FudSk5EUbAY_000156 playing ukulele
618
- FvZqgCIbO2Q_000003 hail
619
- FyszP9lfbDk_000001 playing didgeridoo
620
- G-5AgMNzjv4_000017 vehicle horn, car horn, honking
621
- G-Eokh465wM_000030 printer printing
622
- G-IdABSxeHI_000097 dinosaurs bellowing
623
- G-jsAK9ITwM_000030 ocean burbling
624
- G6FhQuR3_88_000000 playing congas
625
- G6nSnVQCxBQ_000095 elephant trumpeting
626
- G7E7D2Z_Juo_000070 people burping
627
- G7F8HVNw1lI_000081 scuba diving
628
- G9AKWSzZtWI_000030 people eating
629
- G9F38sObAns_000025 playing harpsichord
630
- GAFJeF_AqZA_000086 hail
631
- GBf5DgubSuE_000030 wind noise
632
- GD8dVFZaWNU_000030 skateboarding
633
- GDQjuDpqnJI_000030 wind noise
634
- GL1TqKjpv1Q_000047 playing theremin
635
- GLA-upuVPSA_000057 police radio chatter
636
- GLtFkIbCZOY_000140 pigeon, dove cooing
637
- GMNJCJ0ykfc_000050 male singing
638
- GOFDdcvXq40_000030 goose honking
639
- GPl4twCSrLQ_000001 coyote howling
640
- GS_JqZCyqOc_000050 stream burbling
641
- GT2frI8BMMM_000013 vehicle horn, car horn, honking
642
- GTZkjw4aVn0_000030 engine accelerating, revving, vroom
643
- GUSlicDnqIA_000045 playing congas
644
- GX4kLN3hW4Y_000149 planing timber
645
- GXIPKWMIVhs_000072 playing oboe
646
- GXRHmy5Bqas_000008 vehicle horn, car horn, honking
647
- GYJCyn2piCc_000329 lip smacking
648
- GZoVDjx9ltQ_000235 playing erhu
649
- GZoypVKRpCo_000003 cuckoo bird calling
650
- G_hP5gvRfNw_000033 cat growling
651
- GaFqib8bCLM_000019 tapping guitar
652
- GbTzdC4mOtQ_000030 machine gun shooting
653
- GbUoljsX3lg_000672 people gargling
654
- GgUkhedV5e0_000190 female speech, woman speaking
655
- GhizOxu0ZpI_000060 people belly laughing
656
- GidBfE5JU3s_000005 vehicle horn, car horn, honking
657
- GjKjnplphn4_000200 playing acoustic guitar
658
- Gjide6V8U-E_000039 dog growling
659
- GoMH9AL7YRA_000050 ambulance siren
660
- GvPc1ncg0OY_000138 people booing
661
- Gwp62TNrER0_000014 barn swallow calling
662
- GxUovR3d2aM_000019 car engine knocking
663
- GzAdcTtwkM0_000011 missile launch
664
- H-ZKdWCEhbI_000140 fire crackling
665
- H-jnsSCa-c8_000090 playing lacrosse
666
- H-rd3O5haG8_000070 playing bass guitar
667
- H0zmJjMoV-4_000012 playing squash
668
- H1lx8lLLceQ_000120 machine gun shooting
669
- H2r4JHm00Vg_000260 sheep bleating
670
- H6onyc5r6os_000024 heart sounds, heartbeat
671
- H6z_gPH8m2A_000055 people crowd
672
- H7BcUVlPDsg_000026 parrot talking
673
- HCPKDz63_s4_000000 child speech, kid speaking
674
- HCuORBJf-Ho_000027 playing cornet
675
- HL36YvzbFYs_000210 goose honking
676
- HL_E1j069EI_000030 female speech, woman speaking
677
- HOD29VAXJD8_000030 car engine knocking
678
- HOI0ZaKLAMM_000030 fireworks banging
679
- HOI7KapLzz4_000030 playing violin, fiddle
680
- HOupeg-QhHk_000073 yodelling
681
- HQSafj2aCNI_000100 playing banjo
682
- HQlV2jYCz5k_000030 playing violin, fiddle
683
- HR35d67Dhts_000108 singing bowl
684
- HRaGv5q3P3E_000000 opening or closing drawers
685
- HTRRMT1NQOc_000060 playing saxophone
686
- HTqUtEGJ0As_000030 people whistling
687
- HUP72tlgzyE_000066 playing badminton
688
- HUWvhtKby-A_000033 car engine knocking
689
- HW2o3t3fE_k_000062 francolin calling
690
- HX2ccFGAuMU_000163 electric shaver, electric razor shaving
691
- HX5BeffFwV0_000008 smoke detector beeping
692
- HaABMNzUOvo_000030 wind rustling leaves
693
- Hakqd6g2jaY_000110 helicopter
694
- HcO60nHH4W0_000023 playing bass drum
695
- HckqMrtU3dg_000133 playing double bass
696
- HebxWsaO-LA_000115 train whistling
697
- HkCt4hh_x58_000030 rowboat, canoe, kayak rowing
698
- HlUvoEXQZYk_000007 playing tambourine
699
- Hlp5qKMfdYk_000180 playing bass guitar
700
- Honj-TQHx3U_000129 airplane
701
- Hqhi7LioGyM_000030 playing marimba, xylophone
702
- HsCj9l5Barg_000045 fire truck siren
703
- HsX5XlPFOWI_000380 lawn mowing
704
- Hum53_V1zw8_000001 wind noise
705
- Hwp_62TYhDk_000110 playing marimba, xylophone
706
- I-WMZh-ieC8_000280 playing harp
707
- I-qeWJGSXuQ_000083 playing bassoon
708
- I4ffG1Bh-d8_000156 playing oboe
709
- I5wV1AFabIA_000029 frog croaking
710
- I6_30m_TQ2o_000000 playing tuning fork
711
- IBy30oL3yxw_000399 playing harpsichord
712
- ICajcUYAan8_000410 people babbling
713
- IEiseWb8Tao_000080 playing acoustic guitar
714
- IF92YmTMtdk_000089 cattle, bovinae cowbell
715
- IFGbGcs3bQQ_000034 chinchilla barking
716
- IHaWOJuekYY_000109 tap dancing
717
- IINqN6L2NsY_000285 tapping guitar
718
- IJvYFkrfjBg_000049 tornado roaring
719
- IKj9E33H8e8_000012 pig oinking
720
- ILBWV9AFKDU_000115 playing ukulele
721
- IN-9DFoS3fM_000007 bird squawking
722
- IWVztd9QsXg_000005 owl hooting
723
- IWhgJgeUQuA_000090 playing bagpipes
724
- IYhq5aun18M_000181 police radio chatter
725
- IZAasx5KIKE_000010 fireworks banging
726
- IaAKobKeOtU_000271 people marching
727
- IeD5tKVhuI4_000030 playing synthesizer
728
- IeK6EDl8Z_k_000033 people clapping
729
- IeW36MTcnBs_000117 dog growling
730
- Ieca4fwxfyY_000049 tractor digging
731
- IicM8tOXAFg_000146 pheasant crowing
732
- Ik40yoz30vE_000068 woodpecker pecking tree
733
- Il82kphC6es_000172 dinosaurs bellowing
734
- IlLCyGNjG3M_000060 playing harp
735
- InxgcOFzxWY_000070 chicken clucking
736
- IpDU10kKguU_000311 vacuum cleaner cleaning floors
737
- IqCRbzhPkvU_000000 lawn mowing
738
- IrcX151sayY_000098 tapping guitar
739
- IrkyGrHjygY_000020 tapping guitar
740
- Irx-WWFsQYU_000667 people eating
741
- ItnOPd_CktY_000020 people coughing
742
- IuTgZQVcMBg_000007 sloshing water
743
- Ivho6H4q1zk_000017 typing on typewriter
744
- Iylzuk-0j64_000163 slot machine
745
- J0ZBjy_EEtg_000015 people clapping
746
- J18R3qBnJtA_000120 waterfall burbling
747
- J1kAKMeULF8_000500 subway, metro, underground
748
- J3K5HEX3gko_000030 playing banjo
749
- J4VeWujsLJg_000030 typing on computer keyboard
750
- J5ugw2GUbnY_000001 dog whimpering
751
- J7fVkoC-Ha8_000711 people eating crisps
752
- J82OaPeyioI_000030 horse clip-clop
753
- JC33o6YxH9c_000220 playing piano
754
- JHNBF0WJ-EM_000029 people belly laughing
755
- JIdUC1zZb9M_000060 rowboat, canoe, kayak rowing
756
- JK4YikH2myA_000161 playing vibraphone
757
- JKrghKg6UBU_000260 ocean burbling
758
- JKxdjXEI9Wc_000015 eagle screaming
759
- JLPpMZlBOEI_000038 playing accordion
760
- JQ3bFZbatGk_000030 people running
761
- JQr-BRXrjN4_000002 airplane
762
- JVevxopJjU8_000823 playing tabla
763
- JXi1ZtJecYo_000001 bowling impact
764
- J_k6z7_YVJU_000090 playing piano
765
- Jbiig_IQdIo_000282 cap gun shooting
766
- JcXhB_4B32o_000090 playing clarinet
767
- JeJGThFGm80_000001 lighting firecrackers
768
- JfAjUMKjoVI_000460 playing harp
769
- JfiFq8tn5Pk_000009 playing steel guitar, slide guitar
770
- Jk-SBbw7Afg_000140 driving buses
771
- K-B9CIVeQ_U_000030 horse clip-clop
772
- K-MCXLQmnFA_000004 playing banjo
773
- K3KsP-m_c5I_000353 basketball bounce
774
- K5HBK1c7noI_000010 cat meowing
775
- KBc_FdBzN2U_000017 wind chime
776
- KFFJI_TZmoY_000047 crow cawing
777
- KJwga4gMEzU_000239 people slurping
778
- KJxSJR3v6oE_000013 church bell ringing
779
- KKd2qSxww1o_000002 typing on typewriter
780
- KM_VudA7hgo_000030 people running
781
- KOzRB30gxpE_000362 planing timber
782
- KQbCjNzlYPs_000082 writing on blackboard with chalk
783
- KU5WQZsoKRE_000079 child singing
784
- K_8tBU1LYxU_000000 chicken crowing
785
- Kbc8ioemPlA_000081 tractor digging
786
- KdD8xho7ymw_000037 cat purring
787
- KfqdB93utIg_000000 waterfall burbling
788
- KfyYM6nq--A_000011 playing vibraphone
789
- KnwgxGWxp7Y_000025 people whistling
790
- Kp0W7S-oExs_000030 driving buses
791
- Kq0Dbp3C4d0_000017 dog howling
792
- KrUuPSM4LxM_000215 magpie calling
793
- KsuQWEN0COQ_000199 playing darts
794
- Kus5SmqOIrA_000024 mynah bird singing
795
- Kwha8UYndzI_000090 playing didgeridoo
796
- Kz4Jm9_iFeg_000038 hail
797
- KzK6d6Qpu_o_000010 dog barking
798
- KztFbSJPxg0_000197 planing timber
799
- L4u9LOjcXoE_000000 people sobbing
800
- LAaJfzvvlTI_000053 lions roaring
801
- LAx_fanEB_g_000168 arc welding
802
- LB2EbSmDSKw_000007 baby laughter
803
- LBH_D9h18bw_000042 rope skipping
804
- LCcPzeH_Cn4_000160 sailing
805
- LE49c8e5VMU_000049 mynah bird singing
806
- LEpzp8DnWyY_000026 sharpen knife
807
- LGMZ9c7q8tE_000168 cat purring
808
- LHYHo8wJF74_000342 playing oboe
809
- LJsSbG5A1y0_000000 lighting firecrackers
810
- LL618LsL2zY_000030 pig oinking
811
- LMbyOx04l9E_000036 vehicle horn, car horn, honking
812
- LNl3ANFth4Y_000021 mynah bird singing
813
- LOLFOiNiS1o_000067 sharpen knife
814
- LPTsZZVr06o_000030 people eating
815
- LSsYBN_RvPc_000122 rapping
816
- LWrztDg2BGI_000245 playing synthesizer
817
- LYUkVukRObA_000236 pigeon, dove cooing
818
- L_Da1Sv1iKU_000028 playing didgeridoo
819
- L_OvLmH_feU_000021 dog growling
820
- LaGhL-3ctOc_000048 playing double bass
821
- LbjkUR-ERQw_000049 opening or closing drawers
822
- LciaPQ1XV3c_000217 playing badminton
823
- Lfmcj5VW6VE_000050 playing acoustic guitar
824
- LgdtTzvKnT4_000030 rowboat, canoe, kayak rowing
825
- Lj4Ngu0ars8_000138 electric shaver, electric razor shaving
826
- LlRZR8xPOEw_000021 frog croaking
827
- Lmp51YN-7wc_000466 people marching
828
- LtqXpk2YGls_000010 chainsawing trees
829
- LuxrhiicesU_000000 donkey, ass braying
830
- LvzMerRGbCE_000099 bouncing on trampoline
831
- LxdOWpwSzi0_000400 mouse pattering
832
- Lz8Ytz12MrU_000120 chopping wood
833
- M0EaEBlx5fk_000126 yodelling
834
- M9cNmb9HKPc_000110 turkey gobbling
835
- MApvC99wovc_000159 car engine starting
836
- MEtdxR3RdEA_000180 playing piano
837
- MFEhejrPVmw_000040 dog barking
838
- MMTvsiahcsc_000002 fire crackling
839
- MMjEIFDYQvc_000117 yodelling
840
- MQPNvRDVuUs_000100 playing french horn
841
- MQPggq37uX8_000003 scuba diving
842
- MQcS6DqCjKQ_000030 playing vibraphone
843
- MRnnE9MTm64_000052 driving snowmobile
844
- MTD6-1mrtP8_000072 owl hooting
845
- MVmJujaAocY_000030 baby crying
846
- MXmetP4F-EU_000019 door slamming
847
- MdUG2H5K5eg_000117 roller coaster running
848
- MenAsca8z6s_000137 slot machine
849
- Mf6bCl5HKgc_000000 wind chime
850
- MfSXrFJt6d4_000007 motorboat, speedboat acceleration
851
- Mhzz75z8mbY_000166 playing ukulele
852
- Mk8fhA3DAsA_000030 turkey gobbling
853
- MkrFhq3F_z4_000100 playing accordion
854
- MlX7I-OZIyk_000062 playing timpani
855
- Mmyr6Gpclbk_000070 bird chirping, tweeting
856
- Mnv4KVEt18I_000018 people giggling
857
- Msh94MTYC6A_000290 chainsawing trees
858
- MshXUve673A_000363 elk bugling
859
- Mvn2oFoKxwI_000128 people booing
860
- Mvue0y_EsDU_000000 orchestra
861
- MwVghEDjyQM_000030 people sobbing
862
- MwsoiJOqg_g_000030 duck quacking
863
- MwyzEfk2xbA_000054 playing double bass
864
- Mzc3DajWA0k_000030 ice cracking
865
- Mzgas545UXU_000090 playing snare drum
866
- N09QFSbvIC4_000150 playing electronic organ
867
- N2DQWIePoLs_000030 playing violin, fiddle
868
- N3_jZV1ejnA_000030 crow cawing
869
- N5CNEOKptjo_000000 splashing water
870
- N8cNWpCL0Rs_000183 owl hooting
871
- N9cM9BdATNs_000081 people booing
872
- NAETplWD64g_000030 playing harpsichord
873
- NAk-PU3X_DQ_000026 mynah bird singing
874
- NBeonGAqO84_000032 playing bugle
875
- NCdkXluu-D8_000350 playing harpsichord
876
- NFd5Zot-0_c_000006 heart sounds, heartbeat
877
- NJfJ4E9EVoM_000120 people whispering
878
- NN6mOUDBjEM_000042 dog howling
879
- NWSsGcjVRDw_000245 playing tabla
880
- NZs6RgHZOoI_000013 firing muskets
881
- NfcCnLiHlqU_000134 playing erhu
882
- NhO6B0zM9Pc_000030 playing electronic organ
883
- NjKRF79wl5Y_000110 wind noise
884
- Nkz9_eGsHKY_000057 people booing
885
- NmdqThtOVro_000160 beat boxing
886
- NnNm_oqkG0o_000050 people sobbing
887
- NniPHshHj9M_000068 playing didgeridoo
888
- NqxCX4G3N2g_000107 playing volleyball
889
- NrCNo4V7RVM_000030 lawn mowing
890
- NrWxMrh7cGw_000210 playing cornet
891
- NtJQ6W2o0EI_000075 canary calling
892
- NwIDavS0llk_000010 chicken crowing
893
- O-C9p_sK_eI_000030 horse clip-clop
894
- O0QV4_JRM0M_000002 car engine knocking
895
- O15FUv56iCc_000040 playing cymbal
896
- O3geFV-GoqM_000031 fire truck siren
897
- O5LFB39yCA4_000085 missile launch
898
- O5TMWyFd1DQ_000180 playing vibraphone
899
- O6_sGC3v96g_000006 wood thrush calling
900
- O7KCtFRaWck_000080 alarm clock ringing
901
- OEu8pZpN8ZA_000000 using sewing machines
902
- OIqUka8BOS8_021217 warbler chirping
903
- OLtTuBhG-og_000075 playing squash
904
- OTVFQoNRQTs_000060 playing bass guitar
905
- OWlCVuOznw0_000019 arc welding
906
- OZ14CiqpJL8_000010 child speech, kid speaking
907
- Ocdu7Lz0IuU_000003 francolin calling
908
- OdGHvGlSUcM_000157 playing timpani
909
- Of59qi5xxkM_000050 playing drum kit
910
- OiIaJb68Haw_000050 playing banjo
911
- OjOQ0K6lza8_000018 playing tuning fork
912
- Okd7ksWR-fc_000547 swimming
913
- Oljdv3iSTBc_000047 people eating noodle
914
- Om-Uc7ia1f0_000320 playing bass guitar
915
- OrueZOVOAD8_000010 motorboat, speedboat acceleration
916
- OtDVd-1zaqU_000030 motorboat, speedboat acceleration
917
- Ow1ZEhmP3qU_000116 typing on typewriter
918
- OyoJ99jDQdo_000147 playing didgeridoo
919
- P1eMMIK0cTs_000011 mynah bird singing
920
- P2taxpwuzcw_000053 wood thrush calling
921
- P2wbv4C6bBA_000210 barn swallow calling
922
- P35m_Rn7HbA_000030 motorboat, speedboat acceleration
923
- P5Y1D-fSVfg_000054 lip smacking
924
- P6sG1m6C4zI_000081 mouse pattering
925
- PavKY6YlSl4_000026 people whistling
926
- PawUc0pqf9M_000260 car engine starting
927
- PbomocKzqKU_000109 splashing water
928
- PeJxiP0CPn4_000025 playing didgeridoo
929
- PfwBOCxEst8_000243 cheetah chirrup
930
- PjbxRjKvzw4_000030 wind noise
931
- Pll-TpbHen4_000067 airplane
932
- Pp61sP7bols_000076 cap gun shooting
933
- PrIQbadXX74_000692 playing oboe
934
- PsmihTl5Cx8_000060 pig oinking
935
- Pu4BCOv6e5Q_000020 fireworks banging
936
- PvE48Ub_CgA_000034 bird chirping, tweeting
937
- PvpA8y7-ZC4_000101 people burping
938
- Pvt8VUQ_Bso_000030 playing vibraphone
939
- PxEpiEid_c8_000177 slot machine
940
- Py5s1uL46L0_000100 male singing
941
- Pz618GchhGI_000001 otter growling
942
- Pz9BhPMUzv8_000258 lathe spinning
943
- Q38lPvwj5Gw_000234 swimming
944
- Q57DFiTwcM4_000221 people eating noodle
945
- Q5jnMD1z86k_000287 people eating noodle
946
- Q7X3fyId2U0_000090 tornado roaring
947
- Q7ZPnRQraJk_000200 playing clarinet
948
- Q9AvyaxgRRo_000141 playing steel guitar, slide guitar
949
- QBFaKTDXCCQ_000120 playing acoustic guitar
950
- QBfcf-k5U28_000007 vehicle horn, car horn, honking
951
- QCX3H9wXgpo_000053 cap gun shooting
952
- QEcQtxP1fdg_000056 playing bongo
953
- QGkUBiVG8-Y_000034 owl hooting
954
- QH5ZtCI9Hts_000125 chopping wood
955
- QL6Ws4i07is_000040 goat bleating
956
- QO9sbXhMq08_000220 people hiccup
957
- QOFuXRetSLI_000064 arc welding
958
- QT1nE5lR7wA_000035 cat growling
959
- QTqN9c6661s_000000 forging swords
960
- QUMzyZRYpWs_000019 playing steel guitar, slide guitar
961
- QXEq7sE7dqg_000030 police car (siren)
962
- QXjaLCotbpY_000055 playing zither
963
- QZ-cG6VdBHM_000070 helicopter
964
- QaBmzAFivPQ_000096 people marching
965
- QcL-X7hJJYQ_000000 people whistling
966
- Qd9_UMNMhcA_000010 typing on computer keyboard
967
- QdEYMboSweA_000001 playing oboe
968
- Qdrcv-ZjC-g_000037 car engine starting
969
- QgEi6pAW36g_000150 male speech, man speaking
970
- QgHYiH6ES08_000120 basketball bounce
971
- QgQKqaMqRgs_000062 playing bongo
972
- QhRcayuLZ48_000390 playing piano
973
- Qlj2HEcX05Q_000136 playing saxophone
974
- QpA1_cezBwA_000123 mouse clicking
975
- Qr2PeUXBJu4_000197 playing erhu
976
- Qs8RjZlOcdU_000030 car passing by
977
- QsHeqaa4Ckc_000072 people whistling
978
- QsNHM92SIvo_000000 people whistling
979
- QvEMTs9_RQE_000010 lawn mowing
980
- Qwxa7ZCEBQs_000006 cricket chirping
981
- QyRrtn5AoSg_000280 playing saxophone
982
- QypTigdvLWU_000017 firing cannon
983
- R0YwusOkMx0_000008 bowling impact
984
- R1h5rRHM3oI_000000 donkey, ass braying
985
- R29qwv_mh4E_000018 playing bassoon
986
- R3VnztSX-k8_000200 ice cream truck, ice cream van
987
- R7KnzEqUGAc_000040 playing cymbal
988
- R7bSeIfRG-Y_000590 eating with cutlery
989
- R8cWq9GoEpE_000037 pheasant crowing
990
- RBHqcDacio0_000182 beat boxing
991
- RCIMcizSSZU_000044 francolin calling
992
- RDuDqEmKucQ_000030 motorboat, speedboat acceleration
993
- RJNjaPizyKg_000099 playing theremin
994
- RKZmAYXXWbg_000247 canary calling
995
- RM6uf-sdVQI_000043 playing bass drum
996
- RN96eLdMN_I_000005 bull bellowing
997
- ROsAOQe62gs_000050 playing electronic organ
998
- RVqCdL7_G2Y_000030 car engine knocking
999
- RWnvolYKQ2o_000414 lip smacking
1000
- R_SNrPUIa1A_000140 playing bassoon
1001
- R_yW6SKe_-M_000080 people booing
1002
- RaawVrMvP7k_000048 pheasant crowing
1003
- Rb0IEIeJTKY_000002 basketball bounce
1004
- Rc_exQXrUG0_000100 skidding
1005
- ReZUlDwGaLY_000080 playing marimba, xylophone
1006
- Ria-XrpfgsA_000089 people marching
1007
- Rifu8nB2cCs_000043 cat purring
1008
- Riu9TpsQ_mk_000009 pig oinking
1009
- RmGGiQMURcQ_000022 people sniggering
1010
- RoLNzNAv-Ig_000030 motorboat, speedboat acceleration
1011
- RrpMoJrp4AY_000180 people crowd
1012
- RsYAulhucVI_000011 lions roaring
1013
- RtHMCINXA0s_000052 cricket chirping
1014
- Rur-IfwPZho_000051 dog howling
1015
- RwE9JAktTvU_000580 people coughing
1016
- RyV40yhlOeU_000419 people marching
1017
- S29c6T__5HU_000003 playing timpani
1018
- S3Ipyd9HHLk_000185 magpie calling
1019
- S45cdr4x-mc_000080 chainsawing trees
1020
- S9fw7NHd2eo_000380 playing electric guitar
1021
- SBYzwBhUpYs_000166 playing badminton
1022
- SBwOIJoGChM_000116 hammering nails
1023
- SCjdlZSW8nY_000111 playing table tennis
1024
- SGkzdDWFIHI_000085 playing bass drum
1025
- SHebWHn0c2Y_000005 chopping wood
1026
- SPnZIDCnKwM_000030 orchestra
1027
- SS6iMabGB1Y_000020 chimpanzee pant-hooting
1028
- ST33aEP5Hbc_000006 train horning
1029
- SXC13GS87Co_000031 woodpecker pecking tree
1030
- SXHYr-7nPaw_000030 playing drum kit
1031
- SYDQX7Whjm4_000061 woodpecker pecking tree
1032
- SYWqIfMOmGE_000051 hammering nails
1033
- S_0v5j4S100_000039 cat purring
1034
- S_qPgRNSkIw_000370 people clapping
1035
- SbXyRN0DD-g_000080 dog bow-wow
1036
- Sc-Ld96kbN0_000144 playing synthesizer
1037
- SdCzaAUA6Xs_000005 playing djembe
1038
- SeZm-iy9n8M_000150 playing electronic organ
1039
- Sf0aZczIZVU_000040 playing cello
1040
- SgYh5Lb7tlM_000130 playing flute
1041
- SifYJFmSSRw_000123 playing marimba, xylophone
1042
- Sl4weBj8xfc_000030 typing on computer keyboard
1043
- SoVEYhxQabk_000103 canary calling
1044
- Spm_zrjedzk_000392 cap gun shooting
1045
- Sqq2dUA8t3A_000586 playing harmonica
1046
- SvJ0kUY22C8_000055 turkey gobbling
1047
- Sw6qDVMsR5M_000030 playing violin, fiddle
1048
- SwQie7apk78_000198 playing darts
1049
- SyEVBFw_9oE_000120 people screaming
1050
- SyfyWK7dKXA_000021 playing squash
1051
- SzmORuHD4g4_000059 wind chime
1052
- T-AN31N4LD0_000050 people screaming
1053
- T0NMgZC7CDU_000011 chipmunk chirping
1054
- T19Xf5-OTHw_000130 playing piano
1055
- T2zZbnu_NtM_000029 playing table tennis
1056
- T4KEGH_8lY8_000119 playing timpani
1057
- TAdH0kUJj9k_000050 helicopter
1058
- TCUnK4k7QZ0_000000 telephone bell ringing
1059
- TDh8_ixGzIo_000030 printer printing
1060
- TGngN3n7EMw_000024 airplane flyby
1061
- TLSmnnnyhEk_000030 people shuffling
1062
- TMyd50KWyNo_000311 people slurping
1063
- TNCcQfbselM_000120 francolin calling
1064
- TQapWHNS5FE_000024 car engine knocking
1065
- TRW01xXMMqg_000210 playing accordion
1066
- TRt_14JcRWQ_000080 playing bass drum
1067
- TTElms_ZWqI_000428 hair dryer drying
1068
- TTstWFDMmqc_000030 people whistling
1069
- TUPEF6PQxow_000132 rapping
1070
- T_iuImHtqUI_000010 people sobbing
1071
- Ta__Ev0mkBk_000030 chainsawing trees
1072
- TakDv24Tiq0_000032 plastic bottle crushing
1073
- TcN0QofoTvg_000221 playing erhu
1074
- TdkhMZZvdgc_000006 owl hooting
1075
- Tdyh5ziqH-U_000007 lions roaring
1076
- TiaGOZ-ibxw_000411 people booing
1077
- TriRWR9YiNk_000016 frog croaking
1078
- Tse5rzNV5dk_000084 pheasant crowing
1079
- Tze9ybKops4_000020 playing synthesizer
1080
- U3-h9ZARqD4_000264 police radio chatter
1081
- U34oQw93afs_000219 playing tambourine
1082
- U3zsgbf9WHQ_000194 horse neighing
1083
- U4RRMpX2wCU_000010 toilet flushing
1084
- U55bYLMVKiw_000193 pheasant crowing
1085
- U6vVDGaKL3Q_000354 bouncing on trampoline
1086
- U9qUXBqIoZ0_000106 dog howling
1087
- UA62hwIBgGY_000020 chicken clucking
1088
- UFIi1OuMx0o_000302 rope skipping
1089
- UGwl5VOHuaw_000200 playing accordion
1090
- UIFxlzHYPBM_000060 gibbon howling
1091
- UJ1lZOY9LSY_000035 playing didgeridoo
1092
- UM1j8kFaxi8_000020 motorboat, speedboat acceleration
1093
- UOL-hbkzUN4_000010 barn swallow calling
1094
- UOlwg402_r4_000070 people clapping
1095
- UPUwaW8jfhA_000030 ice cream truck, ice cream van
1096
- UQonGRRRpv4_000024 goose honking
1097
- UUKyUUjv8qg_000030 church bell ringing
1098
- UZAB21OSorM_000007 electric shaver, electric razor shaving
1099
- UZYfRXafn9I_000005 ferret dooking
1100
- UZp0AcdimvA_000021 cattle, bovinae cowbell
1101
- UeCkRYU_SuM_000100 playing accordion
1102
- Uf2j1VbOk8c_000055 pheasant crowing
1103
- UfG4dP0szuY_000040 fireworks banging
1104
- UjTYiJ0dm8s_000002 vehicle horn, car horn, honking
1105
- UkdS0cwAGYE_000010 car engine starting
1106
- UnGLtJX29Hc_000043 planing timber
1107
- UoFgJXGWJXA_000111 playing congas
1108
- UpWivODbpIY_000059 owl hooting
1109
- UsJAb6aftq8_000580 playing bagpipes
1110
- UuuQH-TFxMo_000034 missile launch
1111
- UzKZijSs4-A_000004 fox barking
1112
- UzPSMiqeH3Y_000118 singing choir
1113
- V-ZbY0SL2XI_000040 people sniggering
1114
- V1ALglq7_x8_000018 dog growling
1115
- V6lQVpw888U_000590 machine gun shooting
1116
- V6y-jCli4I4_000000 cuckoo bird calling
1117
- V7SGeTSJz9w_000090 skateboarding
1118
- V82SmRI0GHY_000030 playing clarinet
1119
- V83lIhKVraY_000125 playing darts
1120
- VCEicqV_2Xw_000030 ambulance siren
1121
- VDXN0xwWgRA_000083 playing bass guitar
1122
- VDzkPfnI1g4_000093 playing djembe
1123
- VEER910vqMk_000002 duck quacking
1124
- VEhmvrgrZb0_000000 chicken clucking
1125
- VFj1vFMV3dQ_000025 playing darts
1126
- VGrI3TMjWog_000120 playing vibraphone
1127
- VHQjG81NcXE_000030 crow cawing
1128
- VS9R3iOc4Vk_000027 pheasant crowing
1129
- VU9W8Y1E5u4_000030 bouncing on trampoline
1130
- VdxslFvStdo_000370 female speech, woman speaking
1131
- VfXlyIjtfo4_000117 baby babbling
1132
- Vgs_XjEqKl0_000020 people sobbing
1133
- Vh4E5JPTMBM_000146 typing on typewriter
1134
- VhLn9pUFwXw_000039 chopping wood
1135
- VhUG4vTpPUo_000324 ripping paper
1136
- VhsFniEZO-k_000026 mynah bird singing
1137
- Vkbp8VmL3pM_000040 people sobbing
1138
- VkgLWYydiPE_000125 tractor digging
1139
- VlGuwiKwJAM_000027 playing sitar
1140
- VlkgwzKAamE_000051 ripping paper
1141
- Vnnw7lK63rg_000041 playing snare drum
1142
- Vt3qBXzyS5k_000280 eating with cutlery
1143
- VwZ8gzI3qNE_000106 people slapping
1144
- VwqcV76E6Nk_000000 people booing
1145
- VwqqmiiznQU_000028 woodpecker pecking tree
1146
- Vxs0xCJI92Y_000080 driving motorcycle
1147
- Vzb427ZmWvw_000220 fireworks banging
1148
- W0PwVllBxkI_000114 playing steel guitar, slide guitar
1149
- W1o_XgU8lec_000050 skateboarding
1150
- W2_8zRHaEPk_000150 playing vibraphone
1151
- W2gkFTFR8mw_000047 rope skipping
1152
- W4eT7fj-aIA_000201 driving snowmobile
1153
- W5oXrz8dqBk_000030 playing piano
1154
- W5wBkCwEEmY_000140 playing banjo
1155
- W7OJevEgq7w_000000 dog bow-wow
1156
- W7u5kEt-q-8_000000 playing tennis
1157
- W9L5rTbcMFA_000004 people eating noodle
1158
- WABbXpAT_UA_000049 playing bagpipes
1159
- WAhoodHHm2w_000001 playing squash
1160
- WBOqGIqUwGg_000090 people sniggering
1161
- WD0aVtBqoxo_000120 goose honking
1162
- WDmJ4ZtLuNU_000102 playing timbales
1163
- WGHTlOM4-3w_000050 sheep bleating
1164
- WH7LBLKyEkA_000241 playing mandolin
1165
- WIWRYG4vJC4_000020 people burping
1166
- WIZTFH-LGpo_000001 planing timber
1167
- WJQ27fShKvk_000000 playing tennis
1168
- WQFZLDitkkM_000067 eletric blender running
1169
- WQuoH_HyUAk_000030 playing cello
1170
- WRvPzjj5uoE_000134 ice cream truck, ice cream van
1171
- WWzD6E9Wp_k_000260 playing cornet
1172
- WXMt58sLsf8_000028 zebra braying
1173
- WZ568vdA7bU_000070 plastic bottle crushing
1174
- We-E7-Sx3Zo_000260 barn swallow calling
1175
- Wg86ercBjY0_000002 playing clarinet
1176
- Wh8A7CAuLe0_000028 barn swallow calling
1177
- Whjk5Fvue1o_000030 singing choir
1178
- Wj0qIPUjTfE_000008 lions roaring
1179
- WqKP-0cSKgs_000030 dog bow-wow
1180
- WvRkqVmRH0g_000088 playing harp
1181
- WvcM0ueEjfo_000050 people burping
1182
- Ww3CMatNd84_000721 cat purring
1183
- WxQHtaD0Yqg_000028 tractor digging
1184
- X-o1Twh5SFY_000032 playing steelpan
1185
- X0gT3reH8A8_000120 people sniggering
1186
- X17lq90OIO8_000020 dog barking
1187
- X5C9NY9MjA4_000105 train whistling
1188
- X7EGSxA-aCI_000132 child singing
1189
- XBAwcPvVSoA_000068 lathe spinning
1190
- XDMTylVtYx4_000190 race car, auto racing
1191
- XEOUYLlaef4_000003 rope skipping
1192
- XJnKU_SXYlM_000049 playing tabla
1193
- XK4Ws-xvt10_000267 vacuum cleaner cleaning floors
1194
- XKp4HCxVmaI_000017 vehicle horn, car horn, honking
1195
- XLTqSk1Z3D0_000000 police radio chatter
1196
- XM6eeVHjmLk_000001 dog growling
1197
- XNgq-cDV7FI_000101 dinosaurs bellowing
1198
- XOTSovKwxLk_000030 child speech, kid speaking
1199
- XSJzshsMz30_000030 chainsawing trees
1200
- XTDo4OaFapg_000100 hammering nails
1201
- XU8dCEdiGWc_000010 crow cawing
1202
- XUyBxCbiv7A_000073 playing bassoon
1203
- XVveRibUh18_000023 frog croaking
1204
- XWp8qMpnD00_000026 electric shaver, electric razor shaving
1205
- XYZ4Nd4qV-I_000101 people humming
1206
- XdSCT_cQDbE_000010 splashing water
1207
- Xgm17YbPztk_000022 playing didgeridoo
1208
- XiExpKM1Hpo_000160 playing trombone
1209
- XlJ-tAbzzSg_000234 alligators, crocodiles hissing
1210
- XtExs7nIzts_000034 people booing
1211
- Xv4AVT2QYhA_000100 rowboat, canoe, kayak rowing
1212
- Xxq7CElxJLc_000063 singing choir
1213
- Y798EuJZaPU_000017 playing squash
1214
- Y9Oee-VRfVA_000339 airplane
1215
- YC_k4W1YaDw_000030 race car, auto racing
1216
- YD41QET24SM_000125 playing badminton
1217
- YD7jTek7yVU_000206 arc welding
1218
- YEatlg_b0BY_000054 people burping
1219
- YISopDKuQ0k_000050 playing accordion
1220
- YJ5xLJ85AwM_000106 tractor digging
1221
- YOTnbp40tf4_000030 male singing
1222
- YOrImbuhsQ8_000027 lions roaring
1223
- YS_zTwf-FRo_000092 playing ukulele
1224
- YU78jPcU6FI_000070 playing trumpet
1225
- YUXZVAQ1iJ4_000007 volcano explosion
1226
- YUcdJy-rpD8_000590 raining
1227
- YVOmkmjoT40_000030 ocean burbling
1228
- YYgYiO9DjEY_000161 tap dancing
1229
- YbALYr-5WpM_000000 playing harmonica
1230
- YbOztklOkF0_000023 goose honking
1231
- YcvHv44MYiU_000027 barn swallow calling
1232
- YdjsatpizhE_000023 airplane flyby
1233
- Ye72yJyWxs8_000021 airplane flyby
1234
- YeEySSrxwpg_000078 barn swallow calling
1235
- YfZp5C7xrKs_000181 playing bassoon
1236
- YgySYOAi8JQ_000396 skiing
1237
- YhJwTBFij48_000015 motorboat, speedboat acceleration
1238
- YjCLRifFCj0_000010 skateboarding
1239
- YjJioclqdQ8_000150 wind noise
1240
- Ys1P04EjGH4_000196 playing bassoon
1241
- Ys9j6IBcFBo_000024 opening or closing car doors
1242
- YvBCKb1LbCk_000095 fire truck siren
1243
- Yvq8WrFpXhE_000057 people crowd
1244
- YwNdDHEhm2g_000005 duck quacking
1245
- YwTFxcWCac8_000381 electric grinder grinding
1246
- YyqqXEmYPIA_000020 ambulance siren
1247
- YzBaTwjmikc_000018 hammering nails
1248
- Z-V-1iUbMWI_000520 lions growling
1249
- Z1BhAXfiZtU_000037 vacuum cleaner cleaning floors
1250
- Z4QR8uvx_Wk_000169 reversing beeps
1251
- Z5SyUJSDCOA_000562 ripping paper
1252
- Z7Hzc1Yw2aY_000060 sloshing water
1253
- Z93pTtHnDXo_000110 playing vibraphone
1254
- Z9nG2fIh214_000075 chinchilla barking
1255
- ZALP7Di4HaM_000180 playing saxophone
1256
- ZAZZ1wImM9M_000010 singing choir
1257
- ZCA_NapBTlg_000060 dog barking
1258
- ZDDnEdzjyrE_000597 playing tambourine
1259
- ZFGcmmpt1bs_000094 playing bagpipes
1260
- ZL_MxixlnHE_000079 reversing beeps
1261
- ZNboftBNdyY_000406 cap gun shooting
1262
- ZPODO-Ehl_M_000030 male singing
1263
- ZQO_uhrJPNA_000110 playing violin, fiddle
1264
- ZUjum5gZMKM_000140 playing accordion
1265
- Z_Bk_CnpWsY_000198 people sneezing
1266
- Z_sW4UxpbbY_000050 using sewing machines
1267
- ZbtuNDtoyOI_000030 sliding door
1268
- ZcskQV2A2cQ_000030 playing flute
1269
- ZdtaSkUkrIE_000256 police radio chatter
1270
- ZeDa5hT2ffk_000071 police radio chatter
1271
- Zgbuj3y2iuY_000210 cattle mooing
1272
- Zh2whhvFWsM_000016 pigeon, dove cooing
1273
- ZhLwVzOZziA_000368 blowtorch igniting
1274
- Zi3FOnx4nuk_000001 playing table tennis
1275
- Zj73Wh6LEiU_000120 skateboarding
1276
- ZjN9CL7B-9I_000239 playing timbales
1277
- ZkfUo4l9ruc_000090 chainsawing trees
1278
- Zl_ZWSLB8Ic_000024 sheep bleating
1279
- Zs8liAFeuuQ_000058 smoke detector beeping
1280
- ZtPoTqVxVvU_000050 helicopter
1281
- Zu0BpngzT_Q_000007 bowling impact
1282
- ZuwSkX0RQQY_000343 playing tennis
1283
- ZxmKMSUpbvc_000065 car engine idling
1284
- ZxpiZiSAm9I_000060 turkey gobbling
1285
- Zy70U6w0yXw_000088 mynah bird singing
1286
- ZyUqhIDVuNc_000541 scuba diving
1287
- Zz0fhQuHZEE_000012 penguins braying
1288
- _0iRtZRG6UA_000047 woodpecker pecking tree
1289
- _4RRKzDUd60_000079 lathe spinning
1290
- _7GnnuKVVCM_000023 engine accelerating, revving, vroom
1291
- _8FhgH9k7Rw_000120 vacuum cleaner cleaning floors
1292
- _9wN5d1Z1ak_000024 lions roaring
1293
- _CF34A0RrPs_000018 horse neighing
1294
- _Cks36T64zE_000061 striking pool
1295
- _DdVu5sPsjk_000490 people whispering
1296
- _GaEZe-Z73k_000233 fire crackling
1297
- _HRn4aOhjhU_000016 canary calling
1298
- _H_W34UobYU_000459 bouncing on trampoline
1299
- _HcIHVLRzpM_000450 female singing
1300
- _NShiXyBmsY_000270 train wheels squealing
1301
- _Ow1h1eTNk0_000178 playing trombone
1302
- _SfaPFwwJHs_000026 train wheels squealing
1303
- _T0iCBHWKt0_000101 pig oinking
1304
- _T5ZUrmRiQI_000108 playing ukulele
1305
- _Uyw_Legahg_000045 tap dancing
1306
- _VOx5BWJsyQ_000030 raining
1307
- _WQQ3QvGrYw_000340 child speech, kid speaking
1308
- _WUAz2RAZZc_000201 planing timber
1309
- _YF3aFSsgUk_000093 playing steel guitar, slide guitar
1310
- _YhSeML8rQo_000109 alligators, crocodiles hissing
1311
- _aX_UzkXRd0_000140 helicopter
1312
- _cvucKdFb5I_000043 people booing
1313
- _dIzu78Ld2w_000166 lathe spinning
1314
- _gQFB_Utuf0_000077 cat caterwauling
1315
- _j8zzvBts98_000000 splashing water
1316
- _m6lwfMU8Eo_000272 electric shaver, electric razor shaving
1317
- _pSMw5FKHX0_000040 people sobbing
1318
- _pfccpy7Cqc_000180 typing on typewriter
1319
- _ru-n--PRNA_000030 police car (siren)
1320
- _t-Abwz6JG4_000031 baby babbling
1321
- _t259gootxc_000190 female speech, woman speaking
1322
- _u9zUuBdo1k_000000 cat growling
1323
- _vkXDgupDN8_000250 sailing
1324
- _wvB2HlVn1I_000050 engine accelerating, revving, vroom
1325
- _xGLwynjhSs_000010 playing french horn
1326
- _xq-9GZBfrg_000014 pheasant crowing
1327
- _yVgX3hi1OQ_000195 driving snowmobile
1328
- _zTmqhuLwAM_000001 donkey, ass braying
1329
- a0LIemH5Cw0_000010 people clapping
1330
- a3ZAFViNYyk_000000 swimming
1331
- a57DUeBMeHY_000320 rowboat, canoe, kayak rowing
1332
- a6CPpulnJ2A_000420 stream burbling
1333
- a8fa79w2aIQ_000023 lighting firecrackers
1334
- aC3nlLHFOfk_000030 playing violin, fiddle
1335
- aCnLa_H0-P0_000000 magpie calling
1336
- aDXQSTbKlIc_000010 playing cornet
1337
- aE32elV-Jtk_000210 people crowd
1338
- aG1wGSIqGR4_000013 frog croaking
1339
- aHzkCSXsrqg_000038 vacuum cleaner cleaning floors
1340
- aJ41sea1s0U_000080 people farting
1341
- aNArqTW4cbc_000025 vehicle horn, car horn, honking
1342
- aNOELrfjAYY_000000 vehicle horn, car horn, honking
1343
- aRI4l67ZlYQ_000063 planing timber
1344
- aSYCwv_hda8_000030 subway, metro, underground
1345
- aSleAKgkDDk_000000 playing accordion
1346
- aVs2QBhLIhY_000162 playing didgeridoo
1347
- acYp_SYmHs8_000164 running electric fan
1348
- aclGsdr83pM_000400 playing saxophone
1349
- aezIOAga5V8_000070 child speech, kid speaking
1350
- agMolFR_pFc_000075 train whistling
1351
- agrdgrC2cdI_001076 dinosaurs bellowing
1352
- ah5cSy0yXs0_000178 slot machine
1353
- aiTXGmkpfnk_000030 playing trumpet
1354
- ainzK7QuseU_000001 dog whimpering
1355
- aj6kdMafoek_000693 hair dryer drying
1356
- aju2z1N0aOo_000030 wind rustling leaves
1357
- ap3PdrjChdo_000040 playing bassoon
1358
- apTvGua1-FY_000271 playing guiro
1359
- asXWEB_SBEI_000060 playing cello
1360
- atT7DPwTkds_000130 people clapping
1361
- auHL-4XCFAk_000030 driving buses
1362
- b-8lh_tfhLQ_000124 hair dryer drying
1363
- b-gza98ikBo_000020 playing snare drum
1364
- b2bpNgK0Cnc_000250 orchestra
1365
- b4Bu0AHwBWs_000084 woodpecker pecking tree
1366
- b4WK1A7DK18_000018 crow cawing
1367
- b8q6Z7dtRvg_000030 playing flute
1368
- bBMcsO6IeDE_000021 lions roaring
1369
- bF89h31EEzg_000000 golf driving
1370
- bFmIV3pNJPY_000001 basketball bounce
1371
- bI_4_x735PA_000020 typing on computer keyboard
1372
- bJtu55jpzNc_000140 playing violin, fiddle
1373
- bJzkn2kRh8g_000070 helicopter
1374
- bLAz_kbihLE_000147 elk bugling
1375
- bMNcdb3Eeds_000064 civil defense siren
1376
- bN9fXjHalIY_000065 playing timpani
1377
- bPNt6iVmemQ_000504 playing bongo
1378
- bPfP2rjJfDY_000609 playing ukulele
1379
- bQV7q5VRaH0_000174 car engine knocking
1380
- bT8QfAM9NRA_000197 cutting hair with electric trimmers
1381
- bVdI6laTOXI_000480 people screaming
1382
- bVskpqAJF8E_000116 people eating crisps
1383
- bYT-N-_u448_000217 civil defense siren
1384
- bZUN1tQnuDQ_000001 child singing
1385
- b_C-fNIS8aI_000000 cat purring
1386
- baVILr18Y9A_000015 civil defense siren
1387
- bd-swxc3o4w_000260 playing hammond organ
1388
- bo9sSwEqnzs_000030 orchestra
1389
- bokQgOSQ2OA_000001 playing squash
1390
- bpF6KhK8El0_000030 police car (siren)
1391
- bsM-z2joYss_000030 child speech, kid speaking
1392
- bsUBSFHXY0g_000040 helicopter
1393
- bukJZ1FxymQ_000390 male speech, man speaking
1394
- bw3GIZLj6kM_000000 playing piano
1395
- bx5BUbiIXFw_000107 child singing
1396
- bzxjT3h2ir8_000105 lip smacking
1397
- c3UPyEZ1yQY_000070 typing on computer keyboard
1398
- c4M3JIyAPcM_000020 playing bass drum
1399
- c5dPZoWwmC0_000020 driving motorcycle
1400
- c6e4pxgoCls_000105 magpie calling
1401
- c84w0ECD-Lc_000010 ocean burbling
1402
- cAI0pcOwk2g_000346 elk bugling
1403
- cEddS8Y-qZc_000510 people clapping
1404
- cFNcpddGRno_000340 fireworks banging
1405
- cIHKR2E1uiQ_000303 smoke detector beeping
1406
- cJSWXGTJMcc_000018 rowboat, canoe, kayak rowing
1407
- cL_nCiBnlbk_000001 playing bugle
1408
- cMdnie91zp4_000000 playing trombone
1409
- cNUIc68WpD4_000075 people marching
1410
- cRiW0u0QY18_000030 playing trumpet
1411
- cSym5f2jySA_000005 chicken crowing
1412
- cUBHfozbsao_000044 playing harp
1413
- cV4QlanVa9w_000070 basketball bounce
1414
- cVhWB3IniBo_000014 playing tuning fork
1415
- cZfuBCVV6n8_000390 eating with cutlery
1416
- ces9pc_r6Wo_000036 child singing
1417
- ckwEyopmfKs_000024 crow cawing
1418
- cmkEW0KJDYI_000165 arc welding
1419
- cp-ZI_fQ1l0_000154 airplane flyby
1420
- cwQY1bck2G8_000070 playing bagpipes
1421
- cx4QSvep_wE_000009 train horning
1422
- cxFdK2G6wq0_000030 playing bagpipes
1423
- d-UQr-8UEUY_000069 playing saxophone
1424
- d05lXeFKDn0_000275 pheasant crowing
1425
- d4yBeEbVp1Y_000030 typing on computer keyboard
1426
- d5HmVBPY1Qc_000230 playing saxophone
1427
- d66pNyYB6WY_000013 people burping
1428
- d8gWsmBdBhE_000097 playing sitar
1429
- dBivnkxNOOc_000175 playing vibraphone
1430
- dECLS-JHWYA_000000 vacuum cleaner cleaning floors
1431
- dK46EdcZFzg_000030 playing trumpet
1432
- dNMCURn41wU_000179 playing djembe
1433
- dN_EzmXbsu8_000016 playing bass drum
1434
- dSeWq0Qd9Hs_000318 playing tambourine
1435
- dVg4IEbk-l8_000010 cat meowing
1436
- d_OIBYBwexQ_000160 playing accordion
1437
- daHwPM2azrc_000036 wood thrush calling
1438
- dfr1OFz20sI_000000 goat bleating
1439
- dgSOnxqNtFE_000246 people coughing
1440
- dgS_Fy1FiNA_000110 people burping
1441
- dhG_GSGW_RI_000004 volcano explosion
1442
- dlJm9R5t_qg_000030 playing hammond organ
1443
- dlWrMn_RDg0_000120 playing bassoon
1444
- dqymshfwGEE_000030 playing saxophone
1445
- duca08sjlbQ_000001 playing bongo
1446
- dugd_OSzghs_000203 ice cracking
1447
- e0LMGLr-T-I_000029 air conditioning noise
1448
- e3ZJnO3s53o_000016 child singing
1449
- eANsaSAzHm8_000010 driving motorcycle
1450
- eBFPD8YrqiA_000140 driving buses
1451
- eCpA_7B-k94_000030 dog bow-wow
1452
- eDqfHtuB8Hk_000015 snake rattling
1453
- eEUsoUKPxy8_000187 basketball bounce
1454
- eFaLkcfCzos_000140 playing cello
1455
- eK97_rb6BsY_000072 playing gong
1456
- eLyQDSo2NAM_000129 opening or closing drawers
1457
- eOJQsk_kdWI_000032 ice cracking
1458
- eS8Tf1hfwxk_000205 sea waves
1459
- eSEIPV-qSj0_000020 squishing water
1460
- e_3GUZmPFBI_000020 playing erhu
1461
- ebhtW1tIXRY_000002 donkey, ass braying
1462
- ecTDu-EX3WE_000019 car engine knocking
1463
- ecq96FWbCF0_000037 gibbon howling
1464
- ed4wVB_RhHw_000011 baby crying
1465
- ehw6y3_g-8A_000757 ripping paper
1466
- ej6jlkTeobU_000002 car engine knocking
1467
- el3i-oj08Q4_000173 playing oboe
1468
- f-XD-BgLWk0_000000 skidding
1469
- f5c5KuWylig_000343 vacuum cleaner cleaning floors
1470
- f6Wl-9pzib0_000032 cattle mooing
1471
- f8bMURZiPiU_000019 people whistling
1472
- f9U7g3g4voA_000026 golf driving
1473
- f9c9YZ8WgjM_000037 bull bellowing
1474
- fD362l9P3u8_000041 tornado roaring
1475
- fFn2P7ZRIeM_000480 playing clarinet
1476
- fNlGlh1GaeA_000013 heart sounds, heartbeat
1477
- fS17RfJYjS4_000001 pig oinking
1478
- fTT_D_d_5FA_000080 people clapping
1479
- f_55S5G8M2s_000000 playing harmonica
1480
- faFCcN6y-C8_000020 ferret dooking
1481
- fcyUlEGvMdc_000037 playing volleyball
1482
- fj0qlDdWt1M_000158 playing squash
1483
- fknz5hZg_3I_000295 playing darts
1484
- fmc6hwse-IA_000085 skiing
1485
- g-CydtX7btM_000086 eagle screaming
1486
- g-u5YOJu_gY_000230 lawn mowing
1487
- g1n-ZaW0QHQ_000095 reversing beeps
1488
- g5OBeqvOmRU_000001 bathroom ventilation fan running
1489
- g8E9gBfe8B4_000180 female speech, woman speaking
1490
- gH25X_mj6mc_000210 ocean burbling
1491
- gJbMwvsUyA8_000000 driving motorcycle
1492
- gL2i_DTGUEY_000028 cattle, bovinae cowbell
1493
- gLTvwzBktxE_000015 airplane
1494
- gLj93C9rRsg_000055 playing tambourine
1495
- gLokxx-ruH8_000230 playing piano
1496
- gM9WSjAPDVc_000030 people babbling
1497
- gPwtTVH44OY_000030 people shuffling
1498
- gW-1oOsNGJs_000010 playing harp
1499
- g_axbxP7Amc_000071 playing harp
1500
- gaFtxq1hBU4_000118 spraying water
1501
- gcBUpboDmjc_000033 hammering nails
1502
- gfgv17hOPIM_000040 playing marimba, xylophone
1503
- gjJ4nqwlgnE_000010 playing hammond organ
1504
- gm0HkvshnPk_000340 cattle mooing
1505
- goS6rwhPth4_000026 mynah bird singing
1506
- goz-IQ8s6uk_000050 skidding
1507
- gpaX15tTUoc_000017 cat growling
1508
- guvnNwCkhcs_000030 people sniggering
1509
- gwzqjVCFNqA_000040 playing clarinet
1510
- gyioxO7fWzI_000046 lions roaring
1511
- gyjZ7tnnZeA_000132 playing theremin
1512
- gyt54t3R_BU_000032 blowtorch igniting
1513
- gzqq0knK2FA_000003 cat meowing
1514
- h-Z5cTyu4LE_000150 wind rustling leaves
1515
- h0025UfxME0_000308 sharpen knife
1516
- h0V51dolEjA_000194 airplane flyby
1517
- h5Gq0y3qkX0_000063 volcano explosion
1518
- h7EWw2n5D5I_000050 train horning
1519
- h7pz6niHZuw_000006 donkey, ass braying
1520
- hEePXITb26o_000042 playing harp
1521
- hFVd2Em9-cc_000100 toilet flushing
1522
- hQvIg0t546Q_000146 playing vibraphone
1523
- hUlqIdQFuxE_000030 basketball bounce
1524
- hV_CjOK-mME_000030 people sniggering
1525
- hW-RxgLN2l0_000007 owl hooting
1526
- h_tdr4t6unw_000300 typing on typewriter
1527
- ha-5LhgpVmQ_000946 playing tympani
1528
- hc9aQ8VL9o0_000083 playing steel guitar, slide guitar
1529
- hcR4BiG8sZs_000150 playing clarinet
1530
- hdphUn6ihrA_000450 lawn mowing
1531
- hdqCHBTwnuQ_000253 playing badminton
1532
- hgcLJFz2WKQ_000512 police radio chatter
1533
- hlKkLqHpJ_s_000400 chicken crowing
1534
- ht3jNf66nbo_000286 missile launch
1535
- htRB8f0r2rg_000027 playing bassoon
1536
- hvoOSCZo2-E_000030 cattle, bovinae cowbell
1537
- hy87-XUmhkE_000004 playing timbales
1538
- i-SmzP7T_E8_000295 skiing
1539
- i3nEgFq4yfo_000578 heart sounds, heartbeat
1540
- i4IsKRvCLi0_000036 lions roaring
1541
- i5dz5NV4Vpc_000007 crow cawing
1542
- i6BBre7xV-c_000937 dinosaurs bellowing
1543
- i9PvGS9Xr9k_000332 lions roaring
1544
- iAqJ9lPCU4w_000024 chicken crowing
1545
- iD8gRmmiiqU_000130 driving motorcycle
1546
- iDODqIflQ1Q_000061 parrot talking
1547
- iFU48OcnO7k_000000 woodpecker pecking tree
1548
- iLaLf95DcQk_000040 duck quacking
1549
- iQMIGLrKlTI_000260 playing accordion
1550
- iSjvZiygjCQ_000510 playing cello
1551
- iTd7hOI27BE_000048 playing harp
1552
- iXslVMHwkTU_000212 people eating noodle
1553
- iZR9dpO64NA_000011 cap gun shooting
1554
- i_-LCRDriig_000030 ocean burbling
1555
- i_hhSKWxzeU_000038 frog croaking
1556
- ibd7CKcSiTI_000122 playing bass drum
1557
- icK4IQb2KsE_000000 hail
1558
- ieRU5f5P4B8_000350 cap gun shooting
1559
- ieXdQlIBgLk_000030 playing marimba, xylophone
1560
- iiUvfvkeo0c_000237 disc scratching
1561
- ijirbb9m05k_000285 swimming
1562
- ipo5U5Grsno_000020 people cheering
1563
- irUkV1DP7Cs_000030 playing cello
1564
- irhsdhRIUwI_000010 fireworks banging
1565
- itH-fbb9Ook_000250 ambulance siren
1566
- ixv1jovJe3c_000151 playing timpani
1567
- j-GF_0RxUlg_000176 playing bass guitar
1568
- j-hyPaKjCAU_000030 playing accordion
1569
- j0NNSluEaS0_000150 heart sounds, heartbeat
1570
- j15Ldqb_XVw_000020 fireworks banging
1571
- j2OhKQ6sm0o_000077 people eating noodle
1572
- j3A_ekLNu1Y_000008 car passing by
1573
- j4GHwj1Yqz8_000076 ice cream truck, ice cream van
1574
- j5oZYOBOppQ_000003 mouse squeaking
1575
- j6f4pheXNDE_000108 tractor digging
1576
- jB-OcexH1n0_000033 cat caterwauling
1577
- jBCKFPXuFOw_000086 strike lighter
1578
- jBZ1C1ihCIY_000005 playing bongo
1579
- jL4h1-_LECU_000022 church bell ringing
1580
- jQRurvUk2xs_000051 writing on blackboard with chalk
1581
- jVG2LQ2kA1Q_000067 playing glockenspiel
1582
- j_WKRbDVZhs_000071 barn swallow calling
1583
- j_vtU1U9rg0_000042 playing volleyball
1584
- jb92NmGYNbU_000279 police radio chatter
1585
- jd2ENRtbxRQ_000010 people coughing
1586
- ji-27X81tIs_000133 playing bassoon
1587
- ji4T1ArqCz0_000017 fire truck siren
1588
- ji8HeUiTfoU_000030 orchestra
1589
- jld-wHLRUWM_000020 playing accordion
1590
- jmLX2yQ4eKk_000007 hail
1591
- jsw5soBYfsc_000020 people farting
1592
- jt7w_UY4yUI_000040 lions growling
1593
- jxnPU7Okb5U_000043 playing snare drum
1594
- jzw_Wa_TXVo_000018 playing squash
1595
- k-AKVEheu4g_000096 alligators, crocodiles hissing
1596
- k-jDS1jp_AA_000014 firing cannon
1597
- k4h2VtrPwus_000161 rapping
1598
- kAWAs_7SaKw_000000 bird chirping, tweeting
1599
- kBmcp8nL6Kg_000195 playing didgeridoo
1600
- kCsmvK06SCA_000254 playing sitar
1601
- kDwFyUvAi4w_000077 playing bongo
1602
- kEQJJyYkYTY_000200 child speech, kid speaking
1603
- kL6xemyurI8_000140 people eating apple
1604
- kPp7CwFBl1c_000030 playing violin, fiddle
1605
- kPpaeW3DObU_000481 playing castanets
1606
- kPus6xz6fN8_000030 car engine knocking
1607
- kSdqIpAMz_M_000175 playing snare drum
1608
- kSwrdM7UD98_000057 owl hooting
1609
- kTyaqJIhX6Q_000020 playing accordion
1610
- kVMXMaTyEbE_000116 playing theremin
1611
- kVtj0bAYAF8_000000 people sobbing
1612
- kW23iJgtyfk_000002 raining
1613
- k_NIUqHoNz4_000037 playing bassoon
1614
- khZPuH00RNc_000332 yodelling
1615
- kjtZNsHp_a0_000330 lawn mowing
1616
- kkgjiCKHvoY_000449 firing cannon
1617
- kmPmQ6aylRc_000012 reversing beeps
1618
- koTbsmbqyxo_000103 people booing
1619
- kp_7Sd6s0h8_000306 people eating apple
1620
- kqvpyaIls0c_000090 playing cello
1621
- ksaiDSSJeOg_000030 playing cymbal
1622
- ktBzLsiL6l0_000157 playing steelpan
1623
- kxl_ZU3j99A_000415 missile launch
1624
- ky92PHpUpEA_000050 playing accordion
1625
- kyEDPVvDQt4_000040 bird wings flapping
1626
- kz849EPouys_000318 magpie calling
1627
- kzntbWmyWBg_000074 playing squash
1628
- l0DQpxoSr2Q_000040 playing banjo
1629
- l3i-cKkVL-o_000007 car engine knocking
1630
- l3rzkrm98J0_000001 alligators, crocodiles hissing
1631
- l3uGoel_Ats_000000 people crowd
1632
- l4XYVX79H58_000400 people babbling
1633
- l5LnwNRK7Bw_000030 playing cello
1634
- l6uZDuUsdpc_000010 people burping
1635
- l7ELBtiVtQ8_000190 striking pool
1636
- l8bdmlXL-Lk_000197 playing didgeridoo
1637
- l9ple4xWo3w_000193 chopping food
1638
- lAF2dHM7Tyc_000170 playing electric guitar
1639
- lEzMz9odWXM_000058 lathe spinning
1640
- lGsxnfOPaUw_000022 baby crying
1641
- lGtRJjnC4PI_000210 airplane
1642
- lKhe8BxkRnU_000025 wind rustling leaves
1643
- lLme6yedI6w_000040 cricket chirping
1644
- lN2kwc34bo0_000050 train horning
1645
- lP5znTMLevo_000030 playing bagpipes
1646
- lQG8CRumj3g_000560 playing cello
1647
- lUWrhn9z9FI_000096 pumping water
1648
- lXIaZksDY38_000030 people shuffling
1649
- lXwEV2S1rt4_000150 using sewing machines
1650
- lc1QTC0R_CQ_000018 people shuffling
1651
- ld9b7tfnqTE_000109 playing erhu
1652
- ldF2EJCVY3g_000147 playing theremin
1653
- ldvcH7bOy_o_000184 playing french horn
1654
- levuF973w8s_000250 playing french horn
1655
- lg6X9iqcqXI_000233 playing table tennis
1656
- lg7DqdnmkmE_000130 skateboarding
1657
- lj-PczKzEaw_000040 using sewing machines
1658
- ljXTXoBG9rg_000077 pheasant crowing
1659
- lnatlhCU5kI_000420 singing choir
1660
- loMPOYNM66g_000123 playing timpani
1661
- lqVp4OJ4hbY_000044 lions roaring
1662
- lr1RLADQXNg_000110 helicopter
1663
- lrFFGvB03Fw_000071 golf driving
1664
- lsBttXzhPHw_000144 playing sitar
1665
- lt5H2iH9Ln8_000120 chicken crowing
1666
- lwgKXn21ymc_000774 people whispering
1667
- lxFVAc2dHVM_000152 fire crackling
1668
- lzLgjt8VRmU_000000 skateboarding
1669
- m-4-BAv8cCQ_000380 lawn mowing
1670
- m-NpPmAkncw_000030 male singing
1671
- m0g-zWJJClA_000150 playing banjo
1672
- m1lFSuSixy8_000350 people marching
1673
- m1lFSuSixy8_000613 people marching
1674
- m2E4i-EzHIE_000085 people finger snapping
1675
- m4j5XY09HlE_000021 car engine idling
1676
- mCyvq9TF5Ms_000052 typing on typewriter
1677
- mInTDyk6c2A_000012 writing on blackboard with chalk
1678
- mPnRdL1sC48_000240 people eating crisps
1679
- mQ60N4HdDyI_000102 machine gun shooting
1680
- mRCzIaqRG_c_000000 using sewing machines
1681
- mWGLXbNhuB4_000096 hammering nails
1682
- m_7BjYa44lo_000030 child speech, kid speaking
1683
- ma0P7XOsBgE_000030 people running
1684
- ma2RuCUufcI_000036 fox barking
1685
- maUlA8WWTEQ_000004 hail
1686
- maVHGHl01Yc_000034 lathe spinning
1687
- mcVY3xsxgcU_000060 playing bagpipes
1688
- mi9AokZ8m5s_000849 shot football
1689
- mjK1vNF3lKE_000023 playing theremin
1690
- mlihNhHFGTM_000030 playing harpsichord
1691
- mt13n4XleGY_000030 orchestra
1692
- mwu46g-jnac_000170 bird chirping, tweeting
1693
- n-PjT4mDn9Y_000173 playing bagpipes
1694
- n0PnM0u47m4_000042 mynah bird singing
1695
- n0gO6pPICi4_000065 playing mandolin
1696
- n21m6N5UmNk_000002 firing cannon
1697
- n2CgftHGLJ0_000030 driving buses
1698
- n3bX64Z_Yds_000000 playing clarinet
1699
- n4wpVSIu7c0_000087 beat boxing
1700
- n6PQq584nWA_000010 playing trumpet
1701
- n8vhraccEnc_000009 dog howling
1702
- nAtvzIyRwnU_000100 playing saxophone
1703
- nEBUuVsMtGE_000000 church bell ringing
1704
- nGIVQLeZ76E_000103 bowling impact
1705
- nHDsu69zzSA_000000 skidding
1706
- nIHYEEVzuzE_000095 canary calling
1707
- nJ7TBigS5bY_000018 people booing
1708
- nLOOmtvC9Hc_000066 playing steel guitar, slide guitar
1709
- nLVmclZYZMY_000200 people screaming
1710
- nP0vO3Xv10M_000010 dog barking
1711
- nPCYkMhaLYs_000024 roller coaster running
1712
- nTo6W-50CDg_000018 whale calling
1713
- nXc-dHK2A2A_000016 playing theremin
1714
- n_F_tRGGoEA_000107 frog croaking
1715
- ngJ_Us2C19g_000040 police car (siren)
1716
- niYH8Dpt4uE_000140 cattle mooing
1717
- nnyll58-lrA_000009 wind chime
1718
- nowY2-6reIk_000030 pigeon, dove cooing
1719
- nz0qYNbFGD4_000030 people coughing
1720
- o2-6TSqWPCY_000170 people clapping
1721
- o2qd4hsquvE_000056 bird squawking
1722
- o4F5dtUXivA_000034 playing steelpan
1723
- o6kY64rTk2k_000291 singing bowl
1724
- o7mBR043UCs_000014 pig oinking
1725
- o8iHgGRzcTE_000020 people clapping
1726
- o8oMY-WgW9Y_000030 wind rustling leaves
1727
- o9uGfNn4JyU_000062 lions roaring
1728
- oBrRQ5SiJTQ_000210 driving motorcycle
1729
- oCZ3WCK5BZU_000000 driving motorcycle
1730
- oDAI33ybJlo_000029 playing theremin
1731
- oDuiwpaep1k_000035 sliding door
1732
- oEEOscuru6s_000280 playing flute
1733
- oEXqWoSZ9Ww_000024 playing erhu
1734
- oG6EUnQjeF8_000077 swimming
1735
- oIUi8gFI_XY_000178 cat purring
1736
- oIXRSpjo7vk_000170 wood thrush calling
1737
- oJ4m2OvhA8Q_000100 playing cello
1738
- oSsLQCIJjyE_000030 singing bowl
1739
- oVxKyGnz-IA_000230 chainsawing trees
1740
- oXXHkjFLN3E_000237 electric shaver, electric razor shaving
1741
- oX_XdxqTE9Y_000110 bird chirping, tweeting
1742
- oYe46obCJhc_000039 alarm clock ringing
1743
- oZ6l0EStee4_000011 police car (siren)
1744
- oZKVPzRyn50_000432 playing electronic organ
1745
- oad_agP1oJU_000287 playing harpsichord
1746
- od2HXuT_NuI_000100 playing cello
1747
- oePtbOc8Hqs_000000 foghorn
1748
- oeSxlmkPj78_000030 ocean burbling
1749
- ofFtXFnfebQ_000684 cat purring
1750
- ohh7mWALd_k_000473 pheasant crowing
1751
- olZa2vOpbD4_000110 male speech, man speaking
1752
- omiGYobPra4_000100 toilet flushing
1753
- onqGNrWQ7us_000587 machine gun shooting
1754
- or7ikBeUhBg_000020 driving buses
1755
- osA1JXFL2Gk_000021 parrot talking
1756
- otp3r8SfygA_000102 people shuffling
1757
- p-DcPCo7Swo_000086 playing double bass
1758
- p4RWTSRg6Bg_000290 people crowd
1759
- p5LsBog-XRk_000130 playing saxophone
1760
- p5j91ecL43Y_000030 people whispering
1761
- p8HTTAhm5ic_000100 waterfall burbling
1762
- pAe8kcpjZII_000010 playing theremin
1763
- pKUzj3ckXvI_000010 toilet flushing
1764
- pNiB5w3JBVI_000003 spraying water
1765
- pQrnDC-kPHk_000106 sharpen knife
1766
- pRdi3oChUR4_000020 baltimore oriole calling
1767
- pUMZEzdKmPM_000136 owl hooting
1768
- pVJY1Q137cw_000681 cat purring
1769
- pX_Sg3xDAUg_000000 people burping
1770
- p_KsZsJwH0w_000555 sharpen knife
1771
- pdzAs6Be2sY_000139 people gargling
1772
- piYKrS14dxA_000113 mynah bird singing
1773
- pnFtPlslgGw_000019 plastic bottle crushing
1774
- ppDvhlGr5nI_000003 golf driving
1775
- ppLjxFk8C4M_000023 heart sounds, heartbeat
1776
- pqDHX5R4sdg_000220 female singing
1777
- pqElMm80SX8_000025 airplane flyby
1778
- prq7EqBGWaY_000035 playing harpsichord
1779
- psz3LAhSi9U_000001 yodelling
1780
- pu9pO-rCzy4_000153 people farting
1781
- pugRM2Nsnyo_000283 church bell ringing
1782
- pukny4fvbOQ_000040 playing clarinet
1783
- pxpIsajKD-Y_000042 reversing beeps
1784
- pxpIsajKD-Y_000065 reversing beeps
1785
- pyHJrlNMYwo_000350 sheep bleating
1786
- pzixqhh0xG4_000175 golf driving
1787
- q0Hz09My-_E_000018 lions roaring
1788
- q0R8KXxZOZM_000070 people farting
1789
- q0lahEg486Y_000295 tractor digging
1790
- q1oBXqEFXy4_000070 sloshing water
1791
- q5fUdJoUrAE_000257 beat boxing
1792
- q7cvNFoT9nQ_000027 lighting firecrackers
1793
- qA-yeGwsVn4_000018 pheasant crowing
1794
- qBDrrE6LnUo_000103 bird chirping, tweeting
1795
- qBmsSZQ7HNg_000360 railroad car, train wagon
1796
- qCcC7n2mOC0_000074 playing harpsichord
1797
- qIcEYC46zmI_000087 playing cornet
1798
- qJJEBEajF1M_000017 air conditioning noise
1799
- qL-4fJyDGXc_000893 people eating noodle
1800
- qNi5Xlf2ZVY_000510 people clapping
1801
- qORUGCczq74_000042 swimming
1802
- qRm5Yh3JPSg_000016 playing tambourine
1803
- qRwun6pFuNA_000010 playing banjo
1804
- qTRrHj-DNYc_000137 dinosaurs bellowing
1805
- qW9b8qu_KrU_000180 lions growling
1806
- qXFgtkhWLgM_000134 child singing
1807
- q_ZMlkVS740_000222 playing congas
1808
- qbmNcYH52eo_000516 striking pool
1809
- qdl6t1bDb-8_000400 eating with cutlery
1810
- qgv0riPveBQ_000030 bird chirping, tweeting
1811
- qiw2I1oQIVQ_000057 playing snare drum
1812
- qjBkiP7mBNI_000597 ripping paper
1813
- qmjK_Wi0IK8_000080 people cheering
1814
- qoPAdSFZ4f0_000370 chopping wood
1815
- qpjOCvQEHdo_000080 people cheering
1816
- qrNCI310T9Y_000018 chicken clucking
1817
- qsj_OgZZDvQ_000080 tap dancing
1818
- qsrNWdcjwwY_000320 female speech, woman speaking
1819
- quF2HA3u2JY_000101 cupboard opening or closing
1820
- quZSWDeSywg_000040 toilet flushing
1821
- qv51EqZA8eE_000291 train horning
1822
- qxeCxC_zpvU_000202 playing french horn
1823
- r24KMnV5Rrk_000030 people running
1824
- r42dJt0hxro_000010 gibbon howling
1825
- r47N9mdOeXc_000030 playing violin, fiddle
1826
- r4Zm5lEsI-M_000110 vehicle horn, car horn, honking
1827
- r7e4wJy4NP8_000090 motorboat, speedboat acceleration
1828
- r96LZqBtlwg_000050 dog whimpering
1829
- r9uN-AltjDQ_000130 lawn mowing
1830
- rAXnOxWHaLs_000030 playing french horn
1831
- rAth9ueRqM4_000040 whale calling
1832
- rD4zq3CvJSo_000130 people slapping
1833
- rEdr-j9oAN0_000074 playing french horn
1834
- rFA1GBcIGN4_000067 playing ukulele
1835
- rFgrOflwKPg_000290 playing trombone
1836
- rLuNw3Cm7rs_000024 lighting firecrackers
1837
- rMDnGZU7jzE_000001 dog baying
1838
- rQthEYYXM-k_000030 people sniggering
1839
- rRP810El--s_000958 fire truck siren
1840
- rSHvW5dGanw_000150 fireworks banging
1841
- rSWPVWkAbec_000000 bee, wasp, etc. buzzing
1842
- rTNSzUXd3wk_000180 playing double bass
1843
- rVnkDOvLWm8_000180 cap gun shooting
1844
- raz3OUu768k_000068 playing clarinet
1845
- rfqqBv3eriU_000160 stream burbling
1846
- rgdMDo5TBic_000355 playing squash
1847
- rn381TUMxyE_000298 arc welding
1848
- rs2FL8HJfGE_000030 people sniggering
1849
- rwVhTlLcBO0_000099 playing erhu
1850
- rx2lqMvj2Wo_000052 squishing water
1851
- rz9PZZA04z8_000183 playing badminton
1852
- s2QrQdxzLwQ_000074 playing glockenspiel
1853
- s8zSSYQM0Tc_000127 footsteps on snow
1854
- s9gzcUg_nlM_000030 playing drum kit
1855
- sFTyeq295xU_000041 people humming
1856
- sIHApNhq2Ik_000002 bird squawking
1857
- sLEEurjCsAY_000051 typing on typewriter
1858
- sLOjC8EWrHA_000070 driving buses
1859
- sOg4MNTWx_0_000000 skateboarding
1860
- sUHlRRyS2YM_000009 pigeon, dove cooing
1861
- sUs8O9toO4M_000311 dinosaurs bellowing
1862
- sXDJvBEzqjs_000000 dog bow-wow
1863
- sYy0lPjLEXQ_000100 playing cymbal
1864
- s_FLZ-ekB2A_000088 telephone bell ringing
1865
- sa6B5XyFYIg_000040 playing bagpipes
1866
- scm7r0uBepU_000467 mouse clicking
1867
- smBHJiEPCRI_000030 duck quacking
1868
- snbtH1P3MVA_000119 playing timbales
1869
- snyzyJlTBbg_000003 dog baying
1870
- surXSGAnpM0_000000 playing harmonica
1871
- sxiVIGK5AEc_000010 people crowd
1872
- syysO74ja30_000007 playing gong
1873
- szQ-4VQQQsI_000020 railroad car, train wagon
1874
- t0XoS_8YVP4_000728 magpie calling
1875
- t2xJjZp1D1E_000030 dog growling
1876
- t3YfjKEmei4_000080 race car, auto racing
1877
- t3u3ykowlvs_000030 raining
1878
- tD9rMw8YPBI_000030 child speech, kid speaking
1879
- tDayTL0ivzU_000014 playing timbales
1880
- tJChPvDD-hI_000035 parrot talking
1881
- tLFNgY5NBMk_000001 playing bassoon
1882
- tRw0KL6PMFU_000060 skateboarding
1883
- tTePTFQV52M_000030 pig oinking
1884
- tV0sIqEryIY_000037 wind chime
1885
- tWDG6UsiG3s_000090 people babbling
1886
- tYBxgXg8yxw_000046 woodpecker pecking tree
1887
- tYzH5rkbuBQ_000000 frog croaking
1888
- tm9rnG0455k_000010 skidding
1889
- tuqcWxh_mdc_000012 baby crying
1890
- twWBQjLyuxw_000014 bull bellowing
1891
- u1nAQ6GgJ7Y_000154 playing volleyball
1892
- u6AV24u4OMQ_000052 rope skipping
1893
- u6c5tvrkqVA_000187 playing timbales
1894
- u88CrTGAqbo_000000 lawn mowing
1895
- uEPueBOV06U_000109 yodelling
1896
- uGQ0TW02gBo_000004 frog croaking
1897
- uI5eona1hc4_000000 elk bugling
1898
- uIHnphQWVRA_000169 opening or closing drawers
1899
- uIg0I7pAjvM_000030 race car, auto racing
1900
- uJSDmIF4dhE_000260 driving buses
1901
- uK0jcVxT-Pg_000030 driving buses
1902
- uLm5oUt3XG4_000031 playing tabla
1903
- uSmduC6gJxg_000050 rowboat, canoe, kayak rowing
1904
- uWdgdlJqI2Y_000019 basketball bounce
1905
- uWq8Q_cIEwE_000086 playing ukulele
1906
- uZghS49MC1k_000180 skidding
1907
- u_85N9h_cGs_000050 car passing by
1908
- udVSYrFacsc_000072 playing cornet
1909
- ugUyp_keJO4_000022 mouse clicking
1910
- uiPC88KDlW4_000022 engine accelerating, revving, vroom
1911
- unF6DdqG4l8_000050 people whistling
1912
- upZ0sKmaZrI_000167 playing lacrosse
1913
- uvUEfRqpEQU_000145 singing choir
1914
- uyNyWLJIci8_000000 fire truck siren
1915
- v5OdaMw5hhk_000030 playing snare drum
1916
- vADdI9YTMRs_000243 playing timbales
1917
- vJk_Jzr2YIs_000080 playing hammond organ
1918
- vLLiaCDHSPY_000010 dog barking
1919
- vUORRJqXp7A_000036 playing table tennis
1920
- vXupVqDfK34_000116 cricket chirping
1921
- v_cxwPhwaBQ_000000 people farting
1922
- varD0b9CTgs_000020 people belly laughing
1923
- vcwXIa-QB8A_000025 sailing
1924
- vdXavSaj8-M_000070 playing accordion
1925
- vgIgTWqXtms_000023 child singing
1926
- vhqkCDgsuh4_000255 people booing
1927
- vkA-v4DSriM_000229 playing tabla
1928
- vktUwc0Cs7w_000170 playing clarinet
1929
- vpAGr_NrM_w_000050 fireworks banging
1930
- vzoQdjPITKw_000030 pigeon, dove cooing
1931
- w-9xoB74oF0_000004 opening or closing car electric windows
1932
- w-JaJ11OqQY_000345 people slurping
1933
- w3kMt-zQ9t4_000215 playing table tennis
1934
- w5T582MCzlY_000011 running electric fan
1935
- w5vaBVSxgKg_000030 lawn mowing
1936
- w8puug1pEUA_000170 stream burbling
1937
- w9K_AmeWhlo_000071 fire crackling
1938
- wAnqT37UgYY_000034 dog growling
1939
- wEbJ-9cmSaE_000003 playing cornet
1940
- wHdgExbL6dA_000034 playing badminton
1941
- wOYLWY6UCu8_000262 playing ukulele
1942
- wP-96GP6bsU_000000 vehicle horn, car horn, honking
1943
- wT6-Isia2PQ_000149 child singing
1944
- wTQ-1cd8owI_000181 dog bow-wow
1945
- wUNpHu61l7Q_000190 male singing
1946
- wVJ-S2zYxug_000040 playing drum kit
1947
- wX4Ya3D20H8_000039 scuba diving
1948
- wXsrff4No40_000237 playing hockey
1949
- wYZc2-3ViXs_000155 civil defense siren
1950
- wZj294W4RVU_000094 fire crackling
1951
- w_yGhgrow38_000091 eletric blender running
1952
- wdk-RmsGdyw_000310 driving buses
1953
- wdlfOAR03iY_000000 playing glockenspiel
1954
- we-ONoZIkWE_000018 dog howling
1955
- wegIxELjtz4_000334 people eating noodle
1956
- whIS2UodgLI_000002 gibbon howling
1957
- wkwjx0oMAjw_000021 beat boxing
1958
- wnW4qgQQg3g_000050 playing cello
1959
- wrFyu2T1XOo_000000 hail
1960
- wsHPe19Y9Nc_000081 electric shaver, electric razor shaving
1961
- wtyuiWygNTc_000000 zebra braying
1962
- wuAcPWyHMXo_000008 lions roaring
1963
- wwQPX3zjV4s_000028 elk bugling
1964
- x0_AiAhfeV0_000068 eagle screaming
1965
- x0bbH2Tao_0_000000 dog howling
1966
- x1Rt2zN-oXo_000000 dog growling
1967
- x1bXQS9dUAc_000140 playing violin, fiddle
1968
- x2uCcPNM6Nw_000030 pigeon, dove cooing
1969
- x3cLaiaaF0M_000032 skiing
1970
- x68R1rmvKgc_000060 female singing
1971
- x6d8ytnWNDI_000045 barn swallow calling
1972
- x8yymm3DtVA_000022 playing cello
1973
- xK1vy_6H2VM_000010 scuba diving
1974
- xMa1vAUhTfM_000429 ice cream truck, ice cream van
1975
- xN_CePbfjVg_000004 playing bass drum
1976
- xPIhTw0fbzI_000010 train horning
1977
- xQaYumd1O48_000004 lions growling
1978
- xS4brO1qu0g_000591 playing hockey
1979
- xUCKcoE3K6Q_000313 lip smacking
1980
- xVDGIF1pFvQ_000030 driving buses
1981
- xVEXWvj0iWo_000060 rowboat, canoe, kayak rowing
1982
- xWBMt4fI95M_000063 scuba diving
1983
- xWgd4OMcKbs_000263 people nose blowing
1984
- xY9mlbn2IhY_000000 people burping
1985
- xYAHwbhWEgM_000030 playing violin, fiddle
1986
- xbNNxwGRG20_000062 cattle, bovinae cowbell
1987
- xdUbCcEbipM_000290 people crowd
1988
- xeS25F6uHic_000162 airplane flyby
1989
- xetF74UUCGk_000001 ice cream truck, ice cream van
1990
- xf0cheS5wFM_000090 playing piano
1991
- xfT0HF1Pbxk_000003 playing sitar
1992
- xg_3Uas3z40_000240 skateboarding
1993
- xibFeibkfWM_000036 alligators, crocodiles hissing
1994
- xj0Xi47RC88_000200 lawn mowing
1995
- xkUzsvSImy4_000306 people eating crisps
1996
- xm0N3HXnSWc_000361 rope skipping
1997
- xoViga6dJa4_000141 playing steelpan
1998
- xocKilOzrb4_000065 reversing beeps
1999
- xq5kMmAFYx8_000030 playing double bass
2000
- xqv96EPg7so_000200 railroad car, train wagon
2001
- xtvQjd6cwC4_000040 playing bagpipes
2002
- y3TRiYwDbHo_000287 playing oboe
2003
- y6wsRU2aNx4_000040 railroad car, train wagon
2004
- y95ml0IYGr4_000440 chainsawing trees
2005
- yA_63YfQ034_000022 dog growling
2006
- yBwMu2NueR0_000284 rapping
2007
- yE_SP127xy8_000010 people crowd
2008
- yEfhYsMd1yc_000006 playing double bass
2009
- yH3PJfYi_gs_000109 car engine starting
2010
- yJGtoH8INnA_000084 tapping guitar
2011
- yJN5_1tfqXo_000075 magpie calling
2012
- yMMmjb3BRi0_000030 dog bow-wow
2013
- yOhdod2Kg40_000210 playing bassoon
2014
- yPJiPWkeT3U_000254 playing gong
2015
- yPUYU6t3rwo_000370 bee, wasp, etc. buzzing
2016
- yQzzdP-4iBU_000002 planing timber
2017
- yUL9UefoANU_000128 tractor digging
2018
- yVzIaZzLH38_000130 bee, wasp, etc. buzzing
2019
- yYPNrg-s-NI_000060 child singing
2020
- ybnXdQfSNZs_000001 police radio chatter
2021
- ycN30BUfzeo_000070 playing clarinet
2022
- ygOHZ_55jME_000174 electric shaver, electric razor shaving
2023
- yjyZgzYuuSQ_000089 cat purring
2024
- yo5I2MTqv9E_000030 playing marimba, xylophone
2025
- ywD_am3uZh8_000020 splashing water
2026
- ywYLMe6y-S0_000040 playing piano
2027
- z9CCSNKepA8_000537 striking pool
2028
- z9crgUIWcmA_000000 dog barking
2029
- zBgR_gj8NGg_000083 striking pool
2030
- zGbJAz-3Ao8_000070 playing banjo
2031
- zGn9k6j8kVo_000049 rope skipping
2032
- zILE3kr9nIU_000030 mouse pattering
2033
- zJPgE79wkE4_000000 playing tennis
2034
- zMKJFnBr1Gw_000013 reversing beeps
2035
- zPlyG_ryFpg_000006 sliding door
2036
- zRU8A0m9Op8_000145 driving snowmobile
2037
- zVCqTRlc7NU_000020 fire truck siren
2038
- zYPY3Fh1Xjo_000000 skidding
2039
- zcZ0WVQ8t8s_000210 splashing water
2040
- zhPLdAMVAuo_000257 church bell ringing
2041
- zl6hP51zURM_000075 playing oboe
2042
- zlt2EGxum58_000174 bouncing on trampoline
2043
- zmSPCArJHB0_000190 bird squawking
2044
- zpqGedo-jm4_000043 cell phone buzzing
2045
- zrKMC4fAKp0_000202 playing cello
2046
- zsnU7rt_Qq0_000005 baby laughter
2047
- zw7dTh-Lx3o_000074 canary calling
2048
- zzP5qr-ZxHY_000199 people marching
2049
- zzftU8z4aOI_000230 skateboarding
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MMAudio/train.py DELETED
@@ -1,209 +0,0 @@
1
- import logging
2
- import math
3
- import random
4
- from datetime import timedelta
5
- from pathlib import Path
6
-
7
- import hydra
8
- import numpy as np
9
- import torch
10
- import torch.distributed as distributed
11
- from hydra import compose
12
- from hydra.core.hydra_config import HydraConfig
13
- from omegaconf import DictConfig, open_dict
14
- from torch.distributed.elastic.multiprocessing.errors import record
15
-
16
- from mmaudio.data.data_setup import setup_training_datasets, setup_val_datasets
17
- from mmaudio.model.sequence_config import CONFIG_16K, CONFIG_44K
18
- from mmaudio.runner import Runner
19
- from mmaudio.sample import sample
20
- from mmaudio.utils.dist_utils import info_if_rank_zero, local_rank, world_size
21
- from mmaudio.utils.logger import TensorboardLogger
22
- from mmaudio.utils.synthesize_ema import synthesize_ema
23
-
24
- torch.backends.cuda.matmul.allow_tf32 = True
25
- torch.backends.cudnn.allow_tf32 = True
26
-
27
- log = logging.getLogger()
28
-
29
-
30
- def distributed_setup():
31
- distributed.init_process_group(backend="nccl", timeout=timedelta(hours=2))
32
- log.info(f'Initialized: local_rank={local_rank}, world_size={world_size}')
33
- return local_rank, world_size
34
-
35
-
36
- @record
37
- @hydra.main(version_base='1.3.2', config_path='config', config_name='train_config.yaml')
38
- def train(cfg: DictConfig):
39
- # initial setup
40
- torch.cuda.set_device(local_rank)
41
- torch.backends.cudnn.benchmark = cfg.cudnn_benchmark
42
- distributed_setup()
43
- num_gpus = world_size
44
- run_dir = HydraConfig.get().run.dir
45
-
46
- # compose early such that it does not rely on future hard disk reading
47
- eval_cfg = compose('eval_config', overrides=[f'exp_id={cfg.exp_id}'])
48
-
49
- # patch data dim
50
- if cfg.model.endswith('16k'):
51
- seq_cfg = CONFIG_16K
52
- elif cfg.model.endswith('44k'):
53
- seq_cfg = CONFIG_44K
54
- else:
55
- raise ValueError(f'Unknown model: {cfg.model}')
56
- with open_dict(cfg):
57
- cfg.data_dim.latent_seq_len = seq_cfg.latent_seq_len
58
- cfg.data_dim.clip_seq_len = seq_cfg.clip_seq_len
59
- cfg.data_dim.sync_seq_len = seq_cfg.sync_seq_len
60
-
61
- # wrap python logger with a tensorboard logger
62
- log = TensorboardLogger(cfg.exp_id,
63
- run_dir,
64
- logging.getLogger(),
65
- is_rank0=(local_rank == 0),
66
- enable_email=cfg.enable_email and not cfg.debug)
67
-
68
- info_if_rank_zero(log, f'All configuration: {cfg}')
69
- info_if_rank_zero(log, f'Number of GPUs detected: {num_gpus}')
70
-
71
- # number of dataloader workers
72
- info_if_rank_zero(log, f'Number of dataloader workers (per GPU): {cfg.num_workers}')
73
-
74
- # Set seeds to ensure the same initialization
75
- torch.manual_seed(cfg.seed)
76
- np.random.seed(cfg.seed)
77
- random.seed(cfg.seed)
78
-
79
- # setting up configurations
80
- info_if_rank_zero(log, f'Training configuration: {cfg}')
81
- cfg.batch_size //= num_gpus
82
- info_if_rank_zero(log, f'Batch size (per GPU): {cfg.batch_size}')
83
-
84
- # determine time to change max skip
85
- total_iterations = cfg['num_iterations']
86
-
87
- # setup datasets
88
- dataset, sampler, loader = setup_training_datasets(cfg)
89
- info_if_rank_zero(log, f'Number of training samples: {len(dataset)}')
90
- info_if_rank_zero(log, f'Number of training batches: {len(loader)}')
91
-
92
- val_dataset, val_loader, eval_loader = setup_val_datasets(cfg)
93
- info_if_rank_zero(log, f'Number of val samples: {len(val_dataset)}')
94
- val_cfg = cfg.data.ExtractedVGG_val
95
-
96
- # compute and set mean and std
97
- latent_mean, latent_std = dataset.compute_latent_stats()
98
-
99
- # construct the trainer
100
- trainer = Runner(cfg,
101
- log=log,
102
- run_path=run_dir,
103
- for_training=True,
104
- latent_mean=latent_mean,
105
- latent_std=latent_std).enter_train()
106
- eval_rng_clone = trainer.rng.graphsafe_get_state()
107
-
108
- # load previous checkpoint if needed
109
- if cfg['checkpoint'] is not None:
110
- curr_iter = trainer.load_checkpoint(cfg['checkpoint'])
111
- cfg['checkpoint'] = None
112
- info_if_rank_zero(log, 'Model checkpoint loaded!')
113
- else:
114
- # if run_dir exists, load the latest checkpoint
115
- checkpoint = trainer.get_latest_checkpoint_path()
116
- if checkpoint is not None:
117
- curr_iter = trainer.load_checkpoint(checkpoint)
118
- info_if_rank_zero(log, 'Latest checkpoint loaded!')
119
- else:
120
- # load previous network weights if needed
121
- curr_iter = 0
122
- if cfg['weights'] is not None:
123
- info_if_rank_zero(log, 'Loading weights from the disk')
124
- trainer.load_weights(cfg['weights'])
125
- cfg['weights'] = None
126
-
127
- # determine max epoch
128
- total_epoch = math.ceil(total_iterations / len(loader))
129
- current_epoch = curr_iter // len(loader)
130
- info_if_rank_zero(log, f'We will approximately use {total_epoch} epochs.')
131
-
132
- # training loop
133
- try:
134
- # Need this to select random bases in different workers
135
- np.random.seed(np.random.randint(2**30 - 1) + local_rank * 1000)
136
- while curr_iter < total_iterations:
137
- # Crucial for randomness!
138
- sampler.set_epoch(current_epoch)
139
- current_epoch += 1
140
- log.debug(f'Current epoch: {current_epoch}')
141
-
142
- trainer.enter_train()
143
- trainer.log.data_timer.start()
144
- for data in loader:
145
- trainer.train_pass(data, curr_iter)
146
-
147
- if (curr_iter + 1) % cfg.val_interval == 0:
148
- # swap into a eval rng state, i.e., use the same seed for every validation pass
149
- train_rng_snapshot = trainer.rng.graphsafe_get_state()
150
- trainer.rng.graphsafe_set_state(eval_rng_clone)
151
- info_if_rank_zero(log, f'Iteration {curr_iter}: validating')
152
- for data in val_loader:
153
- trainer.validation_pass(data, curr_iter)
154
- distributed.barrier()
155
- trainer.val_integrator.finalize('val', curr_iter, ignore_timer=True)
156
- trainer.rng.graphsafe_set_state(train_rng_snapshot)
157
-
158
- if (curr_iter + 1) % cfg.eval_interval == 0:
159
- save_eval = (curr_iter + 1) % cfg.save_eval_interval == 0
160
- train_rng_snapshot = trainer.rng.graphsafe_get_state()
161
- trainer.rng.graphsafe_set_state(eval_rng_clone)
162
- info_if_rank_zero(log, f'Iteration {curr_iter}: validating')
163
- for data in eval_loader:
164
- audio_path = trainer.inference_pass(data,
165
- curr_iter,
166
- val_cfg,
167
- save_eval=save_eval)
168
- distributed.barrier()
169
- trainer.rng.graphsafe_set_state(train_rng_snapshot)
170
- trainer.eval(audio_path, curr_iter, val_cfg)
171
-
172
- curr_iter += 1
173
-
174
- if curr_iter >= total_iterations:
175
- break
176
- except Exception as e:
177
- log.error(f'Error occurred at iteration {curr_iter}!')
178
- log.critical(e.message if hasattr(e, 'message') else str(e))
179
- raise
180
- finally:
181
- if not cfg.debug:
182
- trainer.save_checkpoint(curr_iter)
183
- trainer.save_weights(curr_iter)
184
-
185
- # Inference pass
186
- del trainer
187
- torch.cuda.empty_cache()
188
-
189
- # Synthesize EMA
190
- if local_rank == 0:
191
- log.info(f'Synthesizing EMA with sigma={cfg.ema.default_output_sigma}')
192
- ema_sigma = cfg.ema.default_output_sigma
193
- state_dict = synthesize_ema(cfg, ema_sigma, step=None)
194
- save_dir = Path(run_dir) / f'{cfg.exp_id}_ema_final.pth'
195
- torch.save(state_dict, save_dir)
196
- log.info(f'Synthesized EMA saved to {save_dir}!')
197
- distributed.barrier()
198
-
199
- log.info(f'Evaluation: {eval_cfg}')
200
- sample(eval_cfg)
201
-
202
- # clean-up
203
- log.complete()
204
- distributed.barrier()
205
- distributed.destroy_process_group()
206
-
207
-
208
- if __name__ == '__main__':
209
- train()