Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running

Jack Wu commited on Mar 25

Commit

22b2290

1 Parent(s): 3886668

Remove non-inference files from all three model folders

Keep only what is imported at runtime by app.py:
- TARO: remove dataset.py, infer.py, loss.py, train.py, train.sh, preprocess/, README.md
- MMAudio: remove train.py, batch_eval.py, eval_onsets.py, demo.py, gradio_demo.py,
config/, docs/, sets/, training/, README.md, LICENSE, .gitignore
- HunyuanFoley: remove infer.py, gradio_app.py, tests/, assets/, build_package.sh,
download_test_videos.sh, DEVELOPMENT.md, INSTALL.md, LICENSE, MANIFEST.in,
NOTICE, pytest.ini, README.md, .gitattributes, .gitignore, .pre-commit-config.yaml

Update .gitignore to permanently exclude all of the above.

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +47 -0
HunyuanVideo-Foley/.gitattributes +0 -3
HunyuanVideo-Foley/.gitignore +0 -159
HunyuanVideo-Foley/.pre-commit-config.yaml +0 -38
HunyuanVideo-Foley/DEVELOPMENT.md +0 -187
HunyuanVideo-Foley/INSTALL.md +0 -203
HunyuanVideo-Foley/LICENSE +0 -77
HunyuanVideo-Foley/MANIFEST.in +0 -38
HunyuanVideo-Foley/NOTICE +0 -27
HunyuanVideo-Foley/README.md +0 -519
HunyuanVideo-Foley/build_package.sh +0 -58
HunyuanVideo-Foley/download_test_videos.sh +0 -11
HunyuanVideo-Foley/gradio_app.py +0 -834
HunyuanVideo-Foley/infer.py +0 -304
HunyuanVideo-Foley/pytest.ini +0 -11
HunyuanVideo-Foley/tests/__init__.py +0 -1
HunyuanVideo-Foley/tests/test_config_utils.py +0 -89
HunyuanVideo-Foley/tests/test_media_utils.py +0 -82
MMAudio/.gitignore +0 -146
MMAudio/LICENSE +0 -21
MMAudio/README.md +0 -198
MMAudio/batch_eval.py +0 -110
MMAudio/config/__init__.py +0 -0
MMAudio/config/base_config.yaml +0 -62
MMAudio/config/data/base.yaml +0 -70
MMAudio/config/eval_config.yaml +0 -17
MMAudio/config/eval_data/base.yaml +0 -22
MMAudio/config/hydra/job_logging/custom-eval.yaml +0 -32
MMAudio/config/hydra/job_logging/custom-no-rank.yaml +0 -32
MMAudio/config/hydra/job_logging/custom-simplest.yaml +0 -26
MMAudio/config/hydra/job_logging/custom.yaml +0 -33
MMAudio/config/train_config.yaml +0 -41
MMAudio/demo.py +0 -141
MMAudio/docs/EVAL.md +0 -23
MMAudio/docs/MODELS.md +0 -50
MMAudio/docs/TRAINING.md +0 -184
MMAudio/docs/demo.html +0 -81
MMAudio/docs/images/icon.png +0 -0
MMAudio/docs/index.html +0 -156
MMAudio/docs/style.css +0 -78
MMAudio/docs/style_videos.css +0 -52
MMAudio/docs/video_gen.html +0 -254
MMAudio/docs/video_main.html +0 -98
MMAudio/docs/video_vgg.html +0 -452
MMAudio/eval_onsets.py +0 -141
MMAudio/gradio_demo.py +0 -343
MMAudio/sets/vgg-test.tsv +0 -0
MMAudio/sets/vgg-train.tsv +0 -0
MMAudio/sets/vgg-val.tsv +0 -2049
MMAudio/train.py +0 -209

.gitignore CHANGED Viewed

	@@ -0,0 +1,47 @@

+# ---- TARO: training / preprocessing only ----
+TARO/dataset.py
+TARO/infer.py
+TARO/loss.py
+TARO/train.py
+TARO/train.sh
+TARO/preprocess/
+TARO/README.md
+# ---- MMAudio: training / eval / docs only ----
+MMAudio/batch_eval.py
+MMAudio/eval_onsets.py
+MMAudio/train.py
+MMAudio/demo.py
+MMAudio/gradio_demo.py
+MMAudio/config/
+MMAudio/docs/
+MMAudio/sets/
+MMAudio/training/
+MMAudio/README.md
+MMAudio/.gitignore
+MMAudio/LICENSE
+# ---- HunyuanFoley: build / test / docs only ----
+HunyuanVideo-Foley/.gitattributes
+HunyuanVideo-Foley/.gitignore
+HunyuanVideo-Foley/.pre-commit-config.yaml
+HunyuanVideo-Foley/assets/
+HunyuanVideo-Foley/build_package.sh
+HunyuanVideo-Foley/download_test_videos.sh
+HunyuanVideo-Foley/gradio_app.py
+HunyuanVideo-Foley/infer.py
+HunyuanVideo-Foley/DEVELOPMENT.md
+HunyuanVideo-Foley/INSTALL.md
+HunyuanVideo-Foley/LICENSE
+HunyuanVideo-Foley/MANIFEST.in
+HunyuanVideo-Foley/NOTICE
+HunyuanVideo-Foley/pytest.ini
+HunyuanVideo-Foley/README.md
+HunyuanVideo-Foley/tests/
+# ---- Python / IDE ----
+__pycache__/
+*.pyc
+.venv/
+.DS_Store
+.idea/

HunyuanVideo-Foley/.gitattributes DELETED Viewed

@@ -1,3 +0,0 @@
-assets/data_pipeline.png filter=lfs diff=lfs merge=lfs -text
-assets/model_arch.png filter=lfs diff=lfs merge=lfs -text
-*.png filter=lfs diff=lfs merge=lfs -text

HunyuanVideo-Foley/.gitignore DELETED Viewed

@@ -1,159 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-# C extensions
-*.so
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-pip-wheel-metadata/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-# Translations
-*.mo
-*.pot
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-# Flask stuff:
-instance/
-.webassets-cache
-# Scrapy stuff:
-.scrapy
-# Sphinx documentation
-docs/_build/
-# PyBuilder
-target/
-# Jupyter Notebook
-.ipynb_checkpoints
-# IPython
-profile_default/
-ipython_config.py
-# pyenv
-.python-version
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow
-__pypackages__/
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-# SageMath parsed files
-*.sage.py
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-# Spyder project settings
-.spyderproject
-.spyproject
-# Rope project settings
-.ropeproject
-# mkdocs documentation
-/site
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-# Pyre type checker
-.pyre/
-# ==========================================
-# Custom settings
-# ==========================================
-# For MacOS
-.DS_Store
-# For IDEs
-.idea/
-.vscode/
-pyrightconfig.json
-.cursorignore
-assets/
-examples/
-# For global settings
-__*/
-**/my_*
-tmp*.*
-.my*
-# Model checkpoints
-*.pt
-*.ckpt
-*.pth
-*.safetensors
-CLAUDE.md

HunyuanVideo-Foley/.pre-commit-config.yaml DELETED Viewed

@@ -1,38 +0,0 @@
-repos:
-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
-    hooks:
-      - id: trailing-whitespace
-      - id: end-of-file-fixer
-      - id: check-yaml
-      - id: check-added-large-files
-      - id: check-merge-conflict
-      - id: debug-statements
-      - id: check-docstring-first
-  - repo: https://github.com/psf/black
-    rev: 23.3.0
-    hooks:
-      - id: black
-        language_version: python3
-        args: [--line-length=120]
-  - repo: https://github.com/pycqa/isort
-    rev: 5.12.0
-    hooks:
-      - id: isort
-        args: [--profile, black, --line-length=120]
-  - repo: https://github.com/pycqa/flake8
-    rev: 6.0.0
-    hooks:
-      - id: flake8
-        args: [--max-line-length=120]
-        additional_dependencies: [flake8-docstrings]
-  - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.3.0
-    hooks:
-      - id: mypy
-        additional_dependencies: [types-all]
-        args: [--ignore-missing-imports]

HunyuanVideo-Foley/DEVELOPMENT.md DELETED Viewed

@@ -1,187 +0,0 @@
-# Development Guide
-This document provides guidelines for developing and contributing to the HunyuanVideo-Foley project.
-## Code Style and Quality
-### Code Formatting
-We use the following tools to maintain consistent code style:
-- **Black**: Code formatter with 120 character line length
-- **isort**: Import sorter compatible with Black
-- **flake8**: Linting and style checking
-- **mypy**: Static type checking
-### Pre-commit Hooks
-Install pre-commit hooks to automatically format code before commits:
-```bash
-pip install pre-commit
-pre-commit install
-```
-### Manual Code Formatting
-Format code manually:
-```bash
-# Format all Python files
-black --line-length 120 .
-# Sort imports
-isort --profile black --line-length 120 .
-# Check code style
-flake8 --max-line-length 120
-# Type checking
-mypy --ignore-missing-imports .
-```
-## Project Structure
-```
-hunyuanvideo_foley/
-├── models/                 # Model implementations
-│   ├── hifi_foley.py      # Main model
-│   ├── nn/                # Neural network layers
-│   ├── dac_vae/           # Audio VAE
-│   └── synchformer/       # Synchronization model
-├── utils/                 # Utilities
-│   ├── config_utils.py    # Configuration handling
-│   ├── feature_utils.py   # Feature extraction
-│   ├── model_utils.py     # Model loading/saving
-│   └── media_utils.py     # Audio/video processing
-└── constants.py           # Project constants
-```
-## Coding Standards
-### Error Handling
-- Use custom exceptions for domain-specific errors
-- Always validate inputs at function boundaries
-- Log errors with appropriate levels (ERROR, WARNING, INFO)
-- Provide helpful error messages to users
-### Type Hints
-- Add type hints to all function parameters and return values
-- Use `Optional[Type]` for nullable parameters
-- Import types from `typing` module
-### Documentation
-- Add docstrings to all public functions and classes
-- Use Google-style docstrings
-- Document parameters, return values, and exceptions
-### Example Function
-```python
-def process_video(
-    video_path: str,
-    max_duration: Optional[float] = None
-) -> Tuple[np.ndarray, float]:
-    """
-    Process video file and extract frames.
-    Args:
-        video_path: Path to input video file
-        max_duration: Maximum duration in seconds (optional)
-    Returns:
-        Tuple of (frames array, duration in seconds)
-    Raises:
-        FileNotFoundError: If video file doesn't exist
-        VideoProcessingError: If video processing fails
-    """
-    if not os.path.exists(video_path):
-        raise FileNotFoundError(f"Video file not found: {video_path}")
-    # Implementation here...
-```
-## Testing
-### Running Tests
-```bash
-# Run all tests
-python -m pytest
-# Run specific test file
-python -m pytest tests/test_feature_utils.py
-# Run with coverage
-python -m pytest --cov=hunyuanvideo_foley
-```
-### Writing Tests
-- Place tests in `tests/` directory
-- Name test files as `test_*.py`
-- Use descriptive test function names
-- Test edge cases and error conditions
-## Development Workflow
-1. **Setup Environment**
-   ```bash
-   python -m venv venv
-   source venv/bin/activate  # Linux/Mac
-   # or
-   venv\Scripts\activate     # Windows
-   pip install -r requirements.txt
-   pip install -e .
-   ```
-2. **Install Development Tools**
-   ```bash
-   pre-commit install
-   ```
-3. **Make Changes**
-   - Follow the coding standards above
-   - Add tests for new functionality
-   - Update documentation as needed
-4. **Run Quality Checks**
-   ```bash
-   black --check --line-length 120 .
-   isort --check-only --profile black .
-   flake8 --max-line-length 120
-   mypy --ignore-missing-imports .
-   pytest
-   ```
-5. **Commit Changes**
-   ```bash
-   git add .
-   git commit -m "feat: add new feature"
-   ```
-## Performance Considerations
-- Use `torch.no_grad()` for inference-only code
-- Leverage GPU when available
-- Implement batch processing where possible
-- Profile code to identify bottlenecks
-## Dependencies
-- Keep dependencies minimal and well-maintained
-- Pin versions for reproducibility
-- Separate development dependencies from runtime dependencies
-- Document any special installation requirements
-## Configuration
-- Use centralized configuration in `constants.py`
-- Support environment variable overrides
-- Provide sensible defaults for all parameters
-- Validate configuration at startup

HunyuanVideo-Foley/INSTALL.md DELETED Viewed

@@ -1,203 +0,0 @@
-# 安装指南 - HunyuanVideo-Foley
-本文档提供了将 HunyuanVideo-Foley 作为 Python 包安装和使用的详细指南。
-## 安装方式
-### 方式1：从源码安装（推荐）
-```bash
-# 克隆仓库
-git clone https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley
-cd HunyuanVideo-Foley
-# 安装包（开发模式）
-pip install -e .
-# 或安装包含所有可选依赖
-pip install -e .[all]
-```
-### 方式2：直接从GitHub安装
-```bash
-pip install git+https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley.git
-```
-### 方式3：构建wheel包安装
-```bash
-# 在项目根目录下
-python setup.py bdist_wheel
-pip install dist/hunyuanvideo_foley-1.0.0-py3-none-any.whl
-```
-## 特殊依赖安装
-由于某些依赖不在PyPI上，需要单独安装：
-```bash
-# 安装audiotools（必需）
-pip install git+https://github.com/descriptinc/audiotools
-# 安装特定版本的transformers（支持SigLIP2）
-pip install git+https://github.com/huggingface/transformers@v4.49.0-SigLIP-2
-```
-## 可选依赖安装
-```bash
-# 安装开发依赖
-pip install hunyuanvideo-foley[dev]
-# 安装测试依赖
-pip install hunyuanvideo-foley[test]
-# 安装Gradio界面依赖
-pip install hunyuanvideo-foley[gradio]
-# 安装所有可选依赖
-pip install hunyuanvideo-foley[all]
-```
-## 验证安装
-```bash
-# 检查包是否正确安装
-python -c "import hunyuanvideo_foley; print(hunyuanvideo_foley.__version__)"
-# 检查命令行工具
-hunyuanvideo-foley --help
-```
-## 使用方法
-### 1. 作为Python包使用
-```python
-import hunyuanvideo_foley as hvf
-# 加载模型
-model_dict, cfg = hvf.load_model(
-    model_path="path/to/model",
-    config_path="configs/hunyuanvideo-foley-xxl.yaml"
-)
-# 处理特征
-visual_feats, text_feats, audio_len = hvf.feature_process(
-    video_path="video.mp4",
-    prompt="footsteps on gravel",
-    model_dict=model_dict,
-    cfg=cfg
-)
-# 生成音频
-audio, sample_rate = hvf.denoise_process(
-    visual_feats, text_feats, audio_len,
-    model_dict, cfg
-)
-```
-### 2. 使用命令行工具
-```bash
-# 单个视频处理
-hunyuanvideo-foley \
-    --model_path ./pretrained_models \
-    --single_video video.mp4 \
-    --single_prompt "footsteps on gravel" \
-    --output_dir ./outputs
-# 批量处理
-hunyuanvideo-foley \
-    --model_path ./pretrained_models \
-    --csv_path batch_videos.csv \
-    --output_dir ./outputs
-# 启动Gradio界面
-hunyuanvideo-foley --gradio --model_path ./pretrained_models
-```
-### 3. 使用原始脚本（向后兼容）
-```bash
-# 使用原始infer.py脚本
-python infer.py --model_path ./pretrained_models --single_video video.mp4 --single_prompt "audio description"
-# 启动Gradio应用
-export HIFI_FOLEY_MODEL_PATH=./pretrained_models
-python gradio_app.py
-```
-## 开发环境设置
-如果你想参与开发：
-```bash
-# 克隆项目
-git clone https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley
-cd HunyuanVideo-Foley
-# 安装开发版本
-pip install -e .[dev]
-# 安装pre-commit钩子
-pre-commit install
-# 运行测试
-python -m pytest
-# 代码格式化
-black --line-length 120 .
-isort --profile black .
-# 类型检查
-mypy --ignore-missing-imports .
-```
-## 系统要求
-- **Python**: 3.8+
-- **操作系统**: Linux（主要支持），macOS，Windows
-- **GPU内存**: 推荐 ≥24GB VRAM（如RTX 3090/4090）
-- **CUDA版本**: 12.4 或 11.8（推荐）
-## 故障排除
-### 常见问题
-1. **ImportError: No module named 'audiotools'**
-   ```bash
-   pip install git+https://github.com/descriptinc/audiotools
-   ```
-2. **CUDA内存不足**
-   - 使用较小的批次大小
-   - 确保GPU有足够的VRAM（推荐24GB+）
-3. **transformers版本问题**
-   ```bash
-   pip install git+https://github.com/huggingface/transformers@v4.49.0-SigLIP-2
-   ```
-### 获取帮助
-- 查看项目README: [GitHub](https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley)
-- 报告问题: [GitHub Issues](https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley/issues)
-- 论文: [arXiv:2508.16930](https://arxiv.org/abs/2508.16930)
-## 模型下载
-```bash
-# 使用HuggingFace Hub
-git clone https://huggingface.co/tencent/HunyuanVideo-Foley
-# 或使用huggingface-cli
-huggingface-cli download tencent/HunyuanVideo-Foley
-```
-## 配置文件
-包安装后，配置文件位于：
-- `hunyuanvideo_foley/configs/` 目录
-- 默认配置：`configs/hunyuanvideo-foley-xxl.yaml`

HunyuanVideo-Foley/LICENSE DELETED Viewed

@@ -1,77 +0,0 @@
-TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT
-Tencent HunyuanVideo-Foley Release Date: August 28, 2025
-THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
-By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying any portion or element of the Tencent Hunyuan Works, including via any Hosted Service, You will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
-1.	DEFINITIONS.
-a.	“Acceptable Use Policy” shall mean the policy made available by Tencent as set forth in the Exhibit A.
-b.	“Agreement” shall mean the terms and conditions for use, reproduction, distribution, modification, performance and displaying of Tencent Hunyuan Works or any portion or element thereof set forth herein.
-c.	“Documentation” shall mean the specifications, manuals and documentation for Tencent Hunyuan made publicly available by Tencent.
-d.	“Hosted Service” shall mean a hosted service offered via an application programming interface (API), web access, or any other electronic or remote means.
-e.	“Licensee,” “You” or “Your” shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Tencent Hunyuan Works for any purpose and in any field of use.
-f.	“Materials” shall mean, collectively, Tencent’s proprietary Tencent Hunyuan and Documentation (and any portion thereof) as made available by Tencent under this Agreement.
-g.	“Model Derivatives” shall mean all: (i) modifications to Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; (ii) works based on Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; or (iii) any other machine learning model which is created by transfer of patterns of the weights, parameters, operations, or Output of Tencent Hunyuan or any Model Derivative of Tencent Hunyuan, to that model in order to cause that model to perform similarly to Tencent Hunyuan or a Model Derivative of Tencent Hunyuan, including distillation methods, methods that use intermediate data representations, or methods based on the generation of synthetic data Outputs by Tencent Hunyuan or a Model Derivative of Tencent Hunyuan for training that model. For clarity, Outputs by themselves are not deemed Model Derivatives.
-h.	“Output” shall mean the information and/or content output of Tencent Hunyuan or a Model Derivative that results from operating or otherwise using Tencent Hunyuan or a Model Derivative, including via a Hosted Service.
-i.	“Tencent,” “We” or “Us” shall mean the applicable entity or entities in the Tencent corporate family that own(s) intellectual property or other rights embodied in or utilized by the Materials.
-j.	“Tencent Hunyuan” shall mean the large language models, text/image/video/audio/3D generation models, and multimodal large language models and their software and algorithms, including trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing made publicly available by Us, including, without limitation to, Tencent HunyuanVideo-Foley released at [https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley].
-k.	“Tencent Hunyuan Works” shall mean: (i) the Materials; (ii) Model Derivatives; and (iii) all derivative works thereof.
-l.	“Territory” shall mean the worldwide territory, excluding the territory of the European Union, United Kingdom and South Korea.
-m.	“Third Party” or “Third Parties” shall mean individuals or legal entities that are not under common control with Us or You.
-n.	“including” shall mean including but not limited to.
-2.	GRANT OF RIGHTS.
-We grant You, for the Territory only, a non-exclusive, non-transferable and royalty-free limited license under Tencent’s intellectual property or other rights owned by Us embodied in or utilized by the Materials to use, reproduce, distribute, create derivative works of (including Model Derivatives), and make modifications to the Materials, only in accordance with the terms of this Agreement and the Acceptable Use Policy, and You must not violate (or encourage or permit anyone else to violate) any term of this Agreement or the Acceptable Use Policy.
-3.	DISTRIBUTION.
-You may, subject to Your compliance with this Agreement, distribute or make available to Third Parties the Tencent Hunyuan Works, exclusively in the Territory, provided that You meet all of the following conditions:
-a.	You must provide all such Third Party recipients of the Tencent Hunyuan Works or products or services using them a copy of this Agreement;
-b.	You must cause any modified files to carry prominent notices stating that You changed the files;
-c.	You are encouraged to: (i) publish at least one technology introduction blogpost or one public statement expressing Your experience of using the Tencent Hunyuan Works; and (ii) mark the products or services developed by using the Tencent Hunyuan Works to indicate that the product/service is “Powered by Tencent Hunyuan”; and
-d.	All distributions to Third Parties (other than through a Hosted Service) must be accompanied by a “Notice” text file that contains the following notice: “Tencent Hunyuan is licensed under the Tencent Hunyuan Community License Agreement, Copyright © 2025 Tencent. All Rights Reserved. The trademark rights of “Tencent Hunyuan” are owned by Tencent or its affiliate.”
-You may add Your own copyright statement to Your modifications and, except as set forth in this Section and in Section 5, may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Model Derivatives as a whole, provided Your use, reproduction, modification, distribution, performance and display of the work otherwise complies with the terms and conditions of this Agreement (including as regards the Territory). If You receive Tencent Hunyuan Works from a Licensee as part of an integrated end user product, then this Section 3 of this Agreement will not apply to You.
-4.	ADDITIONAL COMMERCIAL TERMS.
-If, on the Tencent Hunyuan version release date, the monthly active users of all products or services made available by or for Licensee is greater than 100 million monthly active users in the preceding calendar month, You must request a license from Tencent, which Tencent may grant to You in its sole discretion, and You are not authorized to exercise any of the rights under this Agreement unless or until Tencent otherwise expressly grants You such rights.
-5.	RULES OF USE.
-a.	Your use of the Tencent Hunyuan Works must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Tencent Hunyuan Works, which is hereby incorporated by reference into this Agreement. You must include the use restrictions referenced in these Sections 5(a) and 5(b) as an enforceable provision in any agreement (e.g., license agreement, terms of use, etc.) governing the use and/or distribution of Tencent Hunyuan Works and You must provide notice to subsequent users to whom You distribute that Tencent Hunyuan Works are subject to the use restrictions in these Sections 5(a) and 5(b).
-b.	You must not use the Tencent Hunyuan Works or any Output or results of the Tencent Hunyuan Works to improve any other AI model (other than Tencent Hunyuan or Model Derivatives thereof).
-c.	You must not use, reproduce, modify, distribute, or display the Tencent Hunyuan Works, Output or results of the Tencent Hunyuan Works outside the Territory. Any such use outside the Territory is unlicensed and unauthorized under this Agreement.
-6.	INTELLECTUAL PROPERTY.
-a.	Subject to Tencent’s ownership of Tencent Hunyuan Works made by or for Tencent and intellectual property rights therein, conditioned upon Your compliance with the terms and conditions of this Agreement, as between You and Tencent, You will be the owner of any derivative works and modifications of the Materials and any Model Derivatives that are made by or for You.
-b.	No trademark licenses are granted under this Agreement, and in connection with the Tencent Hunyuan Works, Licensee may not use any name or mark owned by or associated with Tencent or any of its affiliates, except as required for reasonable and customary use in describing and distributing the Tencent Hunyuan Works. Tencent hereby grants You a license to use “Tencent Hunyuan” (the “Mark”) in the Territory solely as required to comply with the provisions of Section 3(c), provided that You comply with any applicable laws related to trademark protection. All goodwill arising out of Your use of the Mark will inure to the benefit of Tencent.
-c.	If You commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any person or entity alleging that the Materials or any Output, or any portion of any of the foregoing, infringe any intellectual property or other right owned or licensable by You, then all licenses granted to You under this Agreement shall terminate as of the date such lawsuit or other proceeding is filed. You will defend, indemnify and hold harmless Us from and against any claim by any Third Party arising out of or related to Your or the Third Party’s use or distribution of the Tencent Hunyuan Works.
-d.	Tencent claims no rights in Outputs You generate. You and Your users are solely responsible for Outputs and their subsequent uses.
-7.	DISCLAIMERS OF WARRANTY AND LIMITATIONS OF LIABILITY.
-a.	We are not obligated to support, update, provide training for, or develop any further version of the Tencent Hunyuan Works or to grant any license thereto.
-b.	UNLESS AND ONLY TO THE EXTENT REQUIRED BY APPLICABLE LAW, THE TENCENT HUNYUAN WORKS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED “AS IS” WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES OF ANY KIND INCLUDING ANY WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, COURSE OF DEALING, USAGE OF TRADE, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING, REPRODUCING, MODIFYING, PERFORMING, DISPLAYING OR DISTRIBUTING ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND ASSUME ANY AND ALL RISKS ASSOCIATED WITH YOUR OR A THIRD PARTY’S USE OR DISTRIBUTION OF ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND YOUR EXERCISE OF RIGHTS AND PERMISSIONS UNDER THIS AGREEMENT.
-c.	TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL TENCENT OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, FOR ANY DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, CONSEQUENTIAL OR PUNITIVE DAMAGES, OR LOST PROFITS OF ANY KIND ARISING FROM THIS AGREEMENT OR RELATED TO ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS, EVEN IF TENCENT OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
-8.	SURVIVAL AND TERMINATION.
-a.	The term of this Agreement shall commence upon Your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
-b.	We may terminate this Agreement if You breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, You must promptly delete and cease use of the Tencent Hunyuan Works. Sections 6(a), 6(c), 7 and 9 shall survive the termination of this Agreement.
-9.	GOVERNING LAW AND JURISDICTION.
-a.	This Agreement and any dispute arising out of or relating to it will be governed by the laws of the Hong Kong Special Administrative Region of the People’s Republic of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
-b.	Exclusive jurisdiction and venue for any dispute arising out of or relating to this Agreement will be a court of competent jurisdiction in the Hong Kong Special Administrative Region of the People’s Republic of China, and Tencent and Licensee consent to the exclusive jurisdiction of such court with respect to any such dispute.
-EXHIBIT A
-ACCEPTABLE USE POLICY
-Tencent reserves the right to update this Acceptable Use Policy from time to time.
-Last modified: November 5, 2024
-Tencent endeavors to promote safe and fair use of its tools and features, including Tencent Hunyuan. You agree not to use Tencent Hunyuan or Model Derivatives:
-1.	Outside the Territory;
-2.	In any way that violates any applicable national, federal, state, local, international or any other law or regulation;
-3.	To harm Yourself or others;
-4.	To repurpose or distribute output from Tencent Hunyuan or any Model Derivatives to harm Yourself or others;
-5.	To override or circumvent the safety guardrails and safeguards We have put in place;
-6.	For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
-7.	To generate or disseminate verifiably false information and/or content with the purpose of harming others or influencing elections;
-8.	To generate or facilitate false online engagement, including fake reviews and other means of fake online engagement;
-9.	To intentionally defame, disparage or otherwise harass others;
-10.	To generate and/or disseminate malware (including ransomware) or any other content to be used for the purpose of harming electronic systems;
-11.	To generate or disseminate personal identifiable information with the purpose of harming others;
-12.	To generate or disseminate information (including images, code, posts, articles), and place the information in any public context (including –through the use of bot generated tweets), without expressly and conspicuously identifying that the information and/or content is machine generated;
-13.	To impersonate another individual without consent, authorization, or legal right;
-14.	To make high-stakes automated decisions in domains that affect an individual’s safety, rights or wellbeing (e.g., law enforcement, migration, medicine/health, management of critical infrastructure, safety components of products, essential services, credit, employment, housing, education, social scoring, or insurance);
-15.	In a manner that violates or disrespects the social ethics and moral standards of other countries or regions;
-16.	To perform, facilitate, threaten, incite, plan, promote or encourage violent extremism or terrorism;
-17.	For any use intended to discriminate against or harm individuals or groups based on protected characteristics or categories, online or offline social behavior or known or predicted personal or personality characteristics;
-18.	To intentionally exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
-19.	For military purposes;
-20.	To engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or other professional practices.

HunyuanVideo-Foley/MANIFEST.in DELETED Viewed

@@ -1,38 +0,0 @@
-# Include package metadata and documentation
-include README.md
-include LICENSE
-include NOTICE
-include DEVELOPMENT.md
-include CLAUDE.md
-include requirements.txt
-include pyproject.toml
-include pytest.ini
-# Include configuration files
-include configs/*.yaml
-include configs/*.yml
-recursive-include hunyuanvideo_foley/configs *.yaml *.yml
-# Include test assets if any
-include assets/*.csv
-include assets/*.txt
-recursive-include assets/test_videos *
-# Include example scripts
-include *.py
-include *.sh
-# Include test files
-recursive-include tests *.py
-# Exclude unnecessary files
-global-exclude *.pyc
-global-exclude *.pyo
-global-exclude *~
-global-exclude .DS_Store
-global-exclude __pycache__
-prune .git
-prune .github
-prune examples/*/outputs
-prune **/__pycache__
-prune **/*.pyc

HunyuanVideo-Foley/NOTICE DELETED Viewed

@@ -1,27 +0,0 @@
-Usage and Legal Notices:
-Tencent is pleased to support the open source community by making Tencent HunyuanVideo-Foley available.
-Copyright (C) 2025 Tencent. All rights reserved.
-Tencent HunyuanVideo-Foley is licensed under TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT, which can be found in this repository called "LICENSE", except for the third-party components listed below. Tencent HunyuanVideo-Foley does not impose any additional limitations beyond what is outlined in the respective licenses of these third-party components. Users must comply with all terms and conditions of original licenses of these third-party components and must ensure that the usage of the third party components adheres to all relevant laws and regulations.
-For avoidance of doubts, Tencent HunyuanVideo-Foley means the large language models and their software and algorithms, including trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing made publicly available by Tencent in accordance with the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-Other dependencies and licenses:
-Open Source Software Licensed under the MIT License:
---------------------------------------------------------------------
-1. syncformer
-Copyright (c) 2024 Vladimir Iashin
-Terms of the MIT License:
---------------------------------------------------------------------
-Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

HunyuanVideo-Foley/README.md DELETED Viewed

@@ -1,519 +0,0 @@
-<div align="center">
-https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley
-<img src="assets/logo.png" alt="HunyuanVideo-Foley Logo" width="400">
-<h4>Multimodal Diffusion with Representation Alignment for High-Fidelity Foley Audio Generation</h4>
-<p align="center">
-  <strong>Professional-grade AI sound effect generation for video content creators</strong>
-</p>
-<div align="center">
-  <a href=https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley target="_blank"><img src=https://img.shields.io/badge/Code-black.svg?logo=github height=22px></a>
-  <a href=https://szczesnys.github.io/hunyuanvideo-foley target="_blank"><img src=https://img.shields.io/badge/Page-bb8a2e.svg?logo=github height=22px></a>
-  <a href=https://huggingface.co/tencent/HunyuanVideo-Foley target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Models-d96902.svg height=22px></a>
-  <a href=https://huggingface.co/spaces/tencent/HunyuanVideo-Foley  target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Demo-276cb4.svg height=22px></a>
-  <a href=https://arxiv.org/abs/2508.16930 target="_blank"><img src=https://img.shields.io/badge/Report-b5212f.svg?logo=arxiv height=22px></a>
-  <a href=https://x.com/TencentHunyuan target="_blank"><img src=https://img.shields.io/badge/Hunyuan-black.svg?logo=x height=22px></a>
-  <a href=https://discord.gg/YEyGGn6Bte target="_blank"><img src=https://img.shields.io/badge/Hunyuan-141984.svg?logo=discord height=22px></a>
-</div>
-</div>
----
-<div align="center">
-### 👥 **Authors**
-<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 15px; margin: 20px 0;">
-**Sizhe Shan**<sup>1,2*</sup> • **Qiulin Li**<sup>1,3*</sup> • **Yutao Cui**<sup>1</sup> • **Miles Yang**<sup>1</sup>  • **Yuehai Wang**<sup>2</sup> • **Qun Yang**<sup>3</sup> • **Jin Zhou**<sup>1†</sup> • **Zhao Zhong**<sup>1</sup>
-</div>
-<div style="margin-top: 15px; font-size: 14px; color: #666;">
-🏢 <sup>1</sup>**Tencent Hunyuan** • 🎓 <sup>2</sup>**Zhejiang University** • ✈️ <sup>3</sup>**Nanjing University of Aeronautics and Astronautics**
-*Equal contribution • †Project lead
-</div>
-</div>
----
-## 🔥🔥🔥 **News**
-<div style="background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%); padding: 20px; border-radius: 15px; margin: 20px 0; border-left: 5px solid #2196f3;">
-- **[2025.9.29]** 🚀 **HunyuanVideo-Foley-XL Model Release** - Release XL-sized model with offload inference support, significantly reducing VRAM requirements.
-- **[2025.8.28]** 🌟 **HunyuanVideo-Foley Open Source Release** - Inference code and model weights publicly available.
-</div>
----
-## 🎥 **Demo & Showcase**
-<div align="center">
-> **Experience the magic of AI-generated Foley audio in perfect sync with video content!**
-<div style="border: 3px solid #4A90E2; border-radius: 15px; padding: 10px; margin: 20px 0; background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);">
-  <video src="https://github.com/user-attachments/assets/d6e1b6fd-6980-4a68-8717-74298d064195" width="80%" controls style="border-radius: 10px; box-shadow: 0 8px 32px rgba(0,0,0,0.1);"> </video>
-  <p><em>🎬 Watch how HunyuanVideo-Foley generates immersive sound effects synchronized with video content</em></p>
-</div>
----
-## 🤝 **Community Contributions**
-<div style="background: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 4px solid #28a745; margin: 20px 0; color: #333;">
-**ComfyUI Integration** - Thanks to the amazing community for creating ComfyUI nodes:
-- **[if-ai/ComfyUI_HunyuanVideoFoley](https://github.com/if-ai/ComfyUI_HunyuanVideoFoley)** - ComfyUI workflow integration which supports cpu offloading and FP8 quantization
-- **[phazei/ComfyUI-HunyuanVideo-Foley](https://github.com/phazei/ComfyUI-HunyuanVideo-Foley)** - Alternative ComfyUI node implementation which supports different precision modes
-</div>
-<div align="center" style="margin: 20px 0;">
-**🌟 We encourage and appreciate community contributions that make HunyuanVideo-Foley more accessible!**
-</div>
----
-### ✨ **Key Highlights**
-<table align="center" style="border: none; margin: 20px 0;">
-<tr>
-<td align="center" width="33%">
-🎭 **Multi-scenario Sync**
-High-quality audio synchronized with complex video scenes
-</td>
-<td align="center" width="33%">
-🧠 **Multi-modal Balance**
-Perfect harmony between visual and textual information
-</td>
-<td align="center" width="33%">
-🎵 **48kHz Hi-Fi Output**
-Professional-grade audio generation with crystal clarity
-</td>
-</tr>
-</table>
-</div>
----
-## 📄 **Abstract**
-<div align="center" style="background: linear-gradient(135deg, #ffeef8 0%, #f0f8ff 100%); padding: 30px; border-radius: 20px; margin: 20px 0; border-left: 5px solid #ff6b9d; color: #333;">
-**🚀 Tencent Hunyuan** open-sources **HunyuanVideo-Foley** an end-to-end video sound effect generation model!
-*A professional-grade AI tool specifically designed for video content creators, widely applicable to diverse scenarios including short video creation, film production, advertising creativity, and game development.*
-</div>
-### 🎯 **Core Highlights**
-<div style="display: grid; grid-template-columns: 1fr; gap: 15px; margin: 20px 0;">
-<div style="border-left: 4px solid #4CAF50; padding: 15px; background: #f8f9fa; border-radius: 8px; color: #333;">
-**🎬 Multi-scenario Audio-Visual Synchronization**
-Supports generating high-quality audio that is synchronized and semantically aligned with complex video scenes, enhancing realism and immersive experience for film/TV and gaming applications.
-</div>
-<div style="border-left: 4px solid #2196F3; padding: 15px; background: #f8f9fa; border-radius: 8px; color: #333;">
-**⚖️ Multi-modal Semantic Balance**
-Intelligently balances visual and textual information analysis, comprehensively orchestrates sound effect elements, avoids one-sided generation, and meets personalized dubbing requirements.
-</div>
-<div style="border-left: 4px solid #FF9800; padding: 15px; background: #f8f9fa; border-radius: 8px; color: #333;">
-**🎵 High-fidelity Audio Output**
-Self-developed 48kHz audio VAE perfectly reconstructs sound effects, music, and vocals, achieving professional-grade audio generation quality.
-</div>
-</div>
-<div align="center" style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 15px; margin: 20px 0; color: #333;">
-**🏆 SOTA Performance Achieved**
-*HunyuanVideo-Foley comprehensively leads the field across multiple evaluation benchmarks, achieving new state-of-the-art levels in audio fidelity, visual-semantic alignment, temporal alignment, and distribution matching - surpassing all open-source solutions!*
-</div>
-<div align="center">
-![Performance Overview](assets/pan_chart.png)
-*📊 Performance comparison across different evaluation metrics - HunyuanVideo-Foley leads in all categories*
-</div>
----
-## 🔧 **Technical Architecture**
-### 📊 **Data Pipeline Design**
-<div align="center" style="margin: 20px 0; color: #333;">
-![Data Pipeline](assets/data_pipeline.png)
-*🔄 Comprehensive data processing pipeline for high-quality text-video-audio datasets*
-</div>
-<div style="background: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 4px solid #17a2b8; margin: 20px 0;">
-The **TV2A (Text-Video-to-Audio)** task presents a complex multimodal generation challenge requiring large-scale, high-quality datasets. Our comprehensive data pipeline systematically identifies and excludes unsuitable content to produce robust and generalizable audio generation capabilities.
-</div>
-### 🏗️ **Model Architecture**
-<div align="center" style="margin: 20px 0; color: #333;">
-![Model Architecture](assets/model_arch.png)
-*🧠 HunyuanVideo-Foley hybrid architecture with multimodal and unimodal transformer blocks*
-</div>
-<div style="background: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 4px solid #28a745; margin: 20px 0;">
-**HunyuanVideo-Foley** employs a sophisticated hybrid architecture:
-- **🔄 Multimodal Transformer Blocks**: Process visual-audio streams simultaneously
-- **🎵 Unimodal Transformer Blocks**: Focus on audio stream refinement
-- **👁️ Visual Encoding**: Pre-trained encoder extracts visual features from video frames
-- **📝 Text Processing**: Semantic features extracted via pre-trained text encoder
-- **🎧 Audio Encoding**: Latent representations with Gaussian noise perturbation
-- **⏰ Temporal Alignment**: Synchformer-based frame-level synchronization with gated modulation
-</div>
----
-## 📈 **Performance Benchmarks**
-### 🎬 **MovieGen-Audio-Bench Results**
-<div align="center">
-> *Objective and Subjective evaluation results demonstrating superior performance across all metrics*
-</div>
-<div style="overflow-x: auto; margin: 20px 0;">
-| 🏆 **Method** | **PQ** ↑ | **PC** ↓ | **CE** ↑ | **CU** ↑ | **IB** ↑ | **DeSync** ↓ | **CLAP** ↑ | **MOS-Q** ↑ | **MOS-S** ↑ | **MOS-T** ↑ |
-|:-------------:|:--------:|:--------:|:--------:|:--------:|:--------:|:-------------:|:-----------:|:------------:|:------------:|:------------:|
-| FoleyGrafter | 6.27 | 2.72 | 3.34 | 5.68 | 0.17 | 1.29 | 0.14 | 3.36±0.78 | 3.54±0.88 | 3.46±0.95 |
-| V-AURA | 5.82 | 4.30 | 3.63 | 5.11 | 0.23 | 1.38 | 0.14 | 2.55±0.97 | 2.60±1.20 | 2.70±1.37 |
-| Frieren | 5.71 | 2.81 | 3.47 | 5.31 | 0.18 | 1.39 | 0.16 | 2.92±0.95 | 2.76±1.20 | 2.94±1.26 |
-| MMAudio | 6.17 | 2.84 | 3.59 | 5.62 | 0.27 | 0.80 | 0.35 | 3.58±0.84 | 3.63±1.00 | 3.47±1.03 |
-| ThinkSound | 6.04 | 3.73 | 3.81 | 5.59 | 0.18 | 0.91 | 0.20 | 3.20±0.97 | 3.01±1.04 | 3.02±1.08 |
-| **HunyuanVideo-Foley (ours)** | **6.59** | **2.74** | **3.88** | **6.13** | **0.35** | **0.74** | **0.33** | **4.14±0.68** | **4.12±0.77** | **4.15±0.75** |
-</div>
-### 🎯 **Kling-Audio-Eval Results**
-<div align="center">
-> *Comprehensive objective evaluation showcasing state-of-the-art performance*
-</div>
-<div style="overflow-x: auto; margin: 20px 0;">
-| 🏆 **Method** | **FD_PANNs** ↓ | **FD_PASST** ↓ | **KL** ↓ | **IS** ↑ | **PQ** ↑ | **PC** ↓ | **CE** ↑ | **CU** ↑ | **IB** ↑ | **DeSync** ↓ | **CLAP** ↑ |
-|:-------------:|:--------------:|:--------------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:-------------:|:-----------:|
-| FoleyGrafter | 22.30 | 322.63 | 2.47 | 7.08 | 6.05 | 2.91 | 3.28 | 5.44 | 0.22 | 1.23 | 0.22 |
-| V-AURA | 33.15 | 474.56 | 3.24 | 5.80 | 5.69 | 3.98 | 3.13 | 4.83 | 0.25 | 0.86 | 0.13 |
-| Frieren | 16.86 | 293.57 | 2.95 | 7.32 | 5.72 | 2.55 | 2.88 | 5.10 | 0.21 | 0.86 | 0.16 |
-| MMAudio | 9.01 | 205.85 | 2.17 | 9.59 | 5.94 | 2.91 | 3.30 | 5.39 | 0.30 | 0.56 | 0.27 |
-| ThinkSound | 9.92 | 228.68 | 2.39 | 6.86 | 5.78 | 3.23 | 3.12 | 5.11 | 0.22 | 0.67 | 0.22 |
-| **HunyuanVideo-Foley (ours)** | **6.07** | **202.12** | **1.89** | **8.30** | **6.12** | **2.76** | **3.22** | **5.53** | **0.38** | **0.54** | **0.24** |
-</div>
-<div align="center" style="background: linear-gradient(135deg, #4CAF50 0%, #45a049 100%); color: white; padding: 15px; border-radius: 10px; margin: 20px 0; color: #333;">
-**🎉 Outstanding Results!** HunyuanVideo-Foley achieves the best scores across **ALL** evaluation metrics, demonstrating significant improvements in audio quality, synchronization, and semantic alignment.
-</div>
----
-## 🚀 **Quick Start**
-### 📦 **Installation**
-<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 15px; margin: 20px 0; color: #333;">
-**🔧 System Requirements**
-- **CUDA**: 12.4 or 11.8 recommended
-- **Python**: 3.8+
-- **OS**: Linux (primary support)
-- **VRAM**: 20GB for XXL model (or 12GB with `--enable_offload`), 16GB for XL model (or 8GB with `--enable_offload`)
-</div>
-#### **Step 1: Clone Repository**
-```bash
-# 📥 Clone the repository
-git clone https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley
-cd HunyuanVideo-Foley
-```
-#### **Step 2: Environment Setup**
-<div style="background: #fff3cd; padding: 15px; border-radius: 8px; border-left: 4px solid #ffc107; margin: 10px 0; color: #333;">
-💡 **Tip**: We recommend using [Conda](https://docs.anaconda.com/free/miniconda/index.html) for Python environment management.
-</div>
-```bash
-# 🔧 Install dependencies
-pip install -r requirements.txt
-```
-#### **Step 3: Download Pretrained Models**
-<div style="background: #d1ecf1; padding: 15px; border-radius: 8px; border-left: 4px solid #17a2b8; margin: 10px 0;color: #333;">
-🔗 **Download Model weights from Huggingface**
-```bash
-# using git-lfs
-git clone https://huggingface.co/tencent/HunyuanVideo-Foley
-# using huggingface-cli
-huggingface-cli download tencent/HunyuanVideo-Foley
-```
-<!-- 🔗 **Download Model weights from ModelScope**   -->
-<!-- ```bash -->
-<!-- # using git-lfs -->
-<!-- git clone https://huggingface.co/tencent/HunyuanVideo-Foley -->
-<!--  -->
-<!-- # using huggingface-cli -->
-<!-- huggingface-cli download tencent/HunyuanVideo-Foley -->
-<!-- ``` -->
-</div>
----
-## 💻 **Usage**
-### 📊 **Model Specifications**
-| Model | Checkpoint | VRAM (Normal) | VRAM (Offload) |
-|-------|------------|---------------|----------------|
-| **XXL** *(Default)* | `hunyuanvideo_foley.pth` | 20GB | 12GB |
-| **XL** | `hunyuanvideo_foley_xl.pth` | 16GB | 8GB |
-### 🎬 **Single Video Generation**
-<div style="background: #e8f5e8; padding: 15px; border-radius: 8px; border-left: 4px solid #28a745; margin: 10px 0;color: #333;">
-Generate Foley audio for a single video file with text description:
-</div>
-```bash
-# Use XXL model (default, best quality)
-python3 infer.py \
-    --model_path PRETRAINED_MODEL_PATH_DIR \
-    --single_video video_path \
-    --single_prompt "audio description" \
-    --output_dir OUTPUT_DIR \
-    # --enable_offload
-# Use XL model (memory-friendly)
-python3 infer.py \
-    --model_path PRETRAINED_MODEL_PATH_DIR \
-    --model_size xl \
-    --single_video video_path \
-    --single_prompt "audio description" \
-    --output_dir OUTPUT_DIR \
-    # --enable_offload
-```
-### 📂 **Batch Processing**
-<div style="background: #fff3e0; padding: 15px; border-radius: 8px; border-left: 4px solid #ff9800; margin: 10px 0;color: #333;">
-Process multiple videos using a CSV file with video paths and descriptions:
-</div>
-```bash
-# Download sample test videos
-bash ./download_test_videos.sh
-# Batch processing
-python3 infer.py \
-    --model_path PRETRAINED_MODEL_PATH_DIR \
-    --csv_path assets/test.csv \
-    --output_dir OUTPUT_DIR \
-    # --enable_offload
-```
-### 🌐 **Interactive Web Interface**
-<div style="background: #f3e5f5; padding: 15px; border-radius: 8px; border-left: 4px solid #9c27b0; margin: 10px 0;color: #333;">
-Launch a user-friendly Gradio web interface for easy interaction:
-</div>
-```bash
-# Launch with XXL model (default)
-export HIFI_FOLEY_MODEL_PATH=PRETRAINED_MODEL_PATH_DIR
-python3 gradio_app.py
-# Launch with XL model (memory-friendly)
-export HIFI_FOLEY_MODEL_PATH=PRETRAINED_MODEL_PATH_DIR
-MODEL_SIZE=xl python3 gradio_app.py
-# Optional: Enable offload to reduce memory usage
-ENABLE_OFFLOAD=true python3 gradio_app.py
-```
-<div align="center" style="margin: 20px 0; color: #333;">
-*🚀 Then open your browser and navigate to the provided local URL to start generating Foley audio!*
-</div>
----
-## 📚 **Citation**
-<div style="background: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 4px solid #6c757d; margin: 20px 0; color: #333;">
-If you find **HunyuanVideo-Foley** useful for your research, please consider citing our paper:
-</div>
-```bibtex
-@misc{shan2025hunyuanvideofoleymultimodaldiffusionrepresentation,
-      title={HunyuanVideo-Foley: Multimodal Diffusion with Representation Alignment for High-Fidelity Foley Audio Generation},
-      author={Sizhe Shan and Qiulin Li and Yutao Cui and Miles Yang and Yuehai Wang and Qun Yang and Jin Zhou and Zhao Zhong},
-      year={2025},
-      eprint={2508.16930},
-      archivePrefix={arXiv},
-      primaryClass={eess.AS},
-      url={https://arxiv.org/abs/2508.16930},
-}
-```
-## Star History
-[![Star History Chart](https://api.star-history.com/svg?repos=Tencent-Hunyuan/HunyuanVideo-Foley&type=Date)](https://www.star-history.com/#Tencent-Hunyuan/HunyuanVideo-Foley&Date)
----
-## 🙏 **Acknowledgements**
-<div align="center">
-**We extend our heartfelt gratitude to the open-source community!**
-</div>
-<table align="center" style="width: 100%; border: none; margin: 20px 0;">
-<tr>
-<td align="center" style="width: 33%; padding: 10px; vertical-align: top;">
-🎨 **[Stable Diffusion 3](https://huggingface.co/stabilityai/stable-diffusion-3-medium)**
-*Foundation diffusion models*
-</td>
-<td align="center" style="width: 33%; padding: 10px; vertical-align: top;">
-⚡ **[FLUX](https://github.com/black-forest-labs/flux)**
-*Advanced generation techniques*
-</td>
-<td align="center" style="width: 33%; padding: 10px; vertical-align: top;">
-🎵 **[MMAudio](https://github.com/hkchengrex/MMAudio)**
-*Multimodal audio generation*
-</td>
-</tr>
-<tr>
-<td align="center" style="width: 33%; padding: 10px; vertical-align: top;">
-🤗 **[HuggingFace](https://huggingface.co)**
-*Platform & diffusers library*
-</td>
-<td align="center" style="width: 33%; padding: 10px; vertical-align: top;">
-🗜️ **[DAC](https://github.com/descriptinc/descript-audio-codec)**
-*High-Fidelity Audio Compression*
-</td>
-<td align="center" style="width: 33%; padding: 10px; vertical-align: top;">
-🔗 **[Synchformer](https://github.com/v-iashin/Synchformer)**
-*Audio-Visual Synchronization*
-</td>
-</tr>
-</table>
-<div align="center" style="background: linear-gradient(135deg, #74b9ff 0%, #0984e3 100%); color: white; padding: 20px; border-radius: 15px; margin: 20px 0;, color: #333;">
-**🌟 Special thanks to all researchers and developers who contribute to the advancement of AI-generated audio and multimodal learning!**
-</div>
----
-<div align="center" style="margin: 30px 0;">
-### 🔗 **Connect with Us**
-[![GitHub](https://img.shields.io/badge/GitHub-Follow-black?style=for-the-badge&logo=github)](https://github.com/Tencent-Hunyuan)
-[![Twitter](https://img.shields.io/badge/Twitter-Follow-blue?style=for-the-badge&logo=twitter)](https://twitter.com/Tencent)
-[![Hunyuan](https://img.shields.io/badge/Website-HunyuanAI-green?style=for-the-badge&logo=hunyuan)](https://hunyuan.tencent.com/)
-<p style="color: #666; margin-top: 15px; font-size: 14px;">
-© 2025 Tencent Hunyuan. All rights reserved. | Made with ❤️ for the AI community
-</p>
-</div>

HunyuanVideo-Foley/build_package.sh DELETED Viewed

@@ -1,58 +0,0 @@
-#!/bin/bash
-# 构建 HunyuanVideo-Foley Python 包的脚本
-set -e  # 出现错误时退出
-echo "🚀 开始构建 HunyuanVideo-Foley Python 包..."
-# 清理之前的构建文件
-echo "🧹 清理之前的构建文件..."
-rm -rf build/ dist/ *.egg-info/
-# 检查必要的工具
-echo "🔍 检查构建工具..."
-python -c "import setuptools, wheel; print('✅ setuptools和wheel已安装')" || {
-    echo "❌ 请安装构建工具: pip install setuptools wheel"
-    exit 1
-}
-# 检查setup.py
-echo "🔍 验证setup.py配置..."
-python setup.py check --restructuredtext --strict || {
-    echo "⚠️  setup.py验证有警告，但继续构建..."
-}
-# 构建源码分发包
-echo "📦 构建源码分发包..."
-python setup.py sdist
-# 构建wheel包
-echo "🎡 构建wheel包..."
-python setup.py bdist_wheel
-# 显示构建结果
-echo "✅ 构建完成！生成的包："
-ls -la dist/
-# 验证包
-echo "🔍 验证生成的包..."
-python -m pip check dist/*.whl || echo "⚠️  包验证有警告"
-echo ""
-echo "📝 安装说明："
-echo "# 从wheel文件安装:"
-echo "pip install dist/hunyuanvideo_foley-1.0.0-py3-none-any.whl"
-echo ""
-echo "# 开发模式安装:"
-echo "pip install -e ."
-echo ""
-echo "# 安装所有可选依赖:"
-echo "pip install -e .[all]"
-echo ""
-echo "⚠️  注意：某些依赖需要单独安装："
-echo "pip install git+https://github.com/descriptinc/audiotools"
-echo "pip install git+https://github.com/huggingface/transformers@v4.49.0-SigLIP-2"
-echo ""
-echo "🎉 构建完成！查看 INSTALL.md 获取详细安装指南。"

HunyuanVideo-Foley/download_test_videos.sh DELETED Viewed

@@ -1,11 +0,0 @@
-#!/bin/bash
-# Download MoviegenAudioBenchSfx 10 videos
-curl -O https://texttoaudio-train-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuanvideo-foley_demo/MovieGenAudioBenchSfx.tar.gz
-tar -xzvf MovieGenAudioBenchSfx.tar.gz -C ./assets
-rm MovieGenAudioBenchSfx.tar.gz
-# Download gradio example video
-curl -O https://texttoaudio-train-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuanvideo-foley_demo/examples.tar.gz
-tar -xvzf examples.tar.gz
-rm examples.tar.gz

HunyuanVideo-Foley/gradio_app.py DELETED Viewed

@@ -1,834 +0,0 @@
-import os
-import tempfile
-import gradio as gr
-import torch
-import torchaudio
-from loguru import logger
-from typing import Optional, Tuple
-import random
-import numpy as np
-from hunyuanvideo_foley.utils.model_utils import load_model
-from hunyuanvideo_foley.utils.feature_utils import feature_process
-from hunyuanvideo_foley.utils.model_utils import denoise_process
-from hunyuanvideo_foley.utils.media_utils import merge_audio_video
-# Global variables for model storage
-model_dict = None
-cfg = None
-device = None
-# need to modify the model path
-MODEL_PATH = os.environ.get("HIFI_FOLEY_MODEL_PATH", "./pretrained_models/")
-ENABLE_OFFLOAD = os.environ.get("ENABLE_OFFLOAD", "false").lower() in ("true", "1", "yes")
-MODEL_SIZE = os.environ.get("MODEL_SIZE", "xxl")  # default to xxl model
-CONFIG_PATH = os.environ.get("CONFIG_PATH", "")
-def setup_device(device_str: str = "auto", gpu_id: int = 0) -> torch.device:
-    """Setup computing device"""
-    if device_str == "auto":
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{gpu_id}")
-            logger.info(f"Using CUDA device: {device}")
-        elif torch.backends.mps.is_available():
-            device = torch.device("mps")
-            logger.info("Using MPS device")
-        else:
-            device = torch.device("cpu")
-            logger.info("Using CPU device")
-    else:
-        if device_str == "cuda":
-            device = torch.device(f"cuda:{gpu_id}")
-        else:
-            device = torch.device(device_str)
-        logger.info(f"Using specified device: {device}")
-    return device
-def auto_load_models() -> str:
-    """Automatically load preset models"""
-    global model_dict, cfg, device
-    try:
-        if not os.path.exists(MODEL_PATH):
-            return f"❌ Model directory not found: {MODEL_PATH}"
-        # Use GPU by default
-        device = setup_device("auto", 0)
-        # Auto-select config if not specified
-        config_path = CONFIG_PATH
-        if not config_path:
-            config_mapping = {
-                "xl": "configs/hunyuanvideo-foley-xl.yaml",
-                "xxl": "configs/hunyuanvideo-foley-xxl.yaml"
-            }
-            config_path = config_mapping.get(MODEL_SIZE, "configs/hunyuanvideo-foley-xxl.yaml")
-        # Load model
-        logger.info("Auto-loading model...")
-        logger.info(f"Model path: {MODEL_PATH}")
-        logger.info(f"Model size: {MODEL_SIZE}")
-        logger.info(f"Config path: {config_path}")
-        logger.info(f"Offload mode: {'enabled' if ENABLE_OFFLOAD else 'disabled'}")
-        model_dict, cfg = load_model(MODEL_PATH, config_path, device, enable_offload=ENABLE_OFFLOAD, model_size=MODEL_SIZE)
-        logger.info("✅ Model loaded successfully!")
-        return "✅ Model loaded successfully!"
-    except Exception as e:
-        logger.error(f"Model loading failed: {str(e)}")
-        return f"❌ Model loading failed: {str(e)}"
-def infer_single_video(
-    video_file,
-    text_prompt: str,
-    neg_prompt: str = None,
-    guidance_scale: float = 4.5,
-    num_inference_steps: int = 50,
-    sample_nums: int = 1
-) -> Tuple[list, str]:
-    """Single video inference"""
-    global model_dict, cfg, device
-    if model_dict is None or cfg is None:
-        return [], "❌ Please load the model first!"
-    if video_file is None:
-        return [], "❌ Please upload a video file!"
-    # Allow empty text prompt, use empty string if no prompt provided
-    if text_prompt is None:
-        text_prompt = ""
-    text_prompt = text_prompt.strip()
-    try:
-        logger.info(f"Processing video: {video_file}")
-        logger.info(f"Text prompt: {text_prompt}")
-        # Feature processing
-        visual_feats, text_feats, audio_len_in_s = feature_process(
-            video_file,
-            text_prompt,
-            model_dict,
-            cfg,
-            neg_prompt=neg_prompt
-        )
-        # Denoising process to generate multiple audio samples
-        # Note: The model now generates sample_nums audio samples per inference
-        # The denoise_process function returns audio with shape [batch_size, channels, samples]
-        logger.info(f"Generating {sample_nums} audio samples...")
-        audio, sample_rate = denoise_process(
-            visual_feats,
-            text_feats,
-            audio_len_in_s,
-            model_dict,
-            cfg,
-            guidance_scale=guidance_scale,
-            num_inference_steps=num_inference_steps,
-            batch_size=sample_nums
-        )
-        # Create temporary files to save results
-        temp_dir = tempfile.mkdtemp()
-        video_outputs = []
-        # Process each generated audio sample
-        for i in range(sample_nums):
-            # Save audio file
-            audio_output = os.path.join(temp_dir, f"generated_audio_{i+1}.wav")
-            torchaudio.save(audio_output, audio[i], sample_rate)
-            # Merge video and audio
-            video_output = os.path.join(temp_dir, f"video_with_audio_{i+1}.mp4")
-            merge_audio_video(audio_output, video_file, video_output)
-            video_outputs.append(video_output)
-        logger.info(f"Inference completed! Generated {sample_nums} samples.")
-        return video_outputs, f"✅ Generated {sample_nums} audio sample(s) successfully!"
-    except Exception as e:
-        logger.error(f"Inference failed: {str(e)}")
-        return [], f"❌ Inference failed: {str(e)}"
-def update_video_outputs(video_list, status_msg):
-    """Update video outputs based on the number of generated samples"""
-    # Initialize all outputs as None
-    outputs = [None] * 6
-    # Set values based on generated videos
-    for i, video_path in enumerate(video_list[:6]):  # Max 6 samples
-        outputs[i] = video_path
-    # Return all outputs plus status message
-    return tuple(outputs + [status_msg])
-def create_gradio_interface():
-    """Create Gradio interface"""
-    # Custom CSS for beautiful interface with better contrast
-    css = """
-    .gradio-container {
-        font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
-        background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
-        min-height: 100vh;
-    }
-    .main-header {
-        text-align: center;
-        padding: 2rem 0;
-        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-        border-radius: 20px;
-        margin-bottom: 2rem;
-        box-shadow: 0 8px 32px rgba(0,0,0,0.15);
-    }
-    .main-header h1 {
-        color: white;
-        font-size: 3rem;
-        font-weight: 700;
-        margin-bottom: 0.5rem;
-        text-shadow: 0 2px 10px rgba(0,0,0,0.3);
-    }
-    .main-header p {
-        color: rgba(255, 255, 255, 0.95);
-        font-size: 1.2rem;
-        font-weight: 300;
-    }
-    .status-card {
-        background: white;
-        border-radius: 15px;
-        padding: 1rem;
-        margin-bottom: 1.5rem;
-        border: 1px solid #e1e5e9;
-        box-shadow: 0 4px 20px rgba(0,0,0,0.08);
-    }
-    .status-card label {
-        color: #2d3748 !important;
-        font-weight: 600 !important;
-    }
-    .usage-guide h3 {
-        color: #2d3748 !important;
-        font-weight: 600 !important;
-        margin-bottom: 0.5rem !important;
-    }
-    .usage-guide p {
-        color: #4a5568 !important;
-        font-size: 1rem !important;
-        line-height: 1.6 !important;
-        margin: 0.5rem 0 !important;
-    }
-    .usage-guide strong {
-        color: #1a202c !important;
-        font-weight: 700 !important;
-    }
-    .usage-guide em {
-        color: #1a202c !important;
-        font-weight: 700 !important;
-        font-style: normal !important;
-    }
-    .main-interface {
-        margin-bottom: 2rem;
-    }
-    .input-section {
-        background: white;
-        border-radius: 20px;
-        padding: 2rem;
-        margin-right: 1rem;
-        box-shadow: 0 8px 32px rgba(0,0,0,0.1);
-        border: 1px solid #e1e5e9;
-    }
-    .input-section h3 {
-        color: #2d3748 !important;
-        font-weight: 600 !important;
-        margin-bottom: 1rem !important;
-    }
-    .input-section label {
-        color: #4a5568 !important;
-        font-weight: 500 !important;
-    }
-    .output-section {
-        background: white;
-        border-radius: 20px;
-        padding: 2rem;
-        margin-left: 1rem;
-        box-shadow: 0 8px 32px rgba(0,0,0,0.1);
-        border: 1px solid #e1e5e9;
-    }
-    .output-section h3 {
-        color: #2d3748 !important;
-        font-weight: 600 !important;
-        margin-bottom: 1rem !important;
-    }
-    .output-section label {
-        color: #4a5568 !important;
-        font-weight: 500 !important;
-    }
-    .examples-section h3 {
-        color: #2d3748 !important;
-        font-weight: 600 !important;
-        margin-bottom: 1.5rem !important;
-    }
-    .generate-btn {
-        background: linear-gradient(45deg, #667eea, #764ba2) !important;
-        border: none !important;
-        color: white !important;
-        font-weight: 600 !important;
-        font-size: 1.1rem !important;
-        padding: 12px 30px !important;
-        border-radius: 25px !important;
-        box-shadow: 0 4px 15px rgba(102, 126, 234, 0.4) !important;
-        transition: all 0.3s ease !important;
-    }
-    .generate-btn:hover {
-        transform: translateY(-2px) !important;
-        box-shadow: 0 8px 25px rgba(102, 126, 234, 0.6) !important;
-    }
-    .examples-section {
-        background: white;
-        border-radius: 20px;
-        padding: 2rem;
-        margin-top: 2rem;
-        box-shadow: 0 8px 32px rgba(0,0,0,0.1);
-        border: 1px solid #e1e5e9;
-    }
-    .examples-section p {
-        color: #4a5568 !important;
-        margin-bottom: 1rem !important;
-    }
-    .example-row {
-        background: #f8fafc;
-        border: 1px solid #e2e8f0;
-        border-radius: 15px;
-        padding: 1.5rem;
-        margin: 1rem 0;
-        transition: all 0.3s ease;
-        align-items: center;
-    }
-    .example-row:hover {
-        border-color: #667eea;
-        transform: translateY(-2px);
-        box-shadow: 0 4px 20px rgba(102, 126, 234, 0.15);
-    }
-    .example-row .markdown {
-        color: #2d3748 !important;
-    }
-    .example-row .markdown p {
-        color: #2d3748 !important;
-        margin: 0.5rem 0 !important;
-        line-height: 1.5 !important;
-    }
-    .example-row .markdown strong {
-        color: #1a202c !important;
-        font-weight: 600 !important;
-    }
-    /* Example grid layout styles */
-    .example-grid-row {
-        margin: 1rem 0;
-        gap: 1rem;
-    }
-    .example-item {
-        background: #f8fafc;
-        border: 1px solid #e2e8f0;
-        border-radius: 15px;
-        padding: 1rem;
-        transition: all 0.3s ease;
-        margin: 0.25rem;
-        max-width: 250px;
-        margin-left: auto;
-        margin-right: auto;
-    }
-    .example-item:hover {
-        border-color: #667eea;
-        transform: translateY(-2px);
-        box-shadow: 0 4px 20px rgba(102, 126, 234, 0.15);
-    }
-    .example-caption {
-        margin: 0.5rem 0 !important;
-        min-height: 2.8rem !important;
-        display: flex !important;
-        align-items: flex-start !important;
-    }
-    .example-caption p {
-        color: #2d3748 !important;
-        font-size: 0.9rem !important;
-        line-height: 1.4 !important;
-        margin: 0.5rem 0 !important;
-    }
-    /* Multi-video gallery styles */
-    .additional-samples {
-        margin-top: 1rem;
-        gap: 0.5rem;
-    }
-    .additional-samples .gradio-video {
-        border-radius: 10px;
-        overflow: hidden;
-    }
-    /* Video gallery responsive layout */
-    .video-gallery {
-        display: grid;
-        gap: 1rem;
-        margin-top: 1rem;
-    }
-    .video-gallery.single {
-        grid-template-columns: 1fr;
-    }
-    .video-gallery.dual {
-        grid-template-columns: 1fr 1fr;
-    }
-    .video-gallery.multi {
-        grid-template-columns: repeat(2, 1fr);
-        grid-template-rows: auto auto auto;
-    }
-    .footer-text {
-        color: #718096 !important;
-        text-align: center;
-        padding: 2rem;
-        font-size: 0.9rem;
-    }
-    /* Video component styling for consistent size */
-    .input-section video,
-    .output-section video,
-    .example-row video {
-        width: 100% !important;
-        height: 300px !important;
-        object-fit: contain !important;
-        border-radius: 10px !important;
-        background-color: #000 !important;
-    }
-    .example-row video {
-        height: 150px !important;
-    }
-    /* Fix for additional samples video display */
-    .additional-samples video {
-        height: 150px !important;
-        object-fit: contain !important;
-        border-radius: 10px !important;
-        background-color: #000 !important;
-    }
-    .additional-samples .gradio-video {
-        border-radius: 10px !important;
-        overflow: hidden !important;
-        background-color: #000 !important;
-    }
-    .additional-samples .gradio-video > div {
-        background-color: #000 !important;
-        border-radius: 10px !important;
-    }
-    /* Video container styling */
-    .input-section .video-container,
-    .output-section .video-container,
-    .example-row .video-container {
-        background-color: #000 !important;
-        border-radius: 10px !important;
-        display: flex !important;
-        align-items: center !important;
-        justify-content: center !important;
-        overflow: hidden !important;
-    }
-    /* Ensure proper alignment */
-    .example-row {
-        display: flex !important;
-        align-items: stretch !important;
-    }
-    .example-row > div {
-        display: flex !important;
-        flex-direction: column !important;
-        justify-content: center !important;
-    }
-    /* Video wrapper for better control */
-    .video-wrapper {
-        position: relative !important;
-        width: 100% !important;
-        background: #000 !important;
-        border-radius: 10px !important;
-        overflow: hidden !important;
-        display: flex !important;
-        align-items: center !important;
-        justify-content: center !important;
-    }
-    """
-    with gr.Blocks(css=css, title="HunyuanVideo-Foley") as app:
-        # Main header
-        with gr.Column(elem_classes=["main-header"]):
-            gr.HTML("""
-            <h1>🎵 HunyuanVideo-Foley</h1>
-            <p>Text-Video-to-Audio Synthesis: Generate realistic audio from video and text descriptions</p>
-            """)
-        # Usage Guide
-        with gr.Column(elem_classes=["status-card"]):
-            gr.Markdown("""
-            ### 📋 Quick Start Guide
-            **1.** Upload your video file\t**2.** Add optional text description\t**3.** Adjust sample numbers (1-6)\t**4.** Click Generate Audio
-            💡 For quick start, you can load the prepared examples by clicking the button.
-            """, elem_classes=["usage-guide"])
-        # Main inference interface - Input and Results side by side
-        with gr.Row(elem_classes=["main-interface"]):
-            # Input section
-            with gr.Column(scale=1, elem_classes=["input-section"]):
-                gr.Markdown("### 📹 Video Input")
-                video_input = gr.Video(
-                    label="Upload Video",
-                    info="Supported formats: MP4, AVI, MOV, etc.",
-                    height=300
-                )
-                text_input = gr.Textbox(
-                    label="🎯 Audio Description (English)",
-                    placeholder="A person walks on frozen ice",
-                    lines=3,
-                    info="Describe the audio you want to generate (optional)"
-                )
-                neg_prompt_input = gr.Textbox(
-                    label="🚫 Negative Prompt",
-                    placeholder="noisy, harsh",
-                    lines=2,
-                    info="Describe what you want to avoid in the generated audio (optional, default: 'noisy, harsh')"
-                )
-                with gr.Row():
-                    guidance_scale = gr.Slider(
-                        minimum=1.0,
-                        maximum=10.0,
-                        value=4.5,
-                        step=0.1,
-                        label="🎚️ CFG Scale",
-                    )
-                    inference_steps = gr.Slider(
-                        minimum=10,
-                        maximum=100,
-                        value=50,
-                        step=5,
-                        label="⚡ Steps",
-                    )
-                    sample_nums = gr.Slider(
-                        minimum=1,
-                        maximum=6,
-                        value=1,
-                        step=1,
-                        label="🎲 Sample Nums",
-                    )
-                generate_btn = gr.Button(
-                    "🎵 Generate Audio",
-                    variant="primary",
-                    elem_classes=["generate-btn"]
-                )
-            # Results section
-            with gr.Column(scale=1, elem_classes=["output-section"]):
-                gr.Markdown("### 🎥 Generated Results")
-                # Multi-video gallery for displaying multiple generated samples
-                with gr.Column():
-                    # Primary video (Sample 1)
-                    video_output_1 = gr.Video(
-                        label="Sample 1",
-                        height=250,
-                        visible=True
-                    )
-                    # Additional videos (Samples 2-6) - initially hidden
-                    with gr.Row(elem_classes=["additional-samples"]):
-                        with gr.Column(scale=1):
-                            video_output_2 = gr.Video(
-                                label="Sample 2",
-                                height=150,
-                                visible=False
-                            )
-                            video_output_3 = gr.Video(
-                                label="Sample 3",
-                                height=150,
-                                visible=False
-                            )
-                        with gr.Column(scale=1):
-                            video_output_4 = gr.Video(
-                                label="Sample 4",
-                                height=150,
-                                visible=False
-                            )
-                            video_output_5 = gr.Video(
-                                label="Sample 5",
-                                height=150,
-                                visible=False
-                            )
-                    # Sample 6 - full width
-                    video_output_6 = gr.Video(
-                        label="Sample 6",
-                        height=150,
-                        visible=False
-                    )
-                result_text = gr.Textbox(
-                    label="Status",
-                    interactive=False,
-                    lines=2
-                )
-        # Examples section at the bottom
-        with gr.Column(elem_classes=["examples-section"]):
-            gr.Markdown("### 🌟 Examples")
-            gr.Markdown("Click on any example to load it into the interface above")
-            # Define your custom examples here - 8 examples total
-            examples_data = [
-                # Example 1
-                {
-                    "caption": "A person walks on frozen ice",
-                    "video_path": "examples/1_video.mp4",
-                    "result_path": "examples/1_result.mp4"
-                },
-                # Example 2
-                {
-                    "caption": "With a faint sound as their hands parted, the two embraced, a soft 'mm' escaping between them.",
-                    "video_path": "examples/2_video.mp4",
-                    "result_path": "examples/2_result.mp4"
-                },
-                # Example 3
-                {
-                    "caption": "The sound of the number 3's bouncing footsteps is as light and clear as glass marbles hitting the ground. Each step carries a magical sound.",
-                    "video_path": "examples/3_video.mp4",
-                    "result_path": "examples/3_result.mp4"
-                },
-                # Example 4
-                {
-                    "caption": "gentle gurgling of the stream's current, and music plays in the background which is a beautiful and serene piano solo with a hint of classical charm, evoking a sense of peace and serenity in people's hearts.",
-                    "video_path": "examples/4_video.mp4",
-                    "result_path": "examples/4_result.mp4"
-                },
-                # Example 5 - Add your new examples here
-                {
-                    "caption": "snow crunching under the snowboard's edge.",
-                    "video_path": "examples/5_video.mp4",
-                    "result_path": "examples/5_result.mp4"
-                },
-                # Example 6
-                {
-                    "caption": "The crackling of the fire, the whooshing of the flames, and the occasional crisp popping of charred leaves filled the forest.",
-                    "video_path": "examples/6_video.mp4",
-                    "result_path": "examples/6_result.mp4"
-                },
-                # Example 7
-                {
-                    "caption": "humming of the scooter engine accelerates slowly.",
-                    "video_path": "examples/7_video.mp4",
-                    "result_path": "examples/7_result.mp4"
-                },
-                # Example 8
-                {
-                    "caption": "splash of water and loud thud as person hits the surface.",
-                    "video_path": "examples/8_video.mp4",
-                    "result_path": "examples/8_result.mp4"
-                }
-            ]
-            # Create example grid - 4 examples per row, 2 rows total
-            example_buttons = []
-            for row in range(2):  # 2 rows
-                with gr.Row(elem_classes=["example-grid-row"]):
-                    for col in range(4):  # 4 columns
-                        idx = row * 4 + col
-                        if idx < len(examples_data):
-                            example = examples_data[idx]
-                            with gr.Column(scale=1, elem_classes=["example-item"]):
-                                # Video thumbnail
-                                if os.path.exists(example['video_path']):
-                                    example_video = gr.Video(
-                                        value=example['video_path'],
-                                        label=f"Example {idx+1}",
-                                        interactive=False,
-                                        show_label=True,
-                                        height=180
-                                    )
-                                else:
-                                    example_video = gr.HTML(f"""
-                                    <div style="background: #f0f0f0; padding: 15px; text-align: center; border-radius: 8px; height: 180px; display: flex; align-items: center; justify-content: center;">
-                                        <div>
-                                            <p style="color: #666; margin: 0; font-size: 12px;">📹 Video not found</p>
-                                            <small style="color: #999; font-size: 10px;">{example['video_path']}</small>
-                                        </div>
-                                    </div>
-                                    """)
-                                # Caption (truncated for grid layout)
-                                caption_preview = example['caption'][:60] + "..." if len(example['caption']) > 60 else example['caption']
-                                gr.Markdown(f"{caption_preview}", elem_classes=["example-caption"])
-                                # Load button
-                                example_btn = gr.Button(
-                                    f"Load Example {idx+1}",
-                                    variant="secondary",
-                                    size="sm"
-                                )
-                                example_buttons.append((example_btn, example))
-        # Event handlers
-        def process_inference(video_file, text_prompt, neg_prompt, guidance_scale, inference_steps, sample_nums):
-            # Generate videos
-            video_list, status_msg = infer_single_video(
-                video_file, text_prompt, neg_prompt, guidance_scale, inference_steps, int(sample_nums)
-            )
-            # Update outputs with proper visibility
-            return update_video_outputs(video_list, status_msg)
-        # Add dynamic visibility control based on sample_nums
-        def update_visibility(sample_nums):
-            sample_nums = int(sample_nums)
-            return [
-                gr.update(visible=True),  # Sample 1 always visible
-                gr.update(visible=sample_nums >= 2),  # Sample 2
-                gr.update(visible=sample_nums >= 3),  # Sample 3
-                gr.update(visible=sample_nums >= 4),  # Sample 4
-                gr.update(visible=sample_nums >= 5),  # Sample 5
-                gr.update(visible=sample_nums >= 6),  # Sample 6
-            ]
-        # Update visibility when sample_nums changes
-        sample_nums.change(
-            fn=update_visibility,
-            inputs=[sample_nums],
-            outputs=[video_output_1, video_output_2, video_output_3, video_output_4, video_output_5, video_output_6]
-        )
-        generate_btn.click(
-            fn=process_inference,
-            inputs=[video_input, text_input, neg_prompt_input, guidance_scale, inference_steps, sample_nums],
-            outputs=[
-                video_output_1,  # Sample 1 value
-                video_output_2,  # Sample 2 value
-                video_output_3,  # Sample 3 value
-                video_output_4,  # Sample 4 value
-                video_output_5,  # Sample 5 value
-                video_output_6,  # Sample 6 value
-                result_text
-            ]
-        )
-        # Add click handlers for example buttons
-        for btn, example in example_buttons:
-            def create_example_handler(ex):
-                def handler():
-                    # Check if files exist, if not, return placeholder message
-                    if os.path.exists(ex['video_path']):
-                        video_file = ex['video_path']
-                    else:
-                        video_file = None
-                    if os.path.exists(ex['result_path']):
-                        result_video = ex['result_path']
-                    else:
-                        result_video = None
-                    status_msg = f"✅ Loaded example with caption: {ex['caption'][:50]}..."
-                    if not video_file:
-                        status_msg += f"\n⚠️ Video file not found: {ex['video_path']}"
-                    if not result_video:
-                        status_msg += f"\n⚠️ Result video not found: {ex['result_path']}"
-                    return video_file, ex['caption'], "noisy, harsh", result_video, status_msg
-                return handler
-            btn.click(
-                fn=create_example_handler(example),
-                outputs=[video_input, text_input, neg_prompt_input, video_output_1, result_text]
-            )
-        # Footer
-        gr.HTML("""
-        <div class="footer-text">
-            <p>🚀 Powered by HunyuanVideo-Foley | Generate high-quality audio from video and text descriptions</p>
-        </div>
-        """)
-    return app
-def set_manual_seed(global_seed):
-    random.seed(global_seed)
-    np.random.seed(global_seed)
-    torch.manual_seed(global_seed)
-if __name__ == "__main__":
-    set_manual_seed(1)
-    # Setup logging
-    logger.remove()
-    logger.add(lambda msg: print(msg, end=''), level="INFO")
-    # Auto-load model
-    logger.info("Starting application and loading model...")
-    model_load_result = auto_load_models()
-    logger.info(model_load_result)
-    # Create and launch Gradio app
-    app = create_gradio_interface()
-    # Log completion status
-    if "successfully" in model_load_result:
-        logger.info("Application ready, model loaded")
-    app.launch(
-        server_name="0.0.0.0",
-        server_port=8080,
-        share=False,
-        debug=False,
-        show_error=True
-    )

HunyuanVideo-Foley/infer.py DELETED Viewed

@@ -1,304 +0,0 @@
-import os
-import argparse
-import random
-import numpy as np
-import torch
-import pandas as pd
-import torchaudio
-from loguru import logger
-from hunyuanvideo_foley.utils.model_utils import load_model
-from hunyuanvideo_foley.utils.feature_utils import feature_process
-from hunyuanvideo_foley.utils.model_utils import denoise_process
-from hunyuanvideo_foley.utils.media_utils import merge_audio_video
-def set_manual_seed(global_seed):
-    random.seed(global_seed)
-    np.random.seed(global_seed)
-    torch.manual_seed(global_seed)
-def infer(video_path, prompt, model_dict, cfg, guidance_scale=4.5, num_inference_steps=50, neg_prompt=None):
-    visual_feats, text_feats, audio_len_in_s = feature_process(
-        video_path,
-        prompt,
-        model_dict,
-        cfg,
-        neg_prompt=neg_prompt
-    )
-    audio, sample_rate = denoise_process(
-        visual_feats,
-        text_feats,
-        audio_len_in_s,
-        model_dict,
-        cfg,
-        guidance_scale=guidance_scale,
-        num_inference_steps=num_inference_steps
-    )
-    return audio[0], sample_rate
-def generate_audio(model_dict, cfg, csv_path, output_dir, guidance_scale=4.5, num_inference_steps=50, neg_prompt=None):
-    os.makedirs(output_dir, exist_ok=True)
-    test_df = pd.read_csv(csv_path)
-    for index, row in test_df.iterrows():
-        video_path = row['video']
-        prompt = row['prompt']
-        logger.info(f"Processing video: {video_path}")
-        logger.info(f"Prompt: {prompt}")
-        output_audio_path = os.path.join(output_dir, f"{index:04d}.wav")
-        output_video_path = os.path.join(output_dir, f"{index:04d}.mp4")
-        if not os.path.exists(output_audio_path) or not os.path.exists(output_video_path):
-            audio, sample_rate = infer(video_path, prompt, model_dict, cfg, guidance_scale=guidance_scale, num_inference_steps=num_inference_steps, neg_prompt=neg_prompt)
-            torchaudio.save(output_audio_path, audio, sample_rate)
-            merge_audio_video(output_audio_path, video_path, output_video_path)
-    logger.info(f"All audio files saved to {output_dir}")
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description="HunyuanVideo-Foley: Generate audio from video and text prompts",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument(
-        "--model_path",
-        type=str,
-        required=True,
-        help="Path to the pretrained model dir"
-    )
-    parser.add_argument(
-        "--config_path",
-        type=str,
-        help="Path to the configuration file (.yaml file). If not specified, will be inferred from model_size"
-    )
-    parser.add_argument(
-        "--model_size",
-        type=str,
-        choices=["xl", "xxl"],
-        default="xxl",
-        help="Model size (xl/xxl). Auto-selects config and model file (default: xxl)"
-    )
-    input_group = parser.add_mutually_exclusive_group(required=True)
-    input_group.add_argument(
-        "--csv_path",
-        type=str,
-        help="Path to CSV file containing video paths and text prompts (columns: 'video', 'text')"
-    )
-    input_group.add_argument(
-        "--single_video",
-        type=str,
-        help="Path to a single video file for inference"
-    )
-    parser.add_argument(
-        "--single_prompt",
-        type=str,
-        help="Text prompt for single video (required when using --single_video)"
-    )
-    parser.add_argument(
-        "--neg_prompt",
-        type=str,
-        default=None,
-        help="Negative prompt to avoid during generation (default: 'noisy, harsh')"
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        required=True,
-        help="Directory to save generated audio and video files"
-    )
-    parser.add_argument(
-        "--guidance_scale",
-        type=float,
-        default=4.5,
-        help="Guidance scale for classifier-free guidance (higher = more text adherence)"
-    )
-    parser.add_argument(
-        "--num_inference_steps",
-        type=int,
-        default=50,
-        help="Number of denoising steps for diffusion sampling"
-    )
-    parser.add_argument(
-        "--audio_length",
-        type=float,
-        default=None,
-        help="Maximum audio length in seconds (default: video length)"
-    )
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="auto",
-        choices=["auto", "cpu", "cuda", "mps"],
-        help="Device to use for inference"
-    )
-    parser.add_argument(
-        "--gpu_id",
-        type=int,
-        default=0,
-        help="GPU ID to use when device is cuda"
-    )
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=1,
-        help="Batch size for processing multiple videos"
-    )
-    parser.add_argument(
-        "--skip_existing",
-        action="store_true",
-        help="Skip processing if output files already exist"
-    )
-    parser.add_argument(
-        "--save_video",
-        action="store_true",
-        default=True,
-        help="Save video with generated audio merged"
-    )
-    parser.add_argument(
-        "--log_level",
-        type=str,
-        default="INFO",
-        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
-        help="Logging level"
-    )
-    parser.add_argument(
-        "--enable_offload",
-        action="store_true",
-        help="Enable model offloading to reduce peak memory usage (good for small VRAM GPUs)"
-    )
-    args = parser.parse_args()
-    if args.single_video and not args.single_prompt:
-        parser.error("--single_prompt is required when using --single_video")
-    # 如果指定了model_size，自动推断config_path和model文件
-    if args.model_size:
-        config_mapping = {
-            "xl": "configs/hunyuanvideo-foley-xl.yaml",
-            "xxl": "configs/hunyuanvideo-foley-xxl.yaml"
-        }
-        if not args.config_path:
-            args.config_path = config_mapping[args.model_size]
-            logger.info(f"Auto-selected config for {args.model_size} model: {args.config_path}")
-    elif not args.config_path:
-        args.model_size = "xxl"
-        args.config_path = "configs/hunyuanvideo-foley-xxl.yaml"
-        logger.info(f"Using default {args.model_size} model: {args.config_path}")
-    return args
-def setup_device(device_str, gpu_id=0):
-    if device_str == "auto":
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{gpu_id}")
-            logger.info(f"Using CUDA device: {device}")
-        elif torch.backends.mps.is_available():
-            device = torch.device("mps")
-            logger.info("Using MPS device")
-        else:
-            device = torch.device("cpu")
-            logger.info("Using CPU device")
-    else:
-        if device_str == "cuda":
-            device = torch.device(f"cuda:{gpu_id}")
-        else:
-            device = torch.device(device_str)
-        logger.info(f"Using specified device: {device}")
-    return device
-def process_single_video(video_path, prompt, model_dict, cfg, output_dir, args):
-    logger.info(f"Processing single video: {video_path}")
-    logger.info(f"Text prompt: {prompt}")
-    video_name = os.path.splitext(os.path.basename(video_path))[0]
-    output_audio_path = os.path.join(output_dir, f"{video_name}_generated.wav")
-    output_video_path = os.path.join(output_dir, f"{video_name}_with_audio.mp4")
-    if args.skip_existing and os.path.exists(output_audio_path):
-        logger.info(f"Skipping existing audio file: {output_audio_path}")
-        if args.save_video and os.path.exists(output_video_path):
-            logger.info(f"Skipping existing video file: {output_video_path}")
-            return
-    audio, sample_rate = infer(
-        video_path, prompt, model_dict, cfg,
-        guidance_scale=args.guidance_scale,
-        num_inference_steps=args.num_inference_steps,
-        neg_prompt=args.neg_prompt
-    )
-    torchaudio.save(output_audio_path, audio, sample_rate)
-    logger.info(f"Audio saved to: {output_audio_path}")
-    if args.save_video:
-        merge_audio_video(output_audio_path, video_path, output_video_path)
-        logger.info(f"Video with audio saved to: {output_video_path}")
-def main():
-    set_manual_seed(1)
-    args = parse_args()
-    logger.remove()
-    logger.add(lambda msg: print(msg, end=''), level=args.log_level)
-    device = setup_device(args.device, args.gpu_id)
-    if not os.path.exists(args.model_path):
-        logger.error(f"Model file not found: {args.model_path}")
-        exit(1)
-    if not os.path.exists(args.config_path):
-        logger.error(f"Config file not found: {args.config_path}")
-        exit(1)
-    if args.csv_path:
-        if not os.path.exists(args.csv_path):
-            logger.error(f"CSV file not found: {args.csv_path}")
-            exit(1)
-    elif args.single_video:
-        if not os.path.exists(args.single_video):
-            logger.error(f"Video file not found: {args.single_video}")
-            exit(1)
-    os.makedirs(args.output_dir, exist_ok=True)
-    logger.info(f"Output directory: {args.output_dir}")
-    logger.info("Loading models...")
-    model_dict, cfg = load_model(args.model_path, args.config_path, device, enable_offload=args.enable_offload, model_size=args.model_size)
-    if args.single_video:
-        process_single_video(
-            args.single_video, args.single_prompt,
-            model_dict, cfg, args.output_dir, args
-        )
-    else:
-        generate_audio(
-            model_dict, cfg,
-            args.csv_path, args.output_dir,
-            guidance_scale=args.guidance_scale,
-            num_inference_steps=args.num_inference_steps,
-            neg_prompt=args.neg_prompt
-        )
-    logger.info("Processing completed!")
-if __name__ == "__main__":
-    main()

HunyuanVideo-Foley/pytest.ini DELETED Viewed

@@ -1,11 +0,0 @@
-[tool:pytest]
-testpaths = tests
-python_files = test_*.py
-python_functions = test_*
-addopts =
-    --verbose
-    --tb=short
-    --strict-markers
-    --disable-warnings
-markers =
-    slow: marks tests as slow (deselect with '-m "not slow"')

HunyuanVideo-Foley/tests/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- # Test suite for HunyuanVideo-Foley

HunyuanVideo-Foley/tests/test_config_utils.py DELETED Viewed

@@ -1,89 +0,0 @@
-"""Tests for configuration utilities."""
-import pytest
-import tempfile
-import yaml
-from pathlib import Path
-from hunyuanvideo_foley.utils.config_utils import AttributeDict, load_yaml
-class TestAttributeDict:
-    """Test cases for AttributeDict class."""
-    def test_dict_access(self):
-        """Test dictionary-style access."""
-        data = {"key1": "value1", "key2": {"nested": "value2"}}
-        attr_dict = AttributeDict(data)
-        assert attr_dict["key1"] == "value1"
-        assert attr_dict["key2"]["nested"] == "value2"
-    def test_attribute_access(self):
-        """Test attribute-style access."""
-        data = {"key1": "value1", "key2": {"nested": "value2"}}
-        attr_dict = AttributeDict(data)
-        assert attr_dict.key1 == "value1"
-        assert attr_dict.key2.nested == "value2"
-    def test_list_handling(self):
-        """Test list data handling."""
-        data = [1, 2, {"nested": "value"}]
-        attr_dict = AttributeDict(data)
-        assert attr_dict[0] == 1
-        assert attr_dict[2].nested == "value"
-    def test_keys_method(self):
-        """Test keys() method."""
-        data = {"key1": "value1", "key2": "value2"}
-        attr_dict = AttributeDict(data)
-        keys = list(attr_dict.keys())
-        assert "key1" in keys
-        assert "key2" in keys
-    def test_get_method(self):
-        """Test get() method."""
-        data = {"key1": "value1"}
-        attr_dict = AttributeDict(data)
-        assert attr_dict.get("key1") == "value1"
-        assert attr_dict.get("nonexistent", "default") == "default"
-class TestLoadYaml:
-    """Test cases for load_yaml function."""
-    def test_load_valid_yaml(self):
-        """Test loading valid YAML file."""
-        data = {"model": {"name": "test_model", "params": {"lr": 0.001}}}
-        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f:
-            yaml.dump(data, f)
-            yaml_path = f.name
-        try:
-            result = load_yaml(yaml_path)
-            assert result.model.name == "test_model"
-            assert result.model.params.lr == 0.001
-        finally:
-            Path(yaml_path).unlink()
-    def test_load_nonexistent_file(self):
-        """Test loading non-existent file."""
-        with pytest.raises(FileNotFoundError):
-            load_yaml("nonexistent.yaml")
-    def test_load_invalid_yaml(self):
-        """Test loading invalid YAML file."""
-        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f:
-            f.write("invalid: yaml: content: [\n")  # Invalid YAML
-            yaml_path = f.name
-        try:
-            with pytest.raises(yaml.YAMLError):
-                load_yaml(yaml_path)
-        finally:
-            Path(yaml_path).unlink()

HunyuanVideo-Foley/tests/test_media_utils.py DELETED Viewed

@@ -1,82 +0,0 @@
-"""Tests for media utilities."""
-import pytest
-import tempfile
-import os
-from unittest.mock import patch, MagicMock
-from hunyuanvideo_foley.utils.media_utils import merge_audio_video, MediaProcessingError
-class TestMergeAudioVideo:
-    """Test cases for merge_audio_video function."""
-    def test_invalid_audio_path(self):
-        """Test with non-existent audio file."""
-        with pytest.raises(MediaProcessingError, match="Audio file not found"):
-            merge_audio_video("nonexistent.wav", "video.mp4", "output.mp4")
-    def test_invalid_video_path(self):
-        """Test with non-existent video file."""
-        with tempfile.NamedTemporaryFile(suffix='.wav') as audio_file:
-            with pytest.raises(MediaProcessingError, match="Video file not found"):
-                merge_audio_video(audio_file.name, "nonexistent.mp4", "output.mp4")
-    @patch('subprocess.Popen')
-    def test_successful_merge(self, mock_popen):
-        """Test successful merge operation."""
-        # Create temporary files
-        with tempfile.NamedTemporaryFile(suffix='.wav') as audio_file, \
-             tempfile.NamedTemporaryFile(suffix='.mp4') as video_file, \
-             tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as output_file:
-            # Mock successful subprocess
-            mock_process = MagicMock()
-            mock_process.returncode = 0
-            mock_process.communicate.return_value = ("", "")
-            mock_popen.return_value = mock_process
-            result = merge_audio_video(
-                audio_file.name,
-                video_file.name,
-                output_file.name
-            )
-            assert result == output_file.name
-            mock_popen.assert_called_once()
-            # Cleanup
-            os.unlink(output_file.name)
-    @patch('subprocess.Popen')
-    def test_ffmpeg_failure(self, mock_popen):
-        """Test ffmpeg failure handling."""
-        # Create temporary files
-        with tempfile.NamedTemporaryFile(suffix='.wav') as audio_file, \
-             tempfile.NamedTemporaryFile(suffix='.mp4') as video_file:
-            # Mock failed subprocess
-            mock_process = MagicMock()
-            mock_process.returncode = 1
-            mock_process.communicate.return_value = ("", "FFmpeg error")
-            mock_popen.return_value = mock_process
-            with pytest.raises(MediaProcessingError, match="FFmpeg failed"):
-                merge_audio_video(
-                    audio_file.name,
-                    video_file.name,
-                    "output.mp4"
-                )
-    @patch('subprocess.Popen', side_effect=FileNotFoundError)
-    def test_ffmpeg_not_found(self, mock_popen):
-        """Test ffmpeg not found error."""
-        with tempfile.NamedTemporaryFile(suffix='.wav') as audio_file, \
-             tempfile.NamedTemporaryFile(suffix='.mp4') as video_file:
-            with pytest.raises(FileNotFoundError, match="ffmpeg not found"):
-                merge_audio_video(
-                    audio_file.name,
-                    video_file.name,
-                    "output.mp4"
-                )

MMAudio/.gitignore DELETED Viewed

@@ -1,146 +0,0 @@
-run_*.sh
-log/
-saves
-saves/
-weights/
-weights
-output/
-output
-pretrained/
-workspace
-workspace/
-ext_weights/
-ext_weights
-.checkpoints/
-.vscode/
-training/example_output/
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-# C extensions
-*.so
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-pip-wheel-metadata/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-# Translations
-*.mo
-*.pot
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-# Flask stuff:
-instance/
-.webassets-cache
-# Scrapy stuff:
-.scrapy
-# Sphinx documentation
-docs/_build/
-# PyBuilder
-target/
-# Jupyter Notebook
-.ipynb_checkpoints
-# IPython
-profile_default/
-ipython_config.py
-# pyenv
-.python-version
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow
-__pypackages__/
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-# SageMath parsed files
-*.sage.py
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-# Spyder project settings
-.spyderproject
-.spyproject
-# Rope project settings
-.ropeproject
-# mkdocs documentation
-/site
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-# Pyre type checker
-.pyre/

MMAudio/LICENSE DELETED Viewed

@@ -1,21 +0,0 @@
-MIT License
-Copyright (c) 2024 Sony Research Inc.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.

MMAudio/README.md DELETED Viewed

@@ -1,198 +0,0 @@
-<div align="center">
-https://github.com/hkchengrex/MMAudio
-<p align="center">
-  <h2>MMAudio</h2>
-  <a href="https://arxiv.org/abs/2412.15322">Paper</a> | <a href="https://hkchengrex.github.io/MMAudio">Webpage</a> | <a href="https://huggingface.co/hkchengrex/MMAudio/tree/main">Models</a> | <a href="https://huggingface.co/spaces/hkchengrex/MMAudio"> Huggingface Demo</a> | <a href="https://colab.research.google.com/drive/1TAaXCY2-kPk4xE4PwKB3EqFbSnkUuzZ8?usp=sharing">Colab Demo</a> | <a href="https://replicate.com/zsxkib/mmaudio">Replicate Demo</a>
-</p>
-</div>
-## [Taming Multimodal Joint Training for High-Quality Video-to-Audio Synthesis](https://hkchengrex.github.io/MMAudio)
-[Ho Kei Cheng](https://hkchengrex.github.io/), [Masato Ishii](https://scholar.google.co.jp/citations?user=RRIO1CcAAAAJ), [Akio Hayakawa](https://scholar.google.com/citations?user=sXAjHFIAAAAJ), [Takashi Shibuya](https://scholar.google.com/citations?user=XCRO260AAAAJ), [Alexander Schwing](https://www.alexander-schwing.de/), [Yuki Mitsufuji](https://www.yukimitsufuji.com/)
-University of Illinois Urbana-Champaign, Sony AI, and Sony Group Corporation
-CVPR 2025
-## Highlight
-MMAudio generates synchronized audio given video and/or text inputs.
-Our key innovation is multimodal joint training which allows training on a wide range of audio-visual and audio-text datasets.
-Moreover, a synchronization module aligns the generated audio with the video frames.
-Check out this fun video:
-[![Does Your Voice Match Your Face?](https://img.youtube.com/vi/SLz3NWLyHxg/0.jpg)](https://youtu.be/SLz3NWLyHxg)
-[[Does Your Voice Match Your Face? https://youtu.be/SLz3NWLyHxg]](https://youtu.be/SLz3NWLyHxg)
-## Results
-(All audio from our algorithm MMAudio)
-Videos from Sora:
-https://github.com/user-attachments/assets/82afd192-0cee-48a1-86ca-bd39b8c8f330
-Videos from Veo 2:
-https://github.com/user-attachments/assets/8a11419e-fee2-46e0-9e67-dfb03c48d00e
-Videos from MovieGen/Hunyuan Video/VGGSound:
-https://github.com/user-attachments/assets/29230d4e-21c1-4cf8-a221-c28f2af6d0ca
-For more results, visit https://hkchengrex.com/MMAudio/video_main.html.
-## Installation
-We have only tested this on Ubuntu.
-### Prerequisites
-We recommend using a [miniforge](https://github.com/conda-forge/miniforge) environment.
-- Python 3.9+
-- PyTorch **2.5.1+** and corresponding torchvision/torchaudio (pick your CUDA version https://pytorch.org/, pip install recommended)
-<!-- - ffmpeg<7 ([this is required by torchaudio](https://pytorch.org/audio/master/installation.html#optional-dependencies), you can install it in a miniforge environment with `conda install -c conda-forge 'ffmpeg<7'`) -->
-**1. Install prerequisite if not yet met:**
-```bash
-pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 --upgrade
-```
-(Or any other CUDA versions that your GPUs/driver support)
-<!-- ```
-conda install -c conda-forge 'ffmpeg<7
-```
-(Optional, if you use miniforge and don't already have the appropriate ffmpeg) -->
-**2. Clone our repository:**
-```bash
-git clone https://github.com/hkchengrex/MMAudio.git
-```
-**3. Install with pip (install pytorch first before attempting this!):**
-```bash
-cd MMAudio
-pip install -e .
-```
-(If you encounter the File "setup.py" not found error, upgrade your pip with pip install --upgrade pip)
-**Pretrained models:**
-The models will be downloaded automatically when you run the demo script. MD5 checksums are provided in `mmaudio/utils/download_utils.py`.
-The models are also available at https://huggingface.co/hkchengrex/MMAudio/tree/main
-See [MODELS.md](docs/MODELS.md) for more details.
-## Demo
-By default, these scripts use the `large_44k_v2` model.
-In our experiments, inference only takes around 6GB of GPU memory (in 16-bit mode) which should fit in most modern GPUs.
-### Command-line interface
-With `demo.py`
-```bash
-python demo.py --duration=8 --video=<path to video> --prompt "your prompt"
-```
-The output (audio in `.flac` format, and video in `.mp4` format) will be saved in `./output`.
-See the file for more options.
-Simply omit the `--video` option for text-to-audio synthesis.
-The default output (and training) duration is 8 seconds. Longer/shorter durations could also work, but a large deviation from the training duration may result in a lower quality.
-### Gradio interface
-Supports video-to-audio and text-to-audio synthesis.
-You can also try experimental image-to-audio synthesis which duplicates the input image to a video for processing. This might be interesting to some but it is not something MMAudio has been trained for.
-Use [port forwarding](https://unix.stackexchange.com/questions/115897/whats-ssh-port-forwarding-and-whats-the-difference-between-ssh-local-and-remot) (e.g., `ssh -L 7860:localhost:7860 server`) if necessary. The default port is `7860` which you can specify with `--port`.
-```bash
-python gradio_demo.py
-```
-### FAQ
-1. Video processing
-    - Processing higher-resolution videos takes longer due to encoding and decoding (which can take >95% of the processing time!), but it does not improve the quality of results.
-    - The CLIP encoder resizes input frames to 384×384 pixels.
-    - Synchformer resizes the shorter edge to 224 pixels and applies a center crop, focusing only on the central square of each frame.
-2. Frame rates
-    - The CLIP model operates at 8 FPS, while Synchformer works at 25 FPS.
-    - Frame rate conversion happens on-the-fly via the video reader.
-    - For input videos with a frame rate below 25 FPS, frames will be duplicated to match the required rate.
-3. Failure cases
-As with most models of this type, failures can occur, and the reasons are not always clear. Below are some known failure modes. If you notice a failure mode or believe there’s a bug, feel free to open an issue in the repository.
-4. Performance variations
-We notice that there can be subtle performance variations in different hardware and software environments. Some of the reasons include using/not using `torch.compile`, video reader library/backend, inference precision, batch sizes, random seeds, etc. We (will) provide pre-computed results on standard benchmark for reference. Results obtained from this codebase should be similar but might not be exactly the same.
-### Known limitations
-1. The model sometimes generates unintelligible human speech-like sounds
-2. The model sometimes generates background music (without explicit training, it would not be high quality)
-3. The model struggles with unfamiliar concepts, e.g., it can generate "gunfires" but not "RPG firing".
-We believe all of these three limitations can be addressed with more high-quality training data.
-## Training
-See [TRAINING.md](docs/TRAINING.md).
-## Evaluation
-See [EVAL.md](docs/EVAL.md).
-## Training Datasets
-MMAudio was trained on several datasets, including [AudioSet](https://research.google.com/audioset/), [Freesound](https://github.com/LAION-AI/audio-dataset/blob/main/laion-audio-630k/README.md), [VGGSound](https://www.robots.ox.ac.uk/~vgg/data/vggsound/), [AudioCaps](https://audiocaps.github.io/), and [WavCaps](https://github.com/XinhaoMei/WavCaps). These datasets are subject to specific licenses, which can be accessed on their respective websites. We do not guarantee that the pre-trained models are suitable for commercial use. Please use them at your own risk.
-## Update Logs
-- 2025-03-09: Uploaded the corrected tsv files. See [TRAINING.md](docs/TRAINING.md).
-- 2025-02-27: Disabled the GradScaler by default to improve training stability. See #49.
-- 2024-12-23: Added training and batch evaluation scripts.
-- 2024-12-14: Removed the `ffmpeg<7` requirement for the demos by replacing `torio.io.StreamingMediaDecoder` with `pyav` for reading frames. The read frames are also cached, so we are not reading the same frames again during reconstruction. This should speed things up and make installation less of a hassle.
-- 2024-12-13: Improved for-loop processing in CLIP/Sync feature extraction by introducing a batch size multiplier. We can approximately use 40x batch size for CLIP/Sync without using more memory, thereby speeding up processing. Removed VAE encoder during inference -- we don't need it.
-- 2024-12-11: Replaced `torio.io.StreamingMediaDecoder` with `pyav` for reading framerate when reconstructing the input video. `torio.io.StreamingMediaDecoder` does not work reliably in huggingface ZeroGPU's environment, and I suspect that it might not work in some other environments as well.
-## Citation
-```bibtex
-@inproceedings{cheng2025taming,
-  title={{MMAudio}: Taming Multimodal Joint Training for High-Quality Video-to-Audio Synthesis},
-  author={Cheng, Ho Kei and Ishii, Masato and Hayakawa, Akio and Shibuya, Takashi and Schwing, Alexander and Mitsufuji, Yuki},
-  booktitle={CVPR},
-  year={2025}
-}
-```
-## Relevant Repositories
-- [av-benchmark](https://github.com/hkchengrex/av-benchmark) for benchmarking results.
-## License
-- The code in this repository is released under the MIT license as found in the [LICENSE file](LICENSE)
-- The checkpoints are released on Hugging Face under the CC-BY-NC 4.0 license as found at [https://creativecommons.org/licenses/by-nc/4.0/](https://creativecommons.org/licenses/by-nc/4.0/).
-## Disclaimer
-We have no affiliation with and have no knowledge of the party behind the domain "mmaudio.net".
-## Acknowledgement
-Many thanks to:
-- [Make-An-Audio 2](https://github.com/bytedance/Make-An-Audio-2) for the 16kHz BigVGAN pretrained model and the VAE architecture
-- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
-- [Synchformer](https://github.com/v-iashin/Synchformer)
-- [EDM2](https://github.com/NVlabs/edm2) for the magnitude-preserving VAE network architecture

MMAudio/batch_eval.py DELETED Viewed

@@ -1,110 +0,0 @@
-import logging
-import os
-from pathlib import Path
-import hydra
-import torch
-import torch.distributed as distributed
-import torchaudio
-from hydra.core.hydra_config import HydraConfig
-from omegaconf import DictConfig
-from tqdm import tqdm
-from mmaudio.data.data_setup import setup_eval_dataset
-from mmaudio.eval_utils import ModelConfig, all_model_cfg, generate
-from mmaudio.model.flow_matching import FlowMatching
-from mmaudio.model.networks import MMAudio, get_my_mmaudio
-from mmaudio.model.utils.features_utils import FeaturesUtils
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cudnn.allow_tf32 = True
-local_rank = int(os.environ['LOCAL_RANK'])
-world_size = int(os.environ['WORLD_SIZE'])
-log = logging.getLogger()
-@torch.inference_mode()
-@hydra.main(version_base='1.3.2', config_path='config', config_name='eval_config.yaml')
-def main(cfg: DictConfig):
-    device = 'cuda'
-    torch.cuda.set_device(local_rank)
-    if cfg.model not in all_model_cfg:
-        raise ValueError(f'Unknown model variant: {cfg.model}')
-    model: ModelConfig = all_model_cfg[cfg.model]
-    model.download_if_needed()
-    seq_cfg = model.seq_cfg
-    run_dir = Path(HydraConfig.get().run.dir)
-    if cfg.output_name is None:
-        output_dir = run_dir / cfg.dataset
-    else:
-        output_dir = run_dir / f'{cfg.dataset}-{cfg.output_name}'
-    output_dir.mkdir(parents=True, exist_ok=True)
-    # load a pretrained model
-    seq_cfg.duration = cfg.duration_s
-    net: MMAudio = get_my_mmaudio(cfg.model).to(device).eval()
-    net.load_weights(torch.load(model.model_path, map_location=device, weights_only=True))
-    log.info(f'Loaded weights from {model.model_path}')
-    net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
-    log.info(f'Latent seq len: {seq_cfg.latent_seq_len}')
-    log.info(f'Clip seq len: {seq_cfg.clip_seq_len}')
-    log.info(f'Sync seq len: {seq_cfg.sync_seq_len}')
-    # misc setup
-    rng = torch.Generator(device=device)
-    rng.manual_seed(cfg.seed)
-    fm = FlowMatching(cfg.sampling.min_sigma,
-                      inference_mode=cfg.sampling.method,
-                      num_steps=cfg.sampling.num_steps)
-    feature_utils = FeaturesUtils(tod_vae_ckpt=model.vae_path,
-                                  synchformer_ckpt=model.synchformer_ckpt,
-                                  enable_conditions=True,
-                                  mode=model.mode,
-                                  bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
-                                  need_vae_encoder=False)
-    feature_utils = feature_utils.to(device).eval()
-    if cfg.compile:
-        net.preprocess_conditions = torch.compile(net.preprocess_conditions)
-        net.predict_flow = torch.compile(net.predict_flow)
-        feature_utils.compile()
-    dataset, loader = setup_eval_dataset(cfg.dataset, cfg)
-    with torch.amp.autocast(enabled=cfg.amp, dtype=torch.bfloat16, device_type=device):
-        for batch in tqdm(loader):
-            audios = generate(batch.get('clip_video', None),
-                              batch.get('sync_video', None),
-                              batch.get('caption', None),
-                              feature_utils=feature_utils,
-                              net=net,
-                              fm=fm,
-                              rng=rng,
-                              cfg_strength=cfg.cfg_strength,
-                              clip_batch_size_multiplier=64,
-                              sync_batch_size_multiplier=64)
-            audios = audios.float().cpu()
-            names = batch['name']
-            for audio, name in zip(audios, names):
-                torchaudio.save(output_dir / f'{name}.flac', audio, seq_cfg.sampling_rate)
-def distributed_setup():
-    distributed.init_process_group(backend="nccl")
-    local_rank = distributed.get_rank()
-    world_size = distributed.get_world_size()
-    log.info(f'Initialized: local_rank={local_rank}, world_size={world_size}')
-    return local_rank, world_size
-if __name__ == '__main__':
-    distributed_setup()
-    main()
-    # clean-up
-    distributed.destroy_process_group()

MMAudio/config/__init__.py DELETED Viewed

File without changes

MMAudio/config/base_config.yaml DELETED Viewed

@@ -1,62 +0,0 @@
-defaults:
-  - data: base
-  - eval_data: base
-  - override hydra/job_logging: custom-simplest
-  - _self_
-hydra:
-  run:
-    dir: ./output/${exp_id}
-  output_subdir: ${now:%Y-%m-%d_%H-%M-%S}-hydra
-enable_email: False
-model: small_16k
-exp_id: default
-debug: False
-cudnn_benchmark: True
-compile: True
-amp: True
-weights: null
-checkpoint: null
-seed: 14159265
-num_workers: 10 # per-GPU
-pin_memory: False # set to True if your system can handle it, i.e., have enough memory
-# NOTE: This DOSE NOT affect the model during inference in any way
-# they are just for the dataloader to fill in the missing data in multi-modal loading
-# to change the sequence length for the model, see networks.py
-data_dim:
-  text_seq_len: 77
-  clip_dim: 1024
-  sync_dim: 768
-  text_dim: 1024
-# ema configuration
-ema:
-  enable: True
-  sigma_rels: [0.05, 0.1]
-  update_every: 1
-  checkpoint_every: 5_000
-  checkpoint_folder: ${hydra:run.dir}/ema_ckpts
-  default_output_sigma: 0.05
-# sampling
-sampling:
-  mean: 0.0
-  scale: 1.0
-  min_sigma: 0.0
-  method: euler
-  num_steps: 25
-# classifier-free guidance
-null_condition_probability: 0.1
-cfg_strength: 4.5
-# checkpoint paths to external modules
-vae_16k_ckpt: ./ext_weights/v1-16.pth
-vae_44k_ckpt: ./ext_weights/v1-44.pth
-bigvgan_vocoder_ckpt: ./ext_weights/best_netG.pt
-synchformer_ckpt: ./ext_weights/synchformer_state_dict.pth

MMAudio/config/data/base.yaml DELETED Viewed

@@ -1,70 +0,0 @@
-VGGSound:
-  root: ../data/video
-  subset_name: sets/vgg3-train.tsv
-  fps: 8
-  height: 384
-  width: 384
-  sample_duration_sec: 8.0
-VGGSound_test:
-  root: ../data/video
-  subset_name: sets/vgg3-test.tsv
-  fps: 8
-  height: 384
-  width: 384
-  sample_duration_sec: 8.0
-VGGSound_val:
-  root: ../data/video
-  subset_name: sets/vgg3-val.tsv
-  fps: 8
-  height: 384
-  width: 384
-  sample_duration_sec: 8.0
-ExtractedVGG:
-  tsv: ../data/v1-16-memmap/vgg-train.tsv
-  memmap_dir: ../data/v1-16-memmap/vgg-train
-ExtractedVGG_test:
-  tag: test
-  gt_cache: ../data/eval-cache/vggsound-test
-  output_subdir: null
-  tsv: ../data/v1-16-memmap/vgg-test.tsv
-  memmap_dir: ../data/v1-16-memmap/vgg-test
-ExtractedVGG_val:
-  tag: val
-  gt_cache: ../data/eval-cache/vggsound-val
-  output_subdir: val
-  tsv: ../data/v1-16-memmap/vgg-val.tsv
-  memmap_dir: ../data/v1-16-memmap/vgg-val
-AudioCaps:
-  tsv: ../data/v1-16-memmap/audiocaps.tsv
-  memmap_dir: ../data/v1-16-memmap/audiocaps
-AudioSetSL:
-  tsv: ../data/v1-16-memmap/audioset_sl.tsv
-  memmap_dir: ../data/v1-16-memmap/audioset_sl
-BBCSound:
-  tsv: ../data/v1-16-memmap/bbcsound.tsv
-  memmap_dir: ../data/v1-16-memmap/bbcsound
-FreeSound:
-  tsv: ../data/v1-16-memmap/freesound.tsv
-  memmap_dir: ../data/v1-16-memmap/freesound
-Clotho:
-  tsv: ../data/v1-16-memmap/clotho.tsv
-  memmap_dir: ../data/v1-16-memmap/clotho
-Example_video:
-  tsv: ./training/example_output/memmap/vgg-example.tsv
-  memmap_dir: ./training/example_output/memmap/vgg-example
-Example_audio:
-  tsv: ./training/example_output/memmap/audio-example.tsv
-  memmap_dir: ./training/example_output/memmap/audio-example

MMAudio/config/eval_config.yaml DELETED Viewed

@@ -1,17 +0,0 @@
-defaults:
-  - base_config
-  - override hydra/job_logging: custom-simplest
-  - _self_
-hydra:
-  run:
-    dir: ./output/${exp_id}
-  output_subdir: eval-${now:%Y-%m-%d_%H-%M-%S}-hydra
-exp_id: ${model}
-dataset: audiocaps
-duration_s: 8.0
-# for inference, this is the per-GPU batch size
-batch_size: 16
-output_name: null

MMAudio/config/eval_data/base.yaml DELETED Viewed

@@ -1,22 +0,0 @@
-AudioCaps:
-  audio_path: ../data/AudioCaps-test-audioldm-ver
-  # a csv file, with a header row of 'name' and 'caption'
-  # name should match the audio file name without extension
-  # Can be downloaded here: https://github.com/hkchengrex/MMAudio/releases/download/v0.1/AudioCaps_audioldm_data.csv
-  csv_path: ../data/AudioCaps-test-audioldm-ver/data.csv
-AudioCaps_full:
-  audio_path: ../data/AudioCaps-test-full-ver
-  # a csv file, with a header row of 'name' and 'caption'
-  # name should match the audio file name without extension
-  # Can be downloaded here: https://github.com/hkchengrex/MMAudio/releases/download/v0.1/AudioCaps_full_data.csv
-  csv_path: ../data/AudioCaps-test-full-ver/data.csv
-MovieGen:
-  video_path: ../data/MovieGen/MovieGenAudioBenchSfx/video_with_audio
-  jsonl_path: ../data/MovieGen/MovieGenAudioBenchSfx/metadata
-VGGSound:
-  video_path: ../data/test-videos
-  # from the officially released csv file
-  csv_path: ../data/vggsound.csv

MMAudio/config/hydra/job_logging/custom-eval.yaml DELETED Viewed

@@ -1,32 +0,0 @@
-# python logging configuration for tasks
-version: 1
-formatters:
-  simple:
-    format: '[%(asctime)s][%(levelname)s][r${oc.env:LOCAL_RANK}] - %(message)s'
-    datefmt: '%Y-%m-%d %H:%M:%S'
-  colorlog:
-    '()': 'colorlog.ColoredFormatter'
-    format: '[%(cyan)s%(asctime)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - %(message)s'
-    datefmt: '%Y-%m-%d %H:%M:%S'
-    log_colors:
-      DEBUG: purple
-      INFO: green
-      WARNING: yellow
-      ERROR: red
-      CRITICAL: red
-handlers:
-  console:
-    class: logging.StreamHandler
-    formatter: colorlog
-    stream: ext://sys.stdout
-  file:
-    class: logging.FileHandler
-    formatter: simple
-    # absolute file path
-    filename: ${hydra.runtime.output_dir}/eval-${now:%Y-%m-%d_%H-%M-%S}-rank${oc.env:LOCAL_RANK}.log
-    mode: w
-root:
-  level: INFO
-  handlers: [console, file]
-disable_existing_loggers: false

MMAudio/config/hydra/job_logging/custom-no-rank.yaml DELETED Viewed

@@ -1,32 +0,0 @@
-# python logging configuration for tasks
-version: 1
-formatters:
-  simple:
-    format: '[%(asctime)s][%(levelname)s] - %(message)s'
-    datefmt: '%Y-%m-%d %H:%M:%S'
-  colorlog:
-    '()': 'colorlog.ColoredFormatter'
-    format: '[%(cyan)s%(asctime)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - %(message)s'
-    datefmt: '%Y-%m-%d %H:%M:%S'
-    log_colors:
-      DEBUG: purple
-      INFO: green
-      WARNING: yellow
-      ERROR: red
-      CRITICAL: red
-handlers:
-  console:
-    class: logging.StreamHandler
-    formatter: colorlog
-    stream: ext://sys.stdout
-  file:
-    class: logging.FileHandler
-    formatter: simple
-    # absolute file path
-    filename: ${hydra.runtime.output_dir}/${now:%Y-%m-%d_%H-%M-%S}-eval.log
-    mode: w
-root:
-  level: INFO
-  handlers: [console, file]
-disable_existing_loggers: false

MMAudio/config/hydra/job_logging/custom-simplest.yaml DELETED Viewed

@@ -1,26 +0,0 @@
-# python logging configuration for tasks
-version: 1
-formatters:
-  simple:
-    format: '[%(asctime)s][%(levelname)s] - %(message)s'
-    datefmt: '%Y-%m-%d %H:%M:%S'
-  colorlog:
-    '()': 'colorlog.ColoredFormatter'
-    format: '[%(cyan)s%(asctime)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - %(message)s'
-    datefmt: '%Y-%m-%d %H:%M:%S'
-    log_colors:
-      DEBUG: purple
-      INFO: green
-      WARNING: yellow
-      ERROR: red
-      CRITICAL: red
-handlers:
-  console:
-    class: logging.StreamHandler
-    formatter: colorlog
-    stream: ext://sys.stdout
-root:
-  level: INFO
-  handlers: [console]
-disable_existing_loggers: false

MMAudio/config/hydra/job_logging/custom.yaml DELETED Viewed

@@ -1,33 +0,0 @@
-# @package hydra.job_logging
-# python logging configuration for tasks
-version: 1
-formatters:
-  simple:
-    format: '[%(asctime)s][%(levelname)s][r${oc.env:LOCAL_RANK}] - %(message)s'
-    datefmt: '%Y-%m-%d %H:%M:%S'
-  colorlog:
-    '()': 'colorlog.ColoredFormatter'
-    format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)sr${oc.env:LOCAL_RANK}%(reset)s][%(log_color)s%(levelname)s%(reset)s] - %(message)s'
-    datefmt: '%Y-%m-%d %H:%M:%S'
-    log_colors:
-      DEBUG: purple
-      INFO: green
-      WARNING: yellow
-      ERROR: red
-      CRITICAL: red
-handlers:
-  console:
-    class: logging.StreamHandler
-    formatter: colorlog
-    stream: ext://sys.stdout
-  file:
-    class: logging.FileHandler
-    formatter: simple
-    # absolute file path
-    filename: ${hydra.runtime.output_dir}/train-${now:%Y-%m-%d_%H-%M-%S}-rank${oc.env:LOCAL_RANK}.log
-    mode: w
-root:
-  level: INFO
-  handlers: [console, file]
-disable_existing_loggers: false

MMAudio/config/train_config.yaml DELETED Viewed

@@ -1,41 +0,0 @@
-defaults:
-  - base_config
-  - override data: base
-  - override hydra/job_logging: custom
-  - _self_
-hydra:
-  run:
-    dir: ./output/${exp_id}
-  output_subdir: train-${now:%Y-%m-%d_%H-%M-%S}-hydra
-ema:
-  start: 0
-mini_train: False
-example_train: False
-enable_grad_scaler: False
-vgg_oversample_rate: 5
-log_text_interval: 200
-log_extra_interval: 20_000
-val_interval: 5_000
-eval_interval: 20_000
-save_eval_interval: 40_000
-save_weights_interval: 10_000
-save_checkpoint_interval: 10_000
-save_copy_iterations: []
-batch_size: 512
-eval_batch_size: 256 # per-GPU
-num_iterations: 300_000
-learning_rate: 1.0e-4
-linear_warmup_steps: 1_000
-lr_schedule: step
-lr_schedule_steps: [240_000, 270_000]
-lr_schedule_gamma: 0.1
-clip_grad_norm: 1.0
-weight_decay: 1.0e-6

MMAudio/demo.py DELETED Viewed

@@ -1,141 +0,0 @@
-import logging
-from argparse import ArgumentParser
-from pathlib import Path
-import torch
-import torchaudio
-from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video,
-                                setup_eval_logging)
-from mmaudio.model.flow_matching import FlowMatching
-from mmaudio.model.networks import MMAudio, get_my_mmaudio
-from mmaudio.model.utils.features_utils import FeaturesUtils
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cudnn.allow_tf32 = True
-log = logging.getLogger()
-@torch.inference_mode()
-def main():
-    setup_eval_logging()
-    parser = ArgumentParser()
-    parser.add_argument('--variant',
-                        type=str,
-                        default='large_44k_v2',
-                        help='small_16k, small_44k, medium_44k, large_44k, large_44k_v2')
-    parser.add_argument('--video', type=Path, help='Path to the video file')
-    parser.add_argument('--prompt', type=str, help='Input prompt', default='')
-    parser.add_argument('--negative_prompt', type=str, help='Negative prompt', default='')
-    parser.add_argument('--duration', type=float, default=8.0)
-    parser.add_argument('--cfg_strength', type=float, default=4.5)
-    parser.add_argument('--num_steps', type=int, default=25)
-    parser.add_argument('--mask_away_clip', action='store_true')
-    parser.add_argument('--output', type=Path, help='Output directory', default='./output')
-    parser.add_argument('--seed', type=int, help='Random seed', default=42)
-    parser.add_argument('--skip_video_composite', action='store_true')
-    parser.add_argument('--full_precision', action='store_true')
-    args = parser.parse_args()
-    if args.variant not in all_model_cfg:
-        raise ValueError(f'Unknown model variant: {args.variant}')
-    model: ModelConfig = all_model_cfg[args.variant]
-    model.download_if_needed()
-    seq_cfg = model.seq_cfg
-    if args.video:
-        video_path: Path = Path(args.video).expanduser()
-    else:
-        video_path = None
-    prompt: str = args.prompt
-    negative_prompt: str = args.negative_prompt
-    output_dir: str = args.output.expanduser()
-    seed: int = args.seed
-    num_steps: int = args.num_steps
-    duration: float = args.duration
-    cfg_strength: float = args.cfg_strength
-    skip_video_composite: bool = args.skip_video_composite
-    mask_away_clip: bool = args.mask_away_clip
-    device = 'cpu'
-    if torch.cuda.is_available():
-        device = 'cuda'
-    elif torch.backends.mps.is_available():
-        device = 'mps'
-    else:
-        log.warning('CUDA/MPS are not available, running on CPU')
-    dtype = torch.float32 if args.full_precision else torch.bfloat16
-    output_dir.mkdir(parents=True, exist_ok=True)
-    # load a pretrained model
-    net: MMAudio = get_my_mmaudio(model.model_name).to(device, dtype).eval()
-    net.load_weights(torch.load(model.model_path, map_location=device, weights_only=True))
-    log.info(f'Loaded weights from {model.model_path}')
-    # misc setup
-    rng = torch.Generator(device=device)
-    rng.manual_seed(seed)
-    fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
-    feature_utils = FeaturesUtils(tod_vae_ckpt=model.vae_path,
-                                  synchformer_ckpt=model.synchformer_ckpt,
-                                  enable_conditions=True,
-                                  mode=model.mode,
-                                  bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
-                                  need_vae_encoder=False)
-    feature_utils = feature_utils.to(device, dtype).eval()
-    if video_path is not None:
-        log.info(f'Using video {video_path}')
-        video_info = load_video(video_path, duration)
-        clip_frames = video_info.clip_frames
-        sync_frames = video_info.sync_frames
-        duration = video_info.duration_sec
-        if mask_away_clip:
-            clip_frames = None
-        else:
-            clip_frames = clip_frames.unsqueeze(0)
-        sync_frames = sync_frames.unsqueeze(0)
-    else:
-        log.info('No video provided -- text-to-audio mode')
-        clip_frames = sync_frames = None
-    seq_cfg.duration = duration
-    net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
-    log.info(f'Prompt: {prompt}')
-    log.info(f'Negative prompt: {negative_prompt}')
-    audios = generate(clip_frames,
-                      sync_frames, [prompt],
-                      negative_text=[negative_prompt],
-                      feature_utils=feature_utils,
-                      net=net,
-                      fm=fm,
-                      rng=rng,
-                      cfg_strength=cfg_strength)
-    audio = audios.float().cpu()[0]
-    if video_path is not None:
-        save_path = output_dir / f'{video_path.stem}.flac'
-    else:
-        safe_filename = prompt.replace(' ', '_').replace('/', '_').replace('.', '')
-        save_path = output_dir / f'{safe_filename}.flac'
-    torchaudio.save(save_path, audio, seq_cfg.sampling_rate)
-    log.info(f'Audio saved to {save_path}')
-    if video_path is not None and not skip_video_composite:
-        video_save_path = output_dir / f'{video_path.stem}.mp4'
-        make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
-        log.info(f'Video saved to {output_dir / video_save_path}')
-    log.info('Memory usage: %.2f GB', torch.cuda.max_memory_allocated() / (2**30))
-if __name__ == '__main__':
-    main()

MMAudio/docs/EVAL.md DELETED Viewed

@@ -1,23 +0,0 @@
-# Evaluation
-## Batch Evaluation
-To evaluate the model on a dataset, use the `batch_eval.py` script. It is significantly more efficient in large-scale evaluation compared to `demo.py`, supporting batched inference, multi-GPU inference, torch compilation, and skipping video compositions.
-An example of running this script with four GPUs is as follows:
-```bash
-OMP_NUM_THREADS=4 torchrun --standalone --nproc_per_node=4  batch_eval.py duration_s=8 dataset=vggsound model=small_16k num_workers=8
-```
-You may need to update the data paths in `config/eval_data/base.yaml`.
-More configuration options can be found in `config/base_config.yaml` and `config/eval_config.yaml`.
-You might also want to change the dataset definition if you are not evaluating on VGGSound: https://github.com/hkchengrex/MMAudio/blob/main/mmaudio/data/eval/video_dataset.py
-## Precomputed Results
-Precomputed results for VGGSound, AudioCaps, and MovieGen are available here: https://huggingface.co/datasets/hkchengrex/MMAudio-precomputed-results
-## Obtaining Quantitative Metrics
-Our evaluation code is available here: https://github.com/hkchengrex/av-benchmark

MMAudio/docs/MODELS.md DELETED Viewed

@@ -1,50 +0,0 @@
-# Pretrained models
-The models will be downloaded automatically when you run the demo script. MD5 checksums are provided in `mmaudio/utils/download_utils.py`.
-The models are also available at https://huggingface.co/hkchengrex/MMAudio/tree/main
-| Model    | Download link | File size |
-| -------- | ------- | ------- |
-| Flow prediction network, small 16kHz | <a href="https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_small_16k.pth" download="mmaudio_small_16k.pth">mmaudio_small_16k.pth</a> | 601M |
-| Flow prediction network, small 44.1kHz | <a href="https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_small_44k.pth" download="mmaudio_small_44k.pth">mmaudio_small_44k.pth</a> | 601M |
-| Flow prediction network, medium 44.1kHz | <a href="https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_medium_44k.pth" download="mmaudio_medium_44k.pth">mmaudio_medium_44k.pth</a> | 2.4G |
-| Flow prediction network, large 44.1kHz | <a href="https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_large_44k.pth" download="mmaudio_large_44k.pth">mmaudio_large_44k.pth</a> | 3.9G |
-| Flow prediction network, large 44.1kHz, v2 **(recommended)** | <a href="https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_large_44k_v2.pth" download="mmaudio_large_44k_v2.pth">mmaudio_large_44k_v2.pth</a> | 3.9G |
-| 16kHz VAE | <a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/v1-16.pth">v1-16.pth</a> | 655M |
-| 16kHz BigVGAN vocoder (from Make-An-Audio 2) |<a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/best_netG.pt">best_netG.pt</a> | 429M |
-| 44.1kHz VAE |<a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/v1-44.pth">v1-44.pth</a> | 1.2G |
-| Synchformer visual encoder |<a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/synchformer_state_dict.pth">synchformer_state_dict.pth</a> | 907M |
-To run the model, you need four components: a flow prediction network, visual feature extractors (Synchformer and CLIP, CLIP will be downloaded automatically), a VAE, and a vocoder. VAEs and vocoders are specific to the sampling rate (16kHz or 44.1kHz) and not model sizes.
-The 44.1kHz vocoder will be downloaded automatically.
-The `_v2` model performs worse in benchmarking (e.g., in  Fréchet distance), but, in my experience, generalizes better to new data.
-The expected directory structure (full):
-```bash
-MMAudio
-├── ext_weights
-│   ├── best_netG.pt
-│   ├── synchformer_state_dict.pth
-│   ├── v1-16.pth
-│   └── v1-44.pth
-├── weights
-│   ├── mmaudio_small_16k.pth
-│   ├── mmaudio_small_44k.pth
-│   ├── mmaudio_medium_44k.pth
-│   ├── mmaudio_large_44k.pth
-│   └── mmaudio_large_44k_v2.pth
-└── ...
-```
-The expected directory structure (minimal, for the recommended model only):
-```bash
-MMAudio
-├── ext_weights
-│   ├── synchformer_state_dict.pth
-│   └── v1-44.pth
-├── weights
-│   └── mmaudio_large_44k_v2.pth
-└── ...
-```

MMAudio/docs/TRAINING.md DELETED Viewed

@@ -1,184 +0,0 @@
-# Training
-## Overview
-We have put a large emphasis on making training as fast as possible.
-Consequently, some pre-processing steps are required.
-Namely, before starting any training, we
-1. Obtain training data as videos, audios, and captions.
-2. Encode training audios into spectrograms and then with VAE into mean/std
-3. Extract CLIP and synchronization features from videos
-4. Extract CLIP features from text (captions)
-5. Encode all extracted features into [MemoryMappedTensors](https://pytorch.org/tensordict/main/reference/generated/tensordict.MemoryMappedTensor.html) with [TensorDict](https://pytorch.org/tensordict/main/reference/tensordict.html)
-**NOTE:** for maximum training speed (e.g., when training the base model with 2*H100s), you would need around 3~5 GB/s of random read speed. Spinning disks would not be able to catch up and most consumer-grade SSDs would struggle. In my experience, the best bet is to have a large enough system memory such that the OS can cache the data. This way, the data is read from RAM instead of disk.
-The current training script does not support `_v2` training.
-## Recommended Hardware Configuration
-These are what I recommend for a smooth and efficient training experience. These are not minimum requirements.
-- Single-node machine. We did not implement multi-node training
-- GPUs: for the small model, two 80G-H100s or above; for the large model, eight 80G-H100s or above
-- System memory: for 16kHz training, 600GB+; for 44kHz training, 700GB+
-- Storage: >2TB of fast NVMe storage. If you have enough system memory, OS caching will help and the storage does not need to be as fast.
-## Prerequisites
-1. Install [av-benchmark](https://github.com/hkchengrex/av-benchmark). We use this library to automatically evaluate on the validation set during training, and on the test set after training.
-2. Extract features for evaluation using [av-benchmark](https://github.com/hkchengrex/av-benchmark) for the validation and test set as a [validation cache](https://github.com/hkchengrex/MMAudio/blob/34bf089fdd2e457cd5ef33be96c0e1c8a0412476/config/data/base.yaml#L38) and a [test cache](https://github.com/hkchengrex/MMAudio/blob/34bf089fdd2e457cd5ef33be96c0e1c8a0412476/config/data/base.yaml#L31). You can also download the precomputed evaluation cache [here](https://huggingface.co/datasets/hkchengrex/MMAudio-precomputed-results/tree/main).
-3. You will need ffmpeg to extract frames from videos. Note that `torchaudio` imposes a maximum version limit (`ffmpeg<7`). You can install it as follows:
-```bash
-conda install -c conda-forge 'ffmpeg<7'
-```
-4. Download the training datasets. We used [VGGSound](https://arxiv.org/abs/2004.14368), [AudioCaps](https://audiocaps.github.io/), [WavCaps](https://arxiv.org/abs/2303.17395), and [Clotho](https://arxiv.org/abs/1910.09387) (paper to be updated). Note that the audio files in the huggingface release of WavCaps have been downsampled to 32kHz. To the best of our ability, we located the original (high-sampling rate) audio files and used them instead to prevent artifacts during 44.1kHz training. We did not use the "SoundBible" portion of WavCaps, since it is a small set with many short audio unsuitable for our training.
-5. Download the corresponding VAE (`v1-16.pth` for 16kHz training, and `v1-44.pth` for 44.1kHz training), vocoder models (`best_netG.pt` for 16kHz training; the vocoder for 44.1kHz training will be downloaded automatically), the [empty string encoding](https://github.com/hkchengrex/MMAudio/releases/download/v0.1/empty_string.pth), and Synchformer weights from [MODELS.md](https://github.com/hkchengrex/MMAudio/blob/main/docs/MODELS.md) place them in `ext_weights/`.
-### Helpful links for downloading the datasets
-We cannot redistribute the datasets for copyright reasons, but we do find some links helpful and they might be helpful to you as well.
-- https://huggingface.co/datasets/Meranti/CLAP_freesound
-- https://huggingface.co/datasets/agkphysics/AudioSet
-- https://sound-effects.bbcrewind.co.uk/
-For certain sources of VGGSound, you might notice desychronization between the audio and the video. This happens the video keyframes do not always align with the start of the audio and what happens during playbacks is player-dependent. We used PyTorch's decoder which can correctly handle these cases.
-## Preparing Audio-Video-Text Features
-We have prepared some example data in `training/example_videos`.
-`training/extract_video_training_latents.py` extracts audio, video, and text features and save them as a `TensorDict` with a `.tsv` file containing metadata to `output_dir`.
-To run this script, use the `torchrun` utility:
-```bash
-torchrun --standalone training/extract_video_training_latents.py
-```
-You can run this script with multiple GPUs (with `--nproc_per_node=<n>` after `--standalone` and before the script name) to speed up extraction.
-Modify the definitions near the top of the script to switch between 16kHz/44.1kHz extraction.
-Change the data path definitions in `data_cfg` if necessary.
-Arguments:
-- `latent_dir` -- where intermediate latent outputs are saved. It is safe to delete this directory afterwards.
-- `output_dir` -- where TensorDict and the metadata file are saved.
-Outputs produced in `output_dir`:
-1. A directory named `vgg-{split}` (i.e., in the TensorDict format), containing
-    a. `mean.memmap` mean values predicted by the VAE encoder (number of videos X sequence length X channel size)
-    b. `std.memmap` standard deviation values predicted by the VAE encoder (number of videos X sequence length X channel size)
-    c. `text_features.memmap` text features extracted from CLIP (number of videos X 77 (sequence length) X 1024)
-    d. `clip_features.memmap` clip features extracted from CLIP (number of videos X 64 (8 fps) X 1024)
-    e. `sync_features.memmap` synchronization features extracted from Synchformer (number of videos X 192 (24 fps) X 768)
-    f. `meta.json` that contains the metadata for the above memory mappings
-2. A tab-separated values file named `vgg-{split}.tsv` that contains two columns: `id` containing video file names without extension, and `label` containing corresponding text labels (i.e., captions)
-## Preparing Audio-Text Features
-We have prepared some example data in `training/example_audios`.
-1. Run `training/partition_clips` to partition each audio file into clips (by finding start and end points; we do not save the partitioned audio onto the disk to save disk space)
-2. Run `training/extract_audio_training_latents.py` to extract each clip's audio and text features and save them as a `TensorDict` with a `.tsv` file containing metadata to `output_dir`.
-### Partitioning the audio files
-Run
-```bash
-python training/partition_clips.py
-```
-Arguments:
-- `data_dir` -- path to a directory containing the audio files (`.flac` or `.wav`)
-- `output_dir` -- path to the output `.csv` file
-- `start` -- optional; useful when you need to run multiple processes to speed up processing -- this defines the beginning of the chunk to be processed
-- `end` -- optional; useful when you need to run multiple processes to speed up processing -- this defines the end of the chunk to be processed
-### Extracting audio and text features
-Run
-```bash
-torchrun --standalone training/extract_audio_training_latents.py
-```
-You can run this with multiple GPUs (with `--nproc_per_node=<n>`) to speed up extraction.
-Modify the definitions near the top of the script to switch between 16kHz/44.1kHz extraction.
-Arguments:
-- `data_dir` -- path to a directory containing the audio files (`.flac` or `.wav`), same as the previous step
-- `captions_tsv` -- path to the captions file, a tab-separated values (tsv) file at least with columns `id` and `caption`
-- `clips_tsv` -- path to the clips file, generated in the last step
-- `latent_dir` -- where intermediate latent outputs are saved. It is safe to delete this directory afterwards.
-- `output_dir` -- where TensorDict and the metadata file are saved.
-Outputs produced in `output_dir`:
-1. A directory named `{basename(output_dir)}` (i.e., in the TensorDict format), containing
-    a. `mean.memmap` mean values predicted by the VAE encoder (number of audios X sequence length X channel size)
-    b. `std.memmap` standard deviation values predicted by the VAE encoder (number of audios X sequence length X channel size)
-    c. `text_features.memmap` text features extracted from CLIP (number of audios X 77 (sequence length) X 1024)
-    f. `meta.json` that contains the metadata for the above memory mappings
-2. A tab-separated values file named `{basename(output_dir)}.tsv` that contains two columns: `id` containing audio file names without extension, and `label` containing corresponding text labels (i.e., captions)
-### Reference tsv files (with overlaps removed as mentioned in the paper)
-The reference tsv files can be found [here](https://github.com/hkchengrex/MMAudio/releases/tag/v0.1).
-Note that these reference tsv files are the **outputs** of `extract_audio_training_latents.py`, which means the `id` column might contain duplicate entries (one per clip). You can still use it as the `captions_tsv` input though -- the script will handle duplicates gracefully.
-Among these reference tsv files, `audioset_sl.tsv`, `bbcsound.tsv`, and `freesound.tsv` are subsets that are parts of WavCaps. These subsets might be smaller than the original datasets.
-The Clotho data contains both the development set and the validation set.
-**Update (Mar 9, 2025)**:
-We have updated a corrected set of reference tsv files. The previous tsv files contained some (<1%) corrupted captions (ie, mismatch between audio and caption, see https://github.com/hkchengrex/MMAudio/issues/56). The tsv files for VGGSound are unaffected. This reason for this error is unknown, but I cannot reproduce this error in the latest version of the code. Our pre-trained models are trained with **uncorrected** tsv files. For future training, I recommend using the corrected tsv files.
-The error statistics are as follows:
-- AudioCaps (170/43824), 0.39%
-- Freesound: (1670/180636), 0.92%
-- AudioSet: (290/100776), 0.29%
-- BBCSound: (3/29975), 0.01%
-- Clotho: (8/24332), 0.03%
-## Training on Extracted Features
-We use Distributed Data Parallel (DDP) for training.
-First, specify the data path in `config/data/base.yaml`. If you used the default parameters in the scripts above to extract features for the example data, the `Example_video` and `Example_audio` items should already be correct.
-To run training on the example data, use the following command:
-```bash
-OMP_NUM_THREADS=4 torchrun --standalone --nproc_per_node=1 train.py exp_id=debug compile=False  debug=True example_train=True  batch_size=1
-```
-This will not train a useful model, but it will check if everything is set up correctly.
-For full training on the base model with two GPUs, use the following command:
-```bash
-OMP_NUM_THREADS=4 torchrun --standalone --nproc_per_node=2 train.py exp_id=exp_1 model=small_16k
-```
-Any outputs from training will be stored in `output/<exp_id>`.
-More configuration options can be found in `config/base_config.yaml` and `config/train_config.yaml`.
-For the medium and large models, specify `vgg_oversample_rate` to be `3` to reduce overfitting.
-## Checkpoints
-Model checkpoints, including optimizer states and the latest EMA weights, are available here: https://huggingface.co/hkchengrex/MMAudio
----
-Godspeed!

MMAudio/docs/demo.html DELETED Viewed

@@ -1,81 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-<head>
-    <!-- Google tag (gtag.js) -->
-    <script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
-    <script>
-    window.dataLayer = window.dataLayer || [];
-    function gtag(){dataLayer.push(arguments);}
-    gtag('js', new Date());
-    gtag('config', 'G-0JKBJ3WRJZ');
-    </script>
-    <link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
-    <meta charset="UTF-8">
-    <title>MMAudio</title>
-    <link rel="icon" type="image/png" href="images/icon.png">
-    <meta name="viewport" content="width=device-width, initial-scale=1">
-    <!-- CSS only -->
-    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/css/bootstrap.min.css" rel="stylesheet"
-        integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
-    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.7.1/jquery.min.js"></script>
-    <link rel="stylesheet" href="style_videos.css">
-</head>
-<body>
-    <div id="moviegen_all">
-    <h2 id="moviegen" style="text-align: center;">Supplementary Videos</h2>
-    <div class="row g-1">
-        <div class="col-12 col-md-4">
-            <div class="video-header" style="font-size: large;">Golf; ground-truth</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/1hwSu42kkho"></iframe>
-            </div>
-        </div>
-        <div class="col-12 col-md-4">
-            <div class="video-header" style="font-size: large;">Golf; FoleyCrafter</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/Lfsx8mOPcJo"></iframe>
-                </div>
-        </div>
-        <div class="col-12 col-md-4">
-            <div class="video-header" style="font-size: large;">Golf; Ours</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/kZibDoDCNxI"></iframe>
-                </div>
-        </div>
-    </div>
-    <br>
-    <div class="row g-1">
-        <div class="col-12 col-md-4">
-            <div class="video-header" style="font-size: large;">Waves; Ours</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/7zQzDEuFnfI"></iframe>
-            </div>
-        </div>
-        <div class="col-12 col-md-4">
-            <div class="video-header" style="font-size: large;">Featured MMAudio</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/SLz3NWLyHxg"></iframe>
-            </div>
-        </div>
-        <div class="col-12 col-md-4">
-            <div class="video-header" style="font-size: large;">Failure case</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/nx0CyrDu70Y"></iframe>
-            </div>
-        </div>
-    </div>
-    <br>
-    </div>
-</body>
-</html>

MMAudio/docs/images/icon.png DELETED Viewed

Binary file (163 Bytes)

MMAudio/docs/index.html DELETED Viewed

@@ -1,156 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-<head>
-    <!-- Google tag (gtag.js) -->
-    <script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
-    <script>
-    window.dataLayer = window.dataLayer || [];
-    function gtag(){dataLayer.push(arguments);}
-    gtag('js', new Date());
-    gtag('config', 'G-0JKBJ3WRJZ');
-    </script>
-    <link rel="preconnect" href="https://fonts.googleapis.com">
-    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
-    <link href="https://fonts.googleapis.com/css2?family=Source+Sans+3&display=swap" rel="stylesheet">
-    <meta charset="UTF-8">
-    <title>MMAudio</title>
-    <link rel="icon" type="image/png" href="images/icon.png">
-    <meta name="viewport" content="width=device-width, initial-scale=1">
-    <!-- CSS only -->
-    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/css/bootstrap.min.css" rel="stylesheet"
-        integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
-    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
-    <link rel="stylesheet" href="style.css">
-</head>
-<body>
-    <body>
-        <br><br><br><br>
-        <div class="container">
-            <div class="row text-center" style="font-size:38px">
-                <div class="col strong">
-                    MMAudio: Taming Multimodal Joint Training for High-Quality Video-to-Audio Synthesis
-                </div>
-            </div>
-            <br>
-            <div class="row text-center" style="font-size:28px">
-                <div class="col">
-                    CVPR 2025
-                </div>
-            </div>
-            <br>
-            <div class="h-100 row text-center heavy justify-content-md-center" style="font-size:22px;">
-                <div class="col-sm-auto px-lg-2">
-                    <a href="https://hkchengrex.github.io/">Ho Kei Cheng<sup>1</sup></a>
-                </div>
-                <div class="col-sm-auto px-lg-2">
-                    <nobr><a href="https://scholar.google.co.jp/citations?user=RRIO1CcAAAAJ">Masato Ishii<sup>2</sup></a></nobr>
-                </div>
-                <div class="col-sm-auto px-lg-2">
-                    <nobr><a href="https://scholar.google.com/citations?user=sXAjHFIAAAAJ">Akio Hayakawa<sup>2</sup></a></nobr>
-                </div>
-                <div class="col-sm-auto px-lg-2">
-                    <nobr><a href="https://scholar.google.com/citations?user=XCRO260AAAAJ">Takashi Shibuya<sup>2</sup></a></nobr>
-                </div>
-                <div class="col-sm-auto px-lg-2">
-                    <nobr><a href="https://www.alexander-schwing.de/">Alexander Schwing<sup>1</sup></a></nobr>
-                </div>
-                <div class="col-sm-auto px-lg-2" >
-                    <nobr><a href="https://www.yukimitsufuji.com/">Yuki Mitsufuji<sup>2,3</sup></a></nobr>
-                </div>
-            </div>
-            <div class="h-100 row text-center heavy justify-content-md-center" style="font-size:22px;">
-                <div class="col-sm-auto px-lg-2">
-                    <sup>1</sup>University of Illinois Urbana-Champaign
-                </div>
-                <div class="col-sm-auto px-lg-2">
-                    <sup>2</sup>Sony AI
-                </div>
-                <div class="col-sm-auto px-lg-2">
-                    <sup>3</sup>Sony Group Corporation
-                </div>
-            </div>
-            <br>
-            <br>
-            <div class="h-100 row text-center justify-content-md-center" style="font-size:20px;">
-                <div class="col-sm-2">
-                    <a href="https://arxiv.org/abs/2412.15322">[Paper]</a>
-                </div>
-                <div class="col-sm-2">
-                    <a href="https://github.com/hkchengrex/MMAudio">[Code]</a>
-                </div>
-                <div class="col-sm-3">
-                    <a href="https://huggingface.co/spaces/hkchengrex/MMAudio">[Huggingface Demo]</a>
-                </div>
-                <div class="col-sm-2">
-                    <a href="https://colab.research.google.com/drive/1TAaXCY2-kPk4xE4PwKB3EqFbSnkUuzZ8?usp=sharing">[Colab Demo]</a>
-                </div>
-                <div class="col-sm-3">
-                    <a href="https://replicate.com/zsxkib/mmaudio">[Replicate Demo]</a>
-                </div>
-            </div>
-            <br>
-            <hr>
-            <div class="row" style="font-size:32px">
-                <div class="col strong">
-                    TL;DR
-                </div>
-            </div>
-            <br>
-            <div class="row">
-                <div class="col">
-                    <p class="light" style="text-align: left;">
-                        MMAudio generates synchronized audio given video and/or text inputs.
-                    </p>
-                    <p>
-                        Check out this fun video!
-                        <div class="video-container" style="text-align: center;">
-                            <iframe src="https://youtube.com/embed/SLz3NWLyHxg"></iframe>
-                        </div>
-                    </p>
-                </div>
-            </div>
-            <br>
-            <hr>
-            <br>
-            <div class="row" style="font-size:32px">
-                <div class="col strong">
-                    Demo
-                </div>
-            </div>
-            <br>
-            <div class="row" style="font-size:48px">
-                <div class="col strong text-center">
-                    <a href="video_main.html" style="text-decoration: underline;">&lt;More results&gt;</a>
-                </div>
-            </div>
-            <br>
-            <div class="video-container" style="text-align: center;">
-                <iframe src="https://youtube.com/embed/YElewUT2M4M"></iframe>
-            </div>
-            <br>
-            <br><br>
-            <br><br>
-        </div>
-</body>
-</html>

MMAudio/docs/style.css DELETED Viewed

@@ -1,78 +0,0 @@
-body {
-    font-family: 'Source Sans 3', sans-serif;
-    font-size: 18px;
-    margin-left: auto;
-    margin-right: auto;
-    font-weight: 400;
-    height: 100%;
-    max-width: 1000px;
-}
-table {
-    width: 100%;
-    border-collapse: collapse;
-}
-th, td {
-    border: 1px solid #ddd;
-    padding: 8px;
-    text-align: center;
-}
-th {
-    background-color: #f2f2f2;
-}
-video {
-    width: 100%;
-    height: auto;
-}
-p {
-    font-size: 28px;
-}
-h2 {
-    font-size: 36px;
-}
-.strong {
-    font-weight: 700;
-}
-.light {
-    font-weight: 100;
-}
-.heavy {
-    font-weight: 900;
-}
-.column {
-    float: left;
-}
-a:link,
-a:visited {
-    color: #05538f;
-    text-decoration: none;
-}
-a:hover {
-    color: #63cbdd;
-}
-hr {
-    border: 0;
-    height: 1px;
-    background-image: linear-gradient(to right, rgba(0, 0, 0, 0), rgba(0, 0, 0, 0.75), rgba(0, 0, 0, 0));
-}
-.video-container {
-    position: relative;
-    padding-bottom: 56.25%; /* 16:9 */
-    height: 0;
-  }
-.video-container iframe {
-    position: absolute;
-    top: 0;
-    left: 0;
-    width: 100%;
-    height: 100%;
-}

MMAudio/docs/style_videos.css DELETED Viewed

@@ -1,52 +0,0 @@
-body {
-    font-family: 'Source Sans 3', sans-serif;
-    font-size: 1.5vh;
-    font-weight: 400;
-}
-table {
-    width: 100%;
-    border-collapse: collapse;
-}
-th, td {
-    border: 1px solid #ddd;
-    padding: 8px;
-    text-align: center;
-}
-th {
-    background-color: #f2f2f2;
-}
-video {
-    width: 100%;
-    height: auto;
-}
-p {
-    font-size: 1.5vh;
-    font-weight: bold;
-}
-h2 {
-    font-size: 2vh;
-    font-weight: bold;
-}
-.video-container {
-    position: relative;
-    padding-bottom: 56.25%; /* 16:9 */
-    height: 0;
-  }
-.video-container iframe {
-    position: absolute;
-    top: 0;
-    left: 0;
-    width: 100%;
-    height: 100%;
-}
-.video-header {
-    background-color: #f2f2f2;
-    text-align: center;
-    font-size: 1.5vh;
-    font-weight: bold;
-    padding: 8px;
-}

MMAudio/docs/video_gen.html DELETED Viewed

@@ -1,254 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-<head>
-    <!-- Google tag (gtag.js) -->
-    <script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
-    <script>
-    window.dataLayer = window.dataLayer || [];
-    function gtag(){dataLayer.push(arguments);}
-    gtag('js', new Date());
-    gtag('config', 'G-0JKBJ3WRJZ');
-    </script>
-    <link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
-    <meta charset="UTF-8">
-    <title>MMAudio</title>
-    <link rel="icon" type="image/png" href="images/icon.png">
-    <meta name="viewport" content="width=device-width, initial-scale=1">
-    <!-- CSS only -->
-    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/css/bootstrap.min.css" rel="stylesheet"
-        integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
-    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.7.1/jquery.min.js"></script>
-    <link rel="stylesheet" href="style_videos.css">
-</head>
-<body>
-    <div id="moviegen_all">
-    <h2 id="moviegen" style="text-align: center;">Comparisons with Movie Gen Audio on Videos Generated by MovieGen</h2>
-    <p id="moviegen1" style="overflow: hidden;">
-        Example 1: Ice cracking with sharp snapping sound, and metal tool scraping against the ice surface.
-        <span style="float: right;"><a href="#index">Back to index</a></span>
-    </p>
-    <div class="row g-1">
-        <div class="col-sm-6">
-            <div class="video-header">Movie Gen Audio</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/d7Lb0ihtGcE"></iframe>
-            </div>
-        </div>
-        <div class="col-sm-6">
-            <div class="video-header">Ours</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/F4JoJ2r2m8U"></iframe>
-                </div>
-        </div>
-    </div>
-    <br>
-    <!-- <p id="moviegen2">Example 2: Rhythmic splashing and lapping of water. <span style="float:right;"><a href="#index">Back to index</a></span> </p>
-    <table>
-        <thead>
-            <tr>
-                <th>Movie Gen Audio</th>
-                <th>Ours</th>
-            </tr>
-        </thead>
-        <tbody>
-            <tr>
-                <td width="50%">
-                    <div class="video-container">
-                    <iframe src="https://youtube.com/embed/5gQNPK99CIk"></iframe>
-                    </div>
-                </td>
-                <td width="50%">
-                    <div class="video-container">
-                    <iframe src="https://youtube.com/embed/AbwnTzG-BpA"></iframe>
-                    </div>
-                </td>
-            </tr>
-        </tbody>
-    </table> -->
-    <p id="moviegen2" style="overflow: hidden;">
-        Example 2: Rhythmic splashing and lapping of water.
-        <span style="float:right;"><a href="#index">Back to index</a></span>
-    </p>
-    <div class="row g-1">
-        <div class="col-sm-6">
-            <div class="video-header">Movie Gen Audio</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/5gQNPK99CIk"></iframe>
-            </div>
-        </div>
-        <div class="col-sm-6">
-            <div class="video-header">Ours</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/AbwnTzG-BpA"></iframe>
-                </div>
-        </div>
-    </div>
-    <br>
-    <p id="moviegen3" style="overflow: hidden;">
-        Example 3: Shovel scrapes against dry earth.
-        <span style="float:right;"><a href="#index">Back to index</a></span>
-    </p>
-    <div class="row g-1">
-        <div class="col-sm-6">
-            <div class="video-header">Movie Gen Audio</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/PUKGyEve7XQ"></iframe>
-            </div>
-        </div>
-        <div class="col-sm-6">
-            <div class="video-header">Ours</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/CNn7i8VNkdc"></iframe>
-            </div>
-        </div>
-    </div>
-    <br>
-    <p id="moviegen4" style="overflow: hidden;">
-        (Failure case) Example 4: Creamy sound of mashed potatoes being scooped.
-        <span style="float:right;"><a href="#index">Back to index</a></span>
-    </p>
-    <div class="row g-1">
-        <div class="col-sm-6">
-            <div class="video-header">Movie Gen Audio</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/PJv1zxR9JjQ"></iframe>
-            </div>
-        </div>
-        <div class="col-sm-6">
-            <div class="video-header">Ours</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/c3-LJ1lNsPQ"></iframe>
-            </div>
-        </div>
-    </div>
-    <br>
-    </div>
-    <div id="hunyuan_sora_all">
-    <h2 id="hunyuan" style="text-align: center;">Results on Videos Generated by Hunyuan</h2>
-    <p style="overflow: hidden;">
-        <span style="float:right;"><a href="#index">Back to index</a></span>
-    </p>
-    <div class="row g-1">
-        <div class="col-sm-6">
-            <div class="video-header">Typing</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/8ln_9hhH_nk"></iframe>
-            </div>
-        </div>
-        <div class="col-sm-6">
-            <div class="video-header">Water is rushing down a stream and pouring</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/5df1FZFQj30"></iframe>
-            </div>
-        </div>
-    </div>
-    <div class="row g-1">
-        <div class="col-sm-6">
-            <div class="video-header">Waves on beach</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/7wQ9D5WgpFc"></iframe>
-            </div>
-        </div>
-        <div class="col-sm-6">
-            <div class="video-header">Water droplet</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/q7M2nsalGjM"></iframe>
-            </div>
-        </div>
-    </div>
-    <br>
-    <h2 id="sora" style="text-align: center;">Results on Videos Generated by Sora</h2>
-    <p style="overflow: hidden;">
-        <span style="float:right;"><a href="#index">Back to index</a></span>
-    </p>
-    <div class="row g-1">
-        <div class="col-sm-6">
-            <div class="video-header">Ships riding waves</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/JbgQzHHytk8"></iframe>
-            </div>
-        </div>
-        <div class="col-sm-6">
-            <div class="video-header">Train (no text prompt given)</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/xOW7zrjpWC8"></iframe>
-            </div>
-        </div>
-    </div>
-    <div class="row g-1">
-        <div class="col-sm-6">
-            <div class="video-header">Seashore (no text prompt given)</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/fIuw5Y8ZZ9E"></iframe>
-            </div>
-        </div>
-        <div class="col-sm-6">
-            <div class="video-header">Surfing (failure: unprompted music)</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/UcSTk-v0M_s"></iframe>
-            </div>
-        </div>
-    </div>
-    <br>
-    <div id="mochi_ltx_all">
-    <h2 id="mochi" style="text-align: center;">Results on Videos Generated by Mochi 1</h2>
-    <p style="overflow: hidden;">
-        <span style="float:right;"><a href="#index">Back to index</a></span>
-    </p>
-    <div class="row g-1">
-        <div class="col-sm-6">
-            <div class="video-header">Magical fire and lightning (no text prompt given)</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/tTlRZaSMNwY"></iframe>
-            </div>
-        </div>
-        <div class="col-sm-6">
-            <div class="video-header">Storm (no text prompt given)</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/4hrZTMJUy3w"></iframe>
-            </div>
-        </div>
-    </div>
-    <br>
-    <h2 id="ltx" style="text-align: center;">Results on Videos Generated by LTX-Video</h2>
-    <p style="overflow: hidden;">
-        <span style="float:right;"><a href="#index">Back to index</a></span>
-    </p>
-    <div class="row g-1">
-        <div class="col-sm-6">
-            <div class="video-header">Firewood burning and cracking</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/P7_DDpgev0g"></iframe>
-            </div>
-        </div>
-        <div class="col-sm-6">
-            <div class="video-header">Waterfall, water splashing</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/4MvjceYnIO0"></iframe>
-            </div>
-        </div>
-    </div>
-    <br>
-    </div>
-</body>
-</html>

MMAudio/docs/video_main.html DELETED Viewed

@@ -1,98 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-<head>
-    <!-- Google tag (gtag.js) -->
-    <script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
-    <script>
-    window.dataLayer = window.dataLayer || [];
-    function gtag(){dataLayer.push(arguments);}
-    gtag('js', new Date());
-    gtag('config', 'G-0JKBJ3WRJZ');
-    </script>
-    <link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
-    <meta charset="UTF-8">
-    <title>MMAudio</title>
-    <link rel="icon" type="image/png" href="images/icon.png">
-    <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no">
-    <!-- CSS only -->
-    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/css/bootstrap.min.css" rel="stylesheet"
-        integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
-    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.7.1/jquery.min.js"></script>
-    <link rel="stylesheet" href="style_videos.css">
-    <script type="text/javascript">
-        $(document).ready(function(){
-            $("#content").load("video_gen.html #moviegen_all");
-            $("#load_moveigen").click(function(){
-                $("#content").load("video_gen.html #moviegen_all");
-            });
-            $("#load_hunyuan_sora").click(function(){
-                $("#content").load("video_gen.html #hunyuan_sora_all");
-            });
-            $("#load_mochi_ltx").click(function(){
-                $("#content").load("video_gen.html #mochi_ltx_all");
-            });
-            $("#load_vgg1").click(function(){
-                $("#content").load("video_vgg.html #vgg1");
-            });
-            $("#load_vgg2").click(function(){
-                $("#content").load("video_vgg.html #vgg2");
-            });
-            $("#load_vgg3").click(function(){
-                $("#content").load("video_vgg.html #vgg3");
-            });
-            $("#load_vgg4").click(function(){
-                $("#content").load("video_vgg.html #vgg4");
-            });
-            $("#load_vgg5").click(function(){
-                $("#content").load("video_vgg.html #vgg5");
-            });
-            $("#load_vgg6").click(function(){
-                $("#content").load("video_vgg.html #vgg6");
-            });
-            $("#load_vgg_extra").click(function(){
-                $("#content").load("video_vgg.html #vgg_extra");
-            });
-        });
-    </script>
-</head>
-<body>
-    <h1 id="index" style="text-align: center;">Index</h1>
-    <p><b>(Click on the links to load the corresponding videos)</b> <span style="float:right;"><a href="index.html">Back to project page</a></span></p>
-    <ol>
-        <li>
-            <a href="#" id="load_moveigen">Comparisons with Movie Gen Audio on Videos Generated by MovieGen</a>
-        </li>
-        <li>
-            <a href="#" id="load_hunyuan_sora">Results on Videos Generated by Hunyuan and Sora</a>
-        </li>
-        <li>
-            <a href="#" id="load_mochi_ltx">Results on Videos Generated by Mochi 1 and LTX-Video</a>
-        </li>
-        <li>
-            On VGGSound
-            <ol>
-                <li><a id='load_vgg1' href="#">Example 1: Wolf howling</a></li>
-                <li><a id='load_vgg2' href="#">Example 2: Striking a golf ball</a></li>
-                <li><a id='load_vgg3' href="#">Example 3: Hitting a drum</a></li>
-                <li><a id='load_vgg4' href="#">Example 4: Dog barking</a></li>
-                <li><a id='load_vgg5' href="#">Example 5: Playing a string instrument</a></li>
-                <li><a id='load_vgg6' href="#">Example 6: A group of people playing tambourines</a></li>
-                <li><a id='load_vgg_extra' href="#">Extra results & failure cases</a></li>
-            </ol>
-        </li>
-    </ol>
-    <div id="content" class="container-fluid">
-    </div>
-    <br>
-    <br>
-</body>
-</html>

MMAudio/docs/video_vgg.html DELETED Viewed

@@ -1,452 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-<head>
-    <!-- Google tag (gtag.js) -->
-    <script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
-    <script>
-    window.dataLayer = window.dataLayer || [];
-    function gtag(){dataLayer.push(arguments);}
-    gtag('js', new Date());
-    gtag('config', 'G-0JKBJ3WRJZ');
-    </script>
-    <link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
-    <meta charset="UTF-8">
-    <title>MMAudio</title>
-    <meta name="viewport" content="width=device-width, initial-scale=1">
-    <!-- CSS only -->
-    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/css/bootstrap.min.css" rel="stylesheet"
-        integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
-    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
-    <link rel="stylesheet" href="style_videos.css">
-</head>
-<body>
-    <div id="vgg1">
-    <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
-    <p style="overflow: hidden;">
-        Example 1: Wolf howling.
-        <span style="float:right;"><a href="#index">Back to index</a></span>
-    </p>
-        <div class="row g-1">
-            <div class="col-sm-3">
-                <div class="video-header">Ground-truth</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/9J_V74gqMUA"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">Ours</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/P6O8IpjErPc"></iframe>
-                    </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">V2A-Mapper</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/w-5eyqepvTk"></iframe>
-                    </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">FoleyCrafter</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/VOLfoZlRkzo"></iframe>
-                    </div>
-            </div>
-        </div>
-        <div class="row g-1">
-            <div class="col-sm-3">
-                <div class="video-header">Frieren</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/49owKyA5Pa8"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">VATT</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/QVtrFgbeGDM"></iframe>
-                    </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">V-AURA</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/8r0uEfSNjvI"></iframe>
-                    </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">Seeing and Hearing</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/bn-sLg2qulk"></iframe>
-                    </div>
-            </div>
-        </div>
-    </div>
-    <div id="vgg2">
-        <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
-        <p style="overflow: hidden;">
-            Example 2: Striking a golf ball.
-            <span style="float:right;"><a href="#index">Back to index</a></span>
-        </p>
-        <div class="row g-1">
-            <div class="col-sm-3">
-                <div class="video-header">Ground-truth</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/1hwSu42kkho"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">Ours</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/kZibDoDCNxI"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">V2A-Mapper</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/jgKfLBLhh7Y"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">FoleyCrafter</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/Lfsx8mOPcJo"></iframe>
-                </div>
-            </div>
-        </div>
-        <div class="row g-1">
-            <div class="col-sm-3">
-                <div class="video-header">Frieren</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/tz-LpbB0MBc"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">VATT</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/RTDUHMi08n4"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">V-AURA</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/N-3TDOsPnZQ"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">Seeing and Hearing</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/QnsHnLn4gB0"></iframe>
-                </div>
-            </div>
-        </div>
-    </div>
-    <div id="vgg3">
-        <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
-        <p style="overflow: hidden;">
-            Example 3: Hitting a drum.
-            <span style="float:right;"><a href="#index">Back to index</a></span>
-        </p>
-        <div class="row g-1">
-            <div class="col-sm-3">
-                <div class="video-header">Ground-truth</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/0oeIwq77w0Q"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">Ours</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/-UtPV9ohuIM"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">V2A-Mapper</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/9yivkgN-zwc"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">FoleyCrafter</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/kkCsXPOlBvY"></iframe>
-                </div>
-            </div>
-        </div>
-        <div class="row g-1">
-            <div class="col-sm-3">
-                <div class="video-header">Frieren</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/MbNKsVsuvig"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">VATT</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/2yYviBjrpBw"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">V-AURA</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/9yivkgN-zwc"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">Seeing and Hearing</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/6dnyQt4Fuhs"></iframe>
-                </div>
-            </div>
-        </div>
-    </div>
-    </div>
-    <div id="vgg4">
-        <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
-        <p style="overflow: hidden;">
-            Example 4: Dog barking.
-            <span style="float:right;"><a href="#index">Back to index</a></span>
-        </p>
-        <div class="row g-1">
-            <div class="col-sm-3">
-                <div class="video-header">Ground-truth</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/ckaqvTyMYAw"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">Ours</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/_aRndFZzZ-I"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">V2A-Mapper</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/mNCISP3LBl0"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">FoleyCrafter</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/phZBQ3L7foE"></iframe>
-                </div>
-            </div>
-        </div>
-        <div class="row g-1">
-            <div class="col-sm-3">
-                <div class="video-header">Frieren</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/Sb5Mg1-ORao"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">VATT</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/eHmAGOmtDDg"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">V-AURA</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/NEGa3krBrm0"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">Seeing and Hearing</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/aO0EAXlwE7A"></iframe>
-                </div>
-            </div>
-        </div>
-    </div>
-    <div id="vgg5">
-        <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
-        <p style="overflow: hidden;">
-            Example 5: Playing a string instrument.
-            <span style="float:right;"><a href="#index">Back to index</a></span>
-        </p>
-        <div class="row g-1">
-            <div class="col-sm-3">
-                <div class="video-header">Ground-truth</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/KP1QhWauIOc"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">Ours</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/ovaJhWSquYE"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">V2A-Mapper</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/N723FS9lcy8"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">FoleyCrafter</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/t0N4ZAAXo58"></iframe>
-                </div>
-            </div>
-        </div>
-        <div class="row g-1">
-            <div class="col-sm-3">
-                <div class="video-header">Frieren</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/8YSRs03QNNA"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">VATT</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/vOpMz55J1kY"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">V-AURA</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/9JHC75vr9h0"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">Seeing and Hearing</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/9w0JckNzXmY"></iframe>
-                </div>
-            </div>
-        </div>
-    </div>
-    <div id="vgg6">
-        <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
-        <p style="overflow: hidden;">
-            Example 6: A group of people playing tambourines.
-            <span style="float:right;"><a href="#index">Back to index</a></span>
-        </p>
-        <div class="row g-1">
-            <div class="col-sm-3">
-                <div class="video-header">Ground-truth</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/mx6JLxzUkRc"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">Ours</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/oLirHhP9Su8"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">V2A-Mapper</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/HkLkHMqptv0"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">FoleyCrafter</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/rpHiiODjmNU"></iframe>
-                </div>
-            </div>
-        </div>
-        <div class="row g-1">
-            <div class="col-sm-3">
-                <div class="video-header">Frieren</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/1mVD3fJ0LpM"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">VATT</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/yjVFnJiEJlw"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">V-AURA</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/neVeMSWtRkU"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-3">
-                <div class="video-header">Seeing and Hearing</div>
-                <div class="video-container">
-                    <iframe src="https://youtube.com/embed/EUE7YwyVWz8"></iframe>
-                </div>
-            </div>
-        </div>
-    </div>
-    <div id="vgg_extra">
-        <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
-        <p style="overflow: hidden;">
-            <span style="float:right;"><a href="#index">Back to index</a></span>
-        </p>
-        <div class="row g-1">
-            <div class="col-sm-3">
-            <div class="video-header">Moving train</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/Ta6H45rBzJc"></iframe>
-            </div>
-            </div>
-            <div class="col-sm-3">
-            <div class="video-header">Water splashing</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/hl6AtgHXpb4"></iframe>
-            </div>
-            </div>
-            <div class="col-sm-3">
-            <div class="video-header">Skateboarding</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/n4sCNi_9buI"></iframe>
-            </div>
-            </div>
-            <div class="col-sm-3">
-            <div class="video-header">Synchronized clapping</div>
-            <div class="video-container">
-                <iframe src="https://youtube.com/embed/oxexfpLn7FE"></iframe>
-            </div>
-            </div>
-        </div>
-        <br><br>
-        <div id="extra-failure">
-            <h2 style="text-align: center;">Failure cases</h2>
-            <p style="overflow: hidden;">
-            <span style="float:right;"><a href="#index">Back to index</a></span>
-            </p>
-            <div class="row g-1">
-            <div class="col-sm-6">
-                <div class="video-header">Human speech</div>
-                <div class="video-container">
-                <iframe src="https://youtube.com/embed/nx0CyrDu70Y"></iframe>
-                </div>
-            </div>
-            <div class="col-sm-6">
-                <div class="video-header">Unfamiliar vision input</div>
-                <div class="video-container">
-                <iframe src="https://youtube.com/embed/hfnAqmK3X7w"></iframe>
-                </div>
-            </div>
-            </div>
-        </div>
-        </div>
-</body>
-</html>

MMAudio/eval_onsets.py DELETED Viewed

@@ -1,141 +0,0 @@
-# Modified from https://github.com/XYPB/CondFoleyGen/blob/main/predict_onset.py
-import argparse
-import copy
-import os
-from pathlib import Path
-import librosa
-import numpy as np
-from sklearn.metrics import (average_precision_score, f1_score, precision_recall_curve)
-from tqdm import tqdm
-sample_rate = 22050
-conf_interval = int(0.05 * 22050)
-duration = 8
-def onset_nms(onsets, wav_norm, window=0.05):
-    confidence = [np.max(wav_norm[o - conf_interval:o + conf_interval]) for o in onsets]
-    onset_remain = onsets.tolist()
-    output = []
-    sorted_idx = np.argsort(confidence)[::-1]
-    for idx in sorted_idx:
-        cur = onsets[idx]
-        if cur not in onset_remain:
-            continue
-        output.append(cur)
-        onset_remain.remove(cur)
-        for o in onset_remain:
-            if abs(cur - o) < window * sample_rate:
-                onset_remain.remove(o)
-    return np.array(sorted(output))
-def predict_audio(audio_path: Path, delta: float) -> np.ndarray:
-    wav, _ = librosa.load(audio_path, sr=sample_rate)
-    wav = wav[:duration * sample_rate]
-    onsets = librosa.onset.onset_detect(y=wav, sr=sample_rate, units='samples', delta=delta)
-    wav_norm = (wav - wav.min()) / (wav.max() - wav.min() + 1e-6)
-    return onsets, wav_norm
-def read_gt(gt_file: Path) -> np.ndarray:
-    all_times = []
-    with open(gt_file, 'r') as f:
-        lines = f.readlines()
-        for l in lines:
-            time = float(l.split(' ')[0])
-            if time >= duration:
-                break
-            all_times.append(time)
-    return np.array(all_times)
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--input_dir', type=Path)
-    parser.add_argument('--gt_dir', type=Path)
-    parser.add_argument('--delta', type=float, default=0.3)
-    args = parser.parse_args()
-    input_dir = args.input_dir
-    gt_dir = args.gt_dir
-    delta = args.delta
-    overall_acc = 0
-    overall_ap = 0
-    overall_f1 = 0
-    audio_files = sorted(os.listdir(input_dir))
-    audio_files = [f for f in audio_files if f.endswith('.flac') or f.endswith('.wav')]
-    for audio_file in tqdm(audio_files):
-        base_name = Path(audio_file).stem
-        gt_name = base_name.replace('_denoised', '_times')
-        gt_file = gt_dir / f'{gt_name}.txt'
-        gt_times = read_gt(gt_file) * sample_rate
-        onsets, wav_norm = predict_audio(input_dir / audio_file, delta)
-        onsets = onset_nms(onsets, wav_norm)
-        onsets_onuse = copy.deepcopy(onsets.tolist())
-        onsets_res = [0 for _ in onsets_onuse]
-        y_gt = []
-        y_pred = []
-        hit_cnt = 0
-        for gt_onset in gt_times:
-            diff = [abs(pred_onset - gt_onset) for pred_onset in onsets_onuse]
-            idx_in_window = [idx for idx in range(len(onsets_onuse)) if diff[idx] < delta * 22050]
-            if len(idx_in_window) == 0:
-                y_gt.append(1)
-                y_pred.append(0)
-            else:
-                conf_in_window = [wav_norm[onsets[idx]] for idx in idx_in_window]
-                max_conf_idx = np.argsort(conf_in_window)[-1]
-                match_idx = idx_in_window[max_conf_idx]
-                conf = np.max(wav_norm[onsets_onuse[match_idx] -
-                                       conf_interval:onsets_onuse[match_idx] + conf_interval])
-                hit_cnt += 1
-                y_gt.append(1)
-                y_pred.append(conf)
-                # y_pred.append(1)
-                for i in range(len(onsets)):
-                    if onsets[i] == onsets_onuse[match_idx]:
-                        onsets_res[i] = 1
-                onsets_onuse.remove(onsets_onuse[match_idx])
-                if len(onsets_onuse) == 0:
-                    break
-        for o in onsets_onuse:
-            y_gt.append(0)
-            y_pred.append(np.max(wav_norm[o - conf_interval:o + conf_interval]))
-            # y_pred.append(1)
-        acc = hit_cnt / len(gt_times) if len(gt_times) != 0 else 0
-        ap = average_precision_score(y_gt, y_pred)
-        f1 = f1_score(y_gt, [1 if p > 0 else 0 for p in y_pred])
-        # print(y_gt, y_pred, ap, f1)
-        overall_acc += acc
-        overall_ap += ap
-        overall_f1 += f1
-    overall_acc /= len(audio_files)
-    overall_ap /= len(audio_files)
-    overall_f1 /= len(audio_files)
-    print(f'Overall accuracy: {overall_acc:.4f}')
-    print(f'Overall AP: {overall_ap:.4f}')
-    print(f'Overall F1: {overall_f1:.4f}')
-    # write to file
-    with open(input_dir / 'eval_results.txt', 'w') as f:
-        f.write(f'Overall accuracy: {overall_acc:.4f}\n')
-        f.write(f'Overall AP: {overall_ap:.4f}\n')
-        f.write(f'Overall F1: {overall_f1:.4f}\n')
-if __name__ == '__main__':
-    main()

MMAudio/gradio_demo.py DELETED Viewed

@@ -1,343 +0,0 @@
-import gc
-import logging
-from argparse import ArgumentParser
-from datetime import datetime
-from fractions import Fraction
-from pathlib import Path
-import gradio as gr
-import torch
-import torchaudio
-from mmaudio.eval_utils import (ModelConfig, VideoInfo, all_model_cfg, generate, load_image,
-                                load_video, make_video, setup_eval_logging)
-from mmaudio.model.flow_matching import FlowMatching
-from mmaudio.model.networks import MMAudio, get_my_mmaudio
-from mmaudio.model.sequence_config import SequenceConfig
-from mmaudio.model.utils.features_utils import FeaturesUtils
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cudnn.allow_tf32 = True
-log = logging.getLogger()
-device = 'cpu'
-if torch.cuda.is_available():
-    device = 'cuda'
-elif torch.backends.mps.is_available():
-    device = 'mps'
-else:
-    log.warning('CUDA/MPS are not available, running on CPU')
-dtype = torch.bfloat16
-model: ModelConfig = all_model_cfg['large_44k_v2']
-model.download_if_needed()
-output_dir = Path('./output/gradio')
-setup_eval_logging()
-def get_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
-    seq_cfg = model.seq_cfg
-    net: MMAudio = get_my_mmaudio(model.model_name).to(device, dtype).eval()
-    net.load_weights(torch.load(model.model_path, map_location=device, weights_only=True))
-    log.info(f'Loaded weights from {model.model_path}')
-    feature_utils = FeaturesUtils(tod_vae_ckpt=model.vae_path,
-                                  synchformer_ckpt=model.synchformer_ckpt,
-                                  enable_conditions=True,
-                                  mode=model.mode,
-                                  bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
-                                  need_vae_encoder=False)
-    feature_utils = feature_utils.to(device, dtype).eval()
-    return net, feature_utils, seq_cfg
-net, feature_utils, seq_cfg = get_model()
-@torch.inference_mode()
-def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
-                   cfg_strength: float, duration: float):
-    rng = torch.Generator(device=device)
-    if seed >= 0:
-        rng.manual_seed(seed)
-    else:
-        rng.seed()
-    fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
-    video_info = load_video(video, duration)
-    clip_frames = video_info.clip_frames
-    sync_frames = video_info.sync_frames
-    duration = video_info.duration_sec
-    clip_frames = clip_frames.unsqueeze(0)
-    sync_frames = sync_frames.unsqueeze(0)
-    seq_cfg.duration = duration
-    net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
-    audios = generate(clip_frames,
-                      sync_frames, [prompt],
-                      negative_text=[negative_prompt],
-                      feature_utils=feature_utils,
-                      net=net,
-                      fm=fm,
-                      rng=rng,
-                      cfg_strength=cfg_strength)
-    audio = audios.float().cpu()[0]
-    current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
-    output_dir.mkdir(exist_ok=True, parents=True)
-    video_save_path = output_dir / f'{current_time_string}.mp4'
-    make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
-    gc.collect()
-    return video_save_path
-@torch.inference_mode()
-def image_to_audio(image: gr.Image, prompt: str, negative_prompt: str, seed: int, num_steps: int,
-                   cfg_strength: float, duration: float):
-    rng = torch.Generator(device=device)
-    if seed >= 0:
-        rng.manual_seed(seed)
-    else:
-        rng.seed()
-    fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
-    image_info = load_image(image)
-    clip_frames = image_info.clip_frames
-    sync_frames = image_info.sync_frames
-    clip_frames = clip_frames.unsqueeze(0)
-    sync_frames = sync_frames.unsqueeze(0)
-    seq_cfg.duration = duration
-    net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
-    audios = generate(clip_frames,
-                      sync_frames, [prompt],
-                      negative_text=[negative_prompt],
-                      feature_utils=feature_utils,
-                      net=net,
-                      fm=fm,
-                      rng=rng,
-                      cfg_strength=cfg_strength,
-                      image_input=True)
-    audio = audios.float().cpu()[0]
-    current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
-    output_dir.mkdir(exist_ok=True, parents=True)
-    video_save_path = output_dir / f'{current_time_string}.mp4'
-    video_info = VideoInfo.from_image_info(image_info, duration, fps=Fraction(1))
-    make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
-    gc.collect()
-    return video_save_path
-@torch.inference_mode()
-def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float,
-                  duration: float):
-    rng = torch.Generator(device=device)
-    if seed >= 0:
-        rng.manual_seed(seed)
-    else:
-        rng.seed()
-    fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
-    clip_frames = sync_frames = None
-    seq_cfg.duration = duration
-    net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
-    audios = generate(clip_frames,
-                      sync_frames, [prompt],
-                      negative_text=[negative_prompt],
-                      feature_utils=feature_utils,
-                      net=net,
-                      fm=fm,
-                      rng=rng,
-                      cfg_strength=cfg_strength)
-    audio = audios.float().cpu()[0]
-    current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
-    output_dir.mkdir(exist_ok=True, parents=True)
-    audio_save_path = output_dir / f'{current_time_string}.flac'
-    torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
-    gc.collect()
-    return audio_save_path
-video_to_audio_tab = gr.Interface(
-    fn=video_to_audio,
-    description="""
-    Project page: <a href="https://hkchengrex.com/MMAudio/">https://hkchengrex.com/MMAudio/</a><br>
-    Code: <a href="https://github.com/hkchengrex/MMAudio">https://github.com/hkchengrex/MMAudio</a><br>
-    NOTE: It takes longer to process high-resolution videos (>384 px on the shorter side).
-    Doing so does not improve results.
-    """,
-    inputs=[
-        gr.Video(),
-        gr.Text(label='Prompt'),
-        gr.Text(label='Negative prompt', value='music'),
-        gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
-        gr.Number(label='Num steps', value=25, precision=0, minimum=1),
-        gr.Number(label='Guidance Strength', value=4.5, minimum=1),
-        gr.Number(label='Duration (sec)', value=8, minimum=1),
-    ],
-    outputs='playable_video',
-    cache_examples=False,
-    title='MMAudio — Video-to-Audio Synthesis',
-    examples=[
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_beach.mp4',
-            'waves, seagulls',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_serpent.mp4',
-            '',
-            'music',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_seahorse.mp4',
-            'bubbles',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_india.mp4',
-            'Indian holy music',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_galloping.mp4',
-            'galloping',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_kraken.mp4',
-            'waves, storm',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/mochi_storm.mp4',
-            'storm',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_spring.mp4',
-            '',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_typing.mp4',
-            'typing',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_wake_up.mp4',
-            '',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_nyc.mp4',
-            '',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-    ])
-text_to_audio_tab = gr.Interface(
-    fn=text_to_audio,
-    description="""
-    Project page: <a href="https://hkchengrex.com/MMAudio/">https://hkchengrex.com/MMAudio/</a><br>
-    Code: <a href="https://github.com/hkchengrex/MMAudio">https://github.com/hkchengrex/MMAudio</a><br>
-    """,
-    inputs=[
-        gr.Text(label='Prompt'),
-        gr.Text(label='Negative prompt'),
-        gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
-        gr.Number(label='Num steps', value=25, precision=0, minimum=1),
-        gr.Number(label='Guidance Strength', value=4.5, minimum=1),
-        gr.Number(label='Duration (sec)', value=8, minimum=1),
-    ],
-    outputs='audio',
-    cache_examples=False,
-    title='MMAudio — Text-to-Audio Synthesis',
-)
-image_to_audio_tab = gr.Interface(
-    fn=image_to_audio,
-    description="""
-    Project page: <a href="https://hkchengrex.com/MMAudio/">https://hkchengrex.com/MMAudio/</a><br>
-    Code: <a href="https://github.com/hkchengrex/MMAudio">https://github.com/hkchengrex/MMAudio</a><br>
-    NOTE: It takes longer to process high-resolution images (>384 px on the shorter side).
-    Doing so does not improve results.
-    """,
-    inputs=[
-        gr.Image(type='filepath'),
-        gr.Text(label='Prompt'),
-        gr.Text(label='Negative prompt'),
-        gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
-        gr.Number(label='Num steps', value=25, precision=0, minimum=1),
-        gr.Number(label='Guidance Strength', value=4.5, minimum=1),
-        gr.Number(label='Duration (sec)', value=8, minimum=1),
-    ],
-    outputs='playable_video',
-    cache_examples=False,
-    title='MMAudio — Image-to-Audio Synthesis (experimental)',
-)
-if __name__ == "__main__":
-    parser = ArgumentParser()
-    parser.add_argument('--port', type=int, default=7860)
-    args = parser.parse_args()
-    gr.TabbedInterface([video_to_audio_tab, text_to_audio_tab, image_to_audio_tab],
-                       ['Video-to-Audio', 'Text-to-Audio', 'Image-to-Audio (experimental)']).launch(
-                           server_port=args.port, allowed_paths=[output_dir])

MMAudio/sets/vgg-test.tsv DELETED Viewed

The diff for this file is too large to render. See raw diff

MMAudio/sets/vgg-train.tsv DELETED Viewed

The diff for this file is too large to render. See raw diff

MMAudio/sets/vgg-val.tsv DELETED Viewed

@@ -1,2049 +0,0 @@
-id	label
---96EN9NUQM_000242	alarm clock ringing
--2toZf00LvI_000012	bowling impact
--8OE7Vydkl4_000221	bowling impact
--AEZuuoyJug_000030	playing violin, fiddle
--CUgrFw8TEI_000045	dog whimpering
--CexapzRAPQ_000051	ferret dooking
--DHGwygUsQc_000030	skateboarding
--G-o-Y4WuaU_000139	playing harmonica
--G_2v0L4U_s_000078	playing tennis
--HIPq7T3eFI_000011	driving motorcycle
--I8C3cRr5TY_000030	female singing
--K232jBK8VQ_000030	car passing by
--L_RH-nw11I_000025	vacuum cleaner cleaning floors
--MfBpxtGQmE_000020	ambulance siren
--NYZDjBz60I_000085	child singing
--QWcNg6FCgE_000022	playing bass guitar
--T06kz4MI20_000030	female singing
--UJJsEdgqMQ_000011	horse clip-clop
--VyLmfnIc5Q_000162	driving snowmobile
--W3y3qz3yp8_000256	people eating crisps
--WBvJuF2UOk_000030	playing acoustic guitar
--Yep0TGjWmc_000140	subway, metro, underground
--YrSxLTPdcA_000004	underwater bubbling
--YwZOeyAQC8_000002	baby laughter
--Zd-ZSnZ3so_000159	playing banjo
--_mqzXgg5eQ_000046	ripping paper
--c7lpU-_-V8_000030	motorboat, speedboat acceleration
--c96lccP5nc_000200	skidding
--eqkzAKGBZg_000030	playing drum kit
--geN4ECfl0Q_000030	playing bass guitar
--ibjrtJo9rY_000030	duck quacking
--nEg1olBLcw_000030	male singing
--s2G3Kto0Gw_000030	typing on computer keyboard
--s6dPB8fyQQ_000030	playing electric guitar
--tGOjLdrF6g_000087	playing squash
--v12qcLw5u0_000187	machine gun shooting
--vC3oqlxf4I_000010	slot machine
--vY141CdTc4_000030	playing bass guitar
--vmyjjovGXM_000116	cattle, bovinae cowbell
--vra5dNsP4w_000080	playing bass guitar
--w7WfMgSBD4_000047	lighting firecrackers
--wJ_UfBsiR0_000280	playing accordion
--xzWsDpVEiE_000060	child speech, kid speaking
--yby37u00N4_000030	playing violin, fiddle
--zHk3s6BkpA_000030	chainsawing trees
--zZR-ps0nJY_000137	hail
-0-fd-lvizrY_000024	yodelling
-00eb49xIULo_000030	female speech, woman speaking
-01LPFe-13Aw_000030	playing electric guitar
-01W8XIz7KDM_000007	donkey, ass braying
-02t6zmS4RAk_000102	playing didgeridoo
-038-gneOcks_000309	people eating crisps
-04m_7jCGHko_000030	wind noise
-04sf3v7xOzo_000005	cat meowing
-055LCXe4pR8_000012	people whistling
-09qDi4Auiyo_000030	playing electric guitar
-0Ca2CTVwOxs_000019	cuckoo bird calling
-0CvAFdtyVlo_000023	underwater bubbling
-0G0mSrzOZ2M_000400	driving buses
-0IvNbabusiY_000030	playing flute
-0JPlNHX2HQ8_000049	playing accordion
-0Lro_JzyUX0_000030	male speech, man speaking
-0McmdH07r7w_000050	playing flute
-0OHWW60khJ4_000030	playing bass guitar
-0PZQL-Msz0s_000030	horse clip-clop
-0RFEHUrGOP0_000170	playing acoustic guitar
-0SsaL_YNyjY_000030	waterfall burbling
-0T4gZQwzyKY_000030	people crowd
-0U_Q9JTATCk_000044	owl hooting
-0WIzNXqWrZk_000204	playing hockey
-0XzJKHmoN6w_000019	duck quacking
-0cMnDz8SSwQ_000014	disc scratching
-0dkhsBmUZSY_000030	people cheering
-0fQJ9nShofs_000093	dinosaurs bellowing
-0hCiGC4c97g_000033	crow cawing
-0hWyQpwHNDU_000030	motorboat, speedboat acceleration
-0iVM2GY3R_c_000030	ambulance siren
-0kar1O-1Ckk_000114	playing french horn
-0m3kYCMUuCk_000000	cattle, bovinae cowbell
-0sY8RR7V_q4_000220	female singing
-0tJevlglhe4_000010	railroad car, train wagon
-0uHGQmkKMr0_000223	people marching
-0yAboI4QC6k_000109	hail
-1-2zGkXe070_000098	rope skipping
-10fjkn2eM_M_000050	slot machine
-12tsmtyIALQ_000009	cat meowing
-13LB6yibhQ8_000009	scuba diving
-1CIxzqH4zzM_000040	ice cracking
-1Fp6zPswdjI_000233	tapping guitar
-1JMgZaCb9WM_000204	playing steelpan
-1MCjHVRBDTk_000055	slot machine
-1MLUEfkJDSw_000001	beat boxing
-1MPwoS-R83A_000030	cat meowing
-1Mx2iDMsZj8_000018	playing french horn
-1NTsWn1Gir4_000103	playing snare drum
-1NvpdqTAf3U_000030	skidding
-1NwFHr4VHS0_000090	playing clarinet
-1RB0gsxkPBo_000020	lions growling
-1RSK3TFru0g_000000	sailing
-1T1PLOWu65c_000250	skiing
-1TARmg2FYJQ_000010	people whistling
-1V65GzuCqaw_000030	bird chirping, tweeting
-1Vn7SftZxS4_000030	rowboat, canoe, kayak rowing
-1WaTnza9cn0_000160	playing violin, fiddle
-1YGJDa3aCGo_000289	fire truck siren
-1_CC87jIhXk_000382	swimming
-1acVFuCvOJg_000512	canary calling
-1bBdyTowO-M_000041	parrot talking
-1dO7fONpkvE_000000	people farting
-1eYmBacWt3k_000027	civil defense siren
-1f9IgOjZjn4_000037	rapping
-1gVugA2dsi4_000332	dinosaurs bellowing
-1gXDaVse3SQ_000387	planing timber
-1inu4aoQFKM_000164	planing timber
-1kdGia7plHk_000030	playing electric guitar
-1nDhQKLRJbg_000030	playing marimba, xylophone
-1p8YDM6gG6Y_000014	dog howling
-1t63KIS6F4I_000070	people sobbing
-1x7wVFMW4dk_000030	playing acoustic guitar
-1zWc46eeWLU_000167	playing sitar
-2-Ipq91ns0k_000036	playing bass drum
-21OWtKgJlIE_000270	canary calling
-23ky1UGWeKg_000190	playing bass guitar
-26KmPM2YkmQ_000004	ambulance siren
-2A5eS9kMm-U_000018	owl hooting
-2CebaASg1m4_000030	male singing
-2EeOU7PgSck_000030	female singing
-2F2NSNlc6dQ_000030	male singing
-2FNZwK-4sUA_000030	female speech, woman speaking
-2Jt4iqSqNTg_000012	bird chirping, tweeting
-2LBEllUpWiA_000000	volcano explosion
-2MDjnJzuUaU_000015	skidding
-2NIaPAfScHM_000030	motorboat, speedboat acceleration
-2NjwuyNgNoE_000050	playing hammond organ
-2P7ZXBq5r04_000274	playing cornet
-2RPPKMapBWY_000036	ice cream truck, ice cream van
-2SlVaOyh69w_000219	cattle mooing
-2Sto24aXwao_000097	baltimore oriole calling
-2VdOQylRl08_000002	playing lacrosse
-2YIZLARm8sI_000201	parrot talking
-2d43OFDr5aI_000001	frog croaking
-2ehs70MWQTs_000050	waterfall burbling
-2fCC4BkdMT0_000106	basketball bounce
-2fn6GFSwTEw_000096	cap gun shooting
-2iwPgYGH_Ew_000400	railroad car, train wagon
-2jy1b77hxXc_000136	playing bass guitar
-2lALVOKDQNM_000059	dog howling
-2myGIZCgZ2g_000018	tractor digging
-2rSFLrwcvcY_000020	pheasant crowing
-2szJ9STQPUk_000030	male singing
-2w6jRF1Ekhs_000130	playing sitar
-2xlWTgqPUOA_000004	beat boxing
-2yeuzECPVUI_000033	playing badminton
-2zev5MpJKPc_000039	chicken clucking
-33NCPZjFuLE_000056	playing oboe
-35MtyyqqQyw_000030	playing acoustic guitar
-35c4EPiZ8JM_000030	horse clip-clop
-35iGp2g_U6A_000000	church bell ringing
-37Tl9YROdbA_000077	playing trombone
-3EcAiTE0JyE_000052	playing theremin
-3JyLYEjo4ok_000000	people giggling
-3LfWg5Be60Q_000163	people burping
-3MOG_CAcWkw_000142	playing badminton
-3NcIWxDdTW0_000050	dog growling
-3O8InHTYtk0_000020	male singing
-3Okx0T5vpFc_000192	airplane flyby
-3OxJ7KtIb2A_000100	playing saxophone
-3QHNbJ_XATY_000036	civil defense siren
-3S2-TODd__k_000090	train horning
-3VK-nOg0-RQ_000046	pheasant crowing
-3VSUuTABb3U_000074	wind chime
-3WUTEMZv3EI_000046	slot machine
-3YuBzhAU_Yc_000000	race car, auto racing
-3cMrwXYnjd4_000026	air horn
-3d5tPNd4Olk_000020	wind noise
-3dBQbWPOjjI_000030	playing acoustic guitar
-3djcJkGeJK8_000293	running electric fan
-3e8ECt9wF5Y_000015	playing saxophone
-3en9IzSPnNU_000027	driving snowmobile
-3gTMehPiQ9s_000150	playing harpsichord
-3kXROE2wcRA_000069	bowling impact
-3p9aVzs8aYA_000030	female singing
-3u3iunnXAOs_000432	playing hammond organ
-3wboiuBfavA_000172	people nose blowing
-3yolbg1tH9U_000030	male singing
-4-_AWdbZnzE_000005	playing trombone
-42Iss6TfcpQ_000742	lip smacking
-433xsSMNLf4_000070	playing electronic organ
-43ijm8y4z2o_000030	horse clip-clop
-44UMQ5ZFuuY_000030	engine accelerating, revving, vroom
-457yRHL0f2E_000030	female singing
-45iXudFVQ4E_000000	subway, metro, underground
-46LjKw-7mU0_000030	male singing
-47QYxqXGZ3w_000244	people shuffling
-47SP2azKv8Q_000030	playing electric guitar
-47YlecLyyK0_000030	playing acoustic guitar
-47y5k6vaUxE_000089	francolin calling
-49gi-iYJ1F0_000107	tap dancing
-4CLnZSI8aPs_000092	hair dryer drying
-4DcOTOS_LE0_000454	sliding door
-4DzuWR9ekko_000000	playing bugle
-4E6mA8Y2Be0_000060	using sewing machines
-4FOFcRJR9go_000084	playing glockenspiel
-4H29LCZTMBs_000050	using sewing machines
-4K345_DRFRk_000056	playing volleyball
-4Ofe_ManxZc_000047	playing french horn
-4OxCr981HvY_000016	ice cracking
-4SlcVylJxxk_000297	arc welding
-4WGMFP00rIg_000030	playing acoustic guitar
-4YnMOFstVnk_000066	parrot talking
-4_QGupz8UNA_000189	hail
-4aFirNGu_P8_000381	planing timber
-4dhyddSUAWg_000175	police radio chatter
-4dkU-c4g1VM_000111	dog barking
-4h9o2iL6nps_000050	child speech, kid speaking
-4hU6jqQQUto_000009	playing harpsichord
-4iBqpFUnPoA_000170	fireworks banging
-4j7GbxZQjB8_000024	car engine knocking
-4jHrFbnaVRc_000294	firing muskets
-4kvqtJEFqjw_000190	playing bagpipes
-4ldID97D-oU_000020	people coughing
-4n657Imjmjo_000015	sheep bleating
-4o2IRyXi-aY_000667	playing harpsichord
-4rehS_cPodk_000020	female speech, woman speaking
-4t_Qz9RyUm8_000006	alarm clock ringing
-4yUvIrchOzQ_000280	playing saxophone
-4zf3qRiZ3Ok_000030	child singing
-4zsLfdNLUD4_000033	cat hissing
-50OgBbJZUUc_000064	typing on typewriter
-50jxPCLUFdU_000002	cricket chirping
-53ohFLBl0iE_000052	alarm clock ringing
-542uea0zO1I_000036	sea lion barking
-54XBPEFJQc4_000076	playing djembe
-574NjiOGi5s_000030	female singing
-58KzLvK1OYs_000144	dog growling
-5901zjV6oAo_000006	swimming
-5AFKEd8nSpg_000050	people sniggering
-5CtoZvJaGAM_000096	woodpecker pecking tree
-5D201VjroT0_000229	sharpen knife
-5D2E7s9bEf0_000010	basketball bounce
-5EPnuy_sKHI_000010	singing bowl
-5IZv217s4_E_000049	playing badminton
-5KRxqVykvvI_000030	printer printing
-5S3QDnRCnOQ_000003	tapping guitar
-5Sv97J7mksY_000030	playing electric guitar
-5UqwkZ1XK18_000050	helicopter
-5VyCTHzLVdU_000011	playing bongo
-5WVhslWt1wU_000030	female singing
-5Wb1zMq_DiU_000020	fireworks banging
-5XK1Vgiwllc_000073	playing mandolin
-5X_B2L1-4Bc_000030	playing electric guitar
-5briopN06L8_000000	playing piano
-5eHlhJ-ZOpg_000030	playing hammond organ
-5fZn_7LbKSI_000020	people burping
-5hi4T4Gp6v4_000002	air horn
-5hjKe0FWq9E_000002	horse neighing
-5iEbFJkG6Xg_000557	bird squawking
-5jQLK4Z1EH4_000020	wind noise
-5jt7lR8WY3g_000172	playing castanets
-5lV59hZgwRM_000009	scuba diving
-5mBCF05DV5s_000280	church bell ringing
-5mJ7_05tlhs_000005	crow cawing
-5nscL4EBrXA_000030	male singing
-5r1zW38AWvs_000057	wind chime
-5rP9Z4jEq6s_000024	cap gun shooting
-5xJdFysNSf4_000110	race car, auto racing
-5xefixXFNwk_000020	playing bass guitar
-64eXDlUgPoA_000079	lighting firecrackers
-64lQIoDGX6o_000040	playing marimba, xylophone
-64ollREPrUw_000132	raining
-64zPbHPyiwE_000030	male speech, man speaking
-659mhmSPXWA_000276	bee, wasp, etc. buzzing
-65u3pwOEcBg_000002	frog croaking
-67hDkeQalow_000030	motorboat, speedboat acceleration
-68mXCuRvQkw_000045	people burping
-6ARTjahUaYY_000030	playing electric guitar
-6BQgJ0tvUkc_000162	baby babbling
-6CYhRsU4F34_000000	people whistling
-6EcmHiscsOc_000287	lighting firecrackers
-6GsamqJ5tFU_000075	airplane flyby
-6IMlkVOKxJw_000032	cap gun shooting
-6IQkdce9a7Q_000184	slot machine
-6KO3eMyEeOg_000000	race car, auto racing
-6LB-qRj_zW4_000030	horse clip-clop
-6LKFDTu9vRQ_000018	playing french horn
-6NxeHScEnJE_000000	dog bow-wow
-6OhTwJrVxXs_000028	playing timbales
-6RGa6DvWpt0_000035	people marching
-6UcuQgsHFCA_000142	playing french horn
-6Y6CvX7EP68_000030	singing choir
-6ZbVXBeNsX8_000125	playing didgeridoo
-6aYfccsgIjk_000094	baby crying
-6gTR_Avjz6g_000170	playing cymbal
-6j2g_OZnW74_000189	missile launch
-6mE_v9a5dbM_000030	male singing
-6o0mZVMfKss_000140	people clapping
-6of3tx7IOik_000030	wind noise
-6shIFnN-LsY_000141	playing flute
-6v53uAVpXC4_000071	people babbling
-6wpifZcwOJU_000023	underwater bubbling
-6xAClSJ21qA_000491	rapping
-6xgTrufXcCM_000126	wood thrush calling
-6yBZH5cV7GE_000030	playing electric guitar
-6z_pfZ6Rvfs_000023	playing table tennis
-7-7r-FRwp_w_000041	playing glockenspiel
-72d2TsdeSg8_000000	tap dancing
-75FLwnGZJTc_000125	playing oboe
-75m0cvRBGY0_000030	vehicle horn, car horn, honking
-76F-K-7HUXE_000010	lions roaring
-77rq4-p4vV8_000030	wind noise
-78hdsP0edMg_000030	railroad car, train wagon
-7Ck8cfF2rl0_000200	otter growling
-7ELF2dbWe5w_000010	female singing
-7I_wdG-eOc0_000106	playing hammond organ
-7JT43yyNGkk_000003	black capped chickadee calling
-7JX-Bx0BETQ_000205	rapping
-7LMkG7uISis_000102	playing gong
-7MuetSj86N0_000490	bird squawking
-7NyPcaVKao4_000025	dog growling
-7Odi8SKArQI_000030	playing saxophone
-7P-1-qzwyYA_000055	magpie calling
-7Qr1ncg86N4_000007	lions roaring
-7TMOCRG4EBA_000030	female singing
-7U5V5Teqo8Q_000000	dog barking
-7V6NAsZ86xw_000000	beat boxing
-7VT8p9Er3n8_000020	mynah bird singing
-7Y3u8Aj8UV4_000010	driving motorcycle
-7YGUQYRwnHs_000019	horse neighing
-7YTsyqVSEeI_000006	child singing
-7b__KH3VA_o_000035	people booing
-7caL9c6N1zc_000122	child singing
-7gdSJ30FfNU_000490	people hiccup
-7h7_U2q-VwY_000276	dog baying
-7hdXzJpOXiY_000018	police car (siren)
-7iT77hG1X18_000063	playing erhu
-7kIhqlZok8c_000074	running electric fan
-7kIhqlZok8c_000241	running electric fan
-7lz-THXCwi8_000030	male speech, man speaking
-7ogdSWU90s4_000100	opening or closing drawers
-7pc3c5ZGbwo_000030	ocean burbling
-7rLRSpEqgZk_000253	playing sitar
-7tsuYUeV7_k_000191	airplane flyby
-7vF2Qq0Pg6w_000024	ice cream truck, ice cream van
-7xZxYm27FdA_000020	toilet flushing
-7xaNqQ8FAwI_000100	mynah bird singing
-7yBOHsPAJgw_000040	vacuum cleaner cleaning floors
-7yXROxIZfeo_000053	raining
-80KT6bYCFkg_000077	playing tuning fork
-81ACguOEqoM_000042	electric shaver, electric razor shaving
-82ic2Xisrqg_000030	car engine knocking
-83mmLOdwZlA_000081	air conditioning noise
-85Nd7APr5Os_000028	ice cracking
-85Nd7APr5Os_000052	ice cracking
-87-ZrpDyRHE_000238	cat hissing
-88oLbuKd7Rg_000030	car passing by
-8906Y6i-h10_000102	playing cymbal
-89NzFtLSRSo_000030	engine accelerating, revving, vroom
-8DDro-N5-54_000029	swimming
-8GTkmen1bBg_000110	playing piano
-8IdUE6nhR3E_000030	playing violin, fiddle
-8N0GxZtk9wE_000051	playing didgeridoo
-8NMHjXutgVs_000333	electric shaver, electric razor shaving
-8Rh7NvJDexA_000068	tapping guitar
-8VEqGk0W4xY_000192	playing darts
-8XH7xIWWC6c_000090	people cheering
-8Y9VKxl-1gE_000063	rapping
-8ZWKl-_qHM0_000010	driving buses
-8_xdWIziFpI_000030	baby crying
-8b2ASj5nmos_000251	playing darts
-8c9PJLozdtA_000020	playing bass drum
-8jkr7bOR8ck_000146	playing table tennis
-8lIh0qRN7PE_000220	cattle mooing
-8m7VIFtS4gc_000000	typing on typewriter
-8n76LfbY3qo_000232	chainsawing trees
-8ngu3TPmfZQ_000110	playing drum kit
-8ugZkKeLL7Y_000030	male speech, man speaking
-8vE2wod7rhE_000030	horse neighing
-8ytjUazIdno_000023	playing glockenspiel
-9-N8v-cC0Tg_000002	air horn
-9-xW047dMpk_000125	missile launch
-913ItBzDHLQ_000124	playing synthesizer
-91kWVMnyKxA_000019	bird squawking
-92G0bdxj5ck_000091	mouse pattering
-93a7wS41kLc_000060	splashing water
-93rlsDDmFYo_000110	playing cymbal
-95UKs8K92C4_000110	playing timpani
-96wdXcwIbgk_001238	playing bongo
-97svDuqFctI_000139	playing steel guitar, slide guitar
-98Nc3x8U1JI_000187	playing badminton
-993A2y5lv-s_000030	bird chirping, tweeting
-99WZAe6QKUc_000030	people whispering
-99cGCS0ko2Q_000120	playing saxophone
-99ylFYthGcI_000004	donkey, ass braying
-9A8hgZdD__g_000030	horse clip-clop
-9CFR1VdlIMc_000142	rope skipping
-9CmEsDtIz_Q_000060	playing accordion
-9D9sfe1eaK8_000000	frog croaking
-9HtYErt1moA_000100	playing drum kit
-9IwjfATt51Y_000030	playing french horn
-9JwaE3BmICE_000061	female speech, woman speaking
-9LFGIpAO3NE_000161	tapping guitar
-9LY2BJ2fqts_000090	people gargling
-9Q1RM-pY2yY_000180	playing bongo
-9UvWyax1fEU_000000	people booing
-9Y3ausHODlk_000557	playing electronic organ
-9ZCgk2e7wZM_000269	woodpecker pecking tree
-9ZE18L9NN1Y_000105	striking pool
-9af_fvuAY8E_000167	barn swallow calling
-9exZEq85L1k_000260	playing trombone
-9fvJeyH-4II_000100	fireworks banging
-9gJ4NQYcakk_000030	male speech, man speaking
-9k0OwVahe5Y_000066	playing cymbal
-9pBp5wd9rpw_000043	firing cannon
-9s6jvP1V56w_000080	people crowd
-9t7YT0OKpaM_000130	playing cello
-9tyf9HGsIe4_000000	people finger snapping
-9zriIjvwqJw_000480	people burping
-A08MfFxzmxo_000030	playing accordion
-A0tXM5fSFrw_000062	alligators, crocodiles hissing
-A1-T0wdI8Nw_000070	sloshing water
-A1vf6We9a_Q_000290	mouse pattering
-A2GEU2r5KnQ_000030	playing acoustic guitar
-A551qSirV68_000298	alligators, crocodiles hissing
-A55rHYLkwQk_000000	singing bowl
-A95vqV9oM6g_000081	people marching
-A9pIMNKQWCk_000005	footsteps on snow
-AApTo3l6NfA_000030	train horning
-AFmF56HVvVg_000151	cat purring
-AI3om1uyCH0_000009	fire truck siren
-AJhEl41TC5s_000443	lathe spinning
-AM8-hH1Oahw_000510	driving buses
-AOPgsB4hsH8_000206	electric grinder grinding
-ARFBC4LeCFY_000019	chicken clucking
-ARrb06s5a0Y_000082	donkey, ass braying
-AUufm-TAVg8_000101	playing french horn
-AW-JhveJXFw_000030	typing on typewriter
-AXDomj6KnkE_000141	playing tabla
-AZgG_6NE8j4_000240	parrot talking
-Ac0OxSV8Nqk_000030	female speech, woman speaking
-ActIkLSW20Y_000350	railroad car, train wagon
-AefLmdFYR6k_000136	playing tambourine
-AfrcYQw5mXw_000010	telephone bell ringing
-Agl-AQmIYBE_000030	rowboat, canoe, kayak rowing
-AhUYTb14QZU_000020	chimpanzee pant-hooting
-AiriN8WOgiI_000123	playing bass drum
-AjD1BiY0o8E_000001	pheasant crowing
-AlTNj6IWey4_000112	airplane flyby
-AlebU-Vdy18_000022	playing theremin
-ApMojxDfms0_000273	magpie calling
-AteFCZfJfLY_000011	vehicle horn, car horn, honking
-AvguIvLb0GY_000027	electric shaver, electric razor shaving
-Aw5BwrqdmHc_000010	canary calling
-Aw9arRIoBR4_000030	sliding door
-Aygl9-ur8NU_000001	gibbon howling
-Az1M0iLYjIg_000030	ice cream truck, ice cream van
-B1ax5dX6XrU_000215	wood thrush calling
-B7cF_In3_-c_000030	horse neighing
-B7zgWPjx8hg_000026	wind noise
-B9Mk5n5Zwjg_000240	driving buses
-BBumD37-y80_000110	train horning
-BC4LglYv70Q_000108	playing oboe
-BH8QYqAvO2k_000020	playing vibraphone
-BJ31LCL3Dy4_000100	crow cawing
-BLWCHd07ATw_000080	playing electronic organ
-BM1fw080pSs_000030	female speech, woman speaking
-BM4YyahEm8Q_000078	spraying water
-BO1K4wXy2CI_000299	mosquito buzzing
-BPD7Qj1U_Bo_000131	playing theremin
-BPxW7nP4loQ_000003	eagle screaming
-BTdIM1mncyA_000030	female speech, woman speaking
-BWw6dgq07Qo_000053	playing clarinet
-BYJ2UHIHCLU_000009	hail
-BbUBFko93XE_000031	people sneezing
-BbfDej2cM2I_000001	volcano explosion
-BdIWeKYKIzk_000000	train horning
-BfyYYuE12dw_000006	goose honking
-BkXyLmdb8Yw_000290	playing hammond organ
-BkwStRX3xE0_000010	fireworks banging
-BmsTQHrCwB8_000215	heart sounds, heartbeat
-BnRtoIC87Po_000030	female speech, woman speaking
-BniijKHywXM_000103	playing tambourine
-Bo271H1XM40_000127	arc welding
-BpV7n-YUtos_000248	rope skipping
-BpfM3evN6H8_000009	people eating apple
-BsHgr_sj6ec_000058	playing bongo
-BvdvbeIdUtk_000072	people booing
-C1fdGRZRPtU_000040	barn swallow calling
-C2kKMjYETRQ_000050	playing acoustic guitar
-C5ik_rcugw8_000040	people marching
-C7zLftUgskY_000035	playing harp
-C85lxZAStBk_000056	playing hammond organ
-CA3sbGHEE3c_000001	people screaming
-CFazHdGsxcU_000155	lighting firecrackers
-CG-2XtQI6sM_000000	cat hissing
-CG_qvz_V1Jo_000374	playing gong
-CJjTs72p1gI_000002	alarm clock ringing
-CKwtP-eN1Zk_000014	striking pool
-CUY3hob5V_o_000010	opening or closing car doors
-CUqga7lwvfM_000080	subway, metro, underground
-CVVrs_KA6sU_000030	rowboat, canoe, kayak rowing
-CViouHw-mfQ_000122	playing cello
-CYTTsSPohw0_000122	planing timber
-CdcfD8mg-k4_000030	singing choir
-CeD6RlRSr8M_000099	cat purring
-Cg3XzrFzzpM_000060	playing accordion
-CgevwvZLE3c_000219	running electric fan
-CheeUmf4IhE_000030	fireworks banging
-ChhTVgWMxiI_000030	duck quacking
-CiLwbeRDj8E_000000	people crowd
-CjwqjkkoJHY_000199	car engine starting
-CkYUBci5xEM_000070	using sewing machines
-Cntxv6aE3DY_000030	sliding door
-Co1qXvuwkes_000146	arc welding
-CpW7umx_bi0_000067	playing mandolin
-CqgPmVXNdNQ_000058	striking pool
-Csr7c9uFvQk_000028	dog whimpering
-CvN_oC0AGvM_000340	toilet flushing
-Cvgc82TDNnE_000025	lions roaring
-CvxL2n9DX6w_000251	lighting firecrackers
-CyW4FoAJ1MU_000260	police car (siren)
-CzGGyIj84Hs_000030	pigeon, dove cooing
-D-HXQTcZNGU_000130	female speech, woman speaking
-D109-sQNo1k_000028	sliding door
-D6BCygx6jcs_000000	dog howling
-D7kL3EEOyR4_000050	helicopter
-DAy_bV1d9c4_000046	playing squash
-DGU-HbuX6rs_000230	people crowd
-DJan9OSSF7c_000060	plastic bottle crushing
-DO-9yuU9brk_000028	sea lion barking
-DOi5UxxTknA_000041	driving snowmobile
-DPpo_Whnuqc_000022	missile launch
-DQelhAtUyHY_000030	playing electric guitar
-DR7TdSc2ahQ_000030	sliding door
-DSThhOKXU-c_000250	playing bass guitar
-DSgKhbtDWWo_000400	playing flute
-DX5_AglGFMw_000349	metronome
-DXfTYgSGLac_000177	alligators, crocodiles hissing
-DZo15IMYpmA_000206	vacuum cleaner cleaning floors
-DaMG8zJSkuw_000100	playing trumpet
-DbgRhWmYTJk_000002	frog croaking
-Dbi2L5z8U-w_000020	driving motorcycle
-DdZ6PSUQoQA_000050	female speech, woman speaking
-DfZmOeeF_CI_000024	lathe spinning
-DhaOFNnOC8o_000102	playing steel guitar, slide guitar
-DhxWWDGdF8I_000159	frog croaking
-Dj3sIimPrCk_000330	pigeon, dove cooing
-DmSsL0Xde-I_000005	missile launch
-DpIKdB4c_JU_000030	sloshing water
-DqnMEAN1GVc_000098	baby laughter
-DrPa82cqlSM_000008	playing mandolin
-DroorVxOn5s_000030	engine accelerating, revving, vroom
-DsVtCIaWv-Y_000377	hedge trimmer running
-DtRqBLRUTRo_000069	playing clarinet
-Dtiv9RNaA4U_000106	train horning
-Duk5ikgbUfU_000030	playing violin, fiddle
-DuyL15HJn6M_000036	elk bugling
-DvascfU3OM4_000233	playing bongo
-DwE0cQ3Xz70_000030	chainsawing trees
-Dxxg6NenmBQ_000153	playing oboe
-E0ocfyjk1lw_000129	scuba diving
-E22HBR9rEkI_000030	lawn mowing
-E22UuQ6SRf4_000001	fire truck siren
-E4IHTinI-3k_000010	people whistling
-E4dvhMWr7K0_000140	playing didgeridoo
-E5ICgH7JVFI_000003	driving snowmobile
-E67GhkgB8Jc_000033	cell phone buzzing
-E6tu_4cO7ok_000107	playing cornet
-E8LoFlcAC-M_000051	playing vibraphone
-EDtJ88ZJtWo_000008	playing bagpipes
-EEJp_Ssp0No_000004	dog howling
-EG2bfvkpzjk_000136	playing steelpan
-EGKE_rOo-Gg_000030	playing violin, fiddle
-EHHBn9EAtg4_000040	people booing
-EHHefsog-aM_000069	black capped chickadee calling
-EHkkma0y1T8_000030	people sneezing
-EJudk9RWsZI_000000	car engine starting
-EKkFWhdVAOU_000032	woodpecker pecking tree
-ETcwLdOldMg_000000	blowtorch igniting
-EU3OmHbOUo0_000000	cattle mooing
-EbnPPw9P3MQ_000409	snake rattling
-Ee1Glgpx3YE_000038	scuba diving
-EeUHgSkCSi8_000666	turkey gobbling
-EhDl29RiF74_000085	black capped chickadee calling
-EhaE7gijT78_000119	baby laughter
-EkbcNbEn1Z8_000063	opening or closing car doors
-EoubRuwDlrw_000038	canary calling
-ErdH1gc3ZmU_000003	playing cornet
-F186zkBSFjE_000110	helicopter
-F1ZVQSywml4_000040	skateboarding
-F3yETAYfYZg_000009	playing theremin
-F6xLA2AA2GA_000090	people crowd
-FAdeuN1uc-M_000230	subway, metro, underground
-FCir2lQei8M_000030	playing harpsichord
-FEltES9TUEU_000008	hammering nails
-FGWcwpr_SeM_000133	fire truck siren
-FGoXt7LIK3U_000010	police car (siren)
-FHz8YQy4q5A_000027	tractor digging
-FIPu0jd8I28_000030	people screaming
-FIpCyWCy9Qc_000030	playing violin, fiddle
-FRxNI559-Xs_000280	railroad car, train wagon
-FUVXK29tUwQ_000000	owl hooting
-FWuYLFTe3_8_000000	playing trombone
-FXzP5bUz-Lo_000017	horse clip-clop
-FauD2eg73V8_000030	playing electronic organ
-Fd_SXrGw6ag_000030	playing marimba, xylophone
-Fe9YJozRi78_000148	child singing
-FfpD5XC8b5w_000137	playing bongo
-FglnuP1jpRY_000030	playing cello
-FhHBIlZ_5T8_000035	wood thrush calling
-Fj34VCzy_Og_000030	horse clip-clop
-Fpqf057G_SY_000000	chipmunk chirping
-FpwtNUX45qU_000047	snake hissing
-Frs4_Uf8Tq4_000127	ice cracking
-FtCT62fiyrU_000270	church bell ringing
-FtNV_Gq62l8_000019	cat meowing
-FudSk5EUbAY_000156	playing ukulele
-FvZqgCIbO2Q_000003	hail
-FyszP9lfbDk_000001	playing didgeridoo
-G-5AgMNzjv4_000017	vehicle horn, car horn, honking
-G-Eokh465wM_000030	printer printing
-G-IdABSxeHI_000097	dinosaurs bellowing
-G-jsAK9ITwM_000030	ocean burbling
-G6FhQuR3_88_000000	playing congas
-G6nSnVQCxBQ_000095	elephant trumpeting
-G7E7D2Z_Juo_000070	people burping
-G7F8HVNw1lI_000081	scuba diving
-G9AKWSzZtWI_000030	people eating
-G9F38sObAns_000025	playing harpsichord
-GAFJeF_AqZA_000086	hail
-GBf5DgubSuE_000030	wind noise
-GD8dVFZaWNU_000030	skateboarding
-GDQjuDpqnJI_000030	wind noise
-GL1TqKjpv1Q_000047	playing theremin
-GLA-upuVPSA_000057	police radio chatter
-GLtFkIbCZOY_000140	pigeon, dove cooing
-GMNJCJ0ykfc_000050	male singing
-GOFDdcvXq40_000030	goose honking
-GPl4twCSrLQ_000001	coyote howling
-GS_JqZCyqOc_000050	stream burbling
-GT2frI8BMMM_000013	vehicle horn, car horn, honking
-GTZkjw4aVn0_000030	engine accelerating, revving, vroom
-GUSlicDnqIA_000045	playing congas
-GX4kLN3hW4Y_000149	planing timber
-GXIPKWMIVhs_000072	playing oboe
-GXRHmy5Bqas_000008	vehicle horn, car horn, honking
-GYJCyn2piCc_000329	lip smacking
-GZoVDjx9ltQ_000235	playing erhu
-GZoypVKRpCo_000003	cuckoo bird calling
-G_hP5gvRfNw_000033	cat growling
-GaFqib8bCLM_000019	tapping guitar
-GbTzdC4mOtQ_000030	machine gun shooting
-GbUoljsX3lg_000672	people gargling
-GgUkhedV5e0_000190	female speech, woman speaking
-GhizOxu0ZpI_000060	people belly laughing
-GidBfE5JU3s_000005	vehicle horn, car horn, honking
-GjKjnplphn4_000200	playing acoustic guitar
-Gjide6V8U-E_000039	dog growling
-GoMH9AL7YRA_000050	ambulance siren
-GvPc1ncg0OY_000138	people booing
-Gwp62TNrER0_000014	barn swallow calling
-GxUovR3d2aM_000019	car engine knocking
-GzAdcTtwkM0_000011	missile launch
-H-ZKdWCEhbI_000140	fire crackling
-H-jnsSCa-c8_000090	playing lacrosse
-H-rd3O5haG8_000070	playing bass guitar
-H0zmJjMoV-4_000012	playing squash
-H1lx8lLLceQ_000120	machine gun shooting
-H2r4JHm00Vg_000260	sheep bleating
-H6onyc5r6os_000024	heart sounds, heartbeat
-H6z_gPH8m2A_000055	people crowd
-H7BcUVlPDsg_000026	parrot talking
-HCPKDz63_s4_000000	child speech, kid speaking
-HCuORBJf-Ho_000027	playing cornet
-HL36YvzbFYs_000210	goose honking
-HL_E1j069EI_000030	female speech, woman speaking
-HOD29VAXJD8_000030	car engine knocking
-HOI0ZaKLAMM_000030	fireworks banging
-HOI7KapLzz4_000030	playing violin, fiddle
-HOupeg-QhHk_000073	yodelling
-HQSafj2aCNI_000100	playing banjo
-HQlV2jYCz5k_000030	playing violin, fiddle
-HR35d67Dhts_000108	singing bowl
-HRaGv5q3P3E_000000	opening or closing drawers
-HTRRMT1NQOc_000060	playing saxophone
-HTqUtEGJ0As_000030	people whistling
-HUP72tlgzyE_000066	playing badminton
-HUWvhtKby-A_000033	car engine knocking
-HW2o3t3fE_k_000062	francolin calling
-HX2ccFGAuMU_000163	electric shaver, electric razor shaving
-HX5BeffFwV0_000008	smoke detector beeping
-HaABMNzUOvo_000030	wind rustling leaves
-Hakqd6g2jaY_000110	helicopter
-HcO60nHH4W0_000023	playing bass drum
-HckqMrtU3dg_000133	playing double bass
-HebxWsaO-LA_000115	train whistling
-HkCt4hh_x58_000030	rowboat, canoe, kayak rowing
-HlUvoEXQZYk_000007	playing tambourine
-Hlp5qKMfdYk_000180	playing bass guitar
-Honj-TQHx3U_000129	airplane
-Hqhi7LioGyM_000030	playing marimba, xylophone
-HsCj9l5Barg_000045	fire truck siren
-HsX5XlPFOWI_000380	lawn mowing
-Hum53_V1zw8_000001	wind noise
-Hwp_62TYhDk_000110	playing marimba, xylophone
-I-WMZh-ieC8_000280	playing harp
-I-qeWJGSXuQ_000083	playing bassoon
-I4ffG1Bh-d8_000156	playing oboe
-I5wV1AFabIA_000029	frog croaking
-I6_30m_TQ2o_000000	playing tuning fork
-IBy30oL3yxw_000399	playing harpsichord
-ICajcUYAan8_000410	people babbling
-IEiseWb8Tao_000080	playing acoustic guitar
-IF92YmTMtdk_000089	cattle, bovinae cowbell
-IFGbGcs3bQQ_000034	chinchilla barking
-IHaWOJuekYY_000109	tap dancing
-IINqN6L2NsY_000285	tapping guitar
-IJvYFkrfjBg_000049	tornado roaring
-IKj9E33H8e8_000012	pig oinking
-ILBWV9AFKDU_000115	playing ukulele
-IN-9DFoS3fM_000007	bird squawking
-IWVztd9QsXg_000005	owl hooting
-IWhgJgeUQuA_000090	playing bagpipes
-IYhq5aun18M_000181	police radio chatter
-IZAasx5KIKE_000010	fireworks banging
-IaAKobKeOtU_000271	people marching
-IeD5tKVhuI4_000030	playing synthesizer
-IeK6EDl8Z_k_000033	people clapping
-IeW36MTcnBs_000117	dog growling
-Ieca4fwxfyY_000049	tractor digging
-IicM8tOXAFg_000146	pheasant crowing
-Ik40yoz30vE_000068	woodpecker pecking tree
-Il82kphC6es_000172	dinosaurs bellowing
-IlLCyGNjG3M_000060	playing harp
-InxgcOFzxWY_000070	chicken clucking
-IpDU10kKguU_000311	vacuum cleaner cleaning floors
-IqCRbzhPkvU_000000	lawn mowing
-IrcX151sayY_000098	tapping guitar
-IrkyGrHjygY_000020	tapping guitar
-Irx-WWFsQYU_000667	people eating
-ItnOPd_CktY_000020	people coughing
-IuTgZQVcMBg_000007	sloshing water
-Ivho6H4q1zk_000017	typing on typewriter
-Iylzuk-0j64_000163	slot machine
-J0ZBjy_EEtg_000015	people clapping
-J18R3qBnJtA_000120	waterfall burbling
-J1kAKMeULF8_000500	subway, metro, underground
-J3K5HEX3gko_000030	playing banjo
-J4VeWujsLJg_000030	typing on computer keyboard
-J5ugw2GUbnY_000001	dog whimpering
-J7fVkoC-Ha8_000711	people eating crisps
-J82OaPeyioI_000030	horse clip-clop
-JC33o6YxH9c_000220	playing piano
-JHNBF0WJ-EM_000029	people belly laughing
-JIdUC1zZb9M_000060	rowboat, canoe, kayak rowing
-JK4YikH2myA_000161	playing vibraphone
-JKrghKg6UBU_000260	ocean burbling
-JKxdjXEI9Wc_000015	eagle screaming
-JLPpMZlBOEI_000038	playing accordion
-JQ3bFZbatGk_000030	people running
-JQr-BRXrjN4_000002	airplane
-JVevxopJjU8_000823	playing tabla
-JXi1ZtJecYo_000001	bowling impact
-J_k6z7_YVJU_000090	playing piano
-Jbiig_IQdIo_000282	cap gun shooting
-JcXhB_4B32o_000090	playing clarinet
-JeJGThFGm80_000001	lighting firecrackers
-JfAjUMKjoVI_000460	playing harp
-JfiFq8tn5Pk_000009	playing steel guitar, slide guitar
-Jk-SBbw7Afg_000140	driving buses
-K-B9CIVeQ_U_000030	horse clip-clop
-K-MCXLQmnFA_000004	playing banjo
-K3KsP-m_c5I_000353	basketball bounce
-K5HBK1c7noI_000010	cat meowing
-KBc_FdBzN2U_000017	wind chime
-KFFJI_TZmoY_000047	crow cawing
-KJwga4gMEzU_000239	people slurping
-KJxSJR3v6oE_000013	church bell ringing
-KKd2qSxww1o_000002	typing on typewriter
-KM_VudA7hgo_000030	people running
-KOzRB30gxpE_000362	planing timber
-KQbCjNzlYPs_000082	writing on blackboard with chalk
-KU5WQZsoKRE_000079	child singing
-K_8tBU1LYxU_000000	chicken crowing
-Kbc8ioemPlA_000081	tractor digging
-KdD8xho7ymw_000037	cat purring
-KfqdB93utIg_000000	waterfall burbling
-KfyYM6nq--A_000011	playing vibraphone
-KnwgxGWxp7Y_000025	people whistling
-Kp0W7S-oExs_000030	driving buses
-Kq0Dbp3C4d0_000017	dog howling
-KrUuPSM4LxM_000215	magpie calling
-KsuQWEN0COQ_000199	playing darts
-Kus5SmqOIrA_000024	mynah bird singing
-Kwha8UYndzI_000090	playing didgeridoo
-Kz4Jm9_iFeg_000038	hail
-KzK6d6Qpu_o_000010	dog barking
-KztFbSJPxg0_000197	planing timber
-L4u9LOjcXoE_000000	people sobbing
-LAaJfzvvlTI_000053	lions roaring
-LAx_fanEB_g_000168	arc welding
-LB2EbSmDSKw_000007	baby laughter
-LBH_D9h18bw_000042	rope skipping
-LCcPzeH_Cn4_000160	sailing
-LE49c8e5VMU_000049	mynah bird singing
-LEpzp8DnWyY_000026	sharpen knife
-LGMZ9c7q8tE_000168	cat purring
-LHYHo8wJF74_000342	playing oboe
-LJsSbG5A1y0_000000	lighting firecrackers
-LL618LsL2zY_000030	pig oinking
-LMbyOx04l9E_000036	vehicle horn, car horn, honking
-LNl3ANFth4Y_000021	mynah bird singing
-LOLFOiNiS1o_000067	sharpen knife
-LPTsZZVr06o_000030	people eating
-LSsYBN_RvPc_000122	rapping
-LWrztDg2BGI_000245	playing synthesizer
-LYUkVukRObA_000236	pigeon, dove cooing
-L_Da1Sv1iKU_000028	playing didgeridoo
-L_OvLmH_feU_000021	dog growling
-LaGhL-3ctOc_000048	playing double bass
-LbjkUR-ERQw_000049	opening or closing drawers
-LciaPQ1XV3c_000217	playing badminton
-Lfmcj5VW6VE_000050	playing acoustic guitar
-LgdtTzvKnT4_000030	rowboat, canoe, kayak rowing
-Lj4Ngu0ars8_000138	electric shaver, electric razor shaving
-LlRZR8xPOEw_000021	frog croaking
-Lmp51YN-7wc_000466	people marching
-LtqXpk2YGls_000010	chainsawing trees
-LuxrhiicesU_000000	donkey, ass braying
-LvzMerRGbCE_000099	bouncing on trampoline
-LxdOWpwSzi0_000400	mouse pattering
-Lz8Ytz12MrU_000120	chopping wood
-M0EaEBlx5fk_000126	yodelling
-M9cNmb9HKPc_000110	turkey gobbling
-MApvC99wovc_000159	car engine starting
-MEtdxR3RdEA_000180	playing piano
-MFEhejrPVmw_000040	dog barking
-MMTvsiahcsc_000002	fire crackling
-MMjEIFDYQvc_000117	yodelling
-MQPNvRDVuUs_000100	playing french horn
-MQPggq37uX8_000003	scuba diving
-MQcS6DqCjKQ_000030	playing vibraphone
-MRnnE9MTm64_000052	driving snowmobile
-MTD6-1mrtP8_000072	owl hooting
-MVmJujaAocY_000030	baby crying
-MXmetP4F-EU_000019	door slamming
-MdUG2H5K5eg_000117	roller coaster running
-MenAsca8z6s_000137	slot machine
-Mf6bCl5HKgc_000000	wind chime
-MfSXrFJt6d4_000007	motorboat, speedboat acceleration
-Mhzz75z8mbY_000166	playing ukulele
-Mk8fhA3DAsA_000030	turkey gobbling
-MkrFhq3F_z4_000100	playing accordion
-MlX7I-OZIyk_000062	playing timpani
-Mmyr6Gpclbk_000070	bird chirping, tweeting
-Mnv4KVEt18I_000018	people giggling
-Msh94MTYC6A_000290	chainsawing trees
-MshXUve673A_000363	elk bugling
-Mvn2oFoKxwI_000128	people booing
-Mvue0y_EsDU_000000	orchestra
-MwVghEDjyQM_000030	people sobbing
-MwsoiJOqg_g_000030	duck quacking
-MwyzEfk2xbA_000054	playing double bass
-Mzc3DajWA0k_000030	ice cracking
-Mzgas545UXU_000090	playing snare drum
-N09QFSbvIC4_000150	playing electronic organ
-N2DQWIePoLs_000030	playing violin, fiddle
-N3_jZV1ejnA_000030	crow cawing
-N5CNEOKptjo_000000	splashing water
-N8cNWpCL0Rs_000183	owl hooting
-N9cM9BdATNs_000081	people booing
-NAETplWD64g_000030	playing harpsichord
-NAk-PU3X_DQ_000026	mynah bird singing
-NBeonGAqO84_000032	playing bugle
-NCdkXluu-D8_000350	playing harpsichord
-NFd5Zot-0_c_000006	heart sounds, heartbeat
-NJfJ4E9EVoM_000120	people whispering
-NN6mOUDBjEM_000042	dog howling
-NWSsGcjVRDw_000245	playing tabla
-NZs6RgHZOoI_000013	firing muskets
-NfcCnLiHlqU_000134	playing erhu
-NhO6B0zM9Pc_000030	playing electronic organ
-NjKRF79wl5Y_000110	wind noise
-Nkz9_eGsHKY_000057	people booing
-NmdqThtOVro_000160	beat boxing
-NnNm_oqkG0o_000050	people sobbing
-NniPHshHj9M_000068	playing didgeridoo
-NqxCX4G3N2g_000107	playing volleyball
-NrCNo4V7RVM_000030	lawn mowing
-NrWxMrh7cGw_000210	playing cornet
-NtJQ6W2o0EI_000075	canary calling
-NwIDavS0llk_000010	chicken crowing
-O-C9p_sK_eI_000030	horse clip-clop
-O0QV4_JRM0M_000002	car engine knocking
-O15FUv56iCc_000040	playing cymbal
-O3geFV-GoqM_000031	fire truck siren
-O5LFB39yCA4_000085	missile launch
-O5TMWyFd1DQ_000180	playing vibraphone
-O6_sGC3v96g_000006	wood thrush calling
-O7KCtFRaWck_000080	alarm clock ringing
-OEu8pZpN8ZA_000000	using sewing machines
-OIqUka8BOS8_021217	warbler chirping
-OLtTuBhG-og_000075	playing squash
-OTVFQoNRQTs_000060	playing bass guitar
-OWlCVuOznw0_000019	arc welding
-OZ14CiqpJL8_000010	child speech, kid speaking
-Ocdu7Lz0IuU_000003	francolin calling
-OdGHvGlSUcM_000157	playing timpani
-Of59qi5xxkM_000050	playing drum kit
-OiIaJb68Haw_000050	playing banjo
-OjOQ0K6lza8_000018	playing tuning fork
-Okd7ksWR-fc_000547	swimming
-Oljdv3iSTBc_000047	people eating noodle
-Om-Uc7ia1f0_000320	playing bass guitar
-OrueZOVOAD8_000010	motorboat, speedboat acceleration
-OtDVd-1zaqU_000030	motorboat, speedboat acceleration
-Ow1ZEhmP3qU_000116	typing on typewriter
-OyoJ99jDQdo_000147	playing didgeridoo
-P1eMMIK0cTs_000011	mynah bird singing
-P2taxpwuzcw_000053	wood thrush calling
-P2wbv4C6bBA_000210	barn swallow calling
-P35m_Rn7HbA_000030	motorboat, speedboat acceleration
-P5Y1D-fSVfg_000054	lip smacking
-P6sG1m6C4zI_000081	mouse pattering
-PavKY6YlSl4_000026	people whistling
-PawUc0pqf9M_000260	car engine starting
-PbomocKzqKU_000109	splashing water
-PeJxiP0CPn4_000025	playing didgeridoo
-PfwBOCxEst8_000243	cheetah chirrup
-PjbxRjKvzw4_000030	wind noise
-Pll-TpbHen4_000067	airplane
-Pp61sP7bols_000076	cap gun shooting
-PrIQbadXX74_000692	playing oboe
-PsmihTl5Cx8_000060	pig oinking
-Pu4BCOv6e5Q_000020	fireworks banging
-PvE48Ub_CgA_000034	bird chirping, tweeting
-PvpA8y7-ZC4_000101	people burping
-Pvt8VUQ_Bso_000030	playing vibraphone
-PxEpiEid_c8_000177	slot machine
-Py5s1uL46L0_000100	male singing
-Pz618GchhGI_000001	otter growling
-Pz9BhPMUzv8_000258	lathe spinning
-Q38lPvwj5Gw_000234	swimming
-Q57DFiTwcM4_000221	people eating noodle
-Q5jnMD1z86k_000287	people eating noodle
-Q7X3fyId2U0_000090	tornado roaring
-Q7ZPnRQraJk_000200	playing clarinet
-Q9AvyaxgRRo_000141	playing steel guitar, slide guitar
-QBFaKTDXCCQ_000120	playing acoustic guitar
-QBfcf-k5U28_000007	vehicle horn, car horn, honking
-QCX3H9wXgpo_000053	cap gun shooting
-QEcQtxP1fdg_000056	playing bongo
-QGkUBiVG8-Y_000034	owl hooting
-QH5ZtCI9Hts_000125	chopping wood
-QL6Ws4i07is_000040	goat bleating
-QO9sbXhMq08_000220	people hiccup
-QOFuXRetSLI_000064	arc welding
-QT1nE5lR7wA_000035	cat growling
-QTqN9c6661s_000000	forging swords
-QUMzyZRYpWs_000019	playing steel guitar, slide guitar
-QXEq7sE7dqg_000030	police car (siren)
-QXjaLCotbpY_000055	playing zither
-QZ-cG6VdBHM_000070	helicopter
-QaBmzAFivPQ_000096	people marching
-QcL-X7hJJYQ_000000	people whistling
-Qd9_UMNMhcA_000010	typing on computer keyboard
-QdEYMboSweA_000001	playing oboe
-Qdrcv-ZjC-g_000037	car engine starting
-QgEi6pAW36g_000150	male speech, man speaking
-QgHYiH6ES08_000120	basketball bounce
-QgQKqaMqRgs_000062	playing bongo
-QhRcayuLZ48_000390	playing piano
-Qlj2HEcX05Q_000136	playing saxophone
-QpA1_cezBwA_000123	mouse clicking
-Qr2PeUXBJu4_000197	playing erhu
-Qs8RjZlOcdU_000030	car passing by
-QsHeqaa4Ckc_000072	people whistling
-QsNHM92SIvo_000000	people whistling
-QvEMTs9_RQE_000010	lawn mowing
-Qwxa7ZCEBQs_000006	cricket chirping
-QyRrtn5AoSg_000280	playing saxophone
-QypTigdvLWU_000017	firing cannon
-R0YwusOkMx0_000008	bowling impact
-R1h5rRHM3oI_000000	donkey, ass braying
-R29qwv_mh4E_000018	playing bassoon
-R3VnztSX-k8_000200	ice cream truck, ice cream van
-R7KnzEqUGAc_000040	playing cymbal
-R7bSeIfRG-Y_000590	eating with cutlery
-R8cWq9GoEpE_000037	pheasant crowing
-RBHqcDacio0_000182	beat boxing
-RCIMcizSSZU_000044	francolin calling
-RDuDqEmKucQ_000030	motorboat, speedboat acceleration
-RJNjaPizyKg_000099	playing theremin
-RKZmAYXXWbg_000247	canary calling
-RM6uf-sdVQI_000043	playing bass drum
-RN96eLdMN_I_000005	bull bellowing
-ROsAOQe62gs_000050	playing electronic organ
-RVqCdL7_G2Y_000030	car engine knocking
-RWnvolYKQ2o_000414	lip smacking
-R_SNrPUIa1A_000140	playing bassoon
-R_yW6SKe_-M_000080	people booing
-RaawVrMvP7k_000048	pheasant crowing
-Rb0IEIeJTKY_000002	basketball bounce
-Rc_exQXrUG0_000100	skidding
-ReZUlDwGaLY_000080	playing marimba, xylophone
-Ria-XrpfgsA_000089	people marching
-Rifu8nB2cCs_000043	cat purring
-Riu9TpsQ_mk_000009	pig oinking
-RmGGiQMURcQ_000022	people sniggering
-RoLNzNAv-Ig_000030	motorboat, speedboat acceleration
-RrpMoJrp4AY_000180	people crowd
-RsYAulhucVI_000011	lions roaring
-RtHMCINXA0s_000052	cricket chirping
-Rur-IfwPZho_000051	dog howling
-RwE9JAktTvU_000580	people coughing
-RyV40yhlOeU_000419	people marching
-S29c6T__5HU_000003	playing timpani
-S3Ipyd9HHLk_000185	magpie calling
-S45cdr4x-mc_000080	chainsawing trees
-S9fw7NHd2eo_000380	playing electric guitar
-SBYzwBhUpYs_000166	playing badminton
-SBwOIJoGChM_000116	hammering nails
-SCjdlZSW8nY_000111	playing table tennis
-SGkzdDWFIHI_000085	playing bass drum
-SHebWHn0c2Y_000005	chopping wood
-SPnZIDCnKwM_000030	orchestra
-SS6iMabGB1Y_000020	chimpanzee pant-hooting
-ST33aEP5Hbc_000006	train horning
-SXC13GS87Co_000031	woodpecker pecking tree
-SXHYr-7nPaw_000030	playing drum kit
-SYDQX7Whjm4_000061	woodpecker pecking tree
-SYWqIfMOmGE_000051	hammering nails
-S_0v5j4S100_000039	cat purring
-S_qPgRNSkIw_000370	people clapping
-SbXyRN0DD-g_000080	dog bow-wow
-Sc-Ld96kbN0_000144	playing synthesizer
-SdCzaAUA6Xs_000005	playing djembe
-SeZm-iy9n8M_000150	playing electronic organ
-Sf0aZczIZVU_000040	playing cello
-SgYh5Lb7tlM_000130	playing flute
-SifYJFmSSRw_000123	playing marimba, xylophone
-Sl4weBj8xfc_000030	typing on computer keyboard
-SoVEYhxQabk_000103	canary calling
-Spm_zrjedzk_000392	cap gun shooting
-Sqq2dUA8t3A_000586	playing harmonica
-SvJ0kUY22C8_000055	turkey gobbling
-Sw6qDVMsR5M_000030	playing violin, fiddle
-SwQie7apk78_000198	playing darts
-SyEVBFw_9oE_000120	people screaming
-SyfyWK7dKXA_000021	playing squash
-SzmORuHD4g4_000059	wind chime
-T-AN31N4LD0_000050	people screaming
-T0NMgZC7CDU_000011	chipmunk chirping
-T19Xf5-OTHw_000130	playing piano
-T2zZbnu_NtM_000029	playing table tennis
-T4KEGH_8lY8_000119	playing timpani
-TAdH0kUJj9k_000050	helicopter
-TCUnK4k7QZ0_000000	telephone bell ringing
-TDh8_ixGzIo_000030	printer printing
-TGngN3n7EMw_000024	airplane flyby
-TLSmnnnyhEk_000030	people shuffling
-TMyd50KWyNo_000311	people slurping
-TNCcQfbselM_000120	francolin calling
-TQapWHNS5FE_000024	car engine knocking
-TRW01xXMMqg_000210	playing accordion
-TRt_14JcRWQ_000080	playing bass drum
-TTElms_ZWqI_000428	hair dryer drying
-TTstWFDMmqc_000030	people whistling
-TUPEF6PQxow_000132	rapping
-T_iuImHtqUI_000010	people sobbing
-Ta__Ev0mkBk_000030	chainsawing trees
-TakDv24Tiq0_000032	plastic bottle crushing
-TcN0QofoTvg_000221	playing erhu
-TdkhMZZvdgc_000006	owl hooting
-Tdyh5ziqH-U_000007	lions roaring
-TiaGOZ-ibxw_000411	people booing
-TriRWR9YiNk_000016	frog croaking
-Tse5rzNV5dk_000084	pheasant crowing
-Tze9ybKops4_000020	playing synthesizer
-U3-h9ZARqD4_000264	police radio chatter
-U34oQw93afs_000219	playing tambourine
-U3zsgbf9WHQ_000194	horse neighing
-U4RRMpX2wCU_000010	toilet flushing
-U55bYLMVKiw_000193	pheasant crowing
-U6vVDGaKL3Q_000354	bouncing on trampoline
-U9qUXBqIoZ0_000106	dog howling
-UA62hwIBgGY_000020	chicken clucking
-UFIi1OuMx0o_000302	rope skipping
-UGwl5VOHuaw_000200	playing accordion
-UIFxlzHYPBM_000060	gibbon howling
-UJ1lZOY9LSY_000035	playing didgeridoo
-UM1j8kFaxi8_000020	motorboat, speedboat acceleration
-UOL-hbkzUN4_000010	barn swallow calling
-UOlwg402_r4_000070	people clapping
-UPUwaW8jfhA_000030	ice cream truck, ice cream van
-UQonGRRRpv4_000024	goose honking
-UUKyUUjv8qg_000030	church bell ringing
-UZAB21OSorM_000007	electric shaver, electric razor shaving
-UZYfRXafn9I_000005	ferret dooking
-UZp0AcdimvA_000021	cattle, bovinae cowbell
-UeCkRYU_SuM_000100	playing accordion
-Uf2j1VbOk8c_000055	pheasant crowing
-UfG4dP0szuY_000040	fireworks banging
-UjTYiJ0dm8s_000002	vehicle horn, car horn, honking
-UkdS0cwAGYE_000010	car engine starting
-UnGLtJX29Hc_000043	planing timber
-UoFgJXGWJXA_000111	playing congas
-UpWivODbpIY_000059	owl hooting
-UsJAb6aftq8_000580	playing bagpipes
-UuuQH-TFxMo_000034	missile launch
-UzKZijSs4-A_000004	fox barking
-UzPSMiqeH3Y_000118	singing choir
-V-ZbY0SL2XI_000040	people sniggering
-V1ALglq7_x8_000018	dog growling
-V6lQVpw888U_000590	machine gun shooting
-V6y-jCli4I4_000000	cuckoo bird calling
-V7SGeTSJz9w_000090	skateboarding
-V82SmRI0GHY_000030	playing clarinet
-V83lIhKVraY_000125	playing darts
-VCEicqV_2Xw_000030	ambulance siren
-VDXN0xwWgRA_000083	playing bass guitar
-VDzkPfnI1g4_000093	playing djembe
-VEER910vqMk_000002	duck quacking
-VEhmvrgrZb0_000000	chicken clucking
-VFj1vFMV3dQ_000025	playing darts
-VGrI3TMjWog_000120	playing vibraphone
-VHQjG81NcXE_000030	crow cawing
-VS9R3iOc4Vk_000027	pheasant crowing
-VU9W8Y1E5u4_000030	bouncing on trampoline
-VdxslFvStdo_000370	female speech, woman speaking
-VfXlyIjtfo4_000117	baby babbling
-Vgs_XjEqKl0_000020	people sobbing
-Vh4E5JPTMBM_000146	typing on typewriter
-VhLn9pUFwXw_000039	chopping wood
-VhUG4vTpPUo_000324	ripping paper
-VhsFniEZO-k_000026	mynah bird singing
-Vkbp8VmL3pM_000040	people sobbing
-VkgLWYydiPE_000125	tractor digging
-VlGuwiKwJAM_000027	playing sitar
-VlkgwzKAamE_000051	ripping paper
-Vnnw7lK63rg_000041	playing snare drum
-Vt3qBXzyS5k_000280	eating with cutlery
-VwZ8gzI3qNE_000106	people slapping
-VwqcV76E6Nk_000000	people booing
-VwqqmiiznQU_000028	woodpecker pecking tree
-Vxs0xCJI92Y_000080	driving motorcycle
-Vzb427ZmWvw_000220	fireworks banging
-W0PwVllBxkI_000114	playing steel guitar, slide guitar
-W1o_XgU8lec_000050	skateboarding
-W2_8zRHaEPk_000150	playing vibraphone
-W2gkFTFR8mw_000047	rope skipping
-W4eT7fj-aIA_000201	driving snowmobile
-W5oXrz8dqBk_000030	playing piano
-W5wBkCwEEmY_000140	playing banjo
-W7OJevEgq7w_000000	dog bow-wow
-W7u5kEt-q-8_000000	playing tennis
-W9L5rTbcMFA_000004	people eating noodle
-WABbXpAT_UA_000049	playing bagpipes
-WAhoodHHm2w_000001	playing squash
-WBOqGIqUwGg_000090	people sniggering
-WD0aVtBqoxo_000120	goose honking
-WDmJ4ZtLuNU_000102	playing timbales
-WGHTlOM4-3w_000050	sheep bleating
-WH7LBLKyEkA_000241	playing mandolin
-WIWRYG4vJC4_000020	people burping
-WIZTFH-LGpo_000001	planing timber
-WJQ27fShKvk_000000	playing tennis
-WQFZLDitkkM_000067	eletric blender running
-WQuoH_HyUAk_000030	playing cello
-WRvPzjj5uoE_000134	ice cream truck, ice cream van
-WWzD6E9Wp_k_000260	playing cornet
-WXMt58sLsf8_000028	zebra braying
-WZ568vdA7bU_000070	plastic bottle crushing
-We-E7-Sx3Zo_000260	barn swallow calling
-Wg86ercBjY0_000002	playing clarinet
-Wh8A7CAuLe0_000028	barn swallow calling
-Whjk5Fvue1o_000030	singing choir
-Wj0qIPUjTfE_000008	lions roaring
-WqKP-0cSKgs_000030	dog bow-wow
-WvRkqVmRH0g_000088	playing harp
-WvcM0ueEjfo_000050	people burping
-Ww3CMatNd84_000721	cat purring
-WxQHtaD0Yqg_000028	tractor digging
-X-o1Twh5SFY_000032	playing steelpan
-X0gT3reH8A8_000120	people sniggering
-X17lq90OIO8_000020	dog barking
-X5C9NY9MjA4_000105	train whistling
-X7EGSxA-aCI_000132	child singing
-XBAwcPvVSoA_000068	lathe spinning
-XDMTylVtYx4_000190	race car, auto racing
-XEOUYLlaef4_000003	rope skipping
-XJnKU_SXYlM_000049	playing tabla
-XK4Ws-xvt10_000267	vacuum cleaner cleaning floors
-XKp4HCxVmaI_000017	vehicle horn, car horn, honking
-XLTqSk1Z3D0_000000	police radio chatter
-XM6eeVHjmLk_000001	dog growling
-XNgq-cDV7FI_000101	dinosaurs bellowing
-XOTSovKwxLk_000030	child speech, kid speaking
-XSJzshsMz30_000030	chainsawing trees
-XTDo4OaFapg_000100	hammering nails
-XU8dCEdiGWc_000010	crow cawing
-XUyBxCbiv7A_000073	playing bassoon
-XVveRibUh18_000023	frog croaking
-XWp8qMpnD00_000026	electric shaver, electric razor shaving
-XYZ4Nd4qV-I_000101	people humming
-XdSCT_cQDbE_000010	splashing water
-Xgm17YbPztk_000022	playing didgeridoo
-XiExpKM1Hpo_000160	playing trombone
-XlJ-tAbzzSg_000234	alligators, crocodiles hissing
-XtExs7nIzts_000034	people booing
-Xv4AVT2QYhA_000100	rowboat, canoe, kayak rowing
-Xxq7CElxJLc_000063	singing choir
-Y798EuJZaPU_000017	playing squash
-Y9Oee-VRfVA_000339	airplane
-YC_k4W1YaDw_000030	race car, auto racing
-YD41QET24SM_000125	playing badminton
-YD7jTek7yVU_000206	arc welding
-YEatlg_b0BY_000054	people burping
-YISopDKuQ0k_000050	playing accordion
-YJ5xLJ85AwM_000106	tractor digging
-YOTnbp40tf4_000030	male singing
-YOrImbuhsQ8_000027	lions roaring
-YS_zTwf-FRo_000092	playing ukulele
-YU78jPcU6FI_000070	playing trumpet
-YUXZVAQ1iJ4_000007	volcano explosion
-YUcdJy-rpD8_000590	raining
-YVOmkmjoT40_000030	ocean burbling
-YYgYiO9DjEY_000161	tap dancing
-YbALYr-5WpM_000000	playing harmonica
-YbOztklOkF0_000023	goose honking
-YcvHv44MYiU_000027	barn swallow calling
-YdjsatpizhE_000023	airplane flyby
-Ye72yJyWxs8_000021	airplane flyby
-YeEySSrxwpg_000078	barn swallow calling
-YfZp5C7xrKs_000181	playing bassoon
-YgySYOAi8JQ_000396	skiing
-YhJwTBFij48_000015	motorboat, speedboat acceleration
-YjCLRifFCj0_000010	skateboarding
-YjJioclqdQ8_000150	wind noise
-Ys1P04EjGH4_000196	playing bassoon
-Ys9j6IBcFBo_000024	opening or closing car doors
-YvBCKb1LbCk_000095	fire truck siren
-Yvq8WrFpXhE_000057	people crowd
-YwNdDHEhm2g_000005	duck quacking
-YwTFxcWCac8_000381	electric grinder grinding
-YyqqXEmYPIA_000020	ambulance siren
-YzBaTwjmikc_000018	hammering nails
-Z-V-1iUbMWI_000520	lions growling
-Z1BhAXfiZtU_000037	vacuum cleaner cleaning floors
-Z4QR8uvx_Wk_000169	reversing beeps
-Z5SyUJSDCOA_000562	ripping paper
-Z7Hzc1Yw2aY_000060	sloshing water
-Z93pTtHnDXo_000110	playing vibraphone
-Z9nG2fIh214_000075	chinchilla barking
-ZALP7Di4HaM_000180	playing saxophone
-ZAZZ1wImM9M_000010	singing choir
-ZCA_NapBTlg_000060	dog barking
-ZDDnEdzjyrE_000597	playing tambourine
-ZFGcmmpt1bs_000094	playing bagpipes
-ZL_MxixlnHE_000079	reversing beeps
-ZNboftBNdyY_000406	cap gun shooting
-ZPODO-Ehl_M_000030	male singing
-ZQO_uhrJPNA_000110	playing violin, fiddle
-ZUjum5gZMKM_000140	playing accordion
-Z_Bk_CnpWsY_000198	people sneezing
-Z_sW4UxpbbY_000050	using sewing machines
-ZbtuNDtoyOI_000030	sliding door
-ZcskQV2A2cQ_000030	playing flute
-ZdtaSkUkrIE_000256	police radio chatter
-ZeDa5hT2ffk_000071	police radio chatter
-Zgbuj3y2iuY_000210	cattle mooing
-Zh2whhvFWsM_000016	pigeon, dove cooing
-ZhLwVzOZziA_000368	blowtorch igniting
-Zi3FOnx4nuk_000001	playing table tennis
-Zj73Wh6LEiU_000120	skateboarding
-ZjN9CL7B-9I_000239	playing timbales
-ZkfUo4l9ruc_000090	chainsawing trees
-Zl_ZWSLB8Ic_000024	sheep bleating
-Zs8liAFeuuQ_000058	smoke detector beeping
-ZtPoTqVxVvU_000050	helicopter
-Zu0BpngzT_Q_000007	bowling impact
-ZuwSkX0RQQY_000343	playing tennis
-ZxmKMSUpbvc_000065	car engine idling
-ZxpiZiSAm9I_000060	turkey gobbling
-Zy70U6w0yXw_000088	mynah bird singing
-ZyUqhIDVuNc_000541	scuba diving
-Zz0fhQuHZEE_000012	penguins braying
-_0iRtZRG6UA_000047	woodpecker pecking tree
-_4RRKzDUd60_000079	lathe spinning
-_7GnnuKVVCM_000023	engine accelerating, revving, vroom
-_8FhgH9k7Rw_000120	vacuum cleaner cleaning floors
-_9wN5d1Z1ak_000024	lions roaring
-_CF34A0RrPs_000018	horse neighing
-_Cks36T64zE_000061	striking pool
-_DdVu5sPsjk_000490	people whispering
-_GaEZe-Z73k_000233	fire crackling
-_HRn4aOhjhU_000016	canary calling
-_H_W34UobYU_000459	bouncing on trampoline
-_HcIHVLRzpM_000450	female singing
-_NShiXyBmsY_000270	train wheels squealing
-_Ow1h1eTNk0_000178	playing trombone
-_SfaPFwwJHs_000026	train wheels squealing
-_T0iCBHWKt0_000101	pig oinking
-_T5ZUrmRiQI_000108	playing ukulele
-_Uyw_Legahg_000045	tap dancing
-_VOx5BWJsyQ_000030	raining
-_WQQ3QvGrYw_000340	child speech, kid speaking
-_WUAz2RAZZc_000201	planing timber
-_YF3aFSsgUk_000093	playing steel guitar, slide guitar
-_YhSeML8rQo_000109	alligators, crocodiles hissing
-_aX_UzkXRd0_000140	helicopter
-_cvucKdFb5I_000043	people booing
-_dIzu78Ld2w_000166	lathe spinning
-_gQFB_Utuf0_000077	cat caterwauling
-_j8zzvBts98_000000	splashing water
-_m6lwfMU8Eo_000272	electric shaver, electric razor shaving
-_pSMw5FKHX0_000040	people sobbing
-_pfccpy7Cqc_000180	typing on typewriter
-_ru-n--PRNA_000030	police car (siren)
-_t-Abwz6JG4_000031	baby babbling
-_t259gootxc_000190	female speech, woman speaking
-_u9zUuBdo1k_000000	cat growling
-_vkXDgupDN8_000250	sailing
-_wvB2HlVn1I_000050	engine accelerating, revving, vroom
-_xGLwynjhSs_000010	playing french horn
-_xq-9GZBfrg_000014	pheasant crowing
-_yVgX3hi1OQ_000195	driving snowmobile
-_zTmqhuLwAM_000001	donkey, ass braying
-a0LIemH5Cw0_000010	people clapping
-a3ZAFViNYyk_000000	swimming
-a57DUeBMeHY_000320	rowboat, canoe, kayak rowing
-a6CPpulnJ2A_000420	stream burbling
-a8fa79w2aIQ_000023	lighting firecrackers
-aC3nlLHFOfk_000030	playing violin, fiddle
-aCnLa_H0-P0_000000	magpie calling
-aDXQSTbKlIc_000010	playing cornet
-aE32elV-Jtk_000210	people crowd
-aG1wGSIqGR4_000013	frog croaking
-aHzkCSXsrqg_000038	vacuum cleaner cleaning floors
-aJ41sea1s0U_000080	people farting
-aNArqTW4cbc_000025	vehicle horn, car horn, honking
-aNOELrfjAYY_000000	vehicle horn, car horn, honking
-aRI4l67ZlYQ_000063	planing timber
-aSYCwv_hda8_000030	subway, metro, underground
-aSleAKgkDDk_000000	playing accordion
-aVs2QBhLIhY_000162	playing didgeridoo
-acYp_SYmHs8_000164	running electric fan
-aclGsdr83pM_000400	playing saxophone
-aezIOAga5V8_000070	child speech, kid speaking
-agMolFR_pFc_000075	train whistling
-agrdgrC2cdI_001076	dinosaurs bellowing
-ah5cSy0yXs0_000178	slot machine
-aiTXGmkpfnk_000030	playing trumpet
-ainzK7QuseU_000001	dog whimpering
-aj6kdMafoek_000693	hair dryer drying
-aju2z1N0aOo_000030	wind rustling leaves
-ap3PdrjChdo_000040	playing bassoon
-apTvGua1-FY_000271	playing guiro
-asXWEB_SBEI_000060	playing cello
-atT7DPwTkds_000130	people clapping
-auHL-4XCFAk_000030	driving buses
-b-8lh_tfhLQ_000124	hair dryer drying
-b-gza98ikBo_000020	playing snare drum
-b2bpNgK0Cnc_000250	orchestra
-b4Bu0AHwBWs_000084	woodpecker pecking tree
-b4WK1A7DK18_000018	crow cawing
-b8q6Z7dtRvg_000030	playing flute
-bBMcsO6IeDE_000021	lions roaring
-bF89h31EEzg_000000	golf driving
-bFmIV3pNJPY_000001	basketball bounce
-bI_4_x735PA_000020	typing on computer keyboard
-bJtu55jpzNc_000140	playing violin, fiddle
-bJzkn2kRh8g_000070	helicopter
-bLAz_kbihLE_000147	elk bugling
-bMNcdb3Eeds_000064	civil defense siren
-bN9fXjHalIY_000065	playing timpani
-bPNt6iVmemQ_000504	playing bongo
-bPfP2rjJfDY_000609	playing ukulele
-bQV7q5VRaH0_000174	car engine knocking
-bT8QfAM9NRA_000197	cutting hair with electric trimmers
-bVdI6laTOXI_000480	people screaming
-bVskpqAJF8E_000116	people eating crisps
-bYT-N-_u448_000217	civil defense siren
-bZUN1tQnuDQ_000001	child singing
-b_C-fNIS8aI_000000	cat purring
-baVILr18Y9A_000015	civil defense siren
-bd-swxc3o4w_000260	playing hammond organ
-bo9sSwEqnzs_000030	orchestra
-bokQgOSQ2OA_000001	playing squash
-bpF6KhK8El0_000030	police car (siren)
-bsM-z2joYss_000030	child speech, kid speaking
-bsUBSFHXY0g_000040	helicopter
-bukJZ1FxymQ_000390	male speech, man speaking
-bw3GIZLj6kM_000000	playing piano
-bx5BUbiIXFw_000107	child singing
-bzxjT3h2ir8_000105	lip smacking
-c3UPyEZ1yQY_000070	typing on computer keyboard
-c4M3JIyAPcM_000020	playing bass drum
-c5dPZoWwmC0_000020	driving motorcycle
-c6e4pxgoCls_000105	magpie calling
-c84w0ECD-Lc_000010	ocean burbling
-cAI0pcOwk2g_000346	elk bugling
-cEddS8Y-qZc_000510	people clapping
-cFNcpddGRno_000340	fireworks banging
-cIHKR2E1uiQ_000303	smoke detector beeping
-cJSWXGTJMcc_000018	rowboat, canoe, kayak rowing
-cL_nCiBnlbk_000001	playing bugle
-cMdnie91zp4_000000	playing trombone
-cNUIc68WpD4_000075	people marching
-cRiW0u0QY18_000030	playing trumpet
-cSym5f2jySA_000005	chicken crowing
-cUBHfozbsao_000044	playing harp
-cV4QlanVa9w_000070	basketball bounce
-cVhWB3IniBo_000014	playing tuning fork
-cZfuBCVV6n8_000390	eating with cutlery
-ces9pc_r6Wo_000036	child singing
-ckwEyopmfKs_000024	crow cawing
-cmkEW0KJDYI_000165	arc welding
-cp-ZI_fQ1l0_000154	airplane flyby
-cwQY1bck2G8_000070	playing bagpipes
-cx4QSvep_wE_000009	train horning
-cxFdK2G6wq0_000030	playing bagpipes
-d-UQr-8UEUY_000069	playing saxophone
-d05lXeFKDn0_000275	pheasant crowing
-d4yBeEbVp1Y_000030	typing on computer keyboard
-d5HmVBPY1Qc_000230	playing saxophone
-d66pNyYB6WY_000013	people burping
-d8gWsmBdBhE_000097	playing sitar
-dBivnkxNOOc_000175	playing vibraphone
-dECLS-JHWYA_000000	vacuum cleaner cleaning floors
-dK46EdcZFzg_000030	playing trumpet
-dNMCURn41wU_000179	playing djembe
-dN_EzmXbsu8_000016	playing bass drum
-dSeWq0Qd9Hs_000318	playing tambourine
-dVg4IEbk-l8_000010	cat meowing
-d_OIBYBwexQ_000160	playing accordion
-daHwPM2azrc_000036	wood thrush calling
-dfr1OFz20sI_000000	goat bleating
-dgSOnxqNtFE_000246	people coughing
-dgS_Fy1FiNA_000110	people burping
-dhG_GSGW_RI_000004	volcano explosion
-dlJm9R5t_qg_000030	playing hammond organ
-dlWrMn_RDg0_000120	playing bassoon
-dqymshfwGEE_000030	playing saxophone
-duca08sjlbQ_000001	playing bongo
-dugd_OSzghs_000203	ice cracking
-e0LMGLr-T-I_000029	air conditioning noise
-e3ZJnO3s53o_000016	child singing
-eANsaSAzHm8_000010	driving motorcycle
-eBFPD8YrqiA_000140	driving buses
-eCpA_7B-k94_000030	dog bow-wow
-eDqfHtuB8Hk_000015	snake rattling
-eEUsoUKPxy8_000187	basketball bounce
-eFaLkcfCzos_000140	playing cello
-eK97_rb6BsY_000072	playing gong
-eLyQDSo2NAM_000129	opening or closing drawers
-eOJQsk_kdWI_000032	ice cracking
-eS8Tf1hfwxk_000205	sea waves
-eSEIPV-qSj0_000020	squishing water
-e_3GUZmPFBI_000020	playing erhu
-ebhtW1tIXRY_000002	donkey, ass braying
-ecTDu-EX3WE_000019	car engine knocking
-ecq96FWbCF0_000037	gibbon howling
-ed4wVB_RhHw_000011	baby crying
-ehw6y3_g-8A_000757	ripping paper
-ej6jlkTeobU_000002	car engine knocking
-el3i-oj08Q4_000173	playing oboe
-f-XD-BgLWk0_000000	skidding
-f5c5KuWylig_000343	vacuum cleaner cleaning floors
-f6Wl-9pzib0_000032	cattle mooing
-f8bMURZiPiU_000019	people whistling
-f9U7g3g4voA_000026	golf driving
-f9c9YZ8WgjM_000037	bull bellowing
-fD362l9P3u8_000041	tornado roaring
-fFn2P7ZRIeM_000480	playing clarinet
-fNlGlh1GaeA_000013	heart sounds, heartbeat
-fS17RfJYjS4_000001	pig oinking
-fTT_D_d_5FA_000080	people clapping
-f_55S5G8M2s_000000	playing harmonica
-faFCcN6y-C8_000020	ferret dooking
-fcyUlEGvMdc_000037	playing volleyball
-fj0qlDdWt1M_000158	playing squash
-fknz5hZg_3I_000295	playing darts
-fmc6hwse-IA_000085	skiing
-g-CydtX7btM_000086	eagle screaming
-g-u5YOJu_gY_000230	lawn mowing
-g1n-ZaW0QHQ_000095	reversing beeps
-g5OBeqvOmRU_000001	bathroom ventilation fan running
-g8E9gBfe8B4_000180	female speech, woman speaking
-gH25X_mj6mc_000210	ocean burbling
-gJbMwvsUyA8_000000	driving motorcycle
-gL2i_DTGUEY_000028	cattle, bovinae cowbell
-gLTvwzBktxE_000015	airplane
-gLj93C9rRsg_000055	playing tambourine
-gLokxx-ruH8_000230	playing piano
-gM9WSjAPDVc_000030	people babbling
-gPwtTVH44OY_000030	people shuffling
-gW-1oOsNGJs_000010	playing harp
-g_axbxP7Amc_000071	playing harp
-gaFtxq1hBU4_000118	spraying water
-gcBUpboDmjc_000033	hammering nails
-gfgv17hOPIM_000040	playing marimba, xylophone
-gjJ4nqwlgnE_000010	playing hammond organ
-gm0HkvshnPk_000340	cattle mooing
-goS6rwhPth4_000026	mynah bird singing
-goz-IQ8s6uk_000050	skidding
-gpaX15tTUoc_000017	cat growling
-guvnNwCkhcs_000030	people sniggering
-gwzqjVCFNqA_000040	playing clarinet
-gyioxO7fWzI_000046	lions roaring
-gyjZ7tnnZeA_000132	playing theremin
-gyt54t3R_BU_000032	blowtorch igniting
-gzqq0knK2FA_000003	cat meowing
-h-Z5cTyu4LE_000150	wind rustling leaves
-h0025UfxME0_000308	sharpen knife
-h0V51dolEjA_000194	airplane flyby
-h5Gq0y3qkX0_000063	volcano explosion
-h7EWw2n5D5I_000050	train horning
-h7pz6niHZuw_000006	donkey, ass braying
-hEePXITb26o_000042	playing harp
-hFVd2Em9-cc_000100	toilet flushing
-hQvIg0t546Q_000146	playing vibraphone
-hUlqIdQFuxE_000030	basketball bounce
-hV_CjOK-mME_000030	people sniggering
-hW-RxgLN2l0_000007	owl hooting
-h_tdr4t6unw_000300	typing on typewriter
-ha-5LhgpVmQ_000946	playing tympani
-hc9aQ8VL9o0_000083	playing steel guitar, slide guitar
-hcR4BiG8sZs_000150	playing clarinet
-hdphUn6ihrA_000450	lawn mowing
-hdqCHBTwnuQ_000253	playing badminton
-hgcLJFz2WKQ_000512	police radio chatter
-hlKkLqHpJ_s_000400	chicken crowing
-ht3jNf66nbo_000286	missile launch
-htRB8f0r2rg_000027	playing bassoon
-hvoOSCZo2-E_000030	cattle, bovinae cowbell
-hy87-XUmhkE_000004	playing timbales
-i-SmzP7T_E8_000295	skiing
-i3nEgFq4yfo_000578	heart sounds, heartbeat
-i4IsKRvCLi0_000036	lions roaring
-i5dz5NV4Vpc_000007	crow cawing
-i6BBre7xV-c_000937	dinosaurs bellowing
-i9PvGS9Xr9k_000332	lions roaring
-iAqJ9lPCU4w_000024	chicken crowing
-iD8gRmmiiqU_000130	driving motorcycle
-iDODqIflQ1Q_000061	parrot talking
-iFU48OcnO7k_000000	woodpecker pecking tree
-iLaLf95DcQk_000040	duck quacking
-iQMIGLrKlTI_000260	playing accordion
-iSjvZiygjCQ_000510	playing cello
-iTd7hOI27BE_000048	playing harp
-iXslVMHwkTU_000212	people eating noodle
-iZR9dpO64NA_000011	cap gun shooting
-i_-LCRDriig_000030	ocean burbling
-i_hhSKWxzeU_000038	frog croaking
-ibd7CKcSiTI_000122	playing bass drum
-icK4IQb2KsE_000000	hail
-ieRU5f5P4B8_000350	cap gun shooting
-ieXdQlIBgLk_000030	playing marimba, xylophone
-iiUvfvkeo0c_000237	disc scratching
-ijirbb9m05k_000285	swimming
-ipo5U5Grsno_000020	people cheering
-irUkV1DP7Cs_000030	playing cello
-irhsdhRIUwI_000010	fireworks banging
-itH-fbb9Ook_000250	ambulance siren
-ixv1jovJe3c_000151	playing timpani
-j-GF_0RxUlg_000176	playing bass guitar
-j-hyPaKjCAU_000030	playing accordion
-j0NNSluEaS0_000150	heart sounds, heartbeat
-j15Ldqb_XVw_000020	fireworks banging
-j2OhKQ6sm0o_000077	people eating noodle
-j3A_ekLNu1Y_000008	car passing by
-j4GHwj1Yqz8_000076	ice cream truck, ice cream van
-j5oZYOBOppQ_000003	mouse squeaking
-j6f4pheXNDE_000108	tractor digging
-jB-OcexH1n0_000033	cat caterwauling
-jBCKFPXuFOw_000086	strike lighter
-jBZ1C1ihCIY_000005	playing bongo
-jL4h1-_LECU_000022	church bell ringing
-jQRurvUk2xs_000051	writing on blackboard with chalk
-jVG2LQ2kA1Q_000067	playing glockenspiel
-j_WKRbDVZhs_000071	barn swallow calling
-j_vtU1U9rg0_000042	playing volleyball
-jb92NmGYNbU_000279	police radio chatter
-jd2ENRtbxRQ_000010	people coughing
-ji-27X81tIs_000133	playing bassoon
-ji4T1ArqCz0_000017	fire truck siren
-ji8HeUiTfoU_000030	orchestra
-jld-wHLRUWM_000020	playing accordion
-jmLX2yQ4eKk_000007	hail
-jsw5soBYfsc_000020	people farting
-jt7w_UY4yUI_000040	lions growling
-jxnPU7Okb5U_000043	playing snare drum
-jzw_Wa_TXVo_000018	playing squash
-k-AKVEheu4g_000096	alligators, crocodiles hissing
-k-jDS1jp_AA_000014	firing cannon
-k4h2VtrPwus_000161	rapping
-kAWAs_7SaKw_000000	bird chirping, tweeting
-kBmcp8nL6Kg_000195	playing didgeridoo
-kCsmvK06SCA_000254	playing sitar
-kDwFyUvAi4w_000077	playing bongo
-kEQJJyYkYTY_000200	child speech, kid speaking
-kL6xemyurI8_000140	people eating apple
-kPp7CwFBl1c_000030	playing violin, fiddle
-kPpaeW3DObU_000481	playing castanets
-kPus6xz6fN8_000030	car engine knocking
-kSdqIpAMz_M_000175	playing snare drum
-kSwrdM7UD98_000057	owl hooting
-kTyaqJIhX6Q_000020	playing accordion
-kVMXMaTyEbE_000116	playing theremin
-kVtj0bAYAF8_000000	people sobbing
-kW23iJgtyfk_000002	raining
-k_NIUqHoNz4_000037	playing bassoon
-khZPuH00RNc_000332	yodelling
-kjtZNsHp_a0_000330	lawn mowing
-kkgjiCKHvoY_000449	firing cannon
-kmPmQ6aylRc_000012	reversing beeps
-koTbsmbqyxo_000103	people booing
-kp_7Sd6s0h8_000306	people eating apple
-kqvpyaIls0c_000090	playing cello
-ksaiDSSJeOg_000030	playing cymbal
-ktBzLsiL6l0_000157	playing steelpan
-kxl_ZU3j99A_000415	missile launch
-ky92PHpUpEA_000050	playing accordion
-kyEDPVvDQt4_000040	bird wings flapping
-kz849EPouys_000318	magpie calling
-kzntbWmyWBg_000074	playing squash
-l0DQpxoSr2Q_000040	playing banjo
-l3i-cKkVL-o_000007	car engine knocking
-l3rzkrm98J0_000001	alligators, crocodiles hissing
-l3uGoel_Ats_000000	people crowd
-l4XYVX79H58_000400	people babbling
-l5LnwNRK7Bw_000030	playing cello
-l6uZDuUsdpc_000010	people burping
-l7ELBtiVtQ8_000190	striking pool
-l8bdmlXL-Lk_000197	playing didgeridoo
-l9ple4xWo3w_000193	chopping food
-lAF2dHM7Tyc_000170	playing electric guitar
-lEzMz9odWXM_000058	lathe spinning
-lGsxnfOPaUw_000022	baby crying
-lGtRJjnC4PI_000210	airplane
-lKhe8BxkRnU_000025	wind rustling leaves
-lLme6yedI6w_000040	cricket chirping
-lN2kwc34bo0_000050	train horning
-lP5znTMLevo_000030	playing bagpipes
-lQG8CRumj3g_000560	playing cello
-lUWrhn9z9FI_000096	pumping water
-lXIaZksDY38_000030	people shuffling
-lXwEV2S1rt4_000150	using sewing machines
-lc1QTC0R_CQ_000018	people shuffling
-ld9b7tfnqTE_000109	playing erhu
-ldF2EJCVY3g_000147	playing theremin
-ldvcH7bOy_o_000184	playing french horn
-levuF973w8s_000250	playing french horn
-lg6X9iqcqXI_000233	playing table tennis
-lg7DqdnmkmE_000130	skateboarding
-lj-PczKzEaw_000040	using sewing machines
-ljXTXoBG9rg_000077	pheasant crowing
-lnatlhCU5kI_000420	singing choir
-loMPOYNM66g_000123	playing timpani
-lqVp4OJ4hbY_000044	lions roaring
-lr1RLADQXNg_000110	helicopter
-lrFFGvB03Fw_000071	golf driving
-lsBttXzhPHw_000144	playing sitar
-lt5H2iH9Ln8_000120	chicken crowing
-lwgKXn21ymc_000774	people whispering
-lxFVAc2dHVM_000152	fire crackling
-lzLgjt8VRmU_000000	skateboarding
-m-4-BAv8cCQ_000380	lawn mowing
-m-NpPmAkncw_000030	male singing
-m0g-zWJJClA_000150	playing banjo
-m1lFSuSixy8_000350	people marching
-m1lFSuSixy8_000613	people marching
-m2E4i-EzHIE_000085	people finger snapping
-m4j5XY09HlE_000021	car engine idling
-mCyvq9TF5Ms_000052	typing on typewriter
-mInTDyk6c2A_000012	writing on blackboard with chalk
-mPnRdL1sC48_000240	people eating crisps
-mQ60N4HdDyI_000102	machine gun shooting
-mRCzIaqRG_c_000000	using sewing machines
-mWGLXbNhuB4_000096	hammering nails
-m_7BjYa44lo_000030	child speech, kid speaking
-ma0P7XOsBgE_000030	people running
-ma2RuCUufcI_000036	fox barking
-maUlA8WWTEQ_000004	hail
-maVHGHl01Yc_000034	lathe spinning
-mcVY3xsxgcU_000060	playing bagpipes
-mi9AokZ8m5s_000849	shot football
-mjK1vNF3lKE_000023	playing theremin
-mlihNhHFGTM_000030	playing harpsichord
-mt13n4XleGY_000030	orchestra
-mwu46g-jnac_000170	bird chirping, tweeting
-n-PjT4mDn9Y_000173	playing bagpipes
-n0PnM0u47m4_000042	mynah bird singing
-n0gO6pPICi4_000065	playing mandolin
-n21m6N5UmNk_000002	firing cannon
-n2CgftHGLJ0_000030	driving buses
-n3bX64Z_Yds_000000	playing clarinet
-n4wpVSIu7c0_000087	beat boxing
-n6PQq584nWA_000010	playing trumpet
-n8vhraccEnc_000009	dog howling
-nAtvzIyRwnU_000100	playing saxophone
-nEBUuVsMtGE_000000	church bell ringing
-nGIVQLeZ76E_000103	bowling impact
-nHDsu69zzSA_000000	skidding
-nIHYEEVzuzE_000095	canary calling
-nJ7TBigS5bY_000018	people booing
-nLOOmtvC9Hc_000066	playing steel guitar, slide guitar
-nLVmclZYZMY_000200	people screaming
-nP0vO3Xv10M_000010	dog barking
-nPCYkMhaLYs_000024	roller coaster running
-nTo6W-50CDg_000018	whale calling
-nXc-dHK2A2A_000016	playing theremin
-n_F_tRGGoEA_000107	frog croaking
-ngJ_Us2C19g_000040	police car (siren)
-niYH8Dpt4uE_000140	cattle mooing
-nnyll58-lrA_000009	wind chime
-nowY2-6reIk_000030	pigeon, dove cooing
-nz0qYNbFGD4_000030	people coughing
-o2-6TSqWPCY_000170	people clapping
-o2qd4hsquvE_000056	bird squawking
-o4F5dtUXivA_000034	playing steelpan
-o6kY64rTk2k_000291	singing bowl
-o7mBR043UCs_000014	pig oinking
-o8iHgGRzcTE_000020	people clapping
-o8oMY-WgW9Y_000030	wind rustling leaves
-o9uGfNn4JyU_000062	lions roaring
-oBrRQ5SiJTQ_000210	driving motorcycle
-oCZ3WCK5BZU_000000	driving motorcycle
-oDAI33ybJlo_000029	playing theremin
-oDuiwpaep1k_000035	sliding door
-oEEOscuru6s_000280	playing flute
-oEXqWoSZ9Ww_000024	playing erhu
-oG6EUnQjeF8_000077	swimming
-oIUi8gFI_XY_000178	cat purring
-oIXRSpjo7vk_000170	wood thrush calling
-oJ4m2OvhA8Q_000100	playing cello
-oSsLQCIJjyE_000030	singing bowl
-oVxKyGnz-IA_000230	chainsawing trees
-oXXHkjFLN3E_000237	electric shaver, electric razor shaving
-oX_XdxqTE9Y_000110	bird chirping, tweeting
-oYe46obCJhc_000039	alarm clock ringing
-oZ6l0EStee4_000011	police car (siren)
-oZKVPzRyn50_000432	playing electronic organ
-oad_agP1oJU_000287	playing harpsichord
-od2HXuT_NuI_000100	playing cello
-oePtbOc8Hqs_000000	foghorn
-oeSxlmkPj78_000030	ocean burbling
-ofFtXFnfebQ_000684	cat purring
-ohh7mWALd_k_000473	pheasant crowing
-olZa2vOpbD4_000110	male speech, man speaking
-omiGYobPra4_000100	toilet flushing
-onqGNrWQ7us_000587	machine gun shooting
-or7ikBeUhBg_000020	driving buses
-osA1JXFL2Gk_000021	parrot talking
-otp3r8SfygA_000102	people shuffling
-p-DcPCo7Swo_000086	playing double bass
-p4RWTSRg6Bg_000290	people crowd
-p5LsBog-XRk_000130	playing saxophone
-p5j91ecL43Y_000030	people whispering
-p8HTTAhm5ic_000100	waterfall burbling
-pAe8kcpjZII_000010	playing theremin
-pKUzj3ckXvI_000010	toilet flushing
-pNiB5w3JBVI_000003	spraying water
-pQrnDC-kPHk_000106	sharpen knife
-pRdi3oChUR4_000020	baltimore oriole calling
-pUMZEzdKmPM_000136	owl hooting
-pVJY1Q137cw_000681	cat purring
-pX_Sg3xDAUg_000000	people burping
-p_KsZsJwH0w_000555	sharpen knife
-pdzAs6Be2sY_000139	people gargling
-piYKrS14dxA_000113	mynah bird singing
-pnFtPlslgGw_000019	plastic bottle crushing
-ppDvhlGr5nI_000003	golf driving
-ppLjxFk8C4M_000023	heart sounds, heartbeat
-pqDHX5R4sdg_000220	female singing
-pqElMm80SX8_000025	airplane flyby
-prq7EqBGWaY_000035	playing harpsichord
-psz3LAhSi9U_000001	yodelling
-pu9pO-rCzy4_000153	people farting
-pugRM2Nsnyo_000283	church bell ringing
-pukny4fvbOQ_000040	playing clarinet
-pxpIsajKD-Y_000042	reversing beeps
-pxpIsajKD-Y_000065	reversing beeps
-pyHJrlNMYwo_000350	sheep bleating
-pzixqhh0xG4_000175	golf driving
-q0Hz09My-_E_000018	lions roaring
-q0R8KXxZOZM_000070	people farting
-q0lahEg486Y_000295	tractor digging
-q1oBXqEFXy4_000070	sloshing water
-q5fUdJoUrAE_000257	beat boxing
-q7cvNFoT9nQ_000027	lighting firecrackers
-qA-yeGwsVn4_000018	pheasant crowing
-qBDrrE6LnUo_000103	bird chirping, tweeting
-qBmsSZQ7HNg_000360	railroad car, train wagon
-qCcC7n2mOC0_000074	playing harpsichord
-qIcEYC46zmI_000087	playing cornet
-qJJEBEajF1M_000017	air conditioning noise
-qL-4fJyDGXc_000893	people eating noodle
-qNi5Xlf2ZVY_000510	people clapping
-qORUGCczq74_000042	swimming
-qRm5Yh3JPSg_000016	playing tambourine
-qRwun6pFuNA_000010	playing banjo
-qTRrHj-DNYc_000137	dinosaurs bellowing
-qW9b8qu_KrU_000180	lions growling
-qXFgtkhWLgM_000134	child singing
-q_ZMlkVS740_000222	playing congas
-qbmNcYH52eo_000516	striking pool
-qdl6t1bDb-8_000400	eating with cutlery
-qgv0riPveBQ_000030	bird chirping, tweeting
-qiw2I1oQIVQ_000057	playing snare drum
-qjBkiP7mBNI_000597	ripping paper
-qmjK_Wi0IK8_000080	people cheering
-qoPAdSFZ4f0_000370	chopping wood
-qpjOCvQEHdo_000080	people cheering
-qrNCI310T9Y_000018	chicken clucking
-qsj_OgZZDvQ_000080	tap dancing
-qsrNWdcjwwY_000320	female speech, woman speaking
-quF2HA3u2JY_000101	cupboard opening or closing
-quZSWDeSywg_000040	toilet flushing
-qv51EqZA8eE_000291	train horning
-qxeCxC_zpvU_000202	playing french horn
-r24KMnV5Rrk_000030	people running
-r42dJt0hxro_000010	gibbon howling
-r47N9mdOeXc_000030	playing violin, fiddle
-r4Zm5lEsI-M_000110	vehicle horn, car horn, honking
-r7e4wJy4NP8_000090	motorboat, speedboat acceleration
-r96LZqBtlwg_000050	dog whimpering
-r9uN-AltjDQ_000130	lawn mowing
-rAXnOxWHaLs_000030	playing french horn
-rAth9ueRqM4_000040	whale calling
-rD4zq3CvJSo_000130	people slapping
-rEdr-j9oAN0_000074	playing french horn
-rFA1GBcIGN4_000067	playing ukulele
-rFgrOflwKPg_000290	playing trombone
-rLuNw3Cm7rs_000024	lighting firecrackers
-rMDnGZU7jzE_000001	dog baying
-rQthEYYXM-k_000030	people sniggering
-rRP810El--s_000958	fire truck siren
-rSHvW5dGanw_000150	fireworks banging
-rSWPVWkAbec_000000	bee, wasp, etc. buzzing
-rTNSzUXd3wk_000180	playing double bass
-rVnkDOvLWm8_000180	cap gun shooting
-raz3OUu768k_000068	playing clarinet
-rfqqBv3eriU_000160	stream burbling
-rgdMDo5TBic_000355	playing squash
-rn381TUMxyE_000298	arc welding
-rs2FL8HJfGE_000030	people sniggering
-rwVhTlLcBO0_000099	playing erhu
-rx2lqMvj2Wo_000052	squishing water
-rz9PZZA04z8_000183	playing badminton
-s2QrQdxzLwQ_000074	playing glockenspiel
-s8zSSYQM0Tc_000127	footsteps on snow
-s9gzcUg_nlM_000030	playing drum kit
-sFTyeq295xU_000041	people humming
-sIHApNhq2Ik_000002	bird squawking
-sLEEurjCsAY_000051	typing on typewriter
-sLOjC8EWrHA_000070	driving buses
-sOg4MNTWx_0_000000	skateboarding
-sUHlRRyS2YM_000009	pigeon, dove cooing
-sUs8O9toO4M_000311	dinosaurs bellowing
-sXDJvBEzqjs_000000	dog bow-wow
-sYy0lPjLEXQ_000100	playing cymbal
-s_FLZ-ekB2A_000088	telephone bell ringing
-sa6B5XyFYIg_000040	playing bagpipes
-scm7r0uBepU_000467	mouse clicking
-smBHJiEPCRI_000030	duck quacking
-snbtH1P3MVA_000119	playing timbales
-snyzyJlTBbg_000003	dog baying
-surXSGAnpM0_000000	playing harmonica
-sxiVIGK5AEc_000010	people crowd
-syysO74ja30_000007	playing gong
-szQ-4VQQQsI_000020	railroad car, train wagon
-t0XoS_8YVP4_000728	magpie calling
-t2xJjZp1D1E_000030	dog growling
-t3YfjKEmei4_000080	race car, auto racing
-t3u3ykowlvs_000030	raining
-tD9rMw8YPBI_000030	child speech, kid speaking
-tDayTL0ivzU_000014	playing timbales
-tJChPvDD-hI_000035	parrot talking
-tLFNgY5NBMk_000001	playing bassoon
-tRw0KL6PMFU_000060	skateboarding
-tTePTFQV52M_000030	pig oinking
-tV0sIqEryIY_000037	wind chime
-tWDG6UsiG3s_000090	people babbling
-tYBxgXg8yxw_000046	woodpecker pecking tree
-tYzH5rkbuBQ_000000	frog croaking
-tm9rnG0455k_000010	skidding
-tuqcWxh_mdc_000012	baby crying
-twWBQjLyuxw_000014	bull bellowing
-u1nAQ6GgJ7Y_000154	playing volleyball
-u6AV24u4OMQ_000052	rope skipping
-u6c5tvrkqVA_000187	playing timbales
-u88CrTGAqbo_000000	lawn mowing
-uEPueBOV06U_000109	yodelling
-uGQ0TW02gBo_000004	frog croaking
-uI5eona1hc4_000000	elk bugling
-uIHnphQWVRA_000169	opening or closing drawers
-uIg0I7pAjvM_000030	race car, auto racing
-uJSDmIF4dhE_000260	driving buses
-uK0jcVxT-Pg_000030	driving buses
-uLm5oUt3XG4_000031	playing tabla
-uSmduC6gJxg_000050	rowboat, canoe, kayak rowing
-uWdgdlJqI2Y_000019	basketball bounce
-uWq8Q_cIEwE_000086	playing ukulele
-uZghS49MC1k_000180	skidding
-u_85N9h_cGs_000050	car passing by
-udVSYrFacsc_000072	playing cornet
-ugUyp_keJO4_000022	mouse clicking
-uiPC88KDlW4_000022	engine accelerating, revving, vroom
-unF6DdqG4l8_000050	people whistling
-upZ0sKmaZrI_000167	playing lacrosse
-uvUEfRqpEQU_000145	singing choir
-uyNyWLJIci8_000000	fire truck siren
-v5OdaMw5hhk_000030	playing snare drum
-vADdI9YTMRs_000243	playing timbales
-vJk_Jzr2YIs_000080	playing hammond organ
-vLLiaCDHSPY_000010	dog barking
-vUORRJqXp7A_000036	playing table tennis
-vXupVqDfK34_000116	cricket chirping
-v_cxwPhwaBQ_000000	people farting
-varD0b9CTgs_000020	people belly laughing
-vcwXIa-QB8A_000025	sailing
-vdXavSaj8-M_000070	playing accordion
-vgIgTWqXtms_000023	child singing
-vhqkCDgsuh4_000255	people booing
-vkA-v4DSriM_000229	playing tabla
-vktUwc0Cs7w_000170	playing clarinet
-vpAGr_NrM_w_000050	fireworks banging
-vzoQdjPITKw_000030	pigeon, dove cooing
-w-9xoB74oF0_000004	opening or closing car electric windows
-w-JaJ11OqQY_000345	people slurping
-w3kMt-zQ9t4_000215	playing table tennis
-w5T582MCzlY_000011	running electric fan
-w5vaBVSxgKg_000030	lawn mowing
-w8puug1pEUA_000170	stream burbling
-w9K_AmeWhlo_000071	fire crackling
-wAnqT37UgYY_000034	dog growling
-wEbJ-9cmSaE_000003	playing cornet
-wHdgExbL6dA_000034	playing badminton
-wOYLWY6UCu8_000262	playing ukulele
-wP-96GP6bsU_000000	vehicle horn, car horn, honking
-wT6-Isia2PQ_000149	child singing
-wTQ-1cd8owI_000181	dog bow-wow
-wUNpHu61l7Q_000190	male singing
-wVJ-S2zYxug_000040	playing drum kit
-wX4Ya3D20H8_000039	scuba diving
-wXsrff4No40_000237	playing hockey
-wYZc2-3ViXs_000155	civil defense siren
-wZj294W4RVU_000094	fire crackling
-w_yGhgrow38_000091	eletric blender running
-wdk-RmsGdyw_000310	driving buses
-wdlfOAR03iY_000000	playing glockenspiel
-we-ONoZIkWE_000018	dog howling
-wegIxELjtz4_000334	people eating noodle
-whIS2UodgLI_000002	gibbon howling
-wkwjx0oMAjw_000021	beat boxing
-wnW4qgQQg3g_000050	playing cello
-wrFyu2T1XOo_000000	hail
-wsHPe19Y9Nc_000081	electric shaver, electric razor shaving
-wtyuiWygNTc_000000	zebra braying
-wuAcPWyHMXo_000008	lions roaring
-wwQPX3zjV4s_000028	elk bugling
-x0_AiAhfeV0_000068	eagle screaming
-x0bbH2Tao_0_000000	dog howling
-x1Rt2zN-oXo_000000	dog growling
-x1bXQS9dUAc_000140	playing violin, fiddle
-x2uCcPNM6Nw_000030	pigeon, dove cooing
-x3cLaiaaF0M_000032	skiing
-x68R1rmvKgc_000060	female singing
-x6d8ytnWNDI_000045	barn swallow calling
-x8yymm3DtVA_000022	playing cello
-xK1vy_6H2VM_000010	scuba diving
-xMa1vAUhTfM_000429	ice cream truck, ice cream van
-xN_CePbfjVg_000004	playing bass drum
-xPIhTw0fbzI_000010	train horning
-xQaYumd1O48_000004	lions growling
-xS4brO1qu0g_000591	playing hockey
-xUCKcoE3K6Q_000313	lip smacking
-xVDGIF1pFvQ_000030	driving buses
-xVEXWvj0iWo_000060	rowboat, canoe, kayak rowing
-xWBMt4fI95M_000063	scuba diving
-xWgd4OMcKbs_000263	people nose blowing
-xY9mlbn2IhY_000000	people burping
-xYAHwbhWEgM_000030	playing violin, fiddle
-xbNNxwGRG20_000062	cattle, bovinae cowbell
-xdUbCcEbipM_000290	people crowd
-xeS25F6uHic_000162	airplane flyby
-xetF74UUCGk_000001	ice cream truck, ice cream van
-xf0cheS5wFM_000090	playing piano
-xfT0HF1Pbxk_000003	playing sitar
-xg_3Uas3z40_000240	skateboarding
-xibFeibkfWM_000036	alligators, crocodiles hissing
-xj0Xi47RC88_000200	lawn mowing
-xkUzsvSImy4_000306	people eating crisps
-xm0N3HXnSWc_000361	rope skipping
-xoViga6dJa4_000141	playing steelpan
-xocKilOzrb4_000065	reversing beeps
-xq5kMmAFYx8_000030	playing double bass
-xqv96EPg7so_000200	railroad car, train wagon
-xtvQjd6cwC4_000040	playing bagpipes
-y3TRiYwDbHo_000287	playing oboe
-y6wsRU2aNx4_000040	railroad car, train wagon
-y95ml0IYGr4_000440	chainsawing trees
-yA_63YfQ034_000022	dog growling
-yBwMu2NueR0_000284	rapping
-yE_SP127xy8_000010	people crowd
-yEfhYsMd1yc_000006	playing double bass
-yH3PJfYi_gs_000109	car engine starting
-yJGtoH8INnA_000084	tapping guitar
-yJN5_1tfqXo_000075	magpie calling
-yMMmjb3BRi0_000030	dog bow-wow
-yOhdod2Kg40_000210	playing bassoon
-yPJiPWkeT3U_000254	playing gong
-yPUYU6t3rwo_000370	bee, wasp, etc. buzzing
-yQzzdP-4iBU_000002	planing timber
-yUL9UefoANU_000128	tractor digging
-yVzIaZzLH38_000130	bee, wasp, etc. buzzing
-yYPNrg-s-NI_000060	child singing
-ybnXdQfSNZs_000001	police radio chatter
-ycN30BUfzeo_000070	playing clarinet
-ygOHZ_55jME_000174	electric shaver, electric razor shaving
-yjyZgzYuuSQ_000089	cat purring
-yo5I2MTqv9E_000030	playing marimba, xylophone
-ywD_am3uZh8_000020	splashing water
-ywYLMe6y-S0_000040	playing piano
-z9CCSNKepA8_000537	striking pool
-z9crgUIWcmA_000000	dog barking
-zBgR_gj8NGg_000083	striking pool
-zGbJAz-3Ao8_000070	playing banjo
-zGn9k6j8kVo_000049	rope skipping
-zILE3kr9nIU_000030	mouse pattering
-zJPgE79wkE4_000000	playing tennis
-zMKJFnBr1Gw_000013	reversing beeps
-zPlyG_ryFpg_000006	sliding door
-zRU8A0m9Op8_000145	driving snowmobile
-zVCqTRlc7NU_000020	fire truck siren
-zYPY3Fh1Xjo_000000	skidding
-zcZ0WVQ8t8s_000210	splashing water
-zhPLdAMVAuo_000257	church bell ringing
-zl6hP51zURM_000075	playing oboe
-zlt2EGxum58_000174	bouncing on trampoline
-zmSPCArJHB0_000190	bird squawking
-zpqGedo-jm4_000043	cell phone buzzing
-zrKMC4fAKp0_000202	playing cello
-zsnU7rt_Qq0_000005	baby laughter
-zw7dTh-Lx3o_000074	canary calling
-zzP5qr-ZxHY_000199	people marching
-zzftU8z4aOI_000230	skateboarding

MMAudio/train.py DELETED Viewed

@@ -1,209 +0,0 @@
-import logging
-import math
-import random
-from datetime import timedelta
-from pathlib import Path
-import hydra
-import numpy as np
-import torch
-import torch.distributed as distributed
-from hydra import compose
-from hydra.core.hydra_config import HydraConfig
-from omegaconf import DictConfig, open_dict
-from torch.distributed.elastic.multiprocessing.errors import record
-from mmaudio.data.data_setup import setup_training_datasets, setup_val_datasets
-from mmaudio.model.sequence_config import CONFIG_16K, CONFIG_44K
-from mmaudio.runner import Runner
-from mmaudio.sample import sample
-from mmaudio.utils.dist_utils import info_if_rank_zero, local_rank, world_size
-from mmaudio.utils.logger import TensorboardLogger
-from mmaudio.utils.synthesize_ema import synthesize_ema
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cudnn.allow_tf32 = True
-log = logging.getLogger()
-def distributed_setup():
-    distributed.init_process_group(backend="nccl", timeout=timedelta(hours=2))
-    log.info(f'Initialized: local_rank={local_rank}, world_size={world_size}')
-    return local_rank, world_size
-@record
-@hydra.main(version_base='1.3.2', config_path='config', config_name='train_config.yaml')
-def train(cfg: DictConfig):
-    # initial setup
-    torch.cuda.set_device(local_rank)
-    torch.backends.cudnn.benchmark = cfg.cudnn_benchmark
-    distributed_setup()
-    num_gpus = world_size
-    run_dir = HydraConfig.get().run.dir
-    # compose early such that it does not rely on future hard disk reading
-    eval_cfg = compose('eval_config', overrides=[f'exp_id={cfg.exp_id}'])
-    # patch data dim
-    if cfg.model.endswith('16k'):
-        seq_cfg = CONFIG_16K
-    elif cfg.model.endswith('44k'):
-        seq_cfg = CONFIG_44K
-    else:
-        raise ValueError(f'Unknown model: {cfg.model}')
-    with open_dict(cfg):
-        cfg.data_dim.latent_seq_len = seq_cfg.latent_seq_len
-        cfg.data_dim.clip_seq_len = seq_cfg.clip_seq_len
-        cfg.data_dim.sync_seq_len = seq_cfg.sync_seq_len
-    # wrap python logger with a tensorboard logger
-    log = TensorboardLogger(cfg.exp_id,
-                            run_dir,
-                            logging.getLogger(),
-                            is_rank0=(local_rank == 0),
-                            enable_email=cfg.enable_email and not cfg.debug)
-    info_if_rank_zero(log, f'All configuration: {cfg}')
-    info_if_rank_zero(log, f'Number of GPUs detected: {num_gpus}')
-    # number of dataloader workers
-    info_if_rank_zero(log, f'Number of dataloader workers (per GPU): {cfg.num_workers}')
-    # Set seeds to ensure the same initialization
-    torch.manual_seed(cfg.seed)
-    np.random.seed(cfg.seed)
-    random.seed(cfg.seed)
-    # setting up configurations
-    info_if_rank_zero(log, f'Training configuration: {cfg}')
-    cfg.batch_size //= num_gpus
-    info_if_rank_zero(log, f'Batch size (per GPU): {cfg.batch_size}')
-    # determine time to change max skip
-    total_iterations = cfg['num_iterations']
-    # setup datasets
-    dataset, sampler, loader = setup_training_datasets(cfg)
-    info_if_rank_zero(log, f'Number of training samples: {len(dataset)}')
-    info_if_rank_zero(log, f'Number of training batches: {len(loader)}')
-    val_dataset, val_loader, eval_loader = setup_val_datasets(cfg)
-    info_if_rank_zero(log, f'Number of val samples: {len(val_dataset)}')
-    val_cfg = cfg.data.ExtractedVGG_val
-    # compute and set mean and std
-    latent_mean, latent_std = dataset.compute_latent_stats()
-    # construct the trainer
-    trainer = Runner(cfg,
-                     log=log,
-                     run_path=run_dir,
-                     for_training=True,
-                     latent_mean=latent_mean,
-                     latent_std=latent_std).enter_train()
-    eval_rng_clone = trainer.rng.graphsafe_get_state()
-    # load previous checkpoint if needed
-    if cfg['checkpoint'] is not None:
-        curr_iter = trainer.load_checkpoint(cfg['checkpoint'])
-        cfg['checkpoint'] = None
-        info_if_rank_zero(log, 'Model checkpoint loaded!')
-    else:
-        # if run_dir exists, load the latest checkpoint
-        checkpoint = trainer.get_latest_checkpoint_path()
-        if checkpoint is not None:
-            curr_iter = trainer.load_checkpoint(checkpoint)
-            info_if_rank_zero(log, 'Latest checkpoint loaded!')
-        else:
-            # load previous network weights if needed
-            curr_iter = 0
-            if cfg['weights'] is not None:
-                info_if_rank_zero(log, 'Loading weights from the disk')
-                trainer.load_weights(cfg['weights'])
-                cfg['weights'] = None
-    # determine max epoch
-    total_epoch = math.ceil(total_iterations / len(loader))
-    current_epoch = curr_iter // len(loader)
-    info_if_rank_zero(log, f'We will approximately use {total_epoch} epochs.')
-    # training loop
-    try:
-        # Need this to select random bases in different workers
-        np.random.seed(np.random.randint(2**30 - 1) + local_rank * 1000)
-        while curr_iter < total_iterations:
-            # Crucial for randomness!
-            sampler.set_epoch(current_epoch)
-            current_epoch += 1
-            log.debug(f'Current epoch: {current_epoch}')
-            trainer.enter_train()
-            trainer.log.data_timer.start()
-            for data in loader:
-                trainer.train_pass(data, curr_iter)
-                if (curr_iter + 1) % cfg.val_interval == 0:
-                    # swap into a eval rng state, i.e., use the same seed for every validation pass
-                    train_rng_snapshot = trainer.rng.graphsafe_get_state()
-                    trainer.rng.graphsafe_set_state(eval_rng_clone)
-                    info_if_rank_zero(log, f'Iteration {curr_iter}: validating')
-                    for data in val_loader:
-                        trainer.validation_pass(data, curr_iter)
-                    distributed.barrier()
-                    trainer.val_integrator.finalize('val', curr_iter, ignore_timer=True)
-                    trainer.rng.graphsafe_set_state(train_rng_snapshot)
-                if (curr_iter + 1) % cfg.eval_interval == 0:
-                    save_eval = (curr_iter + 1) % cfg.save_eval_interval == 0
-                    train_rng_snapshot = trainer.rng.graphsafe_get_state()
-                    trainer.rng.graphsafe_set_state(eval_rng_clone)
-                    info_if_rank_zero(log, f'Iteration {curr_iter}: validating')
-                    for data in eval_loader:
-                        audio_path = trainer.inference_pass(data,
-                                                            curr_iter,
-                                                            val_cfg,
-                                                            save_eval=save_eval)
-                    distributed.barrier()
-                    trainer.rng.graphsafe_set_state(train_rng_snapshot)
-                    trainer.eval(audio_path, curr_iter, val_cfg)
-                curr_iter += 1
-                if curr_iter >= total_iterations:
-                    break
-    except Exception as e:
-        log.error(f'Error occurred at iteration {curr_iter}!')
-        log.critical(e.message if hasattr(e, 'message') else str(e))
-        raise
-    finally:
-        if not cfg.debug:
-            trainer.save_checkpoint(curr_iter)
-            trainer.save_weights(curr_iter)
-    # Inference pass
-    del trainer
-    torch.cuda.empty_cache()
-    # Synthesize EMA
-    if local_rank == 0:
-        log.info(f'Synthesizing EMA with sigma={cfg.ema.default_output_sigma}')
-        ema_sigma = cfg.ema.default_output_sigma
-        state_dict = synthesize_ema(cfg, ema_sigma, step=None)
-        save_dir = Path(run_dir) / f'{cfg.exp_id}_ema_final.pth'
-        torch.save(state_dict, save_dir)
-        log.info(f'Synthesized EMA saved to {save_dir}!')
-    distributed.barrier()
-    log.info(f'Evaluation: {eval_cfg}')
-    sample(eval_cfg)
-    # clean-up
-    log.complete()
-    distributed.barrier()
-    distributed.destroy_process_group()
-if __name__ == '__main__':
-    train()