Spaces:

ray-006
/

Sample-Audio

Running on Zero

App Files Files Community

ray-006 commited on 1 day ago

Commit

fc605f9

verified ·

1 Parent(s): e1c7597

Upload 43 files

Browse files

Files changed (44) hide show

.gitattributes +2 -0
.github/workflows/ci.yaml +44 -0
.gitignore +8 -0
.pre-commit-config.yaml +15 -0
CODE_OF_CONDUCT.md +80 -0
CONTRIBUTING.md +31 -0
LICENSE +61 -0
README.md +137 -14
assets/sam_audio_main_model.png +3 -0
eval/README.md +100 -0
eval/dataset/__init__.py +70 -0
eval/dataset/musdb.py +75 -0
eval/dataset/sam_audio_bench.py +153 -0
eval/main.py +162 -0
eval/metrics/__init__.py +13 -0
eval/metrics/aes.py +49 -0
eval/metrics/clap.py +46 -0
eval/metrics/imagebind.py +52 -0
eval/metrics/judge.py +44 -0
examples/assets/office.mp4 +3 -0
examples/span_prompting.ipynb +0 -0
examples/text_prompting.ipynb +0 -0
examples/visual_prompting.ipynb +0 -0
pyproject.toml +62 -0
sam_audio/__init__.py +4 -0
sam_audio/model/__init__.py +4 -0
sam_audio/model/align.py +50 -0
sam_audio/model/base.py +62 -0
sam_audio/model/codec.py +109 -0
sam_audio/model/config.py +251 -0
sam_audio/model/judge.py +135 -0
sam_audio/model/model.py +362 -0
sam_audio/model/patcher.py +164 -0
sam_audio/model/rope.py +155 -0
sam_audio/model/text_encoder.py +37 -0
sam_audio/model/transformer.py +524 -0
sam_audio/model/vision_encoder.py +113 -0
sam_audio/processor.py +382 -0
sam_audio/ranking/__init__.py +30 -0
sam_audio/ranking/clap.py +86 -0
sam_audio/ranking/imagebind.py +197 -0
sam_audio/ranking/judge.py +42 -0
sam_audio/ranking/ranker.py +36 -0
sam_audio/ranking/sound_activity.py +129 -0

.gitattributes CHANGED Viewed

@@ -35,3 +35,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 Sam_audio/assets/sam_audio_main_model.png filter=lfs diff=lfs merge=lfs -text
 Sam_audio/examples/assets/office.mp4 filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 Sam_audio/assets/sam_audio_main_model.png filter=lfs diff=lfs merge=lfs -text
 Sam_audio/examples/assets/office.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/sam_audio_main_model.png filter=lfs diff=lfs merge=lfs -text
+examples/assets/office.mp4 filter=lfs diff=lfs merge=lfs -text

.github/workflows/ci.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+name: CI
+on:
+  push:
+  pull_request:
+env:
+  CACHE_NUMBER: 0
+jobs:
+  build:
+    runs-on: 32-core-ubuntu
+    steps:
+      - uses: actions/checkout@v4
+      - uses: mamba-org/setup-micromamba@v1.8.1
+        with:
+          environment-name: sam-audio
+          init-shell: bash
+          create-args: >-
+            python=3.11
+            pip=24.2
+            ruff==
+      - uses: actions/cache@v4
+        with:
+          path: /home/runner/micromamba/envs/sam-audio
+          key: ${{ hashFiles('pyproject.toml') }}-${{ env.CACHE_NUMBER }}
+        id: cache
+      - name: Update environment
+        shell: bash -l {0}
+        run: |
+          pip install .
+        if: steps.cache.outputs.cache-hit != 'true'
+      - name: Check formatting
+        shell: bash -l {0}
+        run: |
+          ruff format --check .
+      - name: Lint
+        shell: bash -l {0}
+        run: |
+          ruff check .

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+__pycache__
+*.egg-info
+*.pyc
+*.so
+build
+dist
+.checkpoints
+.ipynb_checkpoints

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+      - id: trailing-whitespace
+      - id: check-yaml
+        args:
+          - --allow-multiple-documents
+      - id: end-of-file-fixer
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.12.0
+    hooks:
+      - id: ruff
+        args: [ --fix ]
+      - id: ruff-format

CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,80 @@

+# Code of Conduct
+## Our Pledge
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+## Our Standards
+Examples of behavior that contributes to creating a positive environment
+include:
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+Examples of unacceptable behavior by participants include:
+* The use of sexualized language or imagery and unwelcome sexual attention or
+advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+professional setting
+## Our Responsibilities
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+## Scope
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+This Code of Conduct also applies outside the project spaces when there is a
+reasonable belief that an individual's behavior may have a negative impact on
+the project or its community.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <opensource-conduct@meta.com>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+[homepage]: https://www.contributor-covenant.org
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,31 @@

+# Contributing to segment-anything-model-audio
+We want to make contributing to this project as easy and transparent as
+possible.
+## Pull Requests
+We actively welcome your pull requests.
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+Complete your CLA here: <https://code.facebook.com/cla>
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+## License
+By contributing to segment-anything-model-audio, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.

LICENSE ADDED Viewed

	@@ -0,0 +1,61 @@

+SAM License
+Last Updated: November 19, 2025
+“Agreement” means the terms and conditions for use, reproduction, distribution and modification of the SAM Materials set forth herein.
+“SAM Materials” means, collectively, Documentation and the models, software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code, and other elements of the foregoing distributed by Meta and made available under this Agreement.
+“Documentation” means the specifications, manuals and documentation accompanying
+SAM Materials distributed by Meta.
+“Licensee” or “you” means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.
+“Meta” or “we” means Meta Platforms Ireland Limited (if you are located in or, if you are an entity, your principal place of business is in the EEA or Switzerland) or Meta Platforms, Inc. (if you are located outside of the EEA or Switzerland).
+“Sanctions” means any economic or trade sanctions or restrictions administered or enforced by the United States (including the Office of Foreign Assets Control of the U.S. Department of the Treasury (“OFAC”), the U.S. Department of State and the U.S. Department of Commerce), the United Nations, the European Union, or the United Kingdom.
+“Trade Controls” means any of the following: Sanctions and applicable export and import controls.
+By using or distributing any portion or element of the SAM Materials, you agree to be bound by this Agreement.
+1. License Rights and Redistribution.
+a. Grant of Rights. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Meta’s intellectual property or other rights owned by Meta embodied in the SAM Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the SAM Materials.
+b. Redistribution and Use.
+i. Distribution of SAM Materials, and any derivative works thereof, are subject to the terms of this Agreement. If you distribute or make the SAM Materials, or any derivative works thereof, available to a third party, you may only do so under the terms of this Agreement and you shall provide a copy of this Agreement with any such SAM Materials.
+ii.  If you submit for publication the results of research you perform on, using, or otherwise in connection with SAM Materials, you must acknowledge the use of SAM Materials in your publication.
+iii. Your use of the SAM Materials must comply with applicable laws and regulations, including Trade Control Laws and applicable privacy and data protection laws.
+iv. Your use of the SAM Materials will not involve or encourage others to reverse engineer, decompile or discover the underlying components of the SAM Materials.
+v. You are not the target of Trade Controls and your use of SAM Materials must comply with Trade Controls. You agree not to use, or permit others to use, SAM Materials for any activities subject to the International Traffic in Arms Regulations (ITAR) or end uses prohibited by Trade Controls, including those related to military or warfare purposes, nuclear industries or applications, espionage, or the development or use of guns or illegal weapons.
+2. User Support. Your use of the SAM Materials is done at your own discretion; Meta does not process any information nor provide any service in relation to such use.  Meta is under no obligation to provide any support services for the SAM Materials. Any support provided is “as is”, “with all faults”, and without warranty of any kind.
+3. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE SAM MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OF ANY KIND, AND META DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE SAM MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE SAM MATERIALS AND ANY OUTPUT AND RESULTS.
+4. Limitation of Liability. IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT OR INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+5. Intellectual Property.
+a. Subject to Meta’s ownership of SAM Materials and derivatives made by or for Meta, with respect to any derivative works and modifications of the SAM Materials that are made by you, as between you and Meta, you are and will be the owner of such derivative works and modifications.
+b. If you institute litigation or other proceedings against Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the SAM Materials, outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Meta from and against any claim by any third party arising out of or related to your use or distribution of the SAM Materials.
+6. Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the SAM Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Meta may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the SAM Materials. Sections 3, 4 and 7 shall survive the termination of this Agreement.
+7. Governing Law and Jurisdiction. This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The courts of California shall have exclusive jurisdiction of any dispute arising out of this Agreement.
+8. Modifications and Amendments. Meta may modify this Agreement from time to time; provided that they are similar in spirit to the current version of the Agreement, but may differ in detail to address new problems or concerns. All such changes will be effective immediately. Your continued use of the SAM Materials after any modification to this Agreement constitutes your agreement to such modification. Except as provided in this Agreement, no modification or addition to any provision of this Agreement will be binding unless it is in writing and signed by an authorized representative of both you and Meta.

README.md CHANGED Viewed

@@ -1,14 +1,137 @@
----
-title: Sample Audio
-emoji: 📚
-colorFrom: indigo
-colorTo: red
-sdk: gradio
-sdk_version: 6.2.0
-app_file: app.py
-pinned: false
-license: mit
-short_description: Sample-Audio
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+<div align="center">
+# SAM-Audio
+![CI](https://github.com/facebookresearch/sam-audio/actions/workflows/ci.yaml/badge.svg)
+![model_image](assets/sam_audio_main_model.png)
+</div>
+Segment Anything Model for Audio [[**Blog**](https://ai.meta.com/blog/sam-audio/)] [[**Paper**](https://ai.meta.com/research/publications/sam-audio-segment-anything-in-audio/)] [[**Demo**](https://aidemos.meta.com/segment-anything/editor/segment-audio)]
+SAM-Audio is a foundation model for isolating any sound in audio using text, visual, or temporal prompts. It can separate specific sounds from complex audio mixtures based on natural language descriptions, visual cues from video, or time spans.
+SAM-Audio and the Judge model crucially rely on [Perception-Encoder Audio-Visual (PE-AV)](https://huggingface.co/facebook/pe-av-large), which you can read more about [here](https://ai.meta.com/research/publications/pushing-the-frontier-of-audiovisual-perception-with-large-scale-multimodal-correspondence-learning/)
+## Setup
+**Requirements:**
+- Python >= 3.10
+- CUDA-compatible GPU (recommended)
+Install dependencies:
+```bash
+pip install .
+```
+## Usage
+⚠️ Before using SAM Audio, please request access to the checkpoints on the SAM Audio
+Hugging Face [repo](https://huggingface.co/facebook/sam-audio-large). Once accepted, you
+need to be authenticated to download the checkpoints. You can do this by running
+the following [steps](https://huggingface.co/docs/huggingface_hub/en/quick-start#authentication)
+(e.g. `hf auth login` after generating an access token.)
+### Basic Text Prompting
+```python
+from sam_audio import SAMAudio, SAMAudioProcessor
+import torchaudio
+import torch
+model = SAMAudio.from_pretrained("facebook/sam-audio-large")
+processor = SAMAudioProcessor.from_pretrained("facebook/sam-audio-large")
+model = model.eval().cuda()
+file = "<audio file>" # audio file path or torch tensor
+description = "<description>"
+batch = processor(
+    audios=[file],
+    descriptions=[description],
+).to("cuda")
+with torch.inference_mode():
+    # NOTE: `predict_spans` and `reranking_candidates` have a large impact on performance.
+    # Setting `predict_span=True` and `reranking_candidates=8` will give you better results at the cost of
+    # latency and memory. See the "Span Prediction" section below for more details
+   result = model.separate(batch, predict_spans=False, reranking_candidates=1)
+# Save separated audio
+sample_rate = processor.audio_sampling_rate
+torchaudio.save("target.wav", result.target.cpu(), sample_rate)      # The isolated sound
+torchaudio.save("residual.wav", result.residual.cpu(), sample_rate)  # Everything else
+```
+### Prompting Methods
+SAM-Audio supports three types of prompts:
+1. **Text Prompting**: Describe the sound you want to isolate using natural language
+   ```python
+   processor(audios=[audio], descriptions=["A man speaking"])
+   ```
+2. **Visual Prompting**: Use video frames and masks to isolate sounds associated with visual objects
+   ```python
+   processor(audios=[video], descriptions=[""], masked_videos=processor.mask_videos([frames], [mask]))
+   ```
+3. **Span Prompting**: Specify time ranges where the target sound occurs
+   ```python
+   processor(audios=[audio], descriptions=["A horn honking"], anchors=[[["+", 6.3, 7.0]]])
+   ```
+See the [examples](examples) directory for more detailed examples
+### Span Prediction (Optional for Text Prompting)
+We also provide support for automatically predicting the spans based on the text description, which is especially helpful for separating non-ambience sound events.  You can enable this by adding `predict_spans=True` in your call to `separate`
+```python
+with torch.inference_mode()
+   outputs = model.separate(batch, predict_spans=True)
+# To further improve performance (at the expense of latency), you can add candidate re-ranking
+with torch.inference_mode():
+   outputs = model.separate(batch, predict_spans=True, reranking_candidates=8)
+```
+### Re-Ranking
+We provide the following models to assess the quality of the separated audio:
+- [CLAP](https://github.com/LAION-AI/CLAP): measures the similarity between the target audio and text description
+- [Judge](https://huggingface.co/facebook/sam-audio-judge): measures the overall separation quality across 3 axes: precision, recall, and faithfulness (see the [model card](https://huggingface.co/facebook/sam-audio-judge#output-format) for more details)
+- [ImageBind](https://github.com/facebookresearch/ImageBind): for visual prompting, we measure the imagebind embedding similarity between the separated audio and the masked input video
+We provide support for generating multiple candidates (by setting `reranking_candidates=<k>` in your call to `separate`), which will generate `k` audios, and choose the best one based on the ranking models mentioned above
+# Models
+Below is a table of each of the models we released along with their overall subjective evaluation scores
+| Model    | General SFX | Speech | Speaker | Music | Instr(wild) | Instr(pro) |
+|----------|-------------|--------|---------|-------|-------------|------------|
+| [`sam-audio-small`](https://huggingface.co/facebook/sam-audio-small) | 3.62        | 3.99   | 3.12    | 4.11  | 3.56        | 4.24       |
+| [`sam-audio-base`](https://huggingface.co/facebook/sam-audio-base)   | 3.28        | 4.25   | 3.57    | 3.87  | 3.66        | 4.27       |
+| [`sam-audio-large`](https://huggingface.co/facebook/sam-audio-large) | 3.50        | 4.03   | 3.60    | 4.22  | 3.66        | 4.49       |
+We additional release another variant (in each size) that is better specifically on correctness of target sound as well as visual prompting:
+- [`sam-audio-small-tv`](https://huggingface.co/facebook/sam-audio-small-tv)
+- [`sam-audio-base-tv`](https://huggingface.co/facebook/sam-audio-base-tv)
+- [`sam-audio-large-tv`](https://huggingface.co/facebook/sam-audio-large-tv)
+## Evaluation
+See the [eval](eval) directory for instructions and scripts to reproduce results from the paper
+## Contributing
+See [contributing](CONTRIBUTING.md) and [code of conduct](CODE_OF_CONDUCT.md) for more information.
+## License
+This project is licensed under the SAM License - see the [LICENSE](LICENSE) file for details.

assets/sam_audio_main_model.png ADDED Viewed

Git LFS Details

SHA256: 8dc7bda3f7ad3a910cdcd5d137b3e7e721f2381736ac19e1155732422b818bbd
Pointer size: 132 Bytes
Size of remote file: 1.55 MB

eval/README.md ADDED Viewed

	@@ -0,0 +1,100 @@

+# Evaluation
+This directory contains the evaluation code to reproduce the results from the SAM-Audio paper. The evaluation framework supports multiple datasets, prompting modes (text-only, span, visual), and metrics.
+## Setup
+Before running evaluation, ensure you have:
+1. Installed the SAM-Audio package and its dependencies
+2. Authenticated with Hugging Face to access the model checkpoints (see main [README](../README.md))
+## Quick Start
+Run evaluation on the default setting (instr-pro):
+```bash
+python main.py
+```
+You can also use multiple GPUs to speed up evaluation:
+```bash
+torchrun --nproc_per_node=<ngpus> python main.py
+```
+Evaluate on a specific setting:
+```bash
+python main.py --setting sfx
+```
+Evaluate on multiple settings:
+```bash
+python main.py --setting sfx speech music
+```
+## Available Evaluation Settings
+Run `python main.py --help` to see all available settings
+## Command Line Options
+```bash
+python main.py [OPTIONS]
+```
+### Options:
+- `-s, --setting` - Which setting(s) to evaluate (default: `instr-pro`)
+  - Choices: See available settings above
+  - Can specify multiple settings: `--setting sfx speech music`
+- `--cache-path` - Where to cache downloaded datasets (default: `~/.cache/sam_audio`)
+- `-p, --checkpoint-path` - Model checkpoint to evaluate (default: `facebook/sam-audio-1b`)
+  - Can use local path or Hugging Face model ID
+- `-b, --batch-size` - Batch size for evaluation (default: `1`)
+- `-w, --num-workers` - Number of data loading workers (default: `4`)
+- `-c, --candidates` - Number of reranking candidates (default: `8`)
+## Evaluation Metrics
+The evaluation framework computes the following metrics:
+- **Judge** - SAM Audio Judge quality assessment metric
+- **Aesthetic** - Aesthetic quality metric
+- **CLAP** - Audio-text alignment metric (CLAP similarity)
+- **ImageBind** - Audio-video alignment metric (for visual settings only)
+## Output
+Results are saved to the `results/` directory as JSON files, one per setting:
+```
+results/
+├── sfx.json
+├── speech.json
+└── music.json
+```
+Each JSON file contains the averaged metric scores across all samples in that setting.
+Example output:
+```json
+{
+    "JudgeOverall": "4.386",
+    "JudgeFaithfulness": "4.708",
+    "JudgeRecall": "4.934",
+    "JudgePrecision": "4.451",
+    "ContentEnjoyment": "5.296",
+    "ContentUsefulness": "6.903",
+    "ProductionComplexity": "4.301",
+    "ProductionQuality": "7.100",
+    "CLAPSimilarity": "0.271"
+}
+```

eval/dataset/__init__.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from typing import Callable
+from .musdb import MUSDB
+from .sam_audio_bench import SAMAudioBench
+SETTINGS = {
+    # Text-only settings
+    "sfx": (
+        SAMAudioBench,
+        {"span": False, "visual": False, "subset": "others-50:text-only"},
+    ),
+    "speech": (
+        SAMAudioBench,
+        {"span": False, "visual": False, "subset": "speech-clean-50:text-only"},
+    ),
+    "speaker": (
+        SAMAudioBench,
+        {"span": False, "visual": False, "subset": "spk-50:text-only"},
+    ),
+    "music": (
+        SAMAudioBench,
+        {"span": False, "visual": False, "subset": "music-clean-50:text-only"},
+    ),
+    "instr-wild": (
+        SAMAudioBench,
+        {"span": False, "visual": False, "subset": "instr-50:text-only"},
+    ),
+    "instr-pro": (MUSDB, {}),
+    # Span settings
+    "sfx-span": (
+        SAMAudioBench,
+        {"span": True, "visual": False, "subset": "others-50:text+span"},
+    ),
+    "speech-span": (
+        SAMAudioBench,
+        {"span": True, "visual": False, "subset": "speech-clean-50:text+span"},
+    ),
+    "speaker-span": (
+        SAMAudioBench,
+        {"span": True, "visual": False, "subset": "spk-50:text+span"},
+    ),
+    "music-span": (
+        SAMAudioBench,
+        {"span": True, "visual": False, "subset": "music-clean-50:text+span"},
+    ),
+    "instr-wild-span": (
+        SAMAudioBench,
+        {"span": True, "visual": False, "subset": "instr-50:text+span"},
+    ),
+    # Visual settings
+    "sfx-visual": (
+        SAMAudioBench,
+        {"span": False, "visual": True, "subset": "others-onscreen-50:visual-only"},
+    ),
+    "speaker-visual": (
+        SAMAudioBench,
+        {"span": False, "visual": True, "subset": "spk-onscreen-50:visual-only"},
+    ),
+    "instr-wild-visual": (
+        SAMAudioBench,
+        {"span": False, "visual": True, "subset": "instr-onscreen-50:visual-only"},
+    ),
+}
+def make_dataset(setting: str, cache_path: str, collate_fn: Callable):
+    dataset, kwargs = SETTINGS[setting]
+    return dataset(cache_path=cache_path, collate_fn=collate_fn, **kwargs)

eval/dataset/musdb.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import os
+from subprocess import check_call
+import torchaudio
+from datasets import load_dataset
+from torch.utils.data import Dataset
+from torchcodec.decoders import AudioDecoder
+def cache_file(url, outfile):
+    if not os.path.exists(outfile):
+        print("Downloading musdb18hq dataset...")
+        os.makedirs(os.path.dirname(outfile), exist_ok=True)
+        check_call(["curl", "--url", url, "--output", outfile + ".tmp"])
+        os.rename(outfile + ".tmp", outfile)
+class MUSDB(Dataset):
+    def __init__(
+        self,
+        collate_fn,
+        sample_rate: int = 48_000,
+        cache_path: str = os.path.expanduser("~/.cache/sam_audio"),
+    ):
+        self.cache_path = os.path.join(cache_path, "musdb18hq")
+        self.ds = self.get_dataset(cache_path)
+        self.captions = ["bass", "drums", "vocals"]
+        self.collate_fn = collate_fn
+        self.sample_rate = sample_rate
+    @property
+    def visual(self):
+        return False
+    def get_dataset(self, cache_path):
+        zip_file = os.path.join(cache_path, "musdb18hq.zip")
+        url = "https://zenodo.org/records/3338373/files/musdb18hq.zip?download=1"
+        cache_file(url, zip_file)
+        extracted_dir = os.path.join(cache_path, "musdb18hq")
+        if not os.path.exists(extracted_dir):
+            check_call(["unzip", zip_file, "-d", extracted_dir + ".tmp"])
+            os.rename(extracted_dir + ".tmp", extracted_dir)
+        return load_dataset("facebook/sam-audio-musdb18hq-test")["test"]
+    def __len__(self):
+        return len(self.ds)
+    def collate(self, items):
+        audios, descriptions = zip(*items, strict=False)
+        return self.collate_fn(
+            audios=audios,
+            descriptions=descriptions,
+        )
+    def __getitem__(self, idx):
+        item = self.ds[idx]
+        path = os.path.join(self.cache_path, "test", item["id"], "mixture.wav")
+        assert os.path.exists(path), f"{path} does not exist!"
+        decoder = AudioDecoder(path)
+        data = decoder.get_samples_played_in_range(item["start_time"], item["end_time"])
+        wav = data.data
+        if data.sample_rate != self.sample_rate:
+            wav = torchaudio.functional.resample(
+                wav, data.sample_rate, self.sample_rate
+            )
+        wav = wav.mean(0, keepdim=True)
+        return wav, item["description"]
+if __name__ == "__main__":
+    dataset = MUSDB(lambda **kwargs: None)
+    print(len(dataset))
+    print(dataset[0])

eval/dataset/sam_audio_bench.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import os
+from dataclasses import dataclass
+from io import BytesIO
+from typing import Optional, Tuple
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchaudio
+from datasets import load_dataset
+from torchcodec.decoders import AudioDecoder, VideoDecoder
+@dataclass
+class Item:
+    anchors: list[Tuple[str, float, float]]
+    masked_video_frames: torch.Tensor
+    audio_samples: torch.Tensor
+    description: str
+class SAMAudioBench(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        cache_path,
+        collate_fn,
+        span: bool = True,
+        visual: bool = True,
+        subset: Optional[str] = None,
+    ):
+        self.dataset = load_dataset("facebook/sam-audio-bench")["test"]
+        self.subset = subset
+        self._span = span
+        self._visual = visual
+        if subset is not None:
+            self.dataset = self.dataset.filter(lambda x: subset in x["paper_eval_sets"])
+        self.cache_path = os.path.join(cache_path, "sam_audio_bench")
+        self.collate_fn = collate_fn
+        DATA_MSG = (
+            f"`SAMAudioBench` requires the user to create a directory named {self.cache_path} "
+            "see the README.md file for how to prepare"
+        )
+        assert os.path.exists(self.cache_path), DATA_MSG
+    @property
+    def visual(self):
+        return self._visual
+    def __len__(self):
+        return len(self.dataset)
+    def _get_path(
+        self, video_id: str, source_dataset: str, start_offset: float, end_offset: float
+    ) -> str:
+        path = f"{self.cache_path}/{source_dataset}/{video_id}.mp4"
+        select_frames = True
+        if not os.path.exists(path):
+            path = f"{self.cache_path}/{source_dataset}/{video_id}_{int(start_offset * 1000)}_{int(end_offset * 1000)}.mp4"
+            select_frames = False
+        if not os.path.exists(path):
+            path = f"{self.cache_path}/{source_dataset}/{video_id}_{int(start_offset)}_{int(end_offset)}.mp4"
+        if not os.path.exists(path):
+            path = f"{self.cache_path}/{source_dataset}/{video_id}.{int(start_offset * 1000):08d}_{int(end_offset * 1000):08d}.mp4"
+        return path, select_frames
+    def collate(self, items: list[Item]):
+        has_video = any(item.masked_video_frames is not None for item in items)
+        return self.collate_fn(
+            descriptions=[item.description for item in items],
+            audios=[item.audio_samples for item in items],
+            anchors=[item.anchors for item in items] if self._span else None,
+            masked_videos=[item.masked_video_frames for item in items]
+            if has_video and self._visual
+            else None,
+        )
+    def _get_masked_video(self, item, video_path, select_frames):
+        if item["mask_bytes"] is None:
+            return None
+        mask = torch.from_numpy(np.load(BytesIO(item["mask_bytes"]))["video_masklet"])
+        video_decoder = VideoDecoder(video_path)
+        if select_frames:
+            video_frames = video_decoder.get_frames_played_in_range(
+                item["start_offset"], item["end_offset"]
+            ).data
+        else:
+            video_frames = video_decoder[:].data
+        if mask.size(0) != video_frames.size(0):
+            # It's possible that the mask and the video frames differ by a small amount
+            # we interpolate the mask frame to match
+            idxs = (
+                torch.linspace(0, mask.size(0) - 1, video_frames.size(0)).round().long()
+            )
+            mask = mask[idxs]
+        mask = mask.unsqueeze(1)
+        if mask.shape[-2:] != video_frames.shape[-2:]:
+            mask = F.interpolate(mask, size=video_frames.shape[-2:])
+        import torchvision
+        torchvision.io.write_video("test.mp4", video_frames.permute(0, 2, 3, 1), 30)
+        torchvision.io.write_video(
+            "test_mask.mp4", mask.unsqueeze(-1).expand(-1, -1, -1, 3) * 255, 30
+        )
+        return video_frames * mask
+    def __getitem__(self, idx) -> Item:
+        item = self.dataset[idx]
+        video_path, select_frames = self._get_path(
+            item["video_id"],
+            item["source_dataset"],
+            item["start_offset"],
+            item["end_offset"],
+        )
+        assert os.path.exists(video_path), f"{video_path} does not exist!"
+        audio_decoder = AudioDecoder(video_path)
+        audio_samples = audio_decoder.get_samples_played_in_range(
+            start_seconds=item["start_offset"] if select_frames else 0,
+            stop_seconds=item["end_offset"] if select_frames else None,
+        )
+        if audio_samples.sample_rate != self.collate_fn.audio_sampling_rate:
+            resampled_audio = torchaudio.functional.resample(
+                audio_samples.data,
+                audio_samples.sample_rate,
+                self.collate_fn.audio_sampling_rate,
+            )
+        else:
+            resampled_audio = audio_samples.data
+        masked_video_frames = self._get_masked_video(item, video_path, select_frames)
+        return Item(
+            description=item["description"],
+            anchors=[("+", start, end) for start, end in item["spans"]],
+            masked_video_frames=masked_video_frames,
+            audio_samples=resampled_audio.mean(0, keepdim=True),
+        )

eval/main.py ADDED Viewed

	@@ -0,0 +1,162 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import argparse
+import json
+import os
+import pandas as pd
+import torch
+import torch.distributed as dist
+from dataset import SETTINGS, make_dataset
+from metrics import CLAP, Aesthetic, ImageBind, Judge
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm
+from sam_audio import SAMAudio, SAMAudioProcessor
+def gather_and_average_results(results, world_size):
+    if world_size == 1:
+        return json.loads(results.mean().to_json())
+    # 1. Gather all dictionaries to all ranks
+    all_results = [None for _ in range(world_size)]
+    dist.all_gather_object(
+        all_results, {"sum": results.sum().to_json(), "count": len(results)}
+    )
+    summed = {}
+    counts = 0
+    for res in all_results:
+        for k, v in json.loads(res["sum"]).items():
+            if k not in summed:
+                summed[k] = 0.0
+            summed[k] += v
+        counts += res["count"]
+    # 3. Compute average for keys that appeared at least once
+    averaged = {k: summed[k] / counts for k in summed}
+    return averaged
+def main(
+    settings: list[str],
+    cache_path: str,
+    batch_size: int,
+    checkpoint_path: str,
+    num_workers: int = 4,
+    reranking_candidates: int = 8,
+):
+    world_size = int(os.environ.get("WORLD_SIZE", 1))
+    rank = int(os.environ.get("RANK", 0))
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    if world_size > 1:
+        torch.distributed.init_process_group(backend="nccl")
+        device = torch.device(f"cuda:{rank}")
+        torch.cuda.set_device(device)
+    model = SAMAudio.from_pretrained(checkpoint_path)
+    model = model.eval().to(device)
+    processor = SAMAudioProcessor.from_pretrained(checkpoint_path)
+    judge_metric = Judge(device=device)
+    aes_metric = Aesthetic(device=device)
+    clap_metric = CLAP(device=device)
+    imagebind_metric = ImageBind(device=device)
+    for setting in settings:
+        print(f"Evaluating: {setting}")
+        dset = make_dataset(setting, cache_path=cache_path, collate_fn=processor)
+        sampler = None
+        if world_size > 1:
+            sampler = DistributedSampler(dset)
+        dl = DataLoader(
+            dset,
+            batch_size=batch_size,
+            shuffle=False,
+            collate_fn=dset.collate,
+            num_workers=num_workers,
+            sampler=sampler,
+        )
+        all_metrics = [
+            judge_metric,
+            aes_metric,
+            clap_metric,
+        ]
+        if dset.visual:
+            all_metrics.append(imagebind_metric)
+        dfs = []
+        with torch.inference_mode():
+            for batch in tqdm(dl, disable=rank > 1):
+                batch = batch.to(device)
+                result = model.separate(
+                    batch, reranking_candidates=reranking_candidates
+                )
+                mets = {}
+                for metric in all_metrics:
+                    input_wavs = model.unbatch(batch.audios.squeeze(1), batch.wav_sizes)
+                    mets.update(
+                        metric(
+                            target_wavs=result.target,
+                            target_wavs_sample_rate=model.sample_rate,
+                            descriptions=batch.descriptions,
+                            input_wavs=input_wavs,
+                            videos=batch.masked_video,
+                        )
+                    )
+                dfs.append(pd.DataFrame.from_dict(mets))
+        df = pd.concat(dfs)
+        averaged_results = gather_and_average_results(df, world_size)
+        if rank == 0:
+            results_dict = {k: f"{v:.3f}" for k, v in averaged_results.items()}
+            print(json.dumps(results_dict, indent=4))
+            os.makedirs("results", exist_ok=True)
+            outfile = f"results/{setting}.json"
+            with open(outfile, "w") as fout:
+                print(json.dumps(results_dict), file=fout)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--setting",
+        "-s",
+        choices=SETTINGS.keys(),
+        help=f"Which setting to evaluate.  Choices: {SETTINGS.keys()}",
+        default=["instr-pro"],
+        nargs="+",
+    )
+    parser.add_argument(
+        "--cache-path",
+        type=str,
+        default=os.path.expanduser("~/.cache/sam_audio"),
+        help="Where to cache downloaded datasets",
+    )
+    parser.add_argument(
+        "--checkpoint-path", "-p", type=str, default="facebook/sam-audio-large"
+    )
+    parser.add_argument("--batch-size", "-b", type=int, default=1, help="Batch size")
+    parser.add_argument(
+        "--num-workers", "-w", type=int, default=4, help="Number of workers"
+    )
+    parser.add_argument("--candidates", "-c", type=int, default=8)
+    opt = parser.parse_args()
+    main(
+        settings=opt.setting,
+        cache_path=opt.cache_path,
+        batch_size=opt.batch_size,
+        checkpoint_path=opt.checkpoint_path,
+        num_workers=opt.num_workers,
+        reranking_candidates=opt.candidates,
+    )

eval/metrics/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from metrics.aes import Aesthetic
+from metrics.clap import CLAP
+from metrics.imagebind import ImageBind
+from metrics.judge import Judge
+__all__ = [
+    "Aesthetic",
+    "CLAP",
+    "ImageBind",
+    "Judge",
+]

eval/metrics/aes.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from typing import Optional
+import torch
+from audiobox_aesthetics.infer import AesPredictor
+COLUMN_MAP = {
+    "CE": "ContentEnjoyment",
+    "CU": "ContentUsefulness",
+    "PC": "ProductionComplexity",
+    "PQ": "ProductionQuality",
+}
+class Aesthetic(torch.nn.Module):
+    def __init__(
+        self,
+        checkpoint: Optional[str] = None,
+        device: Optional[torch.device] = None,
+    ):
+        super().__init__()
+        self.model = AesPredictor(
+            checkpoint_pth=checkpoint,
+            data_col="wav",
+        )
+        self.device = device or torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu"
+        )
+    def __call__(
+        self,
+        target_wavs: list[torch.Tensor],
+        target_wavs_sample_rate: int = 48_000,
+        **kwargs,
+    ) -> dict[str, list[float]]:
+        result = self.model.forward(
+            [
+                {
+                    "wav": wav[None] if wav.ndim == 1 else wav,
+                    "sample_rate": target_wavs_sample_rate,
+                }
+                for wav in target_wavs
+            ]
+        )
+        return {
+            long_name: [x[shortname] for x in result]
+            for shortname, long_name in COLUMN_MAP.items()
+        }

eval/metrics/clap.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from tempfile import TemporaryDirectory
+from typing import Optional
+import torch
+from torchcodec.encoders import AudioEncoder
+from sam_audio.ranking.clap import get_model
+class CLAP(torch.nn.Module):
+    def __init__(
+        self,
+        checkpoint: Optional[str] = None,
+        device: Optional[torch.device] = None,
+    ):
+        super().__init__()
+        self.model = get_model(device)
+        self.device = device or torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu"
+        )
+    def __call__(
+        self,
+        target_wavs: list[torch.Tensor],
+        descriptions: list[str],
+        target_wavs_sample_rate: int = 48_000,
+        **kwargs,
+    ) -> list[dict[str, float]]:
+        with TemporaryDirectory() as tdir, torch.inference_mode():
+            file_list = []
+            for i, wav in enumerate(target_wavs):
+                file_list.append(f"{tdir}/hyp_{i}.wav")
+                encoder = AudioEncoder(
+                    samples=wav.cpu()[None] if wav.ndim == 1 else wav.cpu(),
+                    sample_rate=target_wavs_sample_rate,
+                )
+                encoder.to_file(file_list[-1])
+            audio_embs = self.model.get_audio_embedding_from_filelist(
+                file_list, use_tensor=True
+            )
+            text_embs = self.model.get_text_embedding(descriptions, use_tensor=True)
+            sims = audio_embs.unsqueeze(1) @ text_embs.unsqueeze(2)
+            return {"CLAPSimilarity": sims.cpu()[:, 0, 0].tolist()}

eval/metrics/imagebind.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from typing import Optional
+import torch
+from imagebind.models.imagebind_model import ModalityType, imagebind_huge
+from sam_audio.ranking.imagebind import VideoTransform, load_and_transform_audio_data
+class ImageBind(torch.nn.Module):
+    def __init__(
+        self,
+        checkpoint: Optional[str] = None,
+        device: Optional[torch.device] = None,
+    ):
+        super().__init__()
+        self.model = imagebind_huge(pretrained=checkpoint is None)
+        if checkpoint is not None:
+            self.model.load_state_dict(torch.load(checkpoint, map_location="cpu"))
+        self.model = self.model.eval()
+        self.video_transform = VideoTransform()
+        self.device = device or torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu"
+        )
+        self.model = self.model.to(self.device)
+    def __call__(
+        self,
+        target_wavs: list[torch.Tensor],
+        videos: list[torch.Tensor],
+        target_wavs_sample_rate: int = 48_000,
+        **kwargs,
+    ) -> dict[str, list[float]]:
+        audio_data = load_and_transform_audio_data(
+            target_wavs, input_sample_rate=target_wavs_sample_rate
+        )
+        durations = [x.size(-1) / target_wavs_sample_rate for x in target_wavs]
+        video_data = self.video_transform(videos, durations, audio_data.device)
+        inputs = {ModalityType.AUDIO: audio_data, ModalityType.VISION: video_data}
+        embs = self.model(inputs)
+        audio_embs, video_embs = embs[ModalityType.AUDIO], embs[ModalityType.VISION]
+        audio_embs, video_embs = (
+            audio_embs / ((audio_embs**2).sum(dim=-1, keepdims=True) ** 0.5),
+            video_embs / ((video_embs**2).sum(dim=-1, keepdims=True) ** 0.5),
+        )
+        bsz = len(target_wavs)
+        candidates = len(audio_embs) // bsz
+        scores = audio_embs.view(bsz, candidates, -1) @ video_embs.view(bsz, -1, 1)
+        return {"ImageBind": scores.squeeze(1, 2).cpu().tolist()}

eval/metrics/judge.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from typing import Optional
+import torch
+from sam_audio import SAMAudioJudgeModel, SAMAudioJudgeProcessor
+class Judge(torch.nn.Module):
+    def __init__(
+        self,
+        checkpoint: str = "facebook/sam-audio-judge",
+        device: Optional[torch.device] = None,
+    ):
+        super().__init__()
+        self.model = SAMAudioJudgeModel.from_pretrained(checkpoint).to(device)
+        self.processor = SAMAudioJudgeProcessor.from_pretrained(checkpoint)
+        self.device = device or torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu"
+        )
+    def forward(
+        self,
+        input_wavs: list[torch.Tensor],
+        target_wavs: list[torch.Tensor],
+        descriptions: list[str],
+        target_wavs_sample_rate: int = 48_000,
+        **kwargs,
+    ) -> torch.Tensor:
+        with torch.inference_mode():
+            processed = self.processor(
+                text=descriptions,
+                input_audio=[x.cpu() for x in input_wavs],
+                separated_audio=[x.cpu() for x in target_wavs],
+                sampling_rate=target_wavs_sample_rate,
+            ).to(self.device)
+            result = self.model(**processed)
+            return {
+                "JudgeOverall": result.overall.squeeze(-1).cpu().tolist(),
+                "JudgeFaithfulness": result.faithfulness.squeeze(-1).cpu().tolist(),
+                "JudgeRecall": result.recall.squeeze(-1).cpu().tolist(),
+                "JudgePrecision": result.precision.squeeze(-1).cpu().tolist(),
+            }

examples/assets/office.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0f583ff34c5fd9d1a83d640e7c0131ad339755bd69e54f104723b707f213c21
+size 4551702

examples/span_prompting.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

examples/text_prompting.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

examples/visual_prompting.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,62 @@

+[project]
+name = "sam_audio"
+version = "0.1.0"
+description = "Segment Anything Audio"
+authors = [
+    { name="Andros Tjandra", email="androstj@meta.com" },
+    { name="Ann Lee", email="annl@meta.com" },
+    { name="Bowen Shi", email="bshi@meta.com" },
+    { name="Julius Richter", email="jrichter@meta.com" },
+    { name="Matt Le", email="mattle@meta.com" },
+    { name="Yi-Chiao Wu", email="yichiaowu@meta.com" },
+]
+readme = "README.md"
+license = { file="LICENSE" }
+requires-python = ">=3.10"
+dependencies = [
+    "dacvae@git+https://github.com/facebookresearch/dacvae.git",
+    "audiobox_aesthetics",
+    "einops",
+    "imagebind@git+https://github.com/facebookresearch/ImageBind.git",
+    "laion-clap@git+https://github.com/lematt1991/CLAP.git",
+    "numpy",
+    "perception-models@git+https://github.com/facebookresearch/perception_models@unpin-deps",
+    "pydub",
+    "torch",
+    "torchaudio",
+    "torchcodec",
+    "torchdiffeq",
+    "torchvision",
+    "transformers>=4.54.0",
+]
+[tool.setuptools.packages.find]
+include = ["sam_audio*"]
+[tool.ruff]
+target-version = "py310"
+lint.select=[
+    "B",
+    "C",
+    "E",
+    "W",
+    "F",
+    "I",
+]
+lint.ignore = [
+    "E501",
+    "E731",
+    "C901",
+    "B006",
+]
+[project.urls]
+Homepage = "https://github.com/facebookresearch/sam-audio"
+Repository = "https://github.com/facebookresearch/sam-audio"
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"

sam_audio/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from .model import *  # noqa
+from .processor import *  # noqa

sam_audio/model/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from .model import *  # noqa
+from .judge import *  # noqa

sam_audio/model/align.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from typing import Optional
+import torch
+class AlignModalities(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        normalize: bool = True,
+        with_gate: bool = True,
+    ):
+        super().__init__()
+        self.conv = torch.nn.Conv1d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=1
+        )
+        self.normalize = normalize
+        if self.normalize:
+            self.layer_norm = torch.nn.LayerNorm(out_channels)
+        self.gate = None
+        if with_gate:
+            self.gate = torch.nn.Parameter(torch.tensor([0.0]))
+        self.out_channels = out_channels
+    def forward(self, anchor: torch.Tensor, tgt: Optional[torch.Tensor] = None):
+        """
+        Align video features to the input audio features
+        Args:
+            anchor (torch.Tensor): Input anchor tensor of shape (B, T, C), where B is batch size, C is channel size, and T is sequence length.
+            tgt (Optional[torch.Tensor]): Optional features tensor to be aligned to anchor, expected shape (B, in_channels, T).
+        """
+        if tgt is None:
+            return anchor
+        post_conv = self.conv(tgt)
+        post_conv = post_conv.permute(0, 2, 1)  # BCT -> BTC
+        if self.normalize:
+            post_conv = self.layer_norm(post_conv)
+        if self.gate is None:
+            return post_conv
+        else:
+            return anchor + self.gate.tanh() * post_conv

sam_audio/model/base.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import json
+import os
+from typing import Callable, Dict, Optional, Union
+import torch
+from huggingface_hub import ModelHubMixin, snapshot_download
+class BaseModel(torch.nn.Module, ModelHubMixin):
+    config_cls: Callable
+    def device(self):
+        return next(self.parameters()).device
+    @classmethod
+    def _from_pretrained(
+        cls,
+        *,
+        model_id: str,
+        cache_dir: str,
+        force_download: bool,
+        proxies: Optional[Dict],
+        resume_download: bool,
+        local_files_only: bool,
+        token: Union[str, bool, None],
+        map_location: str = "cpu",
+        strict: bool = True,
+        revision: Optional[str] = None,
+        **model_kwargs,
+    ):
+        if os.path.isdir(model_id):
+            cached_model_dir = model_id
+        else:
+            cached_model_dir = snapshot_download(
+                repo_id=model_id,
+                revision=cls.revision,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                token=token,
+                local_files_only=local_files_only,
+            )
+        with open(os.path.join(cached_model_dir, "config.json")) as fin:
+            config = json.load(fin)
+        for key, value in model_kwargs.items():
+            if key in config:
+                config[key] = value
+        config = cls.config_cls(**config)
+        model = cls(config)
+        state_dict = torch.load(
+            os.path.join(cached_model_dir, "checkpoint.pt"),
+            weights_only=True,
+            map_location=map_location,
+        )
+        model.load_state_dict(state_dict, strict=strict)
+        return model

sam_audio/model/codec.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import math
+from abc import ABCMeta, abstractmethod
+from typing import Union
+import dacvae
+import torch
+from sam_audio.model.config import DACVAEConfig
+class Encoder(torch.nn.Module, metaclass=ABCMeta):
+    @abstractmethod
+    def forward(self, waveform: torch.Tensor) -> torch.Tensor: ...
+class Codec(Encoder):
+    @abstractmethod
+    def decode(self, encoded_frames: torch.Tensor) -> torch.Tensor: ...
+    @abstractmethod
+    def wav_idx_to_feature_idx(
+        self, wav_idx: Union[torch.Tensor, int], sample_rate=None
+    ) -> Union[torch.Tensor, int]: ...
+    @abstractmethod
+    def feature_idx_to_wav_idx(
+        self, feature_idx: Union[torch.Tensor, int], sample_rate=None
+    ) -> Union[torch.Tensor, int]: ...
+    @staticmethod
+    def cast_to_int(
+        x: Union[int, torch.Tensor],
+    ) -> Union[int, torch.Tensor]:
+        if isinstance(x, torch.Tensor):
+            return x.int()
+        else:
+            return int(x)
+class DACVAEEncoder(Encoder):
+    def __init__(self, config: DACVAEConfig) -> None:
+        super().__init__()
+        model = dacvae.DACVAE(
+            encoder_dim=config.encoder_dim,
+            encoder_rates=config.encoder_rates,
+            latent_dim=config.latent_dim,
+            decoder_dim=config.decoder_dim,
+            decoder_rates=config.decoder_rates,
+            n_codebooks=config.n_codebooks,
+            codebook_size=config.codebook_size,
+            codebook_dim=config.codebook_dim,
+            quantizer_dropout=config.quantizer_dropout,
+            sample_rate=config.sample_rate,
+        ).eval()
+        self._setup_model(model)
+        self.hop_length = config.hop_length
+        self.sample_rate = config.sample_rate
+    def _setup_model(self, model):
+        self.encoder = model.encoder
+        self.quantizer = model.quantizer
+    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
+        with torch.no_grad(), torch.backends.cudnn.flags(enabled=False):
+            z = self.encoder(self._pad(waveform))
+            mean, _ = self.quantizer.in_proj(z).chunk(2, dim=1)
+            encoded_frames = mean
+        return encoded_frames
+    def _pad(self, wavs):
+        length = wavs.size(-1)
+        if length % self.hop_length:
+            p1d = (0, self.hop_length - (length % self.hop_length))
+            return torch.nn.functional.pad(wavs, p1d, "reflect")
+        else:
+            return wavs
+class DACVAE(DACVAEEncoder, Codec):
+    def _setup_model(self, model):
+        super()._setup_model(model)
+        self.decoder = model.decoder
+    def decode(self, encoded_frames: torch.Tensor) -> torch.Tensor:
+        with torch.backends.cudnn.flags(enabled=False):
+            emb = self.quantizer.out_proj(encoded_frames)
+            return self.decoder(emb)
+    def feature_idx_to_wav_idx(self, feature_idx, sample_rate=None):
+        if sample_rate is None:
+            sample_rate = self.sample_rate
+        orig_freq = sample_rate
+        new_freq = self.sample_rate
+        wav_chunklen = feature_idx * self.hop_length * (orig_freq / new_freq)
+        return self.cast_to_int(wav_chunklen)
+    def wav_idx_to_feature_idx(self, wav_idx, sample_rate=None):
+        ceil = math.ceil
+        if torch.is_tensor(wav_idx):
+            ceil = torch.ceil
+        if sample_rate is None:
+            sample_rate = self.sample_rate
+        orig_freq = sample_rate
+        new_freq = self.sample_rate
+        target_length = ceil(new_freq * wav_idx / orig_freq)
+        res = ceil(target_length / self.hop_length)
+        return self.cast_to_int(res)

sam_audio/model/config.py ADDED Viewed

	@@ -0,0 +1,251 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from typing import Optional, Tuple
+import numpy as np
+from core.audio_visual_encoder.config import TransformerConfig as PEAVTransformerConfig
+from transformers import ModernBertConfig
+class DACVAEConfig:
+    def __init__(
+        self,
+        encoder_dim: int = 64,
+        encoder_rates: list[int] = [2, 8, 10, 12],
+        latent_dim: int = 1024,
+        decoder_dim: int = 1536,
+        decoder_rates: list[int] = [12, 10, 8, 2],
+        n_codebooks: int = 16,
+        codebook_size: int = 1024,
+        codebook_dim: int = 128,
+        quantizer_dropout: bool = False,
+        sample_rate: int = 48_000,
+        mean: float = 0.0,
+        std: float = 1.0,
+    ):
+        self.encoder_dim = encoder_dim
+        self.encoder_rates = encoder_rates
+        self.latent_dim = latent_dim
+        self.decoder_dim = decoder_dim
+        self.decoder_rates = decoder_rates
+        self.n_codebooks = n_codebooks
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.quantizer_dropout = quantizer_dropout
+        self.sample_rate = sample_rate
+        self.mean = mean
+        self.std = std
+    @property
+    def hop_length(self):
+        return int(np.prod(self.encoder_rates))
+class TextEncoderConfig:
+    def __init__(self, dim: int = 768):
+        self.dim = dim
+class T5EncoderConfig(TextEncoderConfig):
+    def __init__(
+        self,
+        name: str = "t5-base",
+        max_length: Optional[int] = 512,
+        pad_mode: str = "longest",
+        dim: int = 768,
+    ):
+        super().__init__(dim=dim)
+        self.name = name
+        self.max_length = max_length
+        self.pad_mode = pad_mode
+class VisionEncoderConfig:
+    def __init__(self, dim: int = 1024, batch_size: int = 300):
+        self.dim = dim
+        self.batch_size = batch_size
+class PerceptionEncoderConfig(VisionEncoderConfig):
+    def __init__(
+        self,
+        dim: int = 1024,
+        batch_size: int = 300,
+        name: str = "PE-Core-L14-336",
+        normalize_feature: bool = True,
+        interpolation_mode: str = "BICUBIC",
+        image_size: int = 336,
+    ):
+        super().__init__(dim=dim, batch_size=batch_size)
+        self.name = name
+        self.normalize_feature = normalize_feature
+        self.interpolation_mode = interpolation_mode
+        self.image_size = image_size
+class TransformerConfig:
+    def __init__(
+        self,
+        dim: int = 2048,
+        n_heads: int = 16,
+        n_layers: int = 16,
+        dropout: float = 0.1,
+        norm_eps: float = 1.0e-05,
+        qk_norm: bool = True,
+        fc_bias: bool = False,
+        ffn_exp: int = 4,
+        ffn_dim_multiplier: int = 1,
+        multiple_of: int = 64,
+        non_linearity: str = "swiglu",
+        use_rope: bool = True,
+        max_positions: int = 10000,
+        frequency_embedding_dim: int = 256,
+        timestep_non_linearity: str = "swiglu",
+        t_block_non_linearity: str = "silu",
+        t_block_bias: bool = True,
+        context_dim: int = 2048,
+        context_non_linearity: str = "swiglu",
+        context_embedder_dropout: float = 0.0,
+        context_norm: bool = False,
+        out_channels: int = 256,
+        in_channels: Optional[int] = None,
+    ):
+        self.dim = dim
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.dropout = dropout
+        self.norm_eps = norm_eps
+        self.qk_norm = qk_norm
+        self.fc_bias = fc_bias
+        self.ffn_exp = ffn_exp
+        self.ffn_dim_multiplier = ffn_dim_multiplier
+        self.multiple_of = multiple_of
+        self.non_linearity = non_linearity
+        self.use_rope = use_rope
+        self.max_positions = max_positions
+        self.frequency_embedding_dim = frequency_embedding_dim
+        self.timestep_non_linearity = timestep_non_linearity
+        self.t_block_non_linearity = t_block_non_linearity
+        self.t_block_bias = t_block_bias
+        self.context_dim = context_dim
+        self.context_non_linearity = context_non_linearity
+        self.context_embedder_dropout = context_embedder_dropout
+        self.context_norm = context_norm
+        self.out_channels = out_channels
+        self.in_channels = in_channels
+class RankerConfig:
+    kind: str
+class ImageBindRankerConfig(RankerConfig):
+    kind: str = "imagebind"
+    def __init__(self, checkpoint: Optional[str] = None):
+        self.checkpoint = checkpoint
+class ClapRankerConfig(RankerConfig):
+    kind: str = "clap"
+    def __init__(self, checkpoint: Optional[str] = None):
+        self.checkpoint = checkpoint
+class JudgeRankerConfig(RankerConfig):
+    kind: str = "judge"
+    def __init__(self, checkpoint_or_model_id: str = "facebook/sam-audio-judge"):
+        self.checkpoint_or_model_id = checkpoint_or_model_id
+class SoundActivityRankerConfig(RankerConfig):
+    kind: str = "sound_activity"
+    def __init__(
+        self,
+        threshold_mode: str = "rel_to_max",
+        sil_threshold: float = -40,
+        metric: str = "iou",
+    ):
+        self.threshold_mode = threshold_mode
+        self.sil_threshold = sil_threshold
+        self.metric = metric
+class EnsembleRankerConfig(RankerConfig):
+    kind: str = "ensemble"
+    def __init__(self, rankers: dict[str, Tuple[RankerConfig, float]]):
+        self.rankers = rankers
+def parse_ranker_config(config_dict: dict):
+    kind = config_dict.pop("kind")
+    match kind:
+        case ImageBindRankerConfig.kind:
+            return ImageBindRankerConfig(**config_dict)
+        case ClapRankerConfig.kind:
+            return ClapRankerConfig(**config_dict)
+        case JudgeRankerConfig.kind:
+            return JudgeRankerConfig(**config_dict)
+        case SoundActivityRankerConfig.kind:
+            return SoundActivityRankerConfig(**config_dict)
+        case EnsembleRankerConfig.kind:
+            return EnsembleRankerConfig(
+                {
+                    k: (parse_ranker_config(v), w)
+                    for k, (v, w) in config_dict["rankers"].items()
+                }
+            )
+class SAMAudioConfig:
+    def __init__(
+        self,
+        in_channels: int = 768,
+        audio_codec=None,
+        text_encoder=None,
+        vision_encoder=None,
+        transformer=None,
+        num_anchors: int = 3,
+        anchor_embedding_dim: int = 128,
+        visual_ranker=None,
+        text_ranker=None,
+        span_predictor: Optional[str] = "pe-a-frame-large",
+    ):
+        self.in_channels = in_channels
+        self.audio_codec = DACVAEConfig(**(audio_codec or {}))
+        self.text_encoder = T5EncoderConfig(**(text_encoder or {}))
+        self.vision_encoder = PerceptionEncoderConfig(**(vision_encoder or {}))
+        self.transformer = TransformerConfig(**(transformer or {}))
+        self.num_anchors = num_anchors
+        self.anchor_embedding_dim = anchor_embedding_dim
+        self.visual_ranker = (
+            None if visual_ranker is None else parse_ranker_config(visual_ranker)
+        )
+        self.text_ranker = (
+            None if text_ranker is None else parse_ranker_config(text_ranker)
+        )
+        self.span_predictor = span_predictor
+class SAMAudioJudgeConfig:
+    def __init__(
+        self,
+        audio_codec: DACVAEConfig = None,
+        transformer: PEAVTransformerConfig = None,
+        text_model: ModernBertConfig = None,
+        finetune_transformer: PEAVTransformerConfig = None,
+        nth_text_layer: int = 22,
+        bottleneck_dim: int = 256,
+    ):
+        self.audio_codec = DACVAEConfig(**(audio_codec or {}))
+        self.transformer = PEAVTransformerConfig(**(transformer or {}))
+        self.text_model = ModernBertConfig(**(text_model or {}))
+        self.finetune_transformer = PEAVTransformerConfig(
+            **(finetune_transformer or {})
+        )
+        self.nth_text_layer = nth_text_layer
+        self.bottleneck_dim = bottleneck_dim

sam_audio/model/judge.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from dataclasses import dataclass
+from typing import Optional
+import torch
+from core.audio_visual_encoder.transformer import BaseModelOutputWithPooling
+from core.audio_visual_encoder.transformer import Transformer as PEAVTransformer
+from transformers import AutoModel
+from .base import BaseModel
+from .codec import DACVAEEncoder
+from .config import SAMAudioJudgeConfig
+@dataclass
+class SAMAudioJudgeOutput:
+    r"""
+    overall (torch.Tensor, optional): Overall score tensor of shape (batch_size, 1).
+    recall (torch.Tensor, optional): Recall score tensor of shape (batch_size, 1).
+    precision (torch.Tensor, optional): Precision score tensor of shape (batch_size, 1).
+    faithfulness (torch.Tensor, optional): Faithfulness score tensor of shape (batch_size, 1).
+    text_model_output (BaseModelOutputWithPooling): Output from the text model.
+    audio_model_output (BaseModelOutputWithPooling): Output from the audio model.
+    """
+    overall: Optional[torch.Tensor] = None
+    recall: Optional[torch.Tensor] = None
+    precision: Optional[torch.Tensor] = None
+    faithfulness: Optional[torch.Tensor] = None
+    text_model_output: BaseModelOutputWithPooling = None
+    audio_model_output: BaseModelOutputWithPooling = None
+class SAMAudioJudgeModel(BaseModel):
+    config_cls = SAMAudioJudgeConfig
+    revision = "sam_audio"
+    def __init__(self, config: SAMAudioJudgeConfig):
+        super().__init__()
+        self.config = config
+        self.data_proj = torch.nn.Linear(
+            config.audio_codec.codebook_dim, config.transformer.hidden_size
+        )
+        self.audio_codec = DACVAEEncoder(config.audio_codec)
+        self.transformer = PEAVTransformer(config.transformer)
+        self.finetune_transformer = PEAVTransformer(config.finetune_transformer)
+        self.text_model = AutoModel.from_config(config.text_model)
+        self.cat_audio_proj = torch.nn.Linear(
+            2 * config.transformer.hidden_size, config.bottleneck_dim
+        )
+        self.text_proj1 = torch.nn.Linear(
+            in_features=config.text_model.hidden_size,
+            out_features=config.transformer.hidden_size,
+            bias=False,
+        )
+        self.text_proj2 = torch.nn.Linear(
+            in_features=config.transformer.hidden_size,
+            out_features=config.bottleneck_dim,
+        )
+        self.layer_norm = torch.nn.LayerNorm(config.bottleneck_dim)
+        self.proj_audio_and_text = torch.nn.Linear(
+            2 * config.bottleneck_dim, config.bottleneck_dim
+        )
+        self.finetune_data_proj = torch.nn.Linear(
+            config.bottleneck_dim, config.finetune_transformer.hidden_size
+        )
+        self.head = torch.nn.Linear(
+            config.finetune_transformer.hidden_size, 4, bias=False
+        )
+        self.mean = torch.nn.Parameter(torch.zeros(4, requires_grad=False))
+        self.std = torch.nn.Parameter(torch.ones(4, requires_grad=False))
+    def _get_text_output(self, input_ids, attention_mask):
+        nth_layer = self.config.nth_text_layer
+        output = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=nth_layer is not None,
+        )
+        if nth_layer is None:
+            text_model_output = output.last_hidden_state
+        else:
+            text_model_output = output.hidden_states[nth_layer]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=text_model_output, pooler_output=text_model_output[:, 0]
+        )
+    def forward(
+        self,
+        input_ids: torch.Tensor,  # tokenized text
+        input_values: torch.Tensor,  # input audio waveform
+        separated_values: torch.Tensor,  # separated audio waveform
+        attention_mask: Optional[torch.Tensor] = None,  # text attention mask
+        padding_mask: Optional[torch.Tensor] = None,  # audio padding mask
+    ) -> SAMAudioJudgeOutput:
+        text_features = self.text_proj1(
+            self._get_text_output(input_ids, attention_mask).pooler_output
+        )
+        stacked_audios = torch.cat([input_values, separated_values], dim=0)
+        stacked_codec_features = self.audio_codec(stacked_audios)
+        feature_padding_mask = None
+        if padding_mask is not None:
+            feature_padding_mask = padding_mask[
+                :, :: self.config.audio_codec.hop_length
+            ]
+        stacked_features = self.transformer(
+            self.data_proj(stacked_codec_features.transpose(1, 2)),
+            padding_mask=feature_padding_mask,
+        )
+        input_features, hyp_features = stacked_features.last_hidden_state.chunk(2, 0)
+        audio_features = self.cat_audio_proj(
+            torch.cat([hyp_features, input_features], dim=2)
+        )
+        expanded_text = (
+            self.layer_norm(self.text_proj2(text_features))
+            .unsqueeze(1)
+            .expand_as(audio_features)
+        )
+        audio_and_text = self.proj_audio_and_text(
+            torch.cat([audio_features, expanded_text], dim=2)
+        )
+        finetune_transformer_output = self.finetune_transformer(
+            self.finetune_data_proj(audio_and_text), padding_mask=feature_padding_mask
+        )
+        result = self.head(finetune_transformer_output.last_hidden_state)
+        if feature_padding_mask is not None:
+            feature_padding_mask = feature_padding_mask.unsqueeze(-1)
+        pooled = torch.masked.mean(result, mask=feature_padding_mask, dim=1)
+        de_normalized = pooled * self.std + self.mean
+        return SAMAudioJudgeOutput(*de_normalized.chunk(4, dim=1))
+__all__ = ["SAMAudioJudgeModel", "SAMAudioJudgeOutput"]

sam_audio/model/model.py ADDED Viewed

	@@ -0,0 +1,362 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import math
+import re
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+import torch
+from core.audio_visual_encoder import PEAudioFrame, PEAudioFrameTransform
+from torchdiffeq import odeint
+from sam_audio.model.align import AlignModalities
+from sam_audio.model.base import BaseModel
+from sam_audio.model.codec import DACVAE
+from sam_audio.model.config import SAMAudioConfig
+from sam_audio.model.text_encoder import T5TextEncoder
+from sam_audio.model.transformer import DiT
+from sam_audio.model.vision_encoder import PerceptionEncoder
+from sam_audio.processor import Batch
+from sam_audio.ranking import create_ranker
+DFLT_ODE_OPT = {"method": "midpoint", "options": {"step_size": 2 / 32}}
+class SinusoidalEmbedding(torch.nn.Module):
+    def __init__(self, dim, theta=10000):
+        super().__init__()
+        assert (dim % 2) == 0
+        half_dim = dim // 2
+        inv_freq = torch.exp(
+            -math.log(theta) * torch.arange(half_dim).float() / half_dim
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, x, pos=None):
+        if pos is None:
+            seq_len, device = x.shape[1], x.device
+            pos = torch.arange(seq_len, device=device)
+        emb = torch.einsum("i, j -> i j", pos, self.inv_freq)
+        emb = torch.cat((emb.cos(), emb.sin()), dim=-1)
+        return emb
+class EmbedAnchors(torch.nn.Module):
+    def __init__(self, num_embeddings: int, embedding_dim: int, out_dim: int):
+        super().__init__()
+        self.embed = torch.nn.Embedding(
+            num_embeddings + 1, embedding_dim, padding_idx=num_embeddings
+        )
+        self.gate = torch.nn.Parameter(torch.tensor([0.0]))
+        self.proj = torch.nn.Linear(embedding_dim, out_dim, bias=False)
+    def forward(
+        self,
+        x: torch.Tensor,
+        anchor_ids: Optional[torch.Tensor] = None,
+        anchor_alignment: Optional[torch.Tensor] = None,
+    ):
+        if anchor_ids is None:
+            return x
+        embs = self.embed(anchor_ids.gather(1, anchor_alignment))
+        proj = self.proj(embs)
+        return x + self.gate.tanh() * proj
+@dataclass
+class SeparationResult:
+    target: torch.Tensor
+    residual: torch.Tensor
+    noise: torch.Tensor
+class SAMAudio(BaseModel):
+    config_cls = SAMAudioConfig
+    revision = None
+    def __init__(self, cfg: SAMAudioConfig):
+        super().__init__()
+        self.audio_codec = DACVAE(cfg.audio_codec)
+        self.text_encoder = T5TextEncoder(cfg.text_encoder)
+        self.vision_encoder = PerceptionEncoder(cfg.vision_encoder)
+        self.transformer = DiT(cfg.transformer)
+        self.proj = torch.nn.Linear(cfg.in_channels, cfg.transformer.dim)
+        self.align_masked_video = AlignModalities(
+            cfg.vision_encoder.dim, cfg.transformer.dim
+        )
+        self.embed_anchors = EmbedAnchors(
+            cfg.num_anchors, cfg.anchor_embedding_dim, cfg.transformer.dim
+        )
+        self.memory_proj = torch.nn.Linear(cfg.text_encoder.dim, cfg.transformer.dim)
+        self.timestep_emb = SinusoidalEmbedding(cfg.transformer.dim)
+        self.visual_ranker = create_ranker(cfg.visual_ranker)
+        self.text_ranker = create_ranker(cfg.text_ranker)
+        if cfg.span_predictor is not None:
+            self.span_predictor = PEAudioFrame.from_config(
+                cfg.span_predictor, pretrained=True
+            )
+            self.span_predictor_transform = PEAudioFrameTransform.from_config(
+                cfg.span_predictor
+            )
+    @property
+    def sample_rate(self):
+        return self.audio_codec.sample_rate
+    def align_inputs(
+        self,
+        noisy_audio,
+        audio_features: torch.Tensor,
+        masked_video_features: Optional[torch.Tensor] = None,
+        anchor_ids: Optional[torch.Tensor] = None,
+        anchor_alignment: Optional[torch.Tensor] = None,
+    ):
+        x = torch.cat(
+            [
+                noisy_audio,
+                torch.zeros_like(audio_features),
+                audio_features,
+            ],
+            dim=2,
+        )
+        projected = self.proj(x)
+        aligned = self.align_masked_video(projected, masked_video_features)
+        aligned = self.embed_anchors(aligned, anchor_ids, anchor_alignment)
+        return aligned
+    def forward(
+        self,
+        noisy_audio: torch.Tensor,
+        audio_features: torch.Tensor,
+        text_features: torch.Tensor,
+        time: torch.Tensor,
+        masked_video_features: Optional[torch.Tensor] = None,
+        text_mask: Optional[torch.Tensor] = None,
+        anchor_ids: Optional[torch.Tensor] = None,
+        anchor_alignment: Optional[torch.Tensor] = None,
+        audio_pad_mask: Optional[torch.Tensor] = None,
+    ):
+        """
+        Forward pass for the model.  Represents one function evaluation of the ODE.
+        In the below descriptions, B is batch size, T is sequence length, C is channel size.
+        Note that the size of C and T may vary across arguments (ex. text_features vs. audio_features),
+        it is used only to designate a Channel or time/sequence-length dimension respectively.
+        Args:
+            noisy_audio (torch.Tensor): Noisy audio input tensor (being denoised).
+            audio_features (torch.Tensor): Clean audio features [B x T x C].
+            text_features (torch.Tensor): Encoded text features tensor [B x T x C].
+            time (torch.Tensor): Timestep tensor for positional encoding [B].
+            masked_video_features (Optional[torch.Tensor], optional): Masked video features tensor. [B x C x T].
+            text_mask (Optional[torch.Tensor], optional): Padding mask for text features. [B x T].
+            anchor_ids (Optional[torch.Tensor], optional): Anchor IDs tensor. Defaults to None [B x T].
+            anchor_alignment (Optional[torch.Tensor], optional): Anchor alignment tensor. B x T.
+            audio_pad_mask (Optional[torch.Tensor], optional): Padding mask for audio input. [B x T].
+        Returns:
+            torch.Tensor
+        """
+        aligned_inputs = self.align_inputs(
+            noisy_audio,
+            audio_features,
+            masked_video_features=masked_video_features,
+            anchor_ids=anchor_ids,
+            anchor_alignment=anchor_alignment,
+        )
+        memory = timestep_emb = self.timestep_emb(time, pos=time).unsqueeze(1)
+        if text_features is not None:
+            memory = self.memory_proj(text_features) + timestep_emb
+        return self.transformer(
+            aligned_inputs,
+            time,
+            padding_mask=audio_pad_mask,
+            memory=memory,
+            memory_padding_mask=text_mask,
+        )
+    def _get_audio_features(self, audios: torch.Tensor):
+        audio_features = self.audio_codec(audios).transpose(1, 2)
+        return torch.cat([audio_features, audio_features], dim=2)
+    def _get_video_features(self, video, audio_features):
+        B, T, _ = audio_features.shape
+        if video is None:
+            return audio_features.new_zeros(B, self.vision_encoder.dim, T)
+        else:
+            return self.vision_encoder(video).transpose(1, 2)
+    def _repeat_for_reranking(self, tensor, candidates):
+        if candidates > 1:
+            B = tensor.size(0)
+            rest = tensor.shape[1:]
+            return (
+                tensor.unsqueeze(1)
+                .expand(B, candidates, *rest)
+                .reshape(B * candidates, *rest)
+            )
+        else:
+            return tensor
+    def _unrepeat_from_reranking(self, tensor, candidates):
+        return tensor[::candidates]
+    def _get_forward_args(self, batch: Batch, candidates: int = 1):
+        audio_features = self._get_audio_features(batch.audios)
+        text_features, text_mask = self.text_encoder(batch.descriptions)
+        masked_video_features = self._get_video_features(
+            batch.masked_video, audio_features
+        )
+        return {
+            "audio_features": self._repeat_for_reranking(audio_features, candidates),
+            "text_features": self._repeat_for_reranking(text_features, candidates),
+            "text_mask": self._repeat_for_reranking(text_mask, candidates),
+            "masked_video_features": self._repeat_for_reranking(
+                masked_video_features, candidates
+            ),
+            "anchor_ids": self._repeat_for_reranking(batch.anchor_ids, candidates),
+            "anchor_alignment": self._repeat_for_reranking(
+                batch.anchor_alignment, candidates
+            ),
+            "audio_pad_mask": self._repeat_for_reranking(
+                batch.audio_pad_mask, candidates
+            ),
+        }
+    def predict_spans(
+        self, batch: Batch, audio_features: torch.Tensor, audio_pad_mask: torch.Tensor
+    ) -> Batch:
+        input = self.span_predictor_transform(text=batch.descriptions).to(
+            audio_features.device
+        )
+        output = self.span_predictor(
+            input_features=audio_features[:, :, :128],
+            padding_mask=audio_pad_mask,
+            return_spans=True,
+            **input,
+        )
+        anchors = [[["+"] + anchor for anchor in anchors] for anchors in output.spans]
+        batch.process_anchors(anchors)
+        return batch
+    @torch.inference_mode()
+    def separate(
+        self,
+        batch: Batch,
+        noise: Optional[torch.Tensor] = None,
+        ode_opt: Dict[str, Any] = DFLT_ODE_OPT,
+        reranking_candidates: int = 1,
+        predict_spans: bool = False,
+    ) -> SeparationResult:
+        # Encode audio
+        forward_args = self._get_forward_args(batch, candidates=reranking_candidates)
+        if predict_spans and hasattr(self, "span_predictor") and batch.anchors is None:
+            batch = self.predict_spans(
+                batch=batch,
+                audio_features=self._unrepeat_from_reranking(
+                    forward_args["audio_features"], reranking_candidates
+                ),
+                audio_pad_mask=self._unrepeat_from_reranking(
+                    forward_args["audio_pad_mask"], reranking_candidates
+                ),
+            )
+        audio_features = forward_args["audio_features"]
+        B, T, C = audio_features.shape
+        C = C // 2  # we stack audio_features, so the actual channels is half
+        if noise is None:
+            noise = torch.randn_like(audio_features)
+        def vector_field(t, noisy_audio):
+            res = self.forward(
+                noisy_audio=noisy_audio,
+                time=t.expand(noisy_audio.size(0)),
+                **forward_args,
+            )
+            return res
+        states = odeint(
+            vector_field,
+            noise,
+            torch.tensor([0.0, 1.0], device=noise.device),
+            **ode_opt,
+        )
+        generated_features = states[-1].transpose(1, 2)
+        # generated_features has shape [B, 2C, T].  Reshape to stack along the batch dimension
+        wavs = self.audio_codec.decode(generated_features.reshape(2 * B, C, T)).view(
+            B, 2, -1
+        )
+        bsz = wavs.size(0) // reranking_candidates
+        sizes = self.audio_codec.feature_idx_to_wav_idx(batch.sizes)
+        target_wavs = self.unbatch(
+            wavs[:, 0].view(bsz, reranking_candidates, -1), sizes
+        )
+        residual_wavs = self.unbatch(
+            wavs[:, 1].view(bsz, reranking_candidates, -1), sizes
+        )
+        if (
+            reranking_candidates > 1
+            and batch.masked_video is not None
+            and self.visual_ranker is not None
+        ):
+            scores = self.visual_ranker(
+                extracted_audio=target_wavs,
+                videos=batch.masked_video,
+                sample_rate=self.audio_codec.sample_rate,
+            )
+            idxs = scores.argmax(dim=1)
+        elif reranking_candidates > 1 and self.text_ranker is not None:
+            input_audio = [
+                audio[:, :size].expand(reranking_candidates, -1)
+                for audio, size in zip(batch.audios, sizes, strict=False)
+            ]
+            scores = self.text_ranker(
+                extracted_audio=target_wavs,
+                input_audio=input_audio,
+                descriptions=batch.descriptions,
+                sample_rate=self.audio_codec.sample_rate,
+            )
+            idxs = scores.argmax(dim=1)
+        else:
+            idxs = torch.zeros(bsz, dtype=torch.long, device=noise.device)
+        return SeparationResult(
+            target=[wav[idx] for wav, idx in zip(target_wavs, idxs, strict=False)],
+            residual=[
+                wavs[idx] for wavs, idx in zip(residual_wavs, idxs, strict=False)
+            ],
+            noise=noise,
+        )
+    def unbatch(self, wavs: torch.Tensor, sizes: torch.Tensor, time_dim: int = -1):
+        result = []
+        for row, size in zip(wavs, sizes, strict=False):
+            result.append(row.narrow(dim=time_dim, start=0, length=size))
+        return result
+    def load_state_dict(self, state_dict, strict=True):
+        if strict:
+            missing_keys, unexpected_keys = super().load_state_dict(
+                state_dict, strict=False
+            )
+            # We load this directly from HF, not in checkpoint
+            skip_regex = re.compile(
+                "(^text_encoder|^visual_ranker|^text_ranker|^span_predictor)"
+            )
+            missing_keys = [x for x in missing_keys if not re.search(skip_regex, x)]
+            if len(missing_keys) > 0 or len(unexpected_keys) > 0:
+                raise RuntimeError(
+                    f"Missing keys: {missing_keys}, unexpected_keys: {unexpected_keys}"
+                )
+__all__ = ["SAMAudio"]

sam_audio/model/patcher.py ADDED Viewed

	@@ -0,0 +1,164 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import math
+from typing import Tuple
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+def pad1d(
+    x: torch.Tensor,
+    paddings: Tuple[int, int],
+    mode: str = "constant",
+    value: float = 0.0,
+):
+    # Copied from https://github.com/facebookresearch/audiocraft/blob/main/audiocraft/modules/conv.py
+    """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
+    If this is the case, we insert extra 0 padding to the right before the reflection happen.
+    """
+    length = x.shape[-1]
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    if mode == "reflect":
+        max_pad = max(padding_left, padding_right)
+        extra_pad = 0
+        if length <= max_pad:
+            extra_pad = max_pad - length + 1
+            x = F.pad(x, (0, extra_pad))
+        padded = F.pad(x, paddings, mode, value)
+        end = padded.shape[-1] - extra_pad
+        return padded[..., :end]
+    else:
+        return F.pad(x, paddings, mode, value)
+def get_extra_padding_for_conv1d(
+    x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
+) -> int:
+    # Copied from https://github.com/facebookresearch/audiocraft/blob/main/audiocraft/modules/conv.py
+    """See `pad_for_conv1d`."""
+    length = x.shape[-1]
+    n_frames = (length - kernel_size + padding_total) / stride + 1
+    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
+    return ideal_length - length
+class Conv1d(torch.nn.Conv1d):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        kernel_size = self.kernel_size[0]
+        stride = self.stride[0]
+        dilation = self.dilation[0]
+        kernel_size = (
+            kernel_size - 1
+        ) * dilation + 1  # effective kernel size with dilations
+        padding_total = kernel_size - stride
+        extra_padding = get_extra_padding_for_conv1d(
+            x, kernel_size, stride, padding_total
+        )
+        # Asymmetric padding required for odd strides
+        padding_right = padding_total // 2
+        padding_left = padding_total - padding_right
+        x = pad1d(x, (padding_left, padding_right + extra_padding))
+        return super().forward(x)
+class ConvBlock1d(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        *,
+        kernel_size: int = 3,
+        stride: int = 1,
+        dilation: int = 1,
+        num_groups: int = 8,
+    ) -> None:
+        super().__init__()
+        self.groupnorm = torch.nn.GroupNorm(
+            num_groups=num_groups, num_channels=in_channels
+        )
+        self.activation = torch.nn.SiLU()
+        self.project = Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        x = self.groupnorm(x)
+        x = self.activation(x)
+        return self.project(x)
+class ResnetBlock1d(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        *,
+        kernel_size: int = 3,
+        stride: int = 1,
+        dilation: int = 1,
+        num_groups: int = 8,
+    ) -> None:
+        super().__init__()
+        self.block1 = ConvBlock1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            num_groups=num_groups,
+        )
+        self.block2 = ConvBlock1d(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            num_groups=num_groups,
+        )
+        self.to_out = (
+            Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1)
+            if in_channels != out_channels
+            else torch.nn.Identity()
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = self.block1(x)
+        h = self.block2(h)
+        return h + self.to_out(x)
+class Patcher(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        patch_size: int,
+    ):
+        super().__init__()
+        assert_message = f"out_channels must be divisible by patch_size ({patch_size})"
+        assert out_channels % patch_size == 0, assert_message
+        self.patch_size = patch_size
+        self.block = ResnetBlock1d(
+            in_channels=in_channels,
+            out_channels=out_channels // patch_size,
+            num_groups=1,
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.block(x)
+        x = rearrange(x, "b c (l p) -> b (c p) l", p=self.patch_size)
+        return x

sam_audio/model/rope.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import math
+from typing import Tuple
+import torch
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor, seq_dim: int):
+    """
+    Reshape frequency tensor for broadcasting it with another tensor.
+    This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
+    for the purpose of broadcasting the frequency tensor during element-wise operations.
+    Args:
+        freqs_cis (torch.Tensor): Frequency tensor to be reshaped.
+        x (torch.Tensor): Target tensor for broadcasting compatibility.
+        seq_dim (int): Sequence dimension index.
+    Returns:
+        torch.Tensor: Reshaped frequency tensor.
+    """
+    ndim = x.ndim
+    assert 0 <= seq_dim < ndim
+    assert freqs_cis.shape == (
+        x.shape[seq_dim],
+        x.shape[-3],
+        2,
+        2,
+    ), f"freqs_cis vs x: {(freqs_cis.shape, x.shape)}"
+    shape = [
+        d if i == seq_dim or i == ndim - 3 else 1 for i, d in enumerate(x.shape[:-2])
+    ] + [2, 2]
+    return freqs_cis.view(*shape)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    seq_dim: int,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    xq_ = xq.reshape(*xq.shape[:-1], -1, 1, 2)  # B S H D -> B S H D/2 1 2
+    xk_ = xk.reshape(*xk.shape[:-1], -1, 1, 2)  # B S H D -> B S H D/2 1 2
+    freqs_cis = reshape_for_broadcast(
+        freqs_cis, xq_, seq_dim
+    ).float()  # S D/2 2 2 -> 1 S 1 D/2 2 2
+    xq_out = (xq_ * freqs_cis).sum(5).flatten(3)
+    xk_out = (xk_ * freqs_cis).sum(5).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+class RotaryEmbedding(torch.nn.Module):
+    """
+    RotaryEmbedding Module
+    """
+    def __init__(
+        self,
+        theta: float,
+        head_dim: int,
+        max_seqlen: int = 1024,
+        scale_factor: int = 1,
+        low_freq_factor: int = 1,
+        high_freq_factor: int = 32,
+        old_context_len: int = 8192,
+    ):
+        super().__init__()
+        self.theta = theta
+        self.head_dim = head_dim
+        self.max_seqlen = max_seqlen
+        self.scale_factor = scale_factor
+        self.low_freq_factor = low_freq_factor
+        self.high_freq_factor = high_freq_factor
+        self.old_context_len = old_context_len
+        if scale_factor != 1:
+            self.low_freq_wavelen = old_context_len / low_freq_factor
+            self.high_freq_wavelen = old_context_len / high_freq_factor
+            assert self.low_freq_wavelen >= self.high_freq_wavelen
+    def reset_parameters(self):
+        freqs_cis = self.precompute_freqs_cis(
+            dim=self.head_dim, end=self.max_seqlen, theta=self.theta
+        )
+        S, D, _, _ = freqs_cis.shape
+        # S D 2 2 -> 1 S 1 D 2 2
+        freqs_cis = freqs_cis.view(1, S, 1, D, 2, 2)
+        self.register_buffer(
+            "freqs_cis",
+            freqs_cis,
+            persistent=False,
+        )
+    def apply_scaling(self, freqs):
+        if self.scale_factor == 1:
+            return freqs
+        new_freqs = []
+        for freq in freqs:
+            wavelen = 2 * math.pi / freq
+            if wavelen < self.high_freq_wavelen:
+                new_freqs.append(freq)
+            elif wavelen > self.low_freq_wavelen:
+                new_freqs.append(freq / self.scale_factor)
+            else:
+                assert self.low_freq_wavelen != self.high_freq_wavelen
+                smooth = (self.old_context_len / wavelen - self.low_freq_factor) / (
+                    self.high_freq_factor - self.low_freq_factor
+                )
+                new_freqs.append(
+                    (1 - smooth) * freq / self.scale_factor + smooth * freq
+                )
+        return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
+    def precompute_freqs_cis(
+        self,
+        dim: int,
+        end: int,
+        theta: float = 10000.0,
+    ):
+        """
+        Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
+        This function calculates a frequency tensor with complex exponentials using the given dimension 'dim'
+        and the end index 'end'. The 'theta' parameter scales the frequencies.
+        The returned tensor contains complex values in complex64 data type.
+        Args:
+            dim (int): Dimension of the frequency tensor.
+            end (int): End index for precomputing frequencies.
+            theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
+        Returns:
+            torch.Tensor: Precomputed frequency tensor with complex exponentials.
+        """
+        freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+        freqs = self.apply_scaling(freqs)
+        t = torch.arange(end, device=freqs.device)
+        freqs = torch.outer(t, freqs).float()
+        cos, sin = freqs.cos(), freqs.sin()
+        return torch.stack((cos, -sin, sin, cos), dim=-1).view(*freqs.size(), 2, 2)
+    def forward(self, x: torch.Tensor, bhle: bool = False, **kwargs):
+        if bhle:
+            x = x.transpose(1, 2)  # (B H L E) -> (B L H E)
+        seqlen = x.size(1)
+        x_ = x.reshape(*x.shape[:-1], -1, 1, 2)  # B L H E -> B L H E/2 1 2
+        x_out = (x_ * self.freqs_cis[:, :seqlen]).sum(5).flatten(3)
+        if bhle:
+            x_out = x_out.transpose(1, 2)
+        return x_out.type_as(x)

sam_audio/model/text_encoder.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from typing import Tuple
+import torch
+import transformers
+from sam_audio.model.config import T5EncoderConfig
+class T5TextEncoder(torch.nn.Module):
+    def __init__(self, cfg: T5EncoderConfig):
+        super().__init__()
+        self.model = transformers.T5EncoderModel.from_pretrained(cfg.name)
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(cfg.name)
+        self.pad_mode = cfg.pad_mode
+        self.max_length = cfg.max_length
+    def forward(self, texts: list[str]) -> Tuple[torch.Tensor, torch.Tensor]:
+        device = next(self.model.parameters()).device
+        encoded = self.tokenizer(
+            texts,
+            truncation=True,
+            max_length=self.max_length,
+            padding=self.pad_mode,
+            return_tensors="pt",
+        )
+        input_ids = encoded["input_ids"].to(device)
+        attention_mask = encoded["attention_mask"].to(device)
+        res = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=True,
+        )["last_hidden_state"]
+        return res, attention_mask.bool()

sam_audio/model/transformer.py ADDED Viewed

	@@ -0,0 +1,524 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import math
+from functools import partial
+from typing import List, Optional, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from .config import TransformerConfig
+from .patcher import Patcher
+from .rope import RotaryEmbedding
+def gate(x, gate):
+    return x * gate
+def modulate(x, shift, scale):
+    return x * (1 + scale) + shift
+def get_nonlinearity(kind: str):
+    return {
+        "relu": F.relu,
+        "gelu": F.gelu,
+        "swiglu": None,
+        "approx_gelu": partial(F.gelu, approximate="tanh"),
+        "srelu": lambda x: F.relu(x) ** 2,
+        "silu": F.silu,
+    }[kind]
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        output = self._norm(x.float())
+        return (output * self.weight).type_as(x)
+class ProjectionLayer(torch.nn.Module):
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        non_linearity: str,
+        dropout: float,
+        fc_bias: bool = False,
+    ):
+        super().__init__()
+        self.swiglu = non_linearity == "swiglu"
+        self.dropout = dropout
+        self.w1 = torch.nn.Linear(in_dim, out_dim, bias=fc_bias)
+        self.w2 = torch.nn.Linear(out_dim, out_dim, bias=fc_bias)
+        if self.swiglu:
+            self.w3 = torch.nn.Linear(in_dim, out_dim, bias=fc_bias)
+        # non-linearity
+        self.non_linearity = get_nonlinearity(non_linearity)
+    def forward(self, x):
+        hidden1 = self.w1(x)
+        if self.swiglu:
+            hidden3 = self.w3(x)
+            hidden = F.silu(hidden1) * hidden3
+        else:
+            hidden = self.non_linearity(hidden1)
+        hidden = F.dropout(hidden, p=self.dropout, training=self.training)
+        return self.w2(hidden)
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        head_dim: int,
+        n_heads: int,
+        n_kv_heads: int,
+        norm_eps: float = 1e-5,
+        use_qk_norm: bool = False,
+        fc_bias: bool = False,
+    ):
+        super().__init__()
+        assert n_heads % n_kv_heads == 0
+        self.head_dim = head_dim
+        self.n_heads = n_heads
+        self.n_kv_heads = n_kv_heads
+        self.use_qk_norm = use_qk_norm
+        self.wq = torch.nn.Linear(dim, n_heads * head_dim, bias=fc_bias)
+        self.wk, self.wv = [
+            torch.nn.Linear(
+                dim,
+                n_kv_heads * head_dim,
+                bias=fc_bias,
+            )
+            for _ in range(2)
+        ]
+        self.wo = torch.nn.Linear(
+            n_heads * head_dim,
+            dim,
+            bias=fc_bias,
+        )
+        if self.use_qk_norm is True:
+            self.q_norm = RMSNorm(head_dim, eps=norm_eps)
+            self.k_norm = RMSNorm(head_dim, eps=norm_eps)
+    def reshape_heads(self, x: torch.Tensor, heads: int) -> torch.Tensor:
+        B, T, C = x.shape
+        # B x T x C -> B x T x C/H x H
+        x = x.reshape(B, T, C // heads, heads)
+        # B x T x C/H x H -> B x H x T x C/H
+        return x.permute(0, 3, 1, 2)
+    def forward(
+        self,
+        x: torch.Tensor,
+        cross_x: Optional[torch.Tensor] = None,
+        key_padding_mask: Optional[torch.Tensor] = None,
+        rope: Optional[RotaryEmbedding] = None,
+    ):
+        # x: B, T, E
+        xq = self.wq(x)
+        if cross_x is not None:
+            xk, xv = self.wk(cross_x), self.wv(cross_x)
+        else:
+            xk, xv = self.wk(x), self.wv(x)
+        xk = self.reshape_heads(xk, self.n_kv_heads)
+        xv = self.reshape_heads(xv, self.n_kv_heads)
+        xq = self.reshape_heads(xq, self.n_heads)
+        if self.use_qk_norm:
+            xq = self.q_norm(xq)
+            xk = self.k_norm(xk)
+        if rope is not None:
+            xq = rope(xq, bhle=True)
+            xk = rope(xk, bhle=True)
+        attn_mask = None
+        if key_padding_mask is not None:
+            attn_mask = key_padding_mask[:, None, None, :]
+        output = F.scaled_dot_product_attention(xq, xk, xv, attn_mask=attn_mask)
+        output = rearrange(output, "b h n d -> b n (h d)")
+        return self.wo(output)
+class FeedForward(torch.nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        ffn_dim_multiplier: float,
+        multiple_of: int,
+        dropout: float,
+        non_linearity: str = "swiglu",
+        fc_bias: bool = False,
+    ):
+        super().__init__()
+        self.dropout = dropout
+        self.swiglu = non_linearity == "swiglu"
+        # swiglu hidden dim factor multiplier (same #params as relu / gelu)
+        if self.swiglu:
+            hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        # round hidden dimension to `multiple_of`
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        # layers
+        self.w1 = torch.nn.Linear(dim, hidden_dim, bias=fc_bias)
+        self.w2 = torch.nn.Linear(hidden_dim, dim, bias=fc_bias)
+        if self.swiglu:
+            self.w3 = torch.nn.Linear(dim, hidden_dim, bias=fc_bias)
+        # non-linearity
+        self.non_linearity = get_nonlinearity(non_linearity)
+    def forward(
+        self,
+        x,
+    ):
+        hidden1 = self.w1(x)
+        if self.swiglu:
+            hidden3 = self.w3(x)
+            hidden = F.silu(hidden1) * hidden3
+        else:
+            hidden = self.non_linearity(hidden1)
+        hidden = F.dropout(hidden, p=self.dropout, training=self.training)
+        return self.w2(hidden)
+class TimestepEmbedder(torch.nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        frequency_embedding_dim: int,
+        non_linearity: str,
+        dropout: float,
+        fc_bias: bool,
+        max_period: int = 10000,
+    ):
+        super().__init__()
+        self.frequency_embedding_size = frequency_embedding_dim
+        self.projection = ProjectionLayer(
+            in_dim=frequency_embedding_dim,
+            out_dim=dim,
+            non_linearity=non_linearity,
+            dropout=dropout,
+            fc_bias=fc_bias,
+        )
+        half = frequency_embedding_dim // 2
+        freqs = torch.exp(
+            -math.log(max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half
+        )
+        self.register_buffer("freqs", freqs, persistent=False)
+    def timestep_embedding(self, t, dim):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        self.freqs = self.freqs.to(device=t.device)
+        args = t[:, None].float() * self.freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+            )
+        return embedding.to(t)
+    def forward(self, t):
+        x = self.timestep_embedding(t, self.frequency_embedding_size)
+        return self.projection(x)
+class ContextEmbedder(torch.nn.Module):
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        non_linearity: str,
+        dropout: float,
+        fc_bias: bool,
+        norm_eps: float = 1e-5,
+        context_norm: bool = False,
+    ):
+        super().__init__()
+        self.context_norm = context_norm
+        if context_norm:
+            self.norm = RMSNorm(in_dim, norm_eps)
+        self.projection = ProjectionLayer(
+            in_dim=in_dim,
+            out_dim=out_dim,
+            non_linearity=non_linearity,
+            dropout=dropout,
+            fc_bias=fc_bias,
+        )
+    def forward(self, x):
+        if self.context_norm:
+            x = self.norm(x)
+        h = self.projection(x)
+        return h
+class DiTBlock(torch.nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        n_heads: int,
+        n_kv_heads: Optional[int] = None,
+        dropout: float = 0.0,
+        norm_eps: float = 1e-5,
+        qk_norm: bool = False,
+        fc_bias: bool = False,
+        ffn_exp: int = 1,
+        ffn_dim_multiplier: int = 4,
+        multiple_of: int = 64,
+        non_linearity: str = "silu",
+        no_cross_attention: bool = False,
+    ):
+        super().__init__()
+        assert dim % n_heads == 0
+        self.n_heads = n_heads
+        self.n_kv_heads = n_heads if n_kv_heads is None else n_kv_heads
+        self.dim = dim
+        self.dropout = dropout
+        self.head_dim = dim // n_heads
+        assert self.n_heads % self.n_kv_heads == 0
+        self.attention = Attention(
+            dim=dim,
+            head_dim=self.head_dim,
+            n_heads=self.n_heads,
+            n_kv_heads=self.n_kv_heads,
+            norm_eps=norm_eps,
+            use_qk_norm=qk_norm,
+            fc_bias=fc_bias,
+        )
+        self.feed_forward = FeedForward(
+            dim=dim,
+            hidden_dim=int(ffn_exp * dim),
+            ffn_dim_multiplier=ffn_dim_multiplier,
+            multiple_of=multiple_of,
+            dropout=dropout,
+            non_linearity=non_linearity,
+            fc_bias=fc_bias,
+        )
+        self.attention_norm, self.ffn_norm = [RMSNorm(dim, norm_eps) for _ in range(2)]
+        self.cross_attention = None
+        if not no_cross_attention:
+            self.cross_attention = Attention(
+                dim=dim,
+                head_dim=self.head_dim,
+                n_heads=self.n_heads,
+                n_kv_heads=self.n_heads,
+                norm_eps=norm_eps,
+                use_qk_norm=qk_norm,
+                fc_bias=fc_bias,
+            )
+        self.scale_shift_table = nn.Parameter(
+            torch.randn(6, self.dim) / self.dim**0.5,
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        cross_x: Optional[torch.Tensor],
+        t: torch.Tensor,
+        padding_mask: Optional[torch.Tensor],
+        memory_padding_mask: Optional[torch.Tensor],
+        rope: Optional[RotaryEmbedding] = None,
+    ):
+        biases = self.scale_shift_table[None] + t.reshape(x.size(0), 6, -1)
+        (
+            shift_msa,
+            scale_msa,
+            gate_msa,
+            shift_mlp,
+            scale_mlp,
+            gate_mlp,
+        ) = biases.chunk(6, dim=1)
+        assert self.attention is not None and self.attention_norm is not None
+        h_attn = self.attention(
+            modulate(self.attention_norm(x), shift_msa, scale_msa),
+            key_padding_mask=padding_mask,
+            rope=rope,
+        )
+        h = x + gate(h_attn, gate_msa)
+        if self.cross_attention is not None:
+            h_cross = self.cross_attention(
+                x=h,
+                cross_x=cross_x,
+                key_padding_mask=memory_padding_mask,
+            )
+            h = h + h_cross  # residual
+        h_ff = self.feed_forward(modulate(self.ffn_norm(h), shift_mlp, scale_mlp))
+        out = h + gate(h_ff, gate_mlp)
+        return out
+class DiT(torch.nn.Module):
+    def __init__(self, config: TransformerConfig):
+        super().__init__()
+        self.dropout = config.dropout
+        if config.in_channels is not None:
+            self.data_proj = torch.nn.Linear(config.in_channels, config.dim)
+        # embeddings
+        self.rope_embeddings = None
+        # rotary embeddings
+        if config.use_rope:
+            self.rope_embeddings = RotaryEmbedding(
+                theta=max(10000, 2 * config.max_positions),
+                head_dim=config.dim // config.n_heads,
+                max_seqlen=config.max_positions,
+            )
+            self.rope_embeddings.reset_parameters()
+        # transformer blocks
+        self.layers = nn.ModuleList()
+        for _ in range(config.n_layers):
+            self.layers.append(
+                DiTBlock(
+                    dim=config.dim,
+                    n_heads=config.n_heads,
+                    dropout=config.dropout,
+                    norm_eps=config.norm_eps,
+                    qk_norm=config.qk_norm,
+                    fc_bias=config.fc_bias,
+                    ffn_exp=config.ffn_exp,
+                    ffn_dim_multiplier=config.ffn_dim_multiplier,
+                    multiple_of=config.multiple_of,
+                    non_linearity=config.non_linearity,
+                )
+            )
+        self.norm = RMSNorm(config.dim, config.norm_eps)
+        # output layer
+        self.output = torch.nn.Linear(
+            config.dim, config.out_channels, bias=config.fc_bias
+        )
+        self.x_embedder = Patcher(
+            in_channels=config.dim,
+            out_channels=config.dim,
+            patch_size=1,
+        )
+        self.y_embedder = ContextEmbedder(
+            in_dim=config.context_dim,
+            out_dim=config.dim,
+            non_linearity=config.context_non_linearity,
+            dropout=config.context_embedder_dropout,
+            fc_bias=config.fc_bias,
+            norm_eps=config.norm_eps,
+            context_norm=config.context_norm,
+        )
+        self.t_embedder = TimestepEmbedder(
+            config.dim,
+            config.frequency_embedding_dim,
+            non_linearity=config.timestep_non_linearity,
+            dropout=config.dropout,
+            fc_bias=config.fc_bias,
+            max_period=10000,
+        )
+        self.t_block_non_linearity = get_nonlinearity(config.t_block_non_linearity)
+        self.t_block = torch.nn.Linear(
+            config.dim,
+            config.dim * 6,
+            bias=config.t_block_bias,
+        )
+        self.final_layer_scale_shift_table = nn.Parameter(
+            torch.randn(2, config.dim) / config.dim**0.5,
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        time: torch.Tensor,
+        *,
+        padding_mask: Optional[torch.Tensor] = None,
+        memory: Optional[torch.Tensor] = None,
+        memory_padding_mask: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        x = rearrange(x, "b l c-> b c l")
+        h = self.x_embedder(x)
+        h = rearrange(h, "b c l -> b l c")
+        original_N = h.shape[1]
+        N = h.shape[1]
+        h = F.dropout(h, p=self.dropout, training=self.training)
+        t = self.t_embedder(time)  # B -> B D
+        t0 = self.t_block_non_linearity(t)
+        t0 = self.t_block(t0)  # B D -> B 6D
+        y = self.y_embedder(memory)
+        for layer in self.layers:
+            h = layer(
+                x=h,
+                cross_x=y,
+                t=t0,
+                padding_mask=padding_mask,
+                memory_padding_mask=memory_padding_mask,
+                rope=self.rope_embeddings,
+            )
+        shift, scale = (self.final_layer_scale_shift_table[None] + t[:, None]).chunk(
+            2, dim=1
+        )
+        # output layer
+        if self.norm is not None:
+            h = self.norm(h)
+        h = modulate(h, shift, scale)
+        h = F.dropout(h, p=self.dropout, training=self.training)
+        output = self.output(h)
+        N = output.shape[1]
+        if original_N != N:
+            output = output[:, -original_N:]
+        return output

sam_audio/model/vision_encoder.py ADDED Viewed

	@@ -0,0 +1,113 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from abc import ABCMeta, abstractmethod
+import torch
+import torchvision
+from core.vision_encoder import pe
+from torch.nn.utils.rnn import pad_sequence
+from sam_audio.model.config import (
+    PerceptionEncoderConfig,
+    VisionEncoderConfig,
+)
+class RescaleTransform(object):
+    """Rescale the image in a sample to a given size.
+    Args:
+        output_size (tuple or int): Desired output size. If tuple, output is
+            matched to output_size. If int, smaller of image edges is matched
+            to output_size keeping aspect ratio the same.
+    """
+    def __init__(self, output_size, interpolation):
+        assert isinstance(output_size, (int, tuple))
+        self.output_size = output_size
+        if isinstance(output_size, int):
+            self.output_size = (output_size, output_size)
+        self.interpolation = interpolation
+    def __call__(self, sample):
+        # sample: [T, C, H, W]
+        sample = torch.nn.functional.interpolate(
+            sample.float(), size=self.output_size, mode=self.interpolation.value
+        )
+        return sample
+class VisionEncoder(torch.nn.Module, metaclass=ABCMeta):
+    def __init__(self, cfg: VisionEncoderConfig):
+        super().__init__()
+        self.batch_size = cfg.batch_size
+        self.dim = cfg.dim
+        self.transform = self.get_transform()
+    @torch.no_grad()
+    def forward(self, videos: list[torch.Tensor]) -> torch.Tensor:
+        """
+        Encodes a list of input videos.  Each element of the list is a video represented
+            as a tensor [T, C, H, W]
+        Args:
+            videos (list[torch.Tensor]): List of input image tensors to be processed.
+        Returns:
+            torch.Tensor: Encoded feature representations of the input tensors.
+                The output is padded along the time dimension for variable length videos
+        """
+        result = []
+        for video in videos:
+            video = self.transform(video)
+            if self.batch_size > 0 and video.size(0) > self.batch_size:
+                res = []
+                for i in range(0, video.size(0), self.batch_size):
+                    res.append(self.encode(video[i : i + self.batch_size]))
+                result.append(torch.cat(res, dim=0))
+            else:
+                result.append(self.encode(video))
+        return pad_sequence(result, batch_first=True, padding_value=0.0)
+    @abstractmethod
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        pass
+    @abstractmethod
+    def get_transform(self):
+        pass
+class PerceptionEncoder(VisionEncoder):
+    def __init__(self, cfg: PerceptionEncoderConfig):
+        self.normalize_feature = cfg.normalize_feature
+        self.interpolation_mode = cfg.interpolation_mode
+        self.image_size = cfg.image_size
+        super().__init__(cfg)
+        self.model = pe.CLIP.from_config(cfg.name)
+    def encode(self, x):
+        image_features = self.model.encode_image(x, normalize=self.normalize_feature)
+        return image_features
+    def get_transform(self):
+        T = torchvision.transforms
+        try:
+            interp = getattr(T.InterpolationMode, self.interpolation_mode.upper())
+        except AttributeError as err:
+            raise ValueError(
+                f"Unsupported interpolation_mode: {self.interpolation_mode}"
+            ) from err
+        crop = [
+            T.Resize(
+                (self.image_size, self.image_size),
+                interpolation=interp,
+            )
+        ]
+        return T.Compose(
+            crop
+            + [
+                T.Lambda(lambda x: x.float() / 255.0),
+                T.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5], inplace=True),
+            ]
+        )

sam_audio/processor.py ADDED Viewed

	@@ -0,0 +1,382 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import json
+import logging
+import math
+import os
+from typing import Callable, List, Optional, Tuple
+import torch
+import torchaudio
+from huggingface_hub import hf_hub_download
+from torch.nn.utils.rnn import pad_sequence
+from torchcodec.decoders import AudioDecoder, VideoDecoder
+from transformers import AutoTokenizer, BatchFeature
+from sam_audio.model.config import SAMAudioConfig, SAMAudioJudgeConfig
+logger = logging.getLogger(__name__)
+Anchor = Tuple[str, float, float]
+def batch_audio(
+    audios: list[str | torch.Tensor], audio_sampling_rate: int = 48_000
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    wavs = []
+    for audio in audios:
+        if isinstance(audio, str):
+            wav, sr = torchaudio.load(audio)
+            if sr != audio_sampling_rate:
+                wav = torchaudio.functional.resample(wav, sr, audio_sampling_rate)
+        else:
+            wav = audio
+        wavs.append(wav.mean(0))
+    sizes = torch.tensor([wav.size(-1) for wav in wavs])
+    return pad_sequence(wavs, batch_first=True).unsqueeze(1), sizes
+class Batch:
+    def __init__(
+        self,
+        audios: torch.Tensor,
+        sizes: torch.Tensor,
+        wav_sizes: torch.Tensor,
+        descriptions: list[str],
+        hop_length: int,
+        audio_sampling_rate: int,
+        anchors: Optional[list[list[Anchor]]] = None,
+        audio_pad_mask: Optional[torch.Tensor] = None,
+        masked_video: Optional[torch.Tensor] = None,
+    ):
+        self.audios = audios
+        self.sizes = sizes
+        self.wav_sizes = wav_sizes
+        self.descriptions = descriptions
+        self.audio_pad_mask = audio_pad_mask
+        self.masked_video = masked_video
+        self.hop_length = hop_length
+        self.audio_sampling_rate = audio_sampling_rate
+        self.process_anchors(anchors)
+        assert self.audios.size(0) == len(self.descriptions)
+    def _wav_to_feature_idx(self, wav_idx: int):
+        return math.ceil(wav_idx / self.hop_length)
+    def to(self, device: torch.device):
+        self.audios = self.audios.to(device)
+        self.anchor_ids = self.anchor_ids.to(device)
+        self.anchor_alignment = self.anchor_alignment.to(device)
+        self.sizes = self.sizes.to(device)
+        self.wav_sizes = self.wav_sizes.to(device)
+        if self.audio_pad_mask is not None:
+            self.audio_pad_mask = self.audio_pad_mask.to(device)
+        if self.masked_video is not None:
+            self.masked_video = [v.to(device) for v in self.masked_video]
+        return self
+    def process_anchors(self, anchors: Optional[list[list[Anchor]]]):
+        batch_size = len(self.audios)
+        anchor_dict = {"<null>": 0, "+": 1, "-": 2, "<pad>": 3}
+        if anchors is None:
+            anchor_ids = torch.full(
+                (batch_size, 2), anchor_dict["<null>"], dtype=torch.long
+            )
+            anchor_ids[:, 1] = anchor_dict["<pad>"]
+            anchor_alignment = torch.full(
+                (
+                    batch_size,
+                    self.audio_pad_mask.size(-1),
+                ),
+                0,
+                dtype=torch.long,
+            )
+            anchor_alignment[~self.audio_pad_mask] = 1  # point to pad token
+        else:
+            anchor_alignment = torch.full(
+                (
+                    batch_size,
+                    self.audio_pad_mask.size(-1),
+                ),
+                0,
+                dtype=torch.long,
+            )
+            anchor_alignment[~self.audio_pad_mask] = 1  # point to pad token
+            ids = []
+            for i, anchor_list in enumerate(anchors):
+                current = [anchor_dict["<null>"], anchor_dict["<pad>"]]
+                for token, start_time, end_time in anchor_list:
+                    start_idx = self._wav_to_feature_idx(
+                        start_time * self.audio_sampling_rate
+                    )
+                    end_idx = self._wav_to_feature_idx(
+                        end_time * self.audio_sampling_rate
+                    )
+                    anchor_alignment[i, start_idx:end_idx] = len(current)
+                    current.append(anchor_dict[token])
+                ids.append(torch.tensor(current))
+            anchor_ids = pad_sequence(
+                ids, batch_first=True, padding_value=anchor_dict["<pad>"]
+            )
+        self.anchor_ids = anchor_ids
+        self.anchor_alignment = anchor_alignment
+        self.anchors = anchors
+def mask_from_sizes(sizes: torch.Tensor) -> torch.Tensor:
+    return torch.arange(sizes.max()).expand(len(sizes), -1) < sizes.unsqueeze(1)
+def load_video(
+    sizes: torch.Tensor,
+    videos: List[str],
+    feature_idx_to_wav_idx: Callable[[torch.Tensor], torch.Tensor],
+    audio_sampling_rate: int,
+) -> list[torch.Tensor]:
+    all_frames = []
+    for size, video in zip(sizes, videos, strict=False):
+        audio_timestamps = (
+            feature_idx_to_wav_idx(torch.arange(size)) / audio_sampling_rate
+        )
+        if isinstance(video, str):
+            decoder = VideoDecoder(video, dimension_order="NCHW")
+            data = decoder.get_frames_in_range(0, len(decoder))
+            diffs = (audio_timestamps[None] - data.pts_seconds[:, None]).abs()
+            frame_idxs = diffs.argmin(dim=0)
+            frames = data.data[frame_idxs]
+        else:
+            assert video.size(1) == 3, (
+                f"Expected video tensor to be in NCHW format, but found {video.size(1)} channels"
+            )
+            idx = torch.linspace(0, video.size(0) - 1, int(size)).round().long()
+            frames = video[idx]
+        all_frames.append(frames)
+    return all_frames
+class Processor:
+    config_cls: Callable
+    def __init__(self, audio_hop_length: int, audio_sampling_rate: int):
+        self.audio_hop_length = audio_hop_length
+        self.audio_sampling_rate = audio_sampling_rate
+    @classmethod
+    def _get_config(cls, model_name_or_path: str):
+        if os.path.exists(model_name_or_path):
+            config_path = os.path.join(model_name_or_path, "config.json")
+        else:
+            config_path = hf_hub_download(
+                repo_id=model_name_or_path,
+                filename="config.json",
+                revision=cls.revision,
+            )
+        with open(config_path) as fin:
+            config = cls.config_cls(**json.load(fin))
+        return config
+    @classmethod
+    def from_pretrained(cls, model_name_or_path: str) -> "Processor":
+        config = cls._get_config(model_name_or_path)
+        return cls(
+            audio_hop_length=config.audio_codec.hop_length,
+            audio_sampling_rate=config.audio_codec.sample_rate,
+        )
+    def feature_to_wav_idx(self, feature_idx):
+        return feature_idx * self.audio_hop_length
+    def wav_to_feature_idx(self, wav_idx):
+        if torch.is_tensor(wav_idx):
+            ceil = torch.ceil
+        else:
+            ceil = math.ceil
+        return ceil(wav_idx / self.audio_hop_length)
+    def mask_videos(
+        self,
+        videos: List[str | torch.Tensor],
+        masks: List[str | torch.Tensor],
+    ) -> list[torch.Tensor]:
+        video = [VideoDecoder(v)[:] if isinstance(v, str) else v for v in videos]
+        video_mask = [VideoDecoder(v)[:] if isinstance(v, str) else v for v in masks]
+        return [v * m.eq(0) for v, m in zip(video, video_mask, strict=False)]
+class SAMAudioProcessor(Processor):
+    config_cls = SAMAudioConfig
+    revision = None
+    def __call__(
+        self,
+        descriptions: list[str],
+        audios: list[str | torch.Tensor],
+        anchors: Optional[list[list[Anchor]]] = None,
+        masked_videos: Optional[list[str | torch.Tensor]] = None,
+    ):
+        """
+        Processes input data for the model.
+        Args:
+            descriptions (list[str]): List of text descriptions corresponding to each audio sample.
+            audios (list[str]): List of audio file paths or tensors.
+                If a tensor:
+                    - should have shape (channels, time) where channels=1 for mono and 2 for stereo.
+                    - should be resampled to 48_000 hz
+            anchors (Optional[list[list[Anchor]]], optional): List of anchors for each sample,
+                where each anchor is a tuple (token, start_time, end_time).
+            masked_videos (Optional[list[str | torch.Tensor]], optional): List of masked video file paths or tensors.
+                If a tensor, should have shape (N, C, H, W)
+        Returns:
+            Batch: A Batch object containing processed audio, sizes, descriptions, anchor ids, anchor alignment, audio pad mask, and optionally masked video.
+        """
+        assert len(descriptions) == len(audios)
+        assert anchors is None or len(descriptions) == len(anchors)
+        assert masked_videos is None or len(descriptions) == len(masked_videos)
+        audios, wav_sizes = batch_audio(audios, self.audio_sampling_rate)
+        sizes = self.wav_to_feature_idx(wav_sizes)
+        audio_pad_mask = mask_from_sizes(sizes)
+        masked_video = None
+        if masked_videos is not None:
+            masked_video = load_video(
+                sizes, masked_videos, self.feature_to_wav_idx, self.audio_sampling_rate
+            )
+        return Batch(
+            audios=audios,
+            sizes=sizes,
+            descriptions=descriptions,
+            audio_pad_mask=audio_pad_mask,
+            anchors=anchors,
+            masked_video=masked_video,
+            hop_length=self.audio_hop_length,
+            audio_sampling_rate=self.audio_sampling_rate,
+            wav_sizes=wav_sizes,
+        )
+class SAMAudioJudgeProcessor(Processor):
+    config_cls = SAMAudioJudgeConfig
+    revision = "sam_audio"
+    def __init__(
+        self,
+        audio_hop_length: int,
+        audio_sampling_rate: int,
+        tokenizer: AutoTokenizer,
+    ):
+        super().__init__(audio_hop_length, audio_sampling_rate)
+        self.tokenizer = tokenizer
+    @classmethod
+    def from_pretrained(cls, model_name_or_path: str) -> "SAMAudioJudgeProcessor":
+        config = cls._get_config(model_name_or_path)
+        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+        return cls(
+            audio_hop_length=config.audio_codec.hop_length,
+            audio_sampling_rate=config.audio_codec.sample_rate,
+            tokenizer=tokenizer,
+        )
+    def _reflect_pad(self, wav):
+        if wav.ndim == 1:
+            wav = wav.unsqueeze(0)
+        if wav.size(-1) % self.audio_hop_length == 0:
+            return wav
+        p1d = (0, self.audio_hop_length - (wav.size(-1) % self.audio_hop_length))
+        return torch.nn.functional.pad(wav, p1d, mode="reflect")
+    def _load_audio(self, path: str):
+        ad = AudioDecoder(path, sample_rate=self.audio_sampling_rate, num_channels=1)
+        return ad.get_all_samples().data
+    def _process_audio(
+        self,
+        raw_audio,
+        sampling_rate: Optional[int] = None,
+    ):
+        from_file = False
+        if isinstance(raw_audio, str):
+            raw_audio = [raw_audio]
+        if isinstance(raw_audio, (list, tuple)) and isinstance(raw_audio[0], str):
+            loaded = []
+            for audio_file in raw_audio:
+                loaded.append(self._load_audio(audio_file))
+            raw_audio = loaded
+            from_file = True
+        if sampling_rate is not None:
+            if sampling_rate != self.audio_sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
+                    f" {self.audio_sampling_rate}. Please make sure that the provided audio input was sampled with"
+                    f" {self.audio_sampling_rate} and not {sampling_rate}."
+                )
+        elif not from_file:
+            logger.warning(
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+        if isinstance(raw_audio, list):
+            raw_audio = [self._reflect_pad(x).T for x in raw_audio]
+        else:
+            raw_audio = self._reflect_pad(raw_audio).T
+        # verify inputs are valid
+        for example in raw_audio:
+            if example.ndim > 2:
+                raise ValueError(
+                    f"Expected input shape (channels, num_samples), but got shape ({example.shape})"
+                )
+        lengths = torch.tensor([x.size(0) for x in raw_audio])
+        input_values = pad_sequence(raw_audio, batch_first=True).transpose(1, 2)
+        padding_mask = torch.arange(lengths.max())[None] < lengths[:, None]
+        return BatchFeature(
+            {"input_values": input_values, "padding_mask": padding_mask}
+        )
+    def __call__(
+        self,
+        text: Optional[str] = None,
+        input_audio: Optional[
+            str | list[str] | torch.Tensor | list[torch.Tensor]
+        ] = None,
+        separated_audio: Optional[
+            str | list[str] | torch.Tensor | list[torch.Tensor]
+        ] = None,
+        sampling_rate: Optional[int] = None,
+        **kwargs,
+    ):
+        batch = BatchFeature()
+        if text is not None:
+            batch.update(
+                self.tokenizer(
+                    text,
+                    return_tensors="pt",
+                    padding="longest",
+                    max_length=512,
+                    truncation=True,
+                )
+            )
+        if input_audio is not None:
+            batch.update(self._process_audio(input_audio, sampling_rate))
+        if separated_audio is not None:
+            batch["separated_values"] = self._process_audio(
+                separated_audio, sampling_rate
+            )["input_values"]
+        return batch
+__all__ = ["SAMAudioProcessor", "SAMAudioJudgeProcessor", "Batch"]

sam_audio/ranking/__init__.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from sam_audio.model.config import (
+    ClapRankerConfig,
+    EnsembleRankerConfig,
+    ImageBindRankerConfig,
+    JudgeRankerConfig,
+)
+from sam_audio.ranking.clap import ClapRanker
+from sam_audio.ranking.imagebind import ImageBindRanker
+from sam_audio.ranking.judge import JudgeRanker
+from sam_audio.ranking.ranker import EnsembleRanker
+def create_ranker(config):
+    if isinstance(config, ImageBindRankerConfig):
+        return ImageBindRanker(config)
+    elif isinstance(config, ClapRankerConfig):
+        return ClapRanker(config)
+    elif isinstance(config, JudgeRankerConfig):
+        return JudgeRanker(config)
+    elif isinstance(config, EnsembleRankerConfig):
+        ranker_cfgs, weights = zip(*config.rankers.values(), strict=False)
+        return EnsembleRanker(
+            rankers=[create_ranker(cfg) for cfg in ranker_cfgs],
+            weights=weights,
+        )
+    else:
+        assert config is None
+        return None

sam_audio/ranking/clap.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import torch
+import torchaudio
+from huggingface_hub import hf_hub_download
+from sam_audio.model.config import ClapRankerConfig
+from sam_audio.ranking.ranker import Ranker
+def get_model(checkpoint_file=None, device="cpu"):
+    import laion_clap
+    model = laion_clap.CLAP_Module(enable_fusion=False, amodel="HTSAT-tiny").to(device)
+    if checkpoint_file is None:
+        checkpoint_file = hf_hub_download(
+            repo_id="lukewys/laion_clap", filename="630k-best.pt"
+        )
+    state_dict = torch.load(checkpoint_file, map_location=device, weights_only=False)[
+        "state_dict"
+    ]
+    if next(iter(state_dict.items()))[0].startswith("module"):
+        state_dict = {k[7:]: v for k, v in state_dict.items()}
+    if "text_branch.embeddings.position_ids" in state_dict:
+        del state_dict["text_branch.embeddings.position_ids"]
+    model.model.load_state_dict(state_dict)
+    return model.eval()
+class ClapRanker(Ranker):
+    def __init__(self, config: ClapRankerConfig):
+        from laion_clap.training import data
+        self.laion_data_module = data
+        super().__init__()
+        self.config = config
+        self.model = get_model(checkpoint_file=config.checkpoint)
+    def _prepare_audio(self, audio, sample_rate):
+        audio_features = []
+        for candidates in audio:
+            if sample_rate != 48_000:
+                candidates = torchaudio.functional.resample(
+                    candidates, sample_rate, 48000
+                )
+            quantized = self.laion_data_module.int16_to_float32_torch(
+                self.laion_data_module.float32_to_int16_torch(candidates)
+            ).float()
+            for sample in quantized:
+                temp_dict = {}
+                temp_dict = self.laion_data_module.get_audio_features(
+                    temp_dict,
+                    sample,
+                    480000,
+                    data_truncating=(
+                        "fusion" if self.model.enable_fusion else "rand_trunc"
+                    ),
+                    data_filling="repeatpad",
+                    audio_cfg=self.model.model_cfg["audio_cfg"],
+                    require_grad=False,
+                )
+                audio_features.append(temp_dict)
+        return audio_features
+    @torch.inference_mode()
+    def forward(
+        self,
+        extracted_audio: list[torch.Tensor],
+        descriptions: list[str],
+        sample_rate: int = 48_000,
+        **kwargs,
+    ):
+        audio_embed = self.model.model.get_audio_embedding(
+            self._prepare_audio(extracted_audio, sample_rate)
+        )
+        text_embed = self.model.get_text_embedding(descriptions, use_tensor=True)
+        bsz = len(extracted_audio)
+        candidates = len(audio_embed) // bsz
+        audio_embed = audio_embed.reshape(bsz, candidates, -1)
+        text_embed = text_embed.reshape(bsz, -1, 1)
+        scores = audio_embed @ text_embed
+        return scores.squeeze(-1)

sam_audio/ranking/imagebind.py ADDED Viewed

	@@ -0,0 +1,197 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import math
+from typing import List, Union
+import torch
+import torchaudio
+from sam_audio.model.config import ImageBindRankerConfig
+from sam_audio.ranking.ranker import Ranker
+try:
+    from imagebind.data import (
+        ConstantClipsPerVideoSampler,
+        NormalizeVideo,
+        SpatialCrop,
+        get_clip_timepoints,
+        load_and_transform_video_data,
+        pv_transforms,
+        transforms,
+        waveform2melspec,
+    )
+    from imagebind.models.imagebind_model import ModalityType, imagebind_huge
+    __imagebind_exists__ = True
+except ImportError:
+    __imagebind_exists__ = False
+def load_and_transform_audio_data(
+    audios: List[Union[str, torch.Tensor]],
+    input_sample_rate=None,
+    num_mel_bins=128,
+    target_length=204,
+    sample_rate=16000,
+    clip_duration=2,
+    clips_per_video=3,
+    mean=-4.268,
+    std=9.138,
+    device=None,
+):
+    if audios is None:
+        return None
+    audio_outputs = []
+    clip_sampler = ConstantClipsPerVideoSampler(
+        clip_duration=clip_duration, clips_per_video=clips_per_video
+    )
+    for audio in audios:
+        if isinstance(audio, str):
+            waveform, input_sample_rate = torchaudio.load(audio)
+        else:
+            assert torch.is_tensor(audio)
+            assert sample_rate is not None
+            # Preprocessing needs to be done in full precision
+            waveform = audio.float()
+            if waveform.ndim == 1:
+                waveform = waveform[None]
+        if sample_rate != input_sample_rate:
+            waveform = torchaudio.functional.resample(
+                waveform, orig_freq=input_sample_rate, new_freq=sample_rate
+            )
+        all_clips_timepoints = get_clip_timepoints(
+            clip_sampler, waveform.size(1) / sample_rate
+        )
+        all_clips = []
+        for clip_timepoints in all_clips_timepoints:
+            waveform_clip = waveform[
+                :,
+                int(clip_timepoints[0] * sample_rate) : int(
+                    clip_timepoints[1] * sample_rate
+                ),
+            ]
+            waveform_melspec = waveform2melspec(
+                waveform_clip, sample_rate, num_mel_bins, target_length
+            )
+            all_clips.append(waveform_melspec)
+        normalize = transforms.Normalize(mean=mean, std=std)
+        all_clips = [normalize(ac).to(device) for ac in all_clips]
+        all_clips = torch.stack(all_clips, dim=0)
+        audio_outputs.append(all_clips)
+    return torch.stack(audio_outputs, dim=0)
+class VideoTransform:
+    def __init__(self, clip_duration=2, clips_per_video=5):
+        self.clip_duration = clip_duration
+        self.clips_per_video = clips_per_video
+        self.clip_sampler = ConstantClipsPerVideoSampler(
+            clip_duration=clip_duration, clips_per_video=clips_per_video
+        )
+        self.video_transform = transforms.Compose(
+            [
+                pv_transforms.ShortSideScale(224),
+                NormalizeVideo(
+                    mean=(0.48145466, 0.4578275, 0.40821073),
+                    std=(0.26862954, 0.26130258, 0.27577711),
+                ),
+            ]
+        )
+        self.spatial_crop = SpatialCrop(224, num_crops=3)
+    def load_video_fast(self, videos, durations, **kwargs):
+        result = []
+        for video, duration in zip(videos, durations, strict=False):
+            nframes = video.size(0)
+            fps = video.size(0) / duration
+            timepoints = get_clip_timepoints(
+                self.clip_sampler,
+                duration,
+            )
+            # Instead of loading 5 2s clips, and then sub-sampling frames, we figure
+            # Out the indices of the final clips we want and only decode those.
+            all_idxs = []
+            for start_time, end_time in timepoints:
+                idxs = torch.arange(
+                    min(int(math.ceil(fps * start_time)), nframes - 1),
+                    min(int(math.ceil(fps * end_time)), nframes),
+                )
+                ts = (
+                    torch.linspace(0, idxs.size(0) - 1, self.clip_duration)
+                    .clamp(max=idxs.size(0) - 1)
+                    .long()
+                )
+                all_idxs.append(idxs[ts])
+            all_idxs = torch.cat(all_idxs)
+            fast_frames = video[all_idxs].transpose(0, 1)
+            result.append(fast_frames.chunk(self.clips_per_video, dim=1))
+        return result
+    def transform_video(self, batch, device=None):
+        device = device or torch.device("cpu")
+        video_outputs = []
+        for all_video in batch:
+            all_video = [
+                self.video_transform(clip.to(device) / 255.0) for clip in all_video
+            ]
+            all_video = self.spatial_crop(all_video)
+            all_video = torch.stack(all_video, dim=0)
+            video_outputs.append(all_video)
+        return torch.stack(video_outputs, dim=0)
+    def __call__(self, videos, durations, device=None):
+        return self.transform_video(
+            self.load_video_fast(videos, durations), device=device
+        )
+class ImageBindRanker(Ranker):
+    def __init__(self, cfg: ImageBindRankerConfig):
+        super().__init__()
+        assert __imagebind_exists__, (
+            "Install ImageBind in order to use this ranker: https://github.com/facebookresearch/ImageBind/tree/main"
+        )
+        self.model = imagebind_huge(pretrained=cfg.checkpoint is None)
+        if cfg.checkpoint is not None:
+            self.model.load_state_dict(torch.load(cfg.checkpoint, map_location="cpu"))
+        self.model = self.model.eval()
+        self.video_transform = VideoTransform()
+    @torch.inference_mode()
+    def forward(
+        self,
+        extracted_audio: list[torch.Tensor],
+        videos: list[torch.Tensor | str],
+        sample_rate: int = 48_000,
+        **kwargs,
+    ):
+        audio_data = torch.cat(
+            [
+                load_and_transform_audio_data(x, input_sample_rate=sample_rate)
+                for x in extracted_audio
+            ],
+            dim=0,
+        )
+        if isinstance(videos[0], str):
+            video_data = load_and_transform_video_data(videos)
+        else:
+            durations = [x.size(-1) / sample_rate for x in extracted_audio]
+            video_data = self.video_transform(videos, durations, audio_data.device)
+        inputs = {ModalityType.AUDIO: audio_data, ModalityType.VISION: video_data}
+        embs = self.model(inputs)
+        audio_embs, video_embs = embs[ModalityType.AUDIO], embs[ModalityType.VISION]
+        audio_embs, video_embs = (
+            audio_embs / ((audio_embs**2).sum(dim=-1, keepdims=True) ** 0.5),
+            video_embs / ((video_embs**2).sum(dim=-1, keepdims=True) ** 0.5),
+        )
+        bsz = len(extracted_audio)
+        candidates = len(audio_embs) // bsz
+        scores = audio_embs.view(bsz, candidates, -1) @ video_embs.view(bsz, -1, 1)
+        return scores

sam_audio/ranking/judge.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import torch
+from ..model.config import JudgeRankerConfig
+from ..model.judge import SAMAudioJudgeModel
+from ..processor import SAMAudioJudgeProcessor
+from .ranker import Ranker
+class JudgeRanker(Ranker):
+    def __init__(self, config: JudgeRankerConfig):
+        super().__init__()
+        self.config = config
+        self.model = SAMAudioJudgeModel.from_pretrained(config.checkpoint_or_model_id)
+        self.processor = SAMAudioJudgeProcessor.from_pretrained(
+            config.checkpoint_or_model_id
+        )
+    @torch.inference_mode()
+    def forward(
+        self,
+        input_audio: list[torch.Tensor],
+        extracted_audio: list[torch.Tensor],
+        descriptions: list[str],
+        sample_rate: int = 48_000,
+        **kwargs,
+    ):
+        bsz, ncandidates = len(input_audio), len(input_audio[0])
+        input_seqs = [x[None] for candidates in input_audio for x in candidates]
+        extracted_seqs = [x[None] for candidates in extracted_audio for x in candidates]
+        repeated_descriptions = [x for x in descriptions for _ in range(ncandidates)]
+        processed = self.processor(
+            text=repeated_descriptions,
+            input_audio=input_seqs,
+            separated_audio=extracted_seqs,
+            return_tensors="pt",
+            padding=True,
+            sampling_rate=sample_rate,
+        )
+        res = self.model(**processed.to(input_audio[0].device))
+        return res.overall.view(bsz, ncandidates)

sam_audio/ranking/ranker.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from abc import ABCMeta, abstractmethod
+from typing import List
+import torch
+class Ranker(torch.nn.Module, metaclass=ABCMeta):
+    @abstractmethod
+    def forward(self, audio: list[torch.Tensor], **kwargs) -> torch.Tensor:
+        """
+        Args:
+            audio: (list[torch.Tensor]) where each element in the list corresponds to
+                the candidates for the i'th generation (num_candidates, num_frames)
+        Returns:
+            (torch.Tensor) of shape (batch_size, num_candidates) correspoding to the ranking scores
+        """
+        pass
+class EnsembleRanker(Ranker):
+    def __init__(self, rankers: List[Ranker], weights: List[float]):
+        super().__init__()
+        assert len(rankers) == len(weights)
+        self.rankers = torch.nn.ModuleList(rankers)
+        self.weights = weights
+    def forward(self, **kwargs) -> torch.Tensor:
+        result = None
+        for weight, ranker in zip(self.weights, self.rankers, strict=False):
+            if result is None:
+                result = weight * ranker(**kwargs)
+            else:
+                result += weight * ranker(**kwargs)
+        return result

sam_audio/ranking/sound_activity.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from io import BytesIO
+from typing import Tuple, Union
+import torch
+from torchcodec.encoders import AudioEncoder
+from ..model.config import SoundActivityRankerConfig
+from .ranker import Ranker
+try:
+    import pydub
+except ImportError:
+    pydub = None
+def get_peak_rms(audio, win_ms=250, hop_ms=100):
+    """
+    win_length and hop_length are in ms
+    """
+    last_slice_start = len(audio) - win_ms
+    slice_starts = range(0, last_slice_start + 1, hop_ms)
+    peak_rms = -1
+    for i in slice_starts:
+        audio_slice = audio[i : i + win_ms]
+        peak_rms = max(peak_rms, audio_slice.rms / audio.max_possible_amplitude)
+    # Ensure peak_rms is positive
+    peak_rms = max(peak_rms, 0)
+    return peak_rms
+def torch_tensor_to_pydub(wav: torch.Tensor, sample_rate: int):
+    bytesio = BytesIO()
+    encoder = AudioEncoder(wav, sample_rate=sample_rate)
+    encoder.to_file_like(bytesio, format="wav")
+    bytesio.seek(0)
+    audio = pydub.AudioSegment.from_file(bytesio, format="wav")
+    return audio
+def detect_nonsilent(
+    path: Union[str, Tuple[torch.Tensor, int]],  # either a file path or pair wav & sr
+    min_sil_ms=250,
+    sil_threshold=-40,
+    threshold_mode="rel_to_max",
+):
+    TH_MODES = {"abs", "rel_to_max"}
+    SAMPLE_RATE = 24_000
+    assert threshold_mode in TH_MODES, f"{threshold_mode=} not in {TH_MODES}"
+    if isinstance(path, str):
+        audio = pydub.AudioSegment.from_file(path)
+    else:  # tuple of (tensor, sr)
+        audio = torch_tensor_to_pydub(path[0], path[1])
+    audio = audio.set_frame_rate(SAMPLE_RATE)
+    if threshold_mode == "rel_to_max":
+        peak_rms = get_peak_rms(audio)
+        sil_threshold = sil_threshold + pydub.utils.ratio_to_db(
+            peak_rms
+        )  # convert to absolute db threshold
+    elif threshold_mode == "abs":
+        pass
+    else:
+        raise NotImplementedError(f"Unknown threshold_mode '{threshold_mode}'")
+    spans = pydub.silence.detect_nonsilent(
+        audio, min_silence_len=min_sil_ms, silence_thresh=sil_threshold, seek_step=10
+    )
+    spans = [(round(start / 1000, 3), round(end / 1000, 3)) for start, end in spans]
+    return spans
+def compute_iou_recall_precision(hyp_spans, ref_spans):
+    def span_length(span):
+        return span[1] - span[0]
+    def intersection_length(span1, span2):
+        return max(0, min(span1[1], span2[1]) - max(span1[0], span2[0]))
+    total_hyp_length = sum(span_length(span) for span in hyp_spans)
+    total_ref_length = sum(span_length(span) for span in ref_spans)
+    total_intersection = 0
+    for hyp_span in hyp_spans:
+        for ref_span in ref_spans:
+            total_intersection += intersection_length(hyp_span, ref_span)
+    union_spans = hyp_spans + ref_spans  # Combine both lists to compute union
+    union_length = sum(span_length(span) for span in union_spans) - total_intersection
+    iou = total_intersection / union_length if union_length > 0 else 0
+    recall = total_intersection / total_ref_length if total_ref_length > 0 else 0
+    precision = total_intersection / total_hyp_length if total_hyp_length > 0 else 0
+    return {"iou": iou, "recall": recall, "precision": precision}
+class SoundActivityRanker(Ranker):
+    def __init__(self, config: SoundActivityRankerConfig):
+        if pydub is None:
+            raise ImportError(
+                'Install reranking dependencies: `pip install "sam-audio[reranking]"`'
+            )
+        super().__init__()
+        self.config = config
+    @torch.inference_mode()
+    def forward(
+        self,
+        extracted_audio: list[torch.Tensor],
+        spans: list[list[list[float]]],
+        sample_rate: int = 48_000,
+        **kwargs,
+    ):
+        device = extracted_audio[0].device
+        scores = []
+        for wav, current_spans in zip(extracted_audio, spans, strict=True):
+            wav = wav.to(torch.float32).cpu()
+            # get non-silent spans
+            hyp_spans = detect_nonsilent(
+                (wav, sample_rate),
+                sil_threshold=self.config.sil_threshold,
+                threshold_mode=self.config.threshold_mode,
+            )
+            timestamps = [[span[1], span[2]] for span in current_spans]
+            result = compute_iou_recall_precision(hyp_spans, timestamps)
+            scores.append(result[self.config.metric])
+        # convert to tensor
+        scores = torch.tensor(scores, device=device)
+        return scores