niobures commited on Nov 6, 2025

Commit

49f2b3f

verified ·

1 Parent(s): db916fe

Audio-Flamingo (code, models, paper)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +23 -0
Audio Flamingo 2. An Audio-Language Model with Long-Audio Understanding and Expert Reasoning Abilities.pdf +3 -0
Audio Flamingo 3. Advancing Audio Intelligence with Fully Open Large Audio Language Models.pdf +3 -0
Audio Flamingo Sound-CoT Technical Report. Improving Chain-of-Thought Reasoning in Sound Understanding.pdf +3 -0
Audio Flamingo. A Novel Audio Language Model with Few-Shot Learning and Dialogue Abilities.pdf +3 -0
NVIDIA представила модель, которая анализирует звук, речь и музыку.pdf +3 -0
code/Audio-Flamingo-3-Pinokio.zip +3 -0
code/Audio-Flamingo-3.zip +3 -0
code/AudioFlamingo.zip +3 -0
code/audio-flamingo-3-chat-hf.zip +3 -0
code/audio-flamingo-3-hf.zip +3 -0
code/audio-flamingo-audio_flamingo_2.zip +3 -0
code/audio-flamingo-audio_flamingo_3.zip +3 -0
code/audio-flamingo-soundCoT.zip +3 -0
code/audio-flamingo.zip +3 -0
code/audio_flamingo.zip +3 -0
code/cog-nvidia-audio-flamingo-3.zip +3 -0
models/audio-flamingo-1/.gitattributes +2 -0
models/audio-flamingo-1/.gitignore +5 -0
models/audio-flamingo-1/LICENSE +21 -0
models/audio-flamingo-1/README.md +64 -0
models/audio-flamingo-1/assets/AudioFlamingo_ICML2024_poster.pdf +3 -0
models/audio-flamingo-1/assets/audio_flamingo_arch.png +3 -0
models/audio-flamingo-1/audio flamingo model card.md +115 -0
models/audio-flamingo-1/chat/README.md +65 -0
models/audio-flamingo-1/chat/clap_modified_code/CLAPWrapper.py +463 -0
models/audio-flamingo-1/chat/configs/chat.yaml +80 -0
models/audio-flamingo-1/chat/data/README.md +19 -0
models/audio-flamingo-1/chat/data/data.py +481 -0
models/audio-flamingo-1/chat/data/prepare_each_dataset.py +253 -0
models/audio-flamingo-1/chat/src/__init__.py +2 -0
models/audio-flamingo-1/chat/src/factory.py +219 -0
models/audio-flamingo-1/chat/src/flamingo.py +260 -0
models/audio-flamingo-1/chat/src/flamingo_lm.py +177 -0
models/audio-flamingo-1/chat/src/helpers.py +380 -0
models/audio-flamingo-1/chat/src/utils.py +54 -0
models/audio-flamingo-1/chat/train/distributed.py +150 -0
models/audio-flamingo-1/chat/train/train.py +376 -0
models/audio-flamingo-1/chat/train/train_utils.py +351 -0
models/audio-flamingo-1/checkpoints/chat_part1.pt +3 -0
models/audio-flamingo-1/checkpoints/chat_part2.pt +3 -0
models/audio-flamingo-1/checkpoints/chat_part3.pt +3 -0
models/audio-flamingo-1/checkpoints/chat_part4.pt +3 -0
models/audio-flamingo-1/checkpoints/chat_part5.pt +3 -0
models/audio-flamingo-1/checkpoints/checkpoint_utils.py +19 -0
models/audio-flamingo-1/checkpoints/foundation_part1.pt +3 -0
models/audio-flamingo-1/checkpoints/foundation_part2.pt +3 -0
models/audio-flamingo-1/checkpoints/foundation_part3.pt +3 -0
models/audio-flamingo-1/checkpoints/foundation_part4.pt +3 -0
models/audio-flamingo-1/checkpoints/foundation_part5.pt +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,26 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Audio[[:space:]]Flamingo[[:space:]]2.[[:space:]]An[[:space:]]Audio-Language[[:space:]]Model[[:space:]]with[[:space:]]Long-Audio[[:space:]]Understanding[[:space:]]and[[:space:]]Expert[[:space:]]Reasoning[[:space:]]Abilities.pdf filter=lfs diff=lfs merge=lfs -text
+Audio[[:space:]]Flamingo[[:space:]]3.[[:space:]]Advancing[[:space:]]Audio[[:space:]]Intelligence[[:space:]]with[[:space:]]Fully[[:space:]]Open[[:space:]]Large[[:space:]]Audio[[:space:]]Language[[:space:]]Models.pdf filter=lfs diff=lfs merge=lfs -text
+Audio[[:space:]]Flamingo[[:space:]]Sound-CoT[[:space:]]Technical[[:space:]]Report.[[:space:]]Improving[[:space:]]Chain-of-Thought[[:space:]]Reasoning[[:space:]]in[[:space:]]Sound[[:space:]]Understanding.pdf filter=lfs diff=lfs merge=lfs -text
+Audio[[:space:]]Flamingo.[[:space:]]A[[:space:]]Novel[[:space:]]Audio[[:space:]]Language[[:space:]]Model[[:space:]]with[[:space:]]Few-Shot[[:space:]]Learning[[:space:]]and[[:space:]]Dialogue[[:space:]]Abilities.pdf filter=lfs diff=lfs merge=lfs -text
+models/audio-flamingo-1/assets/audio_flamingo_arch.png filter=lfs diff=lfs merge=lfs -text
+models/audio-flamingo-1/assets/AudioFlamingo_ICML2024_poster.pdf filter=lfs diff=lfs merge=lfs -text
+models/audio-flamingo-1/labeling_machine/AF-AudioSet.json filter=lfs diff=lfs merge=lfs -text
+models/audio-flamingo-3-chat/llm/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+models/audio-flamingo-3-chat/static/af3_main_diagram-1.png filter=lfs diff=lfs merge=lfs -text
+models/audio-flamingo-3-chat/static/af3_radial-1.png filter=lfs diff=lfs merge=lfs -text
+models/audio-flamingo-3-chat/static/af3_sota.png filter=lfs diff=lfs merge=lfs -text
+models/audio-flamingo-3-chat/static/logo-no-bg.png filter=lfs diff=lfs merge=lfs -text
+models/audio-flamingo-3-hf/static/af3_main_diagram-1.png filter=lfs diff=lfs merge=lfs -text
+models/audio-flamingo-3-hf/static/af3_radial-1.png filter=lfs diff=lfs merge=lfs -text
+models/audio-flamingo-3-hf/static/af3_sota.png filter=lfs diff=lfs merge=lfs -text
+models/audio-flamingo-3-hf/static/logo-no-bg.png filter=lfs diff=lfs merge=lfs -text
+models/audio-flamingo-3-hf/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+models/audio-flamingo-3/llm/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+models/audio-flamingo-3/static/af3_main_diagram-1.png filter=lfs diff=lfs merge=lfs -text
+models/audio-flamingo-3/static/af3_radial-1.png filter=lfs diff=lfs merge=lfs -text
+models/audio-flamingo-3/static/af3_sota.png filter=lfs diff=lfs merge=lfs -text
+models/audio-flamingo-3/static/logo-no-bg.png filter=lfs diff=lfs merge=lfs -text
+NVIDIA[[:space:]]представила[[:space:]]модель,[[:space:]]которая[[:space:]]анализирует[[:space:]]звук,[[:space:]]речь[[:space:]]и[[:space:]]музыку.pdf filter=lfs diff=lfs merge=lfs -text

Audio Flamingo 2. An Audio-Language Model with Long-Audio Understanding and Expert Reasoning Abilities.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:08d544cf57f324020ee5d9ff916c17d53aced283c09d38be09f9bc020a9ba171
+size 10739247

Audio Flamingo 3. Advancing Audio Intelligence with Fully Open Large Audio Language Models.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f966955628f247ed76c7207b7b86048a1790794cc3b5cea47287ec14417f3508
+size 6985793

Audio Flamingo Sound-CoT Technical Report. Improving Chain-of-Thought Reasoning in Sound Understanding.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:20cf54c8128ca96298a342f066239370a122e42642ae3ded1f67b76cd8f80a4d
+size 585189

Audio Flamingo. A Novel Audio Language Model with Few-Shot Learning and Dialogue Abilities.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31451a317ebb7d0f4445500134cf5178b63c617d7f6583e0eac4f3a4c3d0000d
+size 1444685

NVIDIA представила модель, которая анализирует звук, речь и музыку.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e96f68efb1fade752565268797ac90417bd142450eb8c8245c89f94994c22d09
+size 2983908

code/Audio-Flamingo-3-Pinokio.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d740f122a84c86b6e55574c8f6ce7145a8518ccb00a874661810124d5bf1f71
+size 1365689

code/Audio-Flamingo-3.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb1f6c63f18a25cc0db01c146783c651286fc53bb064d003b11518b62e7f59c2
+size 6721741

code/AudioFlamingo.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:40d2bac296affd39f8f63066c9bc4a2ac0f6ec982c542c3f3c8a961e1ef68ca3
+size 2578624

code/audio-flamingo-3-chat-hf.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:afd4af51c0c1a6e2e3b54323dea6b34872c3221826ea969ff40a6e055e3de0e4
+size 1395827

code/audio-flamingo-3-hf.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3157b02422c02ccf87f00b99d0db9ad6ba78101fe43985461db101f418a4e1b4
+size 1443418

code/audio-flamingo-audio_flamingo_2.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d71ee0ac516346df1cfc497da306b729cbe52c1f88c327a0d32ae36f22111450
+size 5672326

code/audio-flamingo-audio_flamingo_3.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db2c7f1847f5f2380f58bd78aa93326a1262b98bc2ff179206a26f67d7c2b371
+size 3445237

code/audio-flamingo-soundCoT.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7e130b11f7d96aca17b9d6feda693d4b9e65ad7b2a35d374451eb24875ac820
+size 12563876

code/audio-flamingo.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d8cc93bf1b574c278112642af6e930485d065b3818e40392fc08b6cbd621f6f1
+size 2484492

code/audio_flamingo.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:971305b4acb2b932be39abe6b376a6d3c52dece06fa6873220631c48a486ba81
+size 11632722

code/cog-nvidia-audio-flamingo-3.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5921a9348dea2c34db7c1542e66461e392dfe92410ad4072001b675ebb87e2eb
+size 17389826

models/audio-flamingo-1/.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.pt filter=lfs diff=lfs merge=lfs -text
2	+ AF-AudioSet.json filter=lfs diff=lfs merge=lfs -text

models/audio-flamingo-1/.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+__pycache__/
+*.pyc
+.DS_Store
+foundation.pt
+chat.pt

models/audio-flamingo-1/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 NVIDIA CORPORATION.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

models/audio-flamingo-1/README.md ADDED Viewed

	@@ -0,0 +1,64 @@

+# PyTorch Implementation of Audio Flamingo
+**Zhifeng Kong, Arushi Goel, Rohan Badlani, Wei Ping, Rafael Valle, Bryan Catanzaro**
+[[Demo website]](https://audioflamingo.github.io/) [[Demo video]](https://www.youtube.com/watch?v=ucttuS28RVE) [[ICML poster]](assets/AudioFlamingo_ICML2024_poster.pdf)
+This repo contains the PyTorch implementation of [Audio Flamingo: A Novel Audio Language Model with Few-Shot Learning and Dialogue Abilities](https://arxiv.org/abs/2402.01831) (ICML 2024). Audio Flamingo is a novel audio-understanding language model with
+- strong audio understanding abilities,
+- the ability to quickly adapt to unseen tasks via in-context learning and retrieval, and
+- strong multi-turn dialogue abilities.
+We introduce a series of training techniques, architecture design, and data strategies to enhance our model with these abilities. Extensive evaluations across various audio understanding tasks confirm the efficacy of our method, setting new state-of-the-art benchmarks.
+![](assets/audio_flamingo_arch.png)
+## Code Structure
+- The folder ```foundation/``` contains training code for the foundation model.
+- The folder ```chat/``` contains training code for the chat model, which can perform multi-turn dialogues.
+- The folder ```inference/``` contains inference code for both the foundation and chat models.
+Within each folder, the structure is highly based on the [Open Flamingo](https://github.com/mlfoundations/open_flamingo) repo (commit ```a05dcba```). Each folder is self-contained and we expect no cross dependencies between these folders.
+## Preparation
+- Download source code of Laion-CLAP from their [official repo](https://github.com/LAION-AI/CLAP). Rename the folder to ```my_laion_clap/``` and copy the folder to under each of ```foundation/, chat/, inference/```. Download their pretrained checkpoints to ```YOUR_DATA_ROOT_DIR/audio-flamingo-data/laion-clap-pretrained/laion_clap/```.
+- Download source code of Microsoft-CLAP from their [official repo](https://github.com/microsoft/CLAP). Rename the folder to ```my_ms_clap/``` and copy the folder to under each of ```foundation/, chat/, inference/```. In each of these, replace the ```my_ms_clap/msclap/CLAPWrapper.py``` with ```clap_modified_code/CLAPWrapper.py```, which adds some processing functions and removes some bugs for clapcap. Download their pretrained checkpoints to ```YOUR_DATA_ROOT_DIR/audio-flamingo-data/clap/```.
+- Download raw training and evaluation datasets from their original sources. Refer to ```foundation/data/README.md``` and ```chat/data/README.md``` for specific instructions to prepare data.
+## Running the Code
+We refer to ```foundation/README.md```, ```chat/README.md```, and ```inference/README.md``` for the specific instructions for training the foundation model, training the chat model, and inferencing, as they require different setups. We used 8 A100 GPUs to train our models.
+## Checkpoints
+- The folder ```checkpoints/``` contains foundation and chat model checkpoints.
+- Each model is about 17GB. Due to ```git lfs``` constraints we split each model into 5 parts. After downloading, go to ```checkpoints/``` and ```python checkpoint_utils.py``` to merge the parts.
+- Alternatively, the model checkpoints are also on HuggingFace (which is easier to download): [https://huggingface.co/nvidia/audio-flamingo](https://huggingface.co/nvidia/audio-flamingo). One can either ```git clone``` this project or use the ```huggingface_hub.hf_hub_download``` function to download: ```checkpoint_path = hf_hub_download(repo_id="nvidia/audio-flamingo", filename="foundation(or chat).pt")```.
+- If you would like to run inference with these checkpoints, remember to modify the absolute paths in ```inference/configs/*.yaml``` and ```inference/inference_examples.py``` to properly load model checkpoints and data (see ```inference/README.md```).
+- The foundation model is pretrained with ```foundation/configs/foundation_pretrain.yaml``` and then finetuned with ```foundation/configs/foundation_sft_8_shot.yaml```.
+- The chat model is pretrained with ```foundation/configs/foundation_pretrain.yaml```, then finetuned with ```foundation/configs/foundation_sft_4_shot.yaml```, and finally finetuned with ```chat/configs/chat.yaml```.
+## Downstream applications
+- We use Audio Flamingo as a data labeling machine for synthetic captions. See ```labeling_machine/``` for details of the synthetic dataset and license descriptions.
+## References
+The main training and inferencing code within each folder (```foundation/```, ```chat/```, ```inference/```), including ```train/```, ```src/```, ```data/```, and ```configs/```, are modified from [Open Flamingo](https://github.com/mlfoundations/open_flamingo) (commit ```a05dcba```) (MIT license), which borrows from [flamingo-pytorch](https://github.com/lucidrains/flamingo-pytorch) (MIT license), [flamingo-mini](https://github.com/dhansmair/flamingo-mini) (MIT license), and [open_clip](https://github.com/mlfoundations/open_clip) (MIT license). ```src/helpers.py``` also includes self-attention implementations based on [attention-is-all-you-need-pytorch](https://github.com/jadore801120/attention-is-all-you-need-pytorch) (MIT license), which borrows from [OpenNMT-py](https://github.com/OpenNMT/OpenNMT-py) (MIT license). Our code also relies on [LAION-AI/CLAP](https://github.com/LAION-AI/CLAP) (CC0-1.0 license) and [microsoft/CLAP](https://github.com/microsoft/CLAP) (MIT license). In ```chat/data/prepare_each_dataset.py```, the filtering keywords are based on the [LLARK](https://arxiv.org/abs/2310.07160) paper (CC-BY-4.0 license) and the [LTU](https://arxiv.org/abs/2305.10790) paper (CC-BY-4.0 license).
+## License
+- The code in this repo is under MIT license (see ```LICENSE```).
+- The checkpoints in this repo (```checkpoints/*.pt```) are for non-commercial use only. They are subject to the [OPT-IML](https://huggingface.co/facebook/opt-iml-1.3b/blob/main/LICENSE.md) license, the [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and the original licenses accompanying each training dataset.
+## Citation
+```
+@article{kong2024audio,
+  title={Audio Flamingo: A Novel Audio Language Model with Few-Shot Learning and Dialogue Abilities},
+  author={Kong, Zhifeng and Goel, Arushi and Badlani, Rohan and Ping, Wei and Valle, Rafael and Catanzaro, Bryan},
+  journal={arXiv preprint arXiv:2402.01831},
+  year={2024}
+}
+```

models/audio-flamingo-1/assets/AudioFlamingo_ICML2024_poster.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f63fccc267408123e6119d0293ff81a6dfbe6979293d451ac14a2a3cc9abe98
+size 1170996

models/audio-flamingo-1/assets/audio_flamingo_arch.png ADDED Viewed

Git LFS Details

SHA256: 12e09cd22361ec76fb00a23da064d6961da4271cc1673046068101d5054db7fc
Pointer size: 131 Bytes
Size of remote file: 492 kB

models/audio-flamingo-1/audio flamingo model card.md ADDED Viewed

	@@ -0,0 +1,115 @@

+# Model Overview
+## Description:
+Audio Flamingo is a novel audio-understanding language model for
+- understanding audio,
+- quickly adapting to unseen tasks via in-context learning and retrieval, and
+- understanding and responding to multi-turn dialogues
+We introduce a series of training techniques, architecture design, and data strategies to enhance our model with these abilities. Extensive evaluations across various audio understanding tasks confirm the efficacy of our method, setting new state-of-the-art benchmarks.
+<center><img src="https://github.com/NVIDIA/audio-flamingo/raw/main/assets/audio_flamingo_arch.png" width="800"></center>
+**This model is ready for non-commercial research-only.**
+<br>
+## References(s):
+* [Audio Flamingo: A Novel Audio Language Model with Few-Shot Learning and Dialogue Abilities](https://arxiv.org/abs/2402.01831)  <br>
+* [Project Page](https://github.com/NVIDIA/audio-flamingo)  <br>
+* [Demo Website](https://audioflamingo.github.io/)  <br>
+## Model Architecture:
+**Architecture Type:** Transformer <br>
+**Network Architecture:** Audio Flamingo
+Audio Flamingo is a Flamingo-style architecture with frozen audio feature extractor, trainable transformation layers and xattn-dense layers, and language model layers.
+## Input:
+**Input Types:** Audio, Text <br>
+**Input Format:** Wav/MP3/Flac, String <br>
+**Input Parameters:** None <br>
+**Maximum Audio Input Lengths:** 33.25 seconds <br>
+**Maximum Text Input Lengths:** 512 tokens <br>
+## Output:
+**Output Type:** Text <br>
+**Output Format:** String <br>
+**Output Parameters:** None <br>
+## Software Integration:
+**Runtime Engine(s):** PyTorch
+**Supported Hardware Microarchitecture Compatibility:**
+* NVIDIA Ampere <br>
+* NVIDIA Hopper <br>
+## Preferred/Supported Operating System(s):
+* Linux
+## Model Version(s):
+* v1.0
+## Training, Testing, and Evaluation Datasets:
+### Training Dataset:
+Audio Flamingo is trained with **publicly available** datasets under various licenses, with the most restricted ones being non-commercial/research-only. The dataset contains diverse audio types including speech, environmental sounds, and music.
+* [OpenAQA	](https://github.com/YuanGongND/ltu?tab=readme-ov-file): Data collection method - [Human]; Labeling method - [Synthetic]
+* [Laion630K	](https://github.com/LAION-AI/audio-dataset/blob/main/laion-audio-630k/README.md)
+* [LP-MusicCaps	](https://github.com/seungheondoh/lp-music-caps)
+* [SoundDescs  	](https://github.com/akoepke/audio-retrieval-benchmark)
+* [WavCaps](https://github.com/XinhaoMei/WavCaps)
+* [AudioSet    	](https://research.google.com/audioset/download.html)
+* [AudioSet Strong Labeled	](https://research.google.com/audioset/download_strong.html)
+* [WavText5K   	](https://github.com/microsoft/WavText5K)
+* [MSP-Podcast 	](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html)
+* [ClothoAQA   	](https://zenodo.org/records/6473207)
+* [Clotho-v2   	](https://github.com/audio-captioning/clotho-dataset/tree/master)
+* [MACS        	](https://zenodo.org/records/5114771)
+* [FSD50k      	](https://zenodo.org/records/4060432)
+* [CochlScene  	](https://github.com/cochlearai/cochlscene)
+* [NonSpeech 7k	](https://zenodo.org/records/6967442)
+* [Chime-home  	](https://code.soundsoftware.ac.uk/projects/chime-home-dataset-annotation-and-baseline-evaluation-code)
+* [Sonyc-UST   	](https://zenodo.org/records/3966543)
+* [Emov-DB     	](https://github.com/numediart/EmoV-DB)
+* [JL-Corpus   	](https://github.com/tli725/JL-Corpus)
+* [Tess        	](https://www.kaggle.com/datasets/ejlok1/toronto-emotional-speech-set-tess)
+* [OMGEmotion  	](https://github.com/knowledgetechnologyuhh/OMGEmotionChallenge)
+* [MELD        	](https://github.com/declare-lab/MELD)
+* [MusicAVQA   	](https://gewu-lab.github.io/MUSIC-AVQA/)
+* [MusicQA     	](https://github.com/shansongliu/MU-LLaMA?tab=readme-ov-file)
+* [MusicCaps   	](https://www.kaggle.com/datasets/googleai/musiccaps)
+* [NSynth      	](https://magenta.tensorflow.org/datasets/nsynth)
+* [MTG-Jamendo 	](https://github.com/MTG/mtg-jamendo-dataset)
+* [MusDB-HQ    	](https://zenodo.org/records/3338373)
+* [FMA         	](https://github.com/mdeff/fma)
+For all of these datasets, the data collection method is [human]. For OpenAQA, Laion630k, LP-MusicCaps, WavCaps, MusicQA, the data labeling method is [synthetic]. For the rest, the data labeling method is [human].
+### Evaluating Dataset:
+Audio Flamingo is evaluated on the test split of the following datasets.
+* [ClothoAQA   	](https://zenodo.org/records/6473207)
+* [MusicAVQA   	](https://gewu-lab.github.io/MUSIC-AVQA/)
+* [Clotho-v2   	](https://github.com/audio-captioning/clotho-dataset/tree/master)
+* [FSD50k      	](https://zenodo.org/records/4060432)
+* [CochlScene  	](https://github.com/cochlearai/cochlscene)
+* [NonSpeech 7k	](https://zenodo.org/records/6967442)
+* [NSynth      	](https://magenta.tensorflow.org/datasets/nsynth)
+* [AudioCaps   	](https://github.com/cdjkim/audiocaps)
+* [CREMA-D     	](https://github.com/CheyneyComputerScience/CREMA-D)
+* [Ravdess     	](https://zenodo.org/records/1188976)
+* [US8K        	](https://urbansounddataset.weebly.com/urbansound8k.html)
+* [GTZAN       	](https://www.tensorflow.org/datasets/catalog/gtzan)
+* [Medley-solos-DB	](https://zenodo.org/records/3464194)
+For all of these datasets, the data collection method is [human] and the data labeling method is [human].
+## Inference
+**Engine:** HuggingFace Transformers <br>
+**Test Hardware [Name the specific test hardware model]:** A100 80GB <br>

models/audio-flamingo-1/chat/README.md ADDED Viewed

	@@ -0,0 +1,65 @@

+# Audio Flamingo Training (Chat Model)
+## Get data ready
+Please read ```data/README.md``` for instructions on data preparation.
+## Get paths ready
+Let ```YOUR_REPO_ROOT_DIR``` be the absolute path to this repo. We use the following structure
+```
+YOUR_REPO_ROOT_DIR/
+  - foundation/
+  - chat/  # you are here
+  - inference/
+```
+Replace ```YOUR_REPO_ROOT_DIR``` to your absolute path in the following places:
+- ```configs/*.yaml --> clap_config --> config_root```
+Let ```YOUR_DATA_ROOT_DIR``` be the absolute path to store all data, checkpoints, etc. We use the following structure
+```
+YOUR_DATA_ROOT_DIR/
+  - datasets/
+    - <dataset_name_i>/
+      - files: raw data of this dataset, including raw waveforms, metadata, etc.
+  - audio-flamingo-data/
+    - dataset_files/
+      - <dataset_name_i>-<flamingo_task_i>/
+        - files: dataset manifests, precomputed embeddings, etc.
+    - checkpoint/
+      - <experiment_name>/  # same as the config file name, and train_config --> run_name in each config
+        - tensorboard/
+        - checkpoint_xxx.pt
+        - other cached files
+    - clap/
+      - files: pretrained Microsoft-CLAP checkpoints
+    - laion-clap-pretrained/laion_clap
+      - files: pretrained Laion-CLAP checkpoints
+    - LLM_pretrained/.cache/  # place to store HuggingFace cache instead of the default ~/.cache
+```
+Replace ```YOUR_DATA_ROOT_DIR``` to your absolute path in the following places:
+- ```configs/*.yaml```
+- ```prepare_each_dataset.py --> __main__```
+## Training
+The following code is tested on 1 node (8 GPUs per node) of A100 (80G) GPUs.
+Set ```configs/chat.yaml --> sft_config --> pretrained_path``` and ```pretrained_ckpt``` to be the checkpoint of the pretrained model.
+```
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+cd train/
+torchrun --nproc_per_node 8 train.py -c ../configs/chat.yaml
+```

models/audio-flamingo-1/chat/clap_modified_code/CLAPWrapper.py ADDED Viewed

	@@ -0,0 +1,463 @@

+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+# Adapted from https://github.com/microsoft/CLAP under the MIT license.
+#   LICENSE is in incl_licenses directory.
+import warnings
+warnings.filterwarnings("ignore")
+import random
+import torchaudio
+# from torch._six import string_classes
+import collections
+import re
+import numpy as np
+from transformers import AutoTokenizer, logging
+try:
+    from models.clap import CLAP
+    from models.mapper import get_clapcap
+except:
+    from .models.clap import CLAP
+    from .models.mapper import get_clapcap
+import math
+import torchaudio.transforms as T
+import os
+import torch
+from importlib_resources import files
+import argparse
+import yaml
+import sys
+logging.set_verbosity_error()
+class CLAPWrapper():
+    """
+    A class for interfacing CLAP model.
+    """
+    def __init__(self, model_fp, config_root, version, use_cuda=False):
+        self.supported_versions = ['2022', '2023', 'clapcap']
+        self.np_str_obj_array_pattern = re.compile(r'[SaUO]')
+        self.file_path = os.path.realpath(__file__)
+        self.default_collate_err_msg_format = (
+            "default_collate: batch must contain tensors, numpy arrays, numbers, "
+            "dicts or lists; found {}")
+        self.config_root = config_root
+        self.config_as_str = self.get_config_path(version)
+        self.model_fp = model_fp
+        self.use_cuda = use_cuda
+        self.version = version
+        if 'clapcap' in self.version:
+            self.clapcap, self.tokenizer, self.args = self.load_clapcap()
+        else:
+            self.clap, self.tokenizer, self.args = self.load_clap()
+    def get_config_path(self, version):
+        if version in self.supported_versions:
+            return f"{self.config_root}/config_{version}.yml"
+        else:
+            raise ValueError(f"The specific version is not supported. The supported versions are {str(self.supported_versions)}")
+    def read_config_as_args(self,config_path,args=None,is_config_str=False):
+        return_dict = {}
+        if config_path is not None:
+            if is_config_str:
+                yml_config = yaml.load(config_path, Loader=yaml.FullLoader)
+            else:
+                with open(config_path, "r") as f:
+                    yml_config = yaml.load(f, Loader=yaml.FullLoader)
+            if args != None:
+                for k, v in yml_config.items():
+                    if k in args.__dict__:
+                        args.__dict__[k] = v
+                    else:
+                        sys.stderr.write("Ignored unknown parameter {} in yaml.\n".format(k))
+            else:
+                for k, v in yml_config.items():
+                    return_dict[k] = v
+        args = args if args != None else return_dict
+        return argparse.Namespace(**args)
+    def load_clap(self):
+        r"""Load CLAP model with args from config file"""
+        args = self.read_config_as_args(self.config_as_str, is_config_str=False)
+        if 'roberta' in args.text_model or 'clip' in args.text_model or 'gpt' in args.text_model:
+            self.token_keys = ['input_ids', 'attention_mask']
+        elif 'bert' in args.text_model:
+            self.token_keys = ['input_ids', 'token_type_ids', 'attention_mask']
+        clap = CLAP(
+            audioenc_name=args.audioenc_name,
+            sample_rate=args.sampling_rate,
+            window_size=args.window_size,
+            hop_size=args.hop_size,
+            mel_bins=args.mel_bins,
+            fmin=args.fmin,
+            fmax=args.fmax,
+            classes_num=args.num_classes,
+            out_emb=args.out_emb,
+            text_model=args.text_model,
+            transformer_embed_dim=args.transformer_embed_dim,
+            d_proj=args.d_proj
+        )
+        # Load pretrained weights for model
+        model_state_dict = torch.load(self.model_fp, map_location=torch.device('cpu'))['model']
+        # We unwrap the DDP model and save. If the model is not unwrapped and saved, then the model needs to unwrapped before `load_state_dict`:
+        # Reference link: https://discuss.pytorch.org/t/how-to-load-dataparallel-model-which-trained-using-multiple-gpus/146005
+        clap.load_state_dict(model_state_dict)
+        clap.eval()  # set clap in eval mode
+        tokenizer = AutoTokenizer.from_pretrained(args.text_model)
+        if 'gpt' in args.text_model:
+            tokenizer.add_special_tokens({'pad_token': '!'})
+        if self.use_cuda and torch.cuda.is_available():
+            clap = clap.cuda()
+        return clap, tokenizer, args
+    def load_clapcap(self):
+        r"""Load CLAP model with args from config file"""
+        args = self.read_config_as_args(self.config_as_str, is_config_str=False)
+        args.prefix_dim = args.d_proj
+        text_model = args.text_model
+        args.text_model = args.text_decoder
+        args.cross_attention = True if 'cross' in args.clapcap_model.lower() else False
+        if 'roberta' in args.text_model or 'clip' in args.text_model or 'gpt' in args.text_model:
+            self.token_keys = ['input_ids', 'attention_mask']
+        elif 'bert' in args.text_model:
+            self.token_keys = ['input_ids', 'token_type_ids', 'attention_mask']
+        clap = CLAP(
+            audioenc_name=args.audioenc_name,
+            sample_rate=args.sampling_rate,
+            window_size=args.window_size,
+            hop_size=args.hop_size,
+            mel_bins=args.mel_bins,
+            fmin=args.fmin,
+            fmax=args.fmax,
+            classes_num=args.num_classes,
+            out_emb=args.out_emb,
+            text_model=text_model,
+            transformer_embed_dim=args.transformer_embed_dim,
+            d_proj=args.d_proj
+        )
+        clapcap = get_clapcap(args.clapcap_model)(clap, args.text_decoder, args.prefix_length, args.prefix_length_clip, args.prefix_dim,
+                 args.num_layers, args.normalize_prefix, args.mapping_type, True, True)
+        model_state_dict = torch.load(self.model_fp, map_location=torch.device('cpu'))['model']
+        clapcap.load_state_dict(model_state_dict)
+        clapcap.eval()  # set clap in eval mode
+        tokenizer = AutoTokenizer.from_pretrained(args.text_model)
+        if 'gpt' in args.text_model:
+            tokenizer.add_special_tokens({'pad_token': '!'})
+        if self.use_cuda and torch.cuda.is_available():
+            clapcap = clapcap.cuda()
+        return clapcap, tokenizer, args
+    def default_collate(self, batch):
+        r"""Puts each data field into a tensor with outer dimension batch size"""
+        elem = batch[0]
+        elem_type = type(elem)
+        if isinstance(elem, torch.Tensor):
+            out = None
+            if torch.utils.data.get_worker_info() is not None:
+                # If we're in a background process, concatenate directly into a
+                # shared memory tensor to avoid an extra copy
+                numel = sum([x.numel() for x in batch])
+                storage = elem.storage()._new_shared(numel)
+                out = elem.new(storage)
+            return torch.stack(batch, 0, out=out)
+        elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
+                and elem_type.__name__ != 'string_':
+            if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
+                # array of string classes and object
+                if self.np_str_obj_array_pattern.search(elem.dtype.str) is not None:
+                    raise TypeError(
+                        self.default_collate_err_msg_format.format(elem.dtype))
+                return self.default_collate([torch.as_tensor(b) for b in batch])
+            elif elem.shape == ():  # scalars
+                return torch.as_tensor(batch)
+        elif isinstance(elem, float):
+            return torch.tensor(batch, dtype=torch.float64)
+        elif isinstance(elem, int):
+            return torch.tensor(batch)
+        # elif isinstance(elem, string_classes):
+        #     return batch
+        elif isinstance(elem, collections.abc.Mapping):
+            return {key: self.default_collate([d[key] for d in batch]) for key in elem}
+        elif isinstance(elem, tuple) and hasattr(elem, '_fields'):  # namedtuple
+            return elem_type(*(self.default_collate(samples) for samples in zip(*batch)))
+        elif isinstance(elem, collections.abc.Sequence):
+            # check to make sure that the elements in batch have consistent size
+            it = iter(batch)
+            elem_size = len(next(it))
+            if not all(len(elem) == elem_size for elem in it):
+                raise RuntimeError(
+                    'each element in list of batch should be of equal size')
+            transposed = zip(*batch)
+            return [self.default_collate(samples) for samples in transposed]
+        raise TypeError(self.default_collate_err_msg_format.format(elem_type))
+    def read_audio(self, audio_path, resample=False):
+        r"""Loads audio file or array and returns a torch tensor"""
+        # Randomly sample a segment of audio_duration from the clip or pad to match duration
+        audio_time_series, sample_rate = torchaudio.load(audio_path)
+        resample_rate = self.args.sampling_rate
+        if resample:
+            resampler = T.Resample(sample_rate, resample_rate)
+            audio_time_series = resampler(audio_time_series)
+        return audio_time_series, sample_rate
+    def load_audio_into_tensor(self, audio_path, audio_duration, resample=False):
+        r"""Loads audio file and returns raw audio."""
+        # Randomly sample a segment of audio_duration from the clip or pad to match duration
+        audio_time_series, sample_rate = self.read_audio(audio_path, resample=False)
+        audio_time_series = audio_time_series.reshape(-1)
+        # audio_time_series is shorter than predefined audio duration,
+        # so audio_time_series is extended
+        if audio_duration*sample_rate >= audio_time_series.shape[0]:
+            repeat_factor = int(np.ceil((audio_duration*sample_rate) /
+                                        audio_time_series.shape[0]))
+            # Repeat audio_time_series by repeat_factor to match audio_duration
+            audio_time_series = audio_time_series.repeat(repeat_factor)
+            # remove excess part of audio_time_series
+            audio_time_series = audio_time_series[0:audio_duration*sample_rate]
+        else:
+            # audio_time_series is longer than predefined audio duration,
+            # so audio_time_series is trimmed
+            start_index = random.randrange(
+                audio_time_series.shape[0] - audio_duration*sample_rate)
+            audio_time_series = audio_time_series[start_index:start_index +
+                                                  audio_duration*sample_rate]
+        return torch.FloatTensor(audio_time_series)
+    # Modified
+    def load_audio_clip_into_tensor(self, audio_clip, audio_duration, resample=False):
+        r"""Loads audio clip and returns raw audio."""
+        # Randomly sample a segment of audio_duration from the clip or pad to match duration
+        sample_rate = 44100
+        audio_time_series = audio_clip.reshape(-1)
+        # audio_time_series is shorter than predefined audio duration,
+        # so audio_time_series is extended
+        assert audio_duration * sample_rate >= audio_time_series.shape[0], \
+            'dur * sr = {} should be larger than len = {}'.format(audio_duration * sample_rate, audio_time_series.shape[0])
+        repeat_factor = int(np.ceil((audio_duration*sample_rate) /
+                                    audio_time_series.shape[0]))
+        # Repeat audio_time_series by repeat_factor to match audio_duration
+        audio_time_series = audio_time_series.repeat(repeat_factor)
+        # remove excess part of audio_time_series
+        audio_time_series = audio_time_series[0:audio_duration*sample_rate]
+        # return torch.FloatTensor(audio_time_series)
+        return audio_time_series  # already on cuda device
+    def preprocess_audio(self, audio_files, resample):
+        r"""Load list of audio files and return raw audio"""
+        audio_tensors = []
+        for audio_file in audio_files:
+            audio_tensor = self.load_audio_into_tensor(
+                audio_file, self.args.duration, resample)
+            audio_tensor = audio_tensor.reshape(
+                1, -1).cuda() if self.use_cuda and torch.cuda.is_available() else audio_tensor.reshape(1, -1)
+            audio_tensors.append(audio_tensor)
+        return self.default_collate(audio_tensors)
+    # Modified
+    def preprocess_audio_clips(self, audio_clips, resample=False):
+        r"""Load list of audio clips and return raw audio"""
+        audio_tensors = []
+        for audio_clip in audio_clips:
+            audio_tensor = self.load_audio_clip_into_tensor(
+                audio_clip, self.args.duration, resample=False)
+            audio_tensor = audio_tensor.reshape(
+                1, -1).cuda() if self.use_cuda and torch.cuda.is_available() else audio_tensor.reshape(1, -1)
+            audio_tensors.append(audio_tensor)
+        return self.default_collate(audio_tensors)
+    def preprocess_text(self, text_queries):
+        r"""Load list of class labels and return tokenized text"""
+        tokenized_texts = []
+        for ttext in text_queries:
+            if 'gpt' in self.args.text_model:
+                ttext = ttext + ' <|endoftext|>'
+            tok = self.tokenizer.encode_plus(
+                text=ttext, add_special_tokens=True, max_length=self.args.text_len, padding='max_length', return_tensors="pt")
+            for key in self.token_keys:
+                tok[key] = tok[key].reshape(-1).cuda() if self.use_cuda and torch.cuda.is_available() else tok[key].reshape(-1)
+            tokenized_texts.append(tok)
+        return self.default_collate(tokenized_texts)
+    def get_text_embeddings(self, class_labels):
+        r"""Load list of class labels and return text embeddings"""
+        preprocessed_text = self.preprocess_text(class_labels)
+        return self._get_text_embeddings(preprocessed_text)
+    def get_audio_embeddings(self, audio_files, resample):
+        r"""Load list of audio files and return a audio embeddings"""
+        preprocessed_audio = self.preprocess_audio(audio_files, resample)
+        return self._get_audio_embeddings(preprocessed_audio)
+    # Modified
+    def get_audio_embeddings_from_clips(self, audio_clips, resample=False):
+        r"""Load list of audio files and return a audio embeddings"""
+        preprocessed_audio = self.preprocess_audio_clips(audio_clips, resample)
+        return self._get_audio_embeddings(preprocessed_audio)
+    def _get_text_embeddings(self, preprocessed_text):
+        r"""Load preprocessed text and return text embeddings"""
+        with torch.no_grad():
+            return self.clap.caption_encoder(preprocessed_text)
+    # Modified
+    def _get_audio_embeddings(self, preprocessed_audio):
+        r"""Load preprocessed audio and return a audio embeddings"""
+        with torch.no_grad():
+            preprocessed_audio = preprocessed_audio.reshape(
+                preprocessed_audio.shape[0], preprocessed_audio.shape[2])
+            #Append [0] the audio emebdding, [1] has output class probabilities
+            if 'clapcap' in self.version:
+                return self.clapcap.clap(preprocessed_audio)[0]
+            else:
+                return self.clap.audio_encoder(preprocessed_audio)[0]
+    def _generic_batch_inference(self, func, *args):
+        r"""Process audio and/or text per batch"""
+        input_tmp = args[0]
+        batch_size = args[-1]
+        # args[0] has audio_files, args[1] has class_labels
+        inputs = [args[0], args[1]] if len(args) == 3 else [args[0]]
+        args0_len = len(args[0])
+        # compute text_embeddings once for all the audio_files batches
+        if len(inputs) == 2:
+            text_embeddings = self.get_text_embeddings(args[1])
+            inputs = [args[0], args[1], text_embeddings]
+        dataset_idx = 0
+        for _ in range(math.ceil(args0_len/batch_size)):
+            next_batch_idx = dataset_idx + batch_size
+            # batch size is bigger than available audio/text items
+            if next_batch_idx >= args0_len:
+                inputs[0] = input_tmp[dataset_idx:]
+                return func(*tuple(inputs))
+            else:
+                inputs[0] = input_tmp[dataset_idx:next_batch_idx]
+                yield func(*tuple(inputs))
+            dataset_idx = next_batch_idx
+    def get_audio_embeddings_per_batch(self, audio_files, batch_size):
+        r"""Load preprocessed audio and return a audio embeddings per batch"""
+        return self._generic_batch_inference(self.get_audio_embeddings, audio_files, batch_size)
+    def get_text_embeddings_per_batch(self, class_labels, batch_size):
+        r"""Load preprocessed text and return text embeddings per batch"""
+        return self._generic_batch_inference(self.get_text_embeddings, class_labels, batch_size)
+    def compute_similarity(self, audio_embeddings, text_embeddings):
+        r"""Compute similarity between text and audio embeddings"""
+        audio_embeddings = audio_embeddings/torch.norm(audio_embeddings, dim=-1, keepdim=True)
+        text_embeddings = text_embeddings/torch.norm(text_embeddings, dim=-1, keepdim=True)
+        logit_scale = self.clap.logit_scale.exp()
+        similarity = logit_scale*text_embeddings @ audio_embeddings.T
+        return similarity.T
+    def classify_audio_files_per_batch(self, audio_files, class_labels, batch_size):
+        r"""Compute classification probabilities for each audio recording in a batch and each class label"""
+        return self._generic_batch_inference(self.classify_audio_files, audio_files, class_labels, batch_size)
+    def generate_caption(self, audio_files, resample=True, beam_size: int = 5, entry_length=67, temperature=1.):
+        r"""Generate audio captions for each audio recording in a batch"""
+        captions = []
+        audio_tensors = self.preprocess_audio(audio_files, resample)
+        with torch.no_grad():
+            prefix = self.clapcap.clap(audio_tensors.squeeze(1))[0]
+            if self.args.normalize_prefix:
+                prefix = prefix / prefix.norm(2, -1).reshape(-1,1)
+            prefix_embed = self.clapcap.clap_project(prefix).view(-1, self.args.prefix_length, self.clapcap.gpt.transformer.wte.weight.shape[1])
+            for i in range(len(audio_tensors)):
+                gen_caption = self._generate_beam(embed=prefix_embed[i].unsqueeze(0),\
+                                                            beam_size=beam_size,\
+                                                            entry_length=entry_length,\
+                                                            temperature=temperature)[0]
+                captions.append(gen_caption.capitalize())
+        return captions
+    def _generate_beam(self, beam_size: int = 5, prompt=None, embed=None,
+                  entry_length=67, temperature=1., stop_token: str = ' <|endoftext|>'):
+        r"""Generate captions by beam search decoding"""
+        self.clapcap.eval()
+        stop_token_index = self.tokenizer.encode(stop_token)[0]
+        tokens = None
+        scores = None
+        device = next(self.clapcap.parameters()).device
+        seq_lengths = torch.ones(beam_size, device=device)
+        is_stopped = torch.zeros(beam_size, device=device, dtype=torch.bool)
+        with torch.no_grad():
+            if embed is not None:
+                generated = embed
+            else:
+                if tokens is None:
+                    tokens = torch.tensor(self.tokenizer.encode(prompt))
+                    tokens = tokens.unsqueeze(0).to(device)
+                    generated = self.clapcap.gpt.transformer.wte(tokens)
+            for i in range(entry_length):
+                outputs = self.clapcap.gpt(inputs_embeds=generated)
+                logits = outputs.logits
+                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
+                logits = logits.softmax(-1).log()
+                if scores is None:
+                    scores, next_tokens = logits.topk(beam_size, -1)
+                    generated = generated.expand(beam_size, *generated.shape[1:])
+                    next_tokens, scores = next_tokens.permute(1, 0), scores.squeeze(0)
+                    if tokens is None:
+                        tokens = next_tokens
+                    else:
+                        tokens = tokens.expand(beam_size, *tokens.shape[1:])
+                        tokens = torch.cat((tokens, next_tokens), dim=1)
+                else:
+                    logits[is_stopped] = -float(np.inf)
+                    logits[is_stopped, 0] = 0
+                    scores_sum = scores[:, None] + logits
+                    seq_lengths[~is_stopped] += 1
+                    scores_sum_average = scores_sum / seq_lengths[:, None]
+                    scores_sum_average, next_tokens = scores_sum_average.view(-1).topk(beam_size, -1)
+                    next_tokens_source = next_tokens // scores_sum.shape[1]
+                    seq_lengths = seq_lengths[next_tokens_source]
+                    next_tokens = next_tokens % scores_sum.shape[1]
+                    next_tokens = next_tokens.unsqueeze(1)
+                    tokens = tokens[next_tokens_source]
+                    tokens = torch.cat((tokens, next_tokens), dim=1)
+                    generated = generated[next_tokens_source]
+                    scores = scores_sum_average * seq_lengths
+                    is_stopped = is_stopped[next_tokens_source]
+                next_token_embed = self.clapcap.gpt.transformer.wte(next_tokens.squeeze()).view(generated.shape[0], 1, -1)
+                generated = torch.cat((generated, next_token_embed), dim=1)
+                is_stopped = is_stopped + next_tokens.eq(stop_token_index).squeeze()
+                if is_stopped.all():
+                    break
+        scores = scores / seq_lengths
+        output_list = tokens.cpu().numpy()
+        output_texts = [self.tokenizer.decode(output[:int(length)]) for output, length in zip(output_list, seq_lengths)]
+        order = scores.argsort(descending=True)
+        output_texts = [output_texts[i] for i in order]
+        return output_texts

models/audio-flamingo-1/chat/configs/chat.yaml ADDED Viewed

	@@ -0,0 +1,80 @@

+train_config:
+  expdir: YOUR_DATA_ROOT_DIR/audio-flamingo-data/checkpoint
+  run_name: chat
+  delete_previous_checkpoint: false
+  batch_size: 4
+  gradient_accumulation_steps: 4  # global batchsize = 128
+  seed: 42
+  learning_rate: 0.00002
+  lr_scheduler: constant
+  loss_multiplier: 1.0
+  warmup_steps: 1875
+  weight_decay: 0.1
+  precision: fp32
+  gradient_checkpointing: False
+  num_epochs: 1
+  offline: false
+  freeze_lm_embeddings: false
+  logging_steps: 10
+  dist_backend: nccl
+  dist_url: env://
+  no_set_device_rank: false
+  fsdp: true
+  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
+  fsdp_sharding_strategy: full  # full, hybrid
+  horovod: false
+# Chat SFT hparams
+sft_config:
+  pretrained_path: YOUR_DATA_ROOT_DIR/audio-flamingo-data/checkpoint/foundation_sft_4_shot/
+  pretrained_ckpt: checkpoint_99.pt
+  unfreeze_full_lm: true
+data_config:
+  dataset_blending_global_weight: 1.0
+  dataset_blending_config:
+    dialog_AudioSetSL-Dialog/train:
+      weight: 1.0
+      prefix_prob: 1.0
+    dialog_MusicCaps-Dialog/train:
+      weight: 5.0
+      prefix_prob: 1.0
+  dataset_file_root: YOUR_DATA_ROOT_DIR/audio-flamingo-data/dataset_files
+  data_root: YOUR_DATA_ROOT_DIR/datasets
+  dataset_blending_output: dataset_blending.json
+  max_tokens: 512
+  num_workers: 4
+clap_config:
+  # method: laion-clap
+  # audio_embed_dim: 512
+  # model_name: 630k-fusion-best
+  # checkpoint: YOUR_DATA_ROOT_DIR/audio-flamingo-data/laion-clap-pretrained/laion_clap/630k-fusion-best.pt
+  method: microsoft-clap
+  audio_embed_dim: 1024
+  config_root: YOUR_REPO_ROOT_DIR/chat/my_ms_clap/src/configs
+  model_name: 'clapcap'
+  checkpoint: YOUR_DATA_ROOT_DIR/audio-flamingo-data/clap/clapcap_weights_2023.pth
+  window_length: 7.0  # seconds
+  window_overlap: 5.25  # seconds
+  max_num_window: 16  # total = 33.25 seconds
+  max_num_fewshot: 4  # number of fewshot samples
+model_config:
+  cache_dir: YOUR_DATA_ROOT_DIR/audio-flamingo-data/LLM_pretrained/.cache
+  lang_encoder_path: facebook/opt-iml-max-1.3b
+  tokenizer_path: facebook/opt-iml-max-1.3b
+  cross_attn_every_n_layers: 1
+  audio_transformer_kwargs: {
+    n_head: 8,
+    n_layers: 3,
+    d_inner: 2048,
+    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
+    max_window_per_audio: 16,  # must = max_num_window
+  }

models/audio-flamingo-1/chat/data/README.md ADDED Viewed

	@@ -0,0 +1,19 @@

+# Data Preparation
+Data preparation and loading is a challenging part in this codebase as complex formats are used. Below are the instructions to prepare dataset manifests.
+## Step 1: Download raw datasets
+Download datasets from their original sources, or prepare your own datasets. For simplicity, in this repo, we assume datasets are stored under ```YOUR_DATA_ROOT_DIR/datasets/<dataset_name>```.
+## Step 2: Prepare dialogues
+Follow the instructions in Appendix B in our paper to generate dialogues from rich metadata and filter for quality.
+## Step 3: Prepare raw datasets into manifests
+- Modify the ```prepare_files()``` function in ```prepare_each_dataset.py``` based on your raw dataset files.
+- For each dataset, this function generates manifests for each split (train/val/test). The manifest is stored under ```YOUR_DATA_ROOT_DIR/audio-flamingo-data/dataset_files/```. The filenames are in the format of ```<dataset_name>-Dialog/train.json```.
+- The ```<dataset_name>``` used in Audio Flamingo can be found in ```configs/*.yaml``` --> data_config --> dataset_blending_config.
+- The structure of manifests can be found within the ```prepare_files()``` function.
+- Usage: ```python prepare_each_dataset.py -d <dataset_name>```.

models/audio-flamingo-1/chat/data/data.py ADDED Viewed

	@@ -0,0 +1,481 @@

+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+# Adapted from https://github.com/mlfoundations/open_flamingo under the MIT license.
+#   LICENSE is in incl_licenses directory.
+import functools
+import io
+import json
+import math
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"  # disable the tokenizer parallelism warning
+import random
+import re
+import string
+import subprocess
+import sys
+import yaml
+import numpy as np
+from collections import defaultdict
+from copy import deepcopy
+from dataclasses import dataclass
+from functools import partial
+from pydub import AudioSegment
+from tqdm import tqdm
+import torch
+import torchvision
+from torch.utils.data import DataLoader, Dataset, get_worker_info
+from torch.utils.data.distributed import DistributedSampler
+from transformers import AutoTokenizer
+import librosa
+import soundfile as sf
+def int16_to_float32(x):
+    return (x / 32767.0).astype(np.float32)
+def float32_to_int16(x):
+    x = np.clip(x, a_min=-1., a_max=1.)
+    return (x * 32767.).astype(np.int16)
+class DataCollator:
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+    def __call__(self, batch):
+        filenames, audio_clips, audio_embed_mask, input_ids, attention_masks = zip(*batch)
+        audio_clips = torch.cat([x.unsqueeze(0) for x in audio_clips], dim=0)
+        audio_embed_mask = torch.cat([x.unsqueeze(0) for x in audio_embed_mask], dim=0)
+        max_length = max([ids.shape[1] for ids in input_ids])
+        padded_input_ids = []
+        padded_attention_masks = []
+        for ids, mask in zip(input_ids, attention_masks):
+            if ids.shape[1] < max_length:
+                padded_input_ids.append(
+                    torch.cat([ids, torch.LongTensor([self.tokenizer.pad_token_id] * (max_length - ids.shape[1])).unsqueeze(0)], dim=1)
+                )
+                padded_attention_masks.append(
+                    torch.cat([mask, torch.LongTensor([0] * (max_length - mask.shape[1])).unsqueeze(0)], dim=1)
+                )
+            else:
+                padded_input_ids.append(ids)
+                padded_attention_masks.append(mask)
+        padded_input_ids = torch.cat(padded_input_ids, dim=0)
+        padded_attention_masks = torch.cat(padded_attention_masks, dim=0).bool()
+        out_dict = dict(
+            filenames=filenames,
+            audio_clips=audio_clips,
+            audio_embed_mask=audio_embed_mask,
+            input_ids=padded_input_ids,
+            attention_mask=padded_attention_masks
+        )
+        return out_dict
+class AudioTextData(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        dataset_file_root: str,
+        data_root: str,
+        clap_config: dict,
+        dataset_blending_global_weight: float,
+        dataset_blending_config: dict,
+        dataset_blending_output: str,
+        tokenizer,
+        max_tokens: int,
+        split: str = 'train',
+        epoch: int = 0,
+        force_reblend: bool = False,
+        **kwargs
+    ):
+        self.dataset_file_root = dataset_file_root
+        self.data_root = data_root
+        self.clap_config = clap_config
+        self.dataset_blending_global_weight = dataset_blending_global_weight
+        self.dataset_blending_config = dataset_blending_config
+        self.split = split
+        self.epoch = epoch
+        self.force_reblend = force_reblend
+        assert self.split == 'train'
+        self.data = self.blend_dataset(dataset_blending_config, dataset_blending_output)
+        self.tokenizer = tokenizer
+        self.tokenizer.padding_side = "right"
+        self.max_tokens = max_tokens
+    @staticmethod
+    def shuffle_dict_fixed_rand(dic, seed=0):
+        print('randomly shuffling key-value pairs')
+        local_random = np.random.default_rng(seed)
+        original_keys = list(dic.keys())
+        shuffled_keys = deepcopy(original_keys)
+        local_random.shuffle(shuffled_keys)
+        shuffling_mapping = {x: y for (x, y) in zip(original_keys, shuffled_keys)}
+        shuffled_dic = {}
+        for idx in original_keys:
+            shuffled_idx = shuffling_mapping[idx]
+            shuffled_dic[idx] = dic[shuffled_idx]
+        return shuffled_dic
+    @staticmethod
+    def is_broken_file(audiopath):
+        # write your broken file paths here
+        BROKEN_FILES = []
+        return audiopath in BROKEN_FILES
+    def _read_dataset_file(self, dataset_file):
+        print("reading", dataset_file)
+        with open(dataset_file) as f:
+            contents = f.read()
+        contents = json.loads(contents)
+        assert contents["dataset_path"].startswith(self.data_root)
+        rel_path = contents["dataset_path"][len(self.data_root):]
+        if rel_path.startswith('/'):
+            rel_path = rel_path[1:]
+        if contents['split_path'] is not None:
+            rel_path = os.path.join(rel_path, contents['split_path'])
+        """
+        contents["data"] = {
+            "0": {'name': name (xxx.wav), 'dialogue': [
+                    {"user": question 1, "assistant": answer 1},
+                    ...
+                    {"user": question k, "assistant": answer k}
+                ]
+            },
+            "1": {'name': name (xxx.wav), 'dialogue': [
+                    {"user": question 1, "assistant": answer 1},
+                    ...
+                    {"user": question k, "assistant": answer k}
+                ]
+            },
+            ...
+            "total_num-1": {'name': name (xxx.wav), 'dialogue': [
+                    {"user": question 1, "assistant": answer 1},
+                    ...
+                    {"user": question k, "assistant": answer k}
+                ]
+            }
+        }
+        """
+        for idx in contents["data"]:
+            contents["data"][idx]['task'] = contents["flamingo_task"]
+            contents["data"][idx]['name'] = os.path.join(
+                rel_path, contents["data"][idx]['name']
+            )
+        return contents
+    def blend_dataset(self, dataset_blending_config, dataset_blending_output):
+        if os.path.exists(dataset_blending_output) and not self.force_reblend:
+            print("loading blended dataset file from:", dataset_blending_output)
+            with open(dataset_blending_output) as f:
+                contents = f.read()
+            self_data = json.loads(contents)
+        else:
+            if not self.force_reblend:
+                print("no blended dataset file found; reading all dataset files")
+            else:
+                print("force reblending dataset at epoch {}; reading all dataset files".format(self.epoch))
+            all_data = {}
+            for dataset_name in dataset_blending_config:
+                dataset_file = os.path.join(self.dataset_file_root, '{}.json'.format(dataset_name))
+                contents = self._read_dataset_file(dataset_file)
+                contents['data'] = self.shuffle_dict_fixed_rand(
+                    contents['data'],
+                    seed=sum(list(map(ord, dataset_name)))
+                )
+                weight_global = float(self.dataset_blending_global_weight)
+                weight_dataset = float(dataset_blending_config[dataset_name]["weight"])
+                weight = weight_global * weight_dataset
+                all_data[dataset_name] = {
+                    "contents": contents,
+                    "weight": weight
+                }
+            self_data = {
+                "dataset_path": self.data_root,
+                "split_path": None,
+                "total_num": 0,
+                "data": {}
+            }
+            for dataset_name in all_data:
+                print('blending {}'.format(dataset_name))
+                contents = all_data[dataset_name]["contents"]
+                shuffled_contents_data = contents['data']
+                weight = all_data[dataset_name]["weight"]
+                assert type(weight) == float and weight > 0.0
+                dataset_total_num = contents['total_num']
+                start_idx = int(self.epoch * dataset_total_num * weight)
+                end_idx = int((self.epoch + 1) * dataset_total_num * weight)
+                for idx in range(start_idx, end_idx):
+                    if idx > 0 and idx % dataset_total_num == 0:
+                        print('force shuffling at new epoch {} for dataset {}'.format(idx // dataset_total_num, dataset_name))
+                        shuffled_contents_data = self.shuffle_dict_fixed_rand(
+                            contents['data'],
+                            seed=sum(list(map(ord, '{}-epoch-{}'.format(dataset_name, idx // dataset_total_num))))
+                        )
+                    key = str(idx % dataset_total_num)
+                    item = shuffled_contents_data[key]
+                    found_broken = False
+                    assert type(item['name']) is str
+                    audiopath = os.path.join(self.data_root, item['name'])
+                    if self.is_broken_file(audiopath):
+                        print('cannot read {}'.format(audiopath))
+                        found_broken = True
+                    if found_broken:
+                        continue
+                    self_data['data'][self_data['total_num']] = item
+                    self_data['total_num'] += 1
+            if not self.force_reblend:
+                print('writing blended dataset file to:', dataset_blending_output)
+                with open(dataset_blending_output, 'w') as json_file:
+                    json.dump(self_data, json_file)
+            else:
+                print('writing reblended dataset file to:', dataset_blending_output.replace('.json', '-reblended.json'))
+                with open(dataset_blending_output.replace('.json', '-reblended.json'), 'w') as json_file:
+                    json.dump(self_data, json_file)
+        return self_data
+    def get_num_windows(self, T, sr):
+        clap_config = self.clap_config
+        window_length  = int(float(clap_config["window_length"]) * sr)
+        window_overlap = int(float(clap_config["window_overlap"]) * sr)
+        max_num_window = int(clap_config["max_num_window"])
+        num_windows = 1
+        if T <= window_length:
+            num_windows = 1
+            full_length = window_length
+        elif T >= (max_num_window * window_length - (max_num_window - 1) * window_overlap):
+            num_windows = max_num_window
+            full_length = (max_num_window * window_length - (max_num_window - 1) * window_overlap)
+        else:
+            num_windows = 1 + int(np.ceil((T - window_length) / float(window_length - window_overlap)))
+            full_length = num_windows * window_length - (num_windows - 1) * window_overlap
+        return num_windows, full_length
+    def load_audio(self, file_path, target_sr=44100, duration=30.0, start=0.0):
+        if file_path.endswith('.mp3'):
+            audio = AudioSegment.from_file(file_path)
+            if len(audio) > (start + duration) * 1000:
+                audio = audio[start * 1000:(start + duration) * 1000]
+            if audio.frame_rate != target_sr:
+                audio = audio.set_frame_rate(target_sr)
+            if audio.channels > 1:
+                audio = audio.set_channels(1)
+            data = np.array(audio.get_array_of_samples())
+            if audio.sample_width == 2:
+                data = data.astype(np.float32) / np.iinfo(np.int16).max
+            elif audio.sample_width == 4:
+                data = data.astype(np.float32) / np.iinfo(np.int32).max
+            else:
+                raise ValueError("Unsupported bit depth: {}".format(audio.sample_width))
+        else:
+            with sf.SoundFile(file_path) as audio:
+                original_sr = audio.samplerate
+                channels = audio.channels
+                max_frames = int((start + duration) * original_sr)
+                audio.seek(int(start * original_sr))
+                frames_to_read = min(max_frames, len(audio))
+                data = audio.read(frames_to_read)
+                if data.max() > 1 or data.min() < -1:
+                    data = data / max(abs(data.max()), abs(data.min()))
+            if original_sr != target_sr:
+                if channels == 1:
+                    data = librosa.resample(data.flatten(), orig_sr=original_sr, target_sr=target_sr)
+                else:
+                    data = librosa.resample(data.T, orig_sr=original_sr, target_sr=target_sr)[0]
+            else:
+                if channels != 1:
+                    data = data.T[0]
+        if data.min() >= 0:
+            data = 2 * data / abs(data.max()) - 1.0
+        else:
+            data = data / max(abs(data.max()), abs(data.min()))
+        assert len(data.shape) == 1, data.shape
+        return data
+    def compute_sliding_window(self, audio_file, audio_start=0.0):
+        if type(audio_start) == str:
+            audio_start = float(audio_start)
+        clap_config = self.clap_config
+        if clap_config["method"] == 'laion-clap':
+            sr = 48000
+        elif clap_config["method"] == 'microsoft-clap':
+            sr = 44100
+        else:
+            raise NotImplementedError
+        window_length  = int(float(clap_config["window_length"]) * sr)
+        window_overlap = int(float(clap_config["window_overlap"]) * sr)
+        max_num_window = int(clap_config["max_num_window"])
+        duration = max_num_window * (clap_config["window_length"] - clap_config["window_overlap"]) + clap_config["window_overlap"]
+        audio_data = self.load_audio(os.path.join(self.data_root, audio_file), sr, duration, audio_start)
+        T = len(audio_data)
+        num_windows, full_length = self.get_num_windows(T, sr)
+        if full_length > T:
+            audio_data = np.append(audio_data, np.zeros(full_length - T))
+        audio_data = audio_data.reshape(1, -1)
+        audio_data_tensor = torch.from_numpy(int16_to_float32(float32_to_int16(audio_data))).float()
+        audio_clips = []
+        audio_embed_mask = torch.zeros(max_num_window)
+        for i in range(num_windows):
+            start = i * (window_length - window_overlap)
+            audio_clips.append(audio_data_tensor[:, start:start+window_length])
+            audio_embed_mask[i] = 1
+        assert sum(audio_embed_mask) == num_windows
+        if num_windows < max_num_window:
+            for _ in range(max_num_window - num_windows):
+                audio_clips.append(torch.zeros_like(audio_clips[-1]))
+        audio_clips = torch.cat(audio_clips)  # (max_num_window, window_length * sr) cuda tensor
+        return audio_clips, audio_embed_mask
+    def preprocess_string_for_eval(self, x):
+        x = x.rstrip().lstrip()
+        x = x.lower()
+        return x
+    def __getitem__(self, i):
+        try:
+            item = self.data['data'][str(i)]
+        except:
+            item = self.data['data'][i]
+        assert type(item['name']) is str
+        audio_files = [os.path.join(self.data_root, item['name'])]
+        audio_starts = [0 if 'audio_start' not in item else float(item['audio_start'])]
+        audio_clips, audio_embed_mask = [], []
+        for audio_file, audio_start in zip(audio_files, audio_starts):
+            this_audio_clips, this_audio_embed_mask = self.compute_sliding_window(audio_file, audio_start)
+            audio_clips.append(this_audio_clips)
+            audio_embed_mask.append(this_audio_embed_mask)
+        audio_clips = torch.cat(audio_clips)
+        audio_embed_mask = torch.cat(audio_embed_mask)
+        correct_num_windows = int(self.clap_config["max_num_window"]) * int(self.clap_config["max_num_fewshot"])
+        if len(audio_clips) < correct_num_windows:
+            audio_clips = torch.cat([
+                audio_clips,
+                torch.zeros(correct_num_windows - len(audio_clips), audio_clips.shape[1])
+            ])
+            audio_embed_mask = torch.cat([
+                audio_embed_mask,
+                torch.zeros(correct_num_windows - len(audio_embed_mask))
+            ])
+        audio_clips.requires_grad = False
+        audio_embed_mask.requires_grad = False
+        assert 'dialogue' in item
+        dialogue = item['dialogue']
+        prefix = 'The task is dialog. '
+        sample = f"{self.tokenizer.bos_token}{prefix}<audio>"
+        for each_round in dialogue:
+            user_content, assistant_content = each_round['user'], each_round['assistant']
+            sample = sample + f"user: {user_content} \nassistant: {self.tokenizer.sep_token}{assistant_content}<|endofchunk|>{self.tokenizer.eos_token}\n"
+        text = self.tokenizer(
+            sample,
+            max_length=self.max_tokens,
+            padding="longest",
+            truncation="only_first",
+            return_tensors="pt"
+        )
+        return (item['name'], audio_clips, audio_embed_mask, text["input_ids"], text["attention_mask"])
+    def __len__(self):
+        return len(list(self.data['data'].keys()))
+@dataclass
+class DataInfo:
+    dataset: Dataset
+    dataloader: DataLoader
+    sampler: DistributedSampler = None
+    def set_epoch(self, epoch):
+        if self.sampler is not None and isinstance(self.sampler, DistributedSampler):
+            self.sampler.set_epoch(epoch)
+def get_audiotext_dataloader(data_config, clap_config, text_tokenizer, batch_size, split='train', epoch=0, force_reblend=False):
+    assert split == 'train'
+    data_collator = DataCollator(text_tokenizer)
+    dataloader_shuffle = False
+    trainset = AudioTextData(
+        **data_config,
+        clap_config=clap_config,
+        tokenizer=text_tokenizer,
+        split=split,
+        epoch=epoch,
+        force_reblend=force_reblend
+    )
+    sampler = DistributedSampler(trainset, shuffle=True)
+    trainloader = DataLoader(
+        trainset,
+        sampler=sampler,
+        batch_size=batch_size,
+        shuffle=dataloader_shuffle,
+        collate_fn=data_collator,
+        num_workers=data_config["num_workers"]
+    )
+    return DataInfo(dataset=trainset, dataloader=trainloader, sampler=sampler)

models/audio-flamingo-1/chat/data/prepare_each_dataset.py ADDED Viewed

	@@ -0,0 +1,253 @@

+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+import os
+import json
+import csv
+import yaml
+from collections import defaultdict
+import pickle
+import glob
+import math
+from functools import partial
+import sys
+import io
+import warnings
+import random
+import numpy as np
+import torch
+import librosa
+from pydub import AudioSegment
+import soundfile as sf
+import faiss
+import multiprocessing
+multiprocessing.set_start_method('spawn', force=True)
+try:
+    from tqdm import tqdm
+except:
+    tqdm = lambda x: x
+def filter_file(file_path, file_list, filename):
+    if file_list is not None:
+        if filename not in file_list:
+            print(filename, 'not exist')
+            return True
+    else:
+        if not os.path.exists(os.path.join(file_path, filename)):
+            print(filename, 'not exist')
+            return True
+    if os.path.getsize(os.path.join(file_path, filename)) < 16000:
+        print(filename, 'less than 0.5 to 1 second')
+        return True
+    return False
+def filter_response(response):
+    filter_phrases_LLARK = [
+        'metadata', 'is not provided', 'based on theprovided metadata',
+        'based on the providedbeat', 'based on the provided chord',
+        'basedon the provided information', 'based on theprovided annotations',
+        'no specific mood,there is no mention of',
+        'there is no specificmention of any', 'as an ai assistant',
+        'iam unable to', 'as an ai assistant', 'i donot',
+        'it is difficult to determine', 'it isnot possible to determine',
+        'no informationis available about the album', 'cannotdetermine',
+        'violin 1', 'violin 2', 'violin 3,viola 1', 'viola 2', 'viola 3', 'pack'
+    ]
+    filter_phrases_LTU = [
+        'cannot determine', 'not provided', 'cannot be determined', 'sorry', 'i cannot',
+        'without more information', 'enough information',
+        'not possible', 'more context', 'enough', 'impossible', 'cannot be determined',
+        'without additional information',
+        'unclear', 'cannot', 'not clear', 'do not provide sufficient', 'does not provide',
+        'difficult to determine', 'no information provided',
+        "can't infer", "difficult to infer", "not specified", "no specific", "no information",
+        "without additional", 'it is difficult to',
+        "no indication"
+    ]
+    filter_phrases_ours = ["doesn't provide", "doesn't specify", "doesn't indicate", "based on"]
+    for phrase in filter_phrases_LLARK + filter_phrases_LTU + filter_phrases_ours:
+        if phrase in response.lower():
+            return True
+    return False
+# !!!Important!!! please write your own code to create dataset manifests based on your stored datasets
+# The list of dataset_name and flamingo_task can be found in configs/*.yaml --> data_config --> dataset_blending_config
+def prepare_files(dataset_name, dataset_path, split, flamingo_task, output_file):
+    assert not os.path.exists(output_file)
+    dataset_dic = {
+        "dataset_path": dataset_path,
+        "split": split,
+        "split_path": None,
+        "flamingo_task": "{}-{}".format(dataset_name, flamingo_task),
+        "total_num": 0,
+        "data": {}
+    }
+    """
+    dataset_dic has the format
+    {
+        "dataset_path": YOUR_DATA_ROOT_DIR/datasets/dataset_name/,
+        "split": "train" or "test",
+        "split_path": ./,
+        "flamingo_task": <dataset_name>-Dialog,
+        "total_num": total number of samples,
+        "data": a dictionary of data manifest (see below)
+    }
+    dataset_dic["data"] has the format
+    {
+        "0": {'name': name (xxx.wav), 'dialogue': [
+                {"user": question 1, "assistant": answer 1},
+                ...
+                {"user": question k, "assistant": answer k}
+            ]
+        },
+        "1": {'name': name (xxx.wav), 'dialogue': [
+                {"user": question 1, "assistant": answer 1},
+                ...
+                {"user": question k, "assistant": answer k}
+            ]
+        },
+        ...
+        "total_num-1": {'name': name (xxx.wav), 'dialogue': [
+                {"user": question 1, "assistant": answer 1},
+                ...
+                {"user": question k, "assistant": answer k}
+            ]
+        }
+    }
+    Note that os.path.join(dataset_path, split_path, name) is the absolute path to the audio file.
+    Note that audio files are not restricted to wav. However, mp3 is not recommended due to a different seeking mechanism.
+    """
+    if dataset_name == 'dialog_AudioSetSL':
+        assert flamingo_task == "Dialog"
+        assert split == 'train'
+        map_split = lambda split: './'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+        dataset_dic["split_path"] = map_split(split)
+        file_list = None
+        json_filename = 'dialogues_audioset_thresholded.json'
+        with open(os.path.join(dataset_path, json_filename)) as f:
+            data_list = f.read()
+        data_list = json.loads(data_list)
+        for data in tqdm(data_list):
+            filename = data["audio_id"]
+            if filter_file(file_path, file_list, filename):
+                continue
+            dialogue = data['dialogue']
+            # filter bad dialog
+            discard = False
+            for each_round in dialogue:
+                if filter_response(each_round['assistant']):
+                    discard = True
+                    break
+            if not discard:
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "dialogue": dialogue
+                }
+                dataset_dic["total_num"] += 1
+    elif dataset_name == 'dialog_MusicCaps':
+        assert flamingo_task == "Dialog"
+        assert split == 'train'
+        map_split = lambda split: './'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+        dataset_dic["split_path"] = map_split(split)
+        file_list = None
+        json_filename = 'dialogues_musiccaps_thresholded.json'
+        with open(os.path.join(dataset_path, json_filename)) as f:
+            data_list = f.read()
+        data_list = json.loads(data_list)
+        for data in tqdm(data_list):
+            filename = data["audio_id"]
+            if filter_file(file_path, file_list, filename):
+                continue
+            dialogue = data['dialogue']
+            # filter bad dialog
+            discard = False
+            for each_round in dialogue:
+                if filter_response(each_round['assistant']):
+                    discard = True
+                    break
+            if not discard:
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "dialogue": dialogue
+                }
+                dataset_dic["total_num"] += 1
+    with open(output_file, 'w') as json_file:
+        json.dump(dataset_dic, json_file)
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-d', '--dataset_name', type=str, help='dataset name')
+    parser.add_argument('-f', '--flamingo_task', type=str, default='Dialog', help='flamingo task')
+    args = parser.parse_args()
+    global DATA_ROOT_DIR
+    DATA_ROOT_DIR = "YOUR_DATA_ROOT_DIR"
+    dataset_root = os.path.join(DATA_ROOT_DIR, "datasets")
+    output_root = os.path.join(DATA_ROOT_DIR, "audio-flamingo-data/dataset_files")
+    os.makedirs(output_root, exist_ok=True)
+    dataset_name = args.dataset_name  # dialog_AudioSetSL, dialog_MusicCaps
+    flamingo_task = args.flamingo_task  # Dialog
+    split = 'train'
+    dataset_path = os.path.join(dataset_root, dataset_name)
+    output_folder = '{}-{}'.format(dataset_name, flamingo_task)
+    os.makedirs(os.path.join(output_root, output_folder), exist_ok=True)
+    dataset_file = os.path.join(output_root, output_folder, '{}.json'.format(split))
+    if not os.path.exists(dataset_file):
+        try:
+            prepare_files(dataset_name, dataset_path, split, flamingo_task, dataset_file)
+        except AssertionError as e:
+            print('split {} not exist for {}: {}'.format(split, dataset_name, e))
+            continue
+    else:
+        print('{} exists; exiting'.format(dataset_file))

models/audio-flamingo-1/chat/src/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (c) 2024 NVIDIA CORPORATION.
2	+ # Licensed under the MIT license.

models/audio-flamingo-1/chat/src/factory.py ADDED Viewed

	@@ -0,0 +1,219 @@

+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+# Adapted from https://github.com/mlfoundations/open_flamingo under the MIT license.
+#   LICENSE is in incl_licenses directory.
+import sys
+sys.path.append('../')
+from typing import Optional
+from copy import deepcopy
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from my_laion_clap.CLAP.src.laion_clap.hook import CLAP_Module
+from my_ms_clap.src.CLAPWrapper import CLAPWrapper
+import torch
+from torch import nn
+try:
+    from .flamingo import Flamingo
+    from .flamingo_lm import FlamingoLMMixin
+    from .utils import extend_instance
+except:
+    from flamingo import Flamingo
+    from flamingo_lm import FlamingoLMMixin
+    from utils import extend_instance
+class CLAP(nn.Module):
+    def __init__(self, clap_config):
+        super(CLAP, self).__init__()
+        self.method = clap_config["method"]
+        device_id = f'cuda:{torch.cuda.current_device()}'
+        if ('finetune' in clap_config) and clap_config['finetune']:
+            self.finetune = True
+            print('Finetuning CLAP encoder as well!')
+        else:
+            self.finetune = False
+        if self.method == 'laion-clap':
+            # https://github.com/LAION-AI/CLAP
+            if clap_config["model_name"] in ['630k-audioset-best', '630k-best', '630k-audioset-fusion-best', '630k-fusion-best']:
+                amodel = 'HTSAT-tiny'
+            elif clap_config["model_name"] in ['music_speech_audioset_epoch_15_esc_89.98']:
+                amodel = 'HTSAT-base'
+            else:
+                raise NotImplementedError
+            enable_fusion = 'fusion' in clap_config["model_name"].lower()
+            self.laion_clap = CLAP_Module(enable_fusion=enable_fusion, amodel=amodel, device=device_id)
+            self.laion_clap.load_ckpt(ckpt=clap_config["checkpoint"])
+            for param in self.laion_clap.parameters():
+                param.requires_grad = self.finetune
+            if self.finetune:
+                self.laion_clap.train()
+            else:
+                self.laion_clap.eval()
+            print('loaded laion-clap model: {}'.format(clap_config["checkpoint"]))
+        elif self.method == 'microsoft-clap':
+            # https://github.com/microsoft/CLAP
+            self.ms_clap = CLAPWrapper(
+                clap_config["checkpoint"],
+                config_root=clap_config["config_root"],
+                version=clap_config['model_name'],
+                use_cuda=True
+            )
+            if clap_config['model_name'] in ['2022', '2023']:
+                for param in self.ms_clap.clap.parameters():
+                    param.requires_grad = self.finetune
+                if self.finetune:
+                    self.ms_clap.clap.train()
+                else:
+                    self.ms_clap.clap.eval()
+            else:
+                for param in self.ms_clap.clapcap.parameters():
+                    param.requires_grad = self.finetune
+                if self.finetune:
+                    self.ms_clap.clapcap.train()
+                else:
+                    self.ms_clap.clapcap.eval()
+            print('loaded microsoft-clap model: {}'.format(clap_config["checkpoint"]))
+        else:
+            raise NotImplementedError
+    def forward(self, audio_clips):
+        if len(audio_clips.shape) == 2:
+            audio_clips = audio_clips.unsqueeze(0)
+        assert len(audio_clips.shape) == 3
+        audio_embeds = []
+        for x in audio_clips:
+            if self.method == 'laion-clap':
+                audio_embed = self.laion_clap.get_audio_embedding_from_data(x=x, use_tensor=True)
+            elif self.method == 'microsoft-clap':
+                audio_embed = self.ms_clap.get_audio_embeddings_from_clips(x)
+            audio_embeds.append(audio_embed)
+        audio_embeds = torch.stack(audio_embeds, dim=0)
+        audio_embeds.requires_grad = self.finetune
+        return audio_embeds
+def create_model_and_transforms(
+    clap_config: dict,
+    lang_encoder_path: str,
+    tokenizer_path: str,
+    audio_transformer_kwargs: dict,
+    cross_attn_every_n_layers: int = 1,
+    use_local_files: bool = False,
+    decoder_layers_attr_name: str = None,
+    freeze_lm_embeddings: bool = False,
+    unfreeze_full_lm: bool = False,
+    cache_dir: Optional[str] = None,
+    **flamingo_kwargs,
+):
+    clap = CLAP(clap_config)
+    text_tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_path,
+        local_files_only=use_local_files,
+        trust_remote_code=True,
+        cache_dir=cache_dir,
+    )
+    text_tokenizer.add_special_tokens(
+        {"additional_special_tokens": ["<audio>", "<|endofchunk|>"]}
+    )
+    if text_tokenizer.pad_token is None:
+        text_tokenizer.add_special_tokens({"pad_token": "<PAD>"})
+    if text_tokenizer.sep_token is None:
+        text_tokenizer.add_special_tokens({"sep_token": "<SEP>"})
+    lang_encoder = AutoModelForCausalLM.from_pretrained(
+        lang_encoder_path,
+        local_files_only=use_local_files,
+        trust_remote_code=True,
+        cache_dir=cache_dir,
+    )
+    extend_instance(lang_encoder, FlamingoLMMixin)
+    if decoder_layers_attr_name is None:
+        decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder)
+    lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name)
+    lang_encoder.resize_token_embeddings(len(text_tokenizer))
+    if ('finetune' in clap_config) and clap_config['finetune']:
+        unfreeze_clap = True
+    else:
+        unfreeze_clap = False
+    model = Flamingo(
+        clap,
+        unfreeze_clap,
+        lang_encoder,
+        text_tokenizer.encode("<|endofchunk|>")[-1],
+        text_tokenizer.encode("<audio>")[-1],
+        text_tokenizer.sep_token_id,
+        audio_embed_dim=clap_config["audio_embed_dim"],
+        audio_transformer_kwargs=audio_transformer_kwargs,
+        cross_attn_every_n_layers=cross_attn_every_n_layers,
+        **flamingo_kwargs,
+    )
+    model.requires_grad_(False)
+    assert sum(p.numel() for p in model.parameters() if p.requires_grad) == 0
+    model.audio_transformer.requires_grad_(True)
+    model.lang_encoder.gated_cross_attn_layers.requires_grad_(True)
+    if not freeze_lm_embeddings:
+        model.lang_encoder.get_input_embeddings().requires_grad_(True)
+    if unfreeze_full_lm:
+        model.lang_encoder.requires_grad_(True)
+    if unfreeze_clap:
+        model.clap.requires_grad_(True)
+    print("Flamingo model initialized with {:,} trainable parameters (audio transformer has {:,}, LM has {:,})".format(
+        sum(p.numel() for p in model.parameters() if p.requires_grad),
+        sum(p.numel() for p in model.audio_transformer.parameters() if p.requires_grad),
+        sum(p.numel() for p in model.lang_encoder.parameters() if p.requires_grad)
+    ))
+    return model, text_tokenizer
+def _infer_decoder_layers_attr_name(model):
+    for k in __KNOWN_DECODER_LAYERS_ATTR_NAMES:
+        if k.lower() in model.__class__.__name__.lower():
+            return __KNOWN_DECODER_LAYERS_ATTR_NAMES[k]
+    raise ValueError(
+        f"We require the attribute name for the nn.ModuleList in the decoder storing the transformer block layers. Please supply this string manually."
+    )
+__KNOWN_DECODER_LAYERS_ATTR_NAMES = {
+    "opt": "model.decoder.layers",
+    "gptj": "transformer.h",
+    "gpt-j": "transformer.h",
+    "pythia": "gpt_neox.layers",
+    "llama": "model.layers",
+    "gptneoxforcausallm": "gpt_neox.layers",
+    "mpt": "transformer.blocks",
+    "mosaicgpt": "transformer.blocks",
+}

models/audio-flamingo-1/chat/src/flamingo.py ADDED Viewed

	@@ -0,0 +1,260 @@

+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+# Adapted from https://github.com/mlfoundations/open_flamingo under the MIT license.
+#   LICENSE is in incl_licenses directory.
+import torch
+from einops import rearrange
+from torch import nn
+from torch.distributed.fsdp.wrap import (
+    enable_wrap,
+    wrap,
+)
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from torch.distributed.fsdp import (
+    FullyShardedDataParallel as FSDP,
+)
+try:
+    from .helpers import TransformerEncoder
+    from .utils import apply_with_stopping_condition
+except:
+    from helpers import TransformerEncoder
+    from utils import apply_with_stopping_condition
+class Flamingo(nn.Module):
+    def __init__(
+        self,
+        clap: nn.Module,
+        unfreeze_clap: bool,
+        lang_encoder: nn.Module,
+        eoc_token_id: int,
+        media_token_id: int,
+        sep_token_id: int,
+        audio_embed_dim: int,
+        audio_transformer_kwargs: dict,
+        cross_attn_every_n_layers: int = 1,
+        gradient_checkpointing: bool = False,
+    ):
+        super().__init__()
+        self.eoc_token_id = eoc_token_id
+        self.media_token_id = media_token_id
+        self.sep_token_id = sep_token_id
+        self.audio_embed_dim = audio_embed_dim
+        self.clap = clap # .to(torch.cuda.current_device())
+        self.unfreeze_clap = unfreeze_clap
+        self.clap.requires_grad_(unfreeze_clap)
+        if hasattr(lang_encoder.config, "d_model"):
+            self.lang_dim = lang_encoder.config.d_model  # mpt uses d_model
+        else:
+            self.lang_dim = lang_encoder.config.hidden_size
+        n_head = audio_transformer_kwargs["n_head"]
+        n_layers = audio_transformer_kwargs["n_layers"]
+        d_inner = audio_transformer_kwargs["d_inner"]
+        max_num_media = audio_transformer_kwargs["max_num_media"]
+        max_window_per_audio = audio_transformer_kwargs["max_window_per_audio"]
+        assert audio_embed_dim % n_head == 0
+        self.audio_transformer = TransformerEncoder(
+            d_word_vec=audio_embed_dim,
+            n_layers=n_layers,
+            n_head=n_head,
+            d_k=audio_embed_dim // n_head,
+            d_v=audio_embed_dim // n_head,
+            d_model=audio_embed_dim,
+            d_inner=d_inner,
+            dropout=0.0,
+            n_position=max_num_media,
+            scale_emb=True
+        )
+        self.lang_encoder = lang_encoder
+        self.lang_encoder.init_flamingo(
+            media_token_id=media_token_id,
+            lang_hidden_size=self.lang_dim,
+            audio_hidden_size=self.audio_embed_dim,
+            max_window_per_audio=max_window_per_audio,
+            cross_attn_every_n_layers=cross_attn_every_n_layers,
+            gradient_checkpointing=gradient_checkpointing,
+        )
+        self._use_gradient_checkpointing = gradient_checkpointing
+        self.audio_transformer._use_gradient_checkpointing = gradient_checkpointing
+        self.clap._use_gradient_checkpointing = gradient_checkpointing
+    def forward(
+        self,
+        audio_x: torch.Tensor,
+        audio_x_mask: torch.Tensor,
+        lang_x: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        labels: torch.Tensor = None,
+        clear_conditioned_layers: bool = True,
+        past_key_values=None,
+        use_cache: bool = False,
+    ):
+        assert (
+            self.lang_encoder.initialized_flamingo
+        ), "Flamingo layers are not initialized. Please call `init_flamingo` first."
+        assert (
+            self.lang_encoder._use_cached_audio_x or audio_x is not None
+        ), "Must provide either audio_x or have precached media using cache_media()."
+        if self.lang_encoder._use_cached_audio_x:
+            assert (
+                audio_x is None
+            ), "Expect audio_x to be None when media has been cached using cache_media(). Try uncache_media() first."
+            assert self.lang_encoder.is_conditioned()
+        else:
+            self._encode_audio_x(audio_x=audio_x, audio_x_mask=audio_x_mask)
+            self._condition_media_locations(input_ids=lang_x)
+        output = self.lang_encoder(
+            input_ids=lang_x,
+            attention_mask=attention_mask,
+            labels=labels,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+        )
+        if clear_conditioned_layers:
+            self.lang_encoder.clear_conditioned_layers()
+        return output
+    def generate(
+        self,
+        audio_x: torch.Tensor,
+        audio_x_mask: torch.Tensor,
+        lang_x: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        **kwargs,
+    ):
+        num_beams = kwargs.pop("num_beams", 1)
+        if num_beams > 1:
+            audio_x = audio_x.repeat_interleave(num_beams, dim=0)
+        self.lang_encoder._use_cached_audio_x = True
+        self._encode_audio_x(audio_x=audio_x, audio_x_mask=audio_x_mask)
+        eos_token_id = kwargs.pop("eos_token_id", self.eoc_token_id)
+        output = self.lang_encoder.generate(
+            input_ids=lang_x,
+            attention_mask=attention_mask,
+            eos_token_id=eos_token_id,
+            num_beams=num_beams,
+            **kwargs,
+        )
+        self.lang_encoder.clear_conditioned_layers()
+        self.lang_encoder._use_cached_audio_x = False
+        return output
+    def _encode_audio_x(self, audio_x: torch.Tensor, audio_x_mask: torch.Tensor):
+        """
+        rearrange code based on https://github.com/dhansmair/flamingo-mini
+        """
+        assert audio_x.ndim == 3, "audio_x should be of shape (B, num_window, window_length)"
+        with torch.no_grad():
+            audio_embeds = self.clap(audio_x)
+        B, L, D = audio_embeds.shape  # L is number of windows, D is feature dim
+        assert D == self.audio_embed_dim
+        assert audio_x_mask.ndim == 2, "audio_x_mask should be of shape (B, L)"
+        if B > 1 and audio_x_mask.shape[0] == 1:
+            audio_x_mask = audio_x_mask.repeat(B, 1)
+        assert audio_x_mask.shape[0] == B and audio_x_mask.shape[1] == L, "{} != ({},{})".format(audio_x_mask.shape, B, L)
+        audio_x_out = self.audio_transformer(audio_embeds)  # B, L, D
+        audio_x_out = audio_x_out.unsqueeze(2)  # B, L, n=1, D
+        audio_x_mask = audio_x_mask.unsqueeze(2)  # B, L, n=1
+        for layer in self.lang_encoder._get_decoder_layers():
+            layer.condition_audio_x(audio_x_out, audio_x_mask)
+    def wrap_fsdp(self, wrapper_kwargs, device_id):
+        # unfreeze the decoder layers
+        for block in self.lang_encoder.old_decoder_blocks:
+            block.requires_grad_(True)
+        # wrap in FSDP
+        with enable_wrap(wrapper_cls=FSDP, **wrapper_kwargs):
+            self.audio_transformer = wrap(wrap(self.audio_transformer))
+            self.lang_encoder.old_decoder_blocks = nn.ModuleList(
+                wrap(wrap(block)) for block in self.lang_encoder.old_decoder_blocks
+            )
+            self.lang_encoder.gated_cross_attn_layers = nn.ModuleList(
+                wrap(wrap(layer)) if layer is not None else None
+                for layer in self.lang_encoder.gated_cross_attn_layers
+            )
+            self.lang_encoder.init_flamingo_layers(self._use_gradient_checkpointing)
+            self.lang_encoder.set_input_embeddings(
+                wrap(wrap(self.lang_encoder.get_input_embeddings()))
+            )
+            if hasattr(self.lang_encoder, 'set_output_embeddings'):
+                self.lang_encoder.set_output_embeddings(
+                    wrap(wrap(self.lang_encoder.get_output_embeddings()))
+                )
+            else:
+                print('skip wrapping output embeddings')
+        # manually move non-FSDP managed parameters to device_id
+        # these are all in lang_encoder
+        apply_with_stopping_condition(
+            module=self.lang_encoder,
+            apply_fn=lambda m: m.to(device_id),
+            apply_condition=lambda m: len(list(m.children())) == 0,
+            stopping_condition=lambda m: isinstance(m, FSDP),
+        )
+        # clap shouldn't be wrapped; should be on each gpu
+        if self.unfreeze_clap:
+            apply_with_stopping_condition(
+                module=self.clap,
+                apply_fn=lambda m: m.to(device_id),
+                apply_condition=lambda m: len(list(m.children())) == 0,
+                stopping_condition=lambda m: isinstance(m, FSDP),
+            )
+        # exclude the original decoder layers from the optimizer
+        for block in self.lang_encoder.old_decoder_blocks:
+            for p in block.parameters():
+                p.exclude_from_optimizer = True
+        # set up clip_grad_norm_ function
+        def clip_grad_norm_(max_norm):
+            self.audio_transformer.clip_grad_norm_(max_norm)
+            for layer in self.lang_encoder.gated_cross_attn_layers:
+                if layer is not None:
+                    layer.clip_grad_norm_(max_norm)
+            self.lang_encoder.get_input_embeddings().clip_grad_norm_(max_norm)
+        self.clip_grad_norm_ = clip_grad_norm_
+    def _condition_media_locations(self, input_ids: torch.Tensor):
+        media_locations = (input_ids == self.media_token_id)
+        for layer in self.lang_encoder._get_decoder_layers():
+            layer.condition_media_locations(media_locations)
+    def cache_media(self, input_ids: torch.Tensor, audio_x: torch.Tensor, audio_x_mask: torch.Tensor):
+        self._encode_audio_x(audio_x=audio_x, audio_x_mask=audio_x_mask)
+        self._condition_media_locations(input_ids=input_ids)
+        self.lang_encoder._use_cached_audio_x = True
+    def uncache_media(self):
+        self.lang_encoder.clear_conditioned_layers()
+        self.lang_encoder._use_cached_audio_x = False

models/audio-flamingo-1/chat/src/flamingo_lm.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+# Adapted from https://github.com/mlfoundations/open_flamingo under the MIT license.
+#   LICENSE is in incl_licenses directory.
+import torch.nn as nn
+try:
+    from .helpers import GatedCrossAttentionBlock
+    from .utils import getattr_recursive, setattr_recursive
+except:
+    from helpers import GatedCrossAttentionBlock
+    from utils import getattr_recursive, setattr_recursive
+class FlamingoLayer(nn.Module):
+    """
+    FlamingoLayer is a wrapper around the GatedCrossAttentionBlock and DecoderLayer.
+    """
+    def __init__(
+        self, gated_cross_attn_layer, decoder_layer, gradient_checkpointing=False
+    ):
+        super().__init__()
+        self.gated_cross_attn_layer = gated_cross_attn_layer
+        self.decoder_layer = decoder_layer
+        self.audio_x = None
+        self.audio_x_mask = None
+        self.few_shot_mask = None
+        self.media_locations = None
+        if self.gated_cross_attn_layer is not None:
+            self.gated_cross_attn_layer._use_gradient_checkpointing = (
+                gradient_checkpointing
+            )
+        self.decoder_layer._use_gradient_checkpointing = gradient_checkpointing
+    def is_conditioned(self) -> bool:
+        """Check whether the layer is conditioned."""
+        return (self.audio_x is not None) and (self.audio_x_mask is not None) and (self.media_locations is not None)
+    def condition_audio_x(self, audio_x, audio_x_mask):
+        self.audio_x = audio_x
+        self.audio_x_mask = audio_x_mask
+    def condition_media_locations(self, media_locations):
+        self.media_locations = media_locations
+    def condition_use_cached_media(self, use_cached_media):
+        self.use_cached_media = use_cached_media
+    def forward(
+        self,
+        lang_x,
+        attention_mask=None,
+        **decoder_layer_kwargs,
+    ):
+        if self.gated_cross_attn_layer is not None:
+            if self.audio_x is None:
+                raise ValueError("audio_x must be conditioned before forward pass")
+            if self.media_locations is None:
+                raise ValueError(
+                    "media_locations must be conditioned before forward pass"
+                )
+            lang_x = self.gated_cross_attn_layer(
+                lang_x,
+                self.audio_x,
+                self.audio_x_mask,
+                media_locations=self.media_locations,
+                use_cached_media=self.use_cached_media,
+            )
+        # Normal decoder layer
+        lang_x = self.decoder_layer(
+            lang_x, attention_mask=attention_mask, **decoder_layer_kwargs
+        )
+        return lang_x
+class FlamingoLMMixin(nn.Module):
+    """
+    Mixin to add cross-attention layers to a language model.
+    """
+    def set_decoder_layers_attr_name(self, decoder_layers_attr_name):
+        self.decoder_layers_attr_name = decoder_layers_attr_name
+    def _get_decoder_layers(self):
+        return getattr_recursive(self, self.decoder_layers_attr_name)
+    def _set_decoder_layers(self, value):
+        setattr_recursive(self, self.decoder_layers_attr_name, value)
+    def init_flamingo(
+        self,
+        media_token_id,
+        lang_hidden_size,
+        audio_hidden_size,
+        max_window_per_audio,
+        cross_attn_every_n_layers,
+        gradient_checkpointing,
+    ):
+        """
+        Initialize Flamingo by adding a new gated cross attn to the decoder. Store the media token id for computing the media locations.
+        """
+        self.old_decoder_blocks = self._get_decoder_layers()
+        self.gated_cross_attn_layers = nn.ModuleList(
+            [
+                GatedCrossAttentionBlock(
+                    dim=lang_hidden_size,
+                    dim_audio=audio_hidden_size,
+                    max_window_per_audio=max_window_per_audio,
+                    only_attend_immediate_media=False,
+                )
+                if (layer_idx + 1) % cross_attn_every_n_layers == 0
+                else None
+                for layer_idx, _ in enumerate(self._get_decoder_layers())
+            ]
+        )
+        self.init_flamingo_layers(gradient_checkpointing)
+        self.media_token_id = media_token_id
+        self.initialized_flamingo = True
+        self._use_cached_audio_x = False
+    def init_flamingo_layers(self, gradient_checkpointing):
+        """
+        Re initializes the FlamingoLayers.
+        Propagates any changes made to self.gated_corss_attn_layers or self.old_decoder_blocks
+        """
+        self._set_decoder_layers(
+            nn.ModuleList(
+                [
+                    FlamingoLayer(
+                        gated_cross_attn_layer, decoder_layer, gradient_checkpointing
+                    )
+                    for gated_cross_attn_layer, decoder_layer in zip(
+                        self.gated_cross_attn_layers, self.old_decoder_blocks
+                    )
+                ]
+            )
+        )
+    def forward(self, input_ids, attention_mask, **kwargs):
+        """Condition the Flamingo layers on the media locations before forward()"""
+        if not self.initialized_flamingo:
+            raise ValueError(
+                "Flamingo layers are not initialized. Please call `init_flamingo` first."
+            )
+        media_locations = input_ids == self.media_token_id
+        use_cached_media_locations = (
+            self._use_cached_audio_x
+            and self.is_conditioned()
+            and not media_locations.any()
+        )
+        for layer in self._get_decoder_layers():
+            if not use_cached_media_locations:
+                layer.condition_media_locations(media_locations)
+            layer.condition_use_cached_media(use_cached_media_locations)
+        kwargs["input_ids"] = input_ids
+        kwargs["attention_mask"] = attention_mask
+        return super().forward(**kwargs)
+    def is_conditioned(self) -> bool:
+        """Check whether all decoder layers are already conditioned."""
+        return all(l.is_conditioned() for l in self._get_decoder_layers())
+    def clear_conditioned_layers(self):
+        for layer in self._get_decoder_layers():
+            layer.condition_audio_x(None, None)
+            layer.condition_media_locations(None)
+            layer.condition_use_cached_media(None)

models/audio-flamingo-1/chat/src/helpers.py ADDED Viewed

	@@ -0,0 +1,380 @@

+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+# Adapted from https://github.com/mlfoundations/open_flamingo under the MIT license.
+#   LICENSE is in incl_licenses directory.
+# Adapted from https://github.com/lucidrains/flamingo-pytorch under the MIT license.
+#   LICENSE is in incl_licenses directory.
+# Adapted from https://github.com/jadore801120/attention-is-all-you-need-pytorch under the MIT license.
+#   LICENSE is in incl_licenses directory.
+from einops import rearrange, repeat
+from einops_exts import rearrange_many
+import numpy as np
+import torch
+from torch import einsum, nn
+import torch.nn.functional as F
+def exists(val):
+    return val is not None
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+# Transformer (encoder) https://github.com/jadore801120/attention-is-all-you-need-pytorch
+# Original Copyright 2017 Victor Huang
+#  MIT License (https://opensource.org/licenses/MIT)
+class ScaledDotProductAttention(nn.Module):
+    ''' Scaled Dot-Product Attention '''
+    def __init__(self, temperature, attn_dropout=0.1):
+        super().__init__()
+        self.temperature = temperature
+        self.dropout = nn.Dropout(attn_dropout)
+    def forward(self, q, k, v, mask=None):
+        attn = torch.matmul(q / self.temperature, k.transpose(2, 3))
+        if mask is not None:
+            attn = attn.masked_fill(mask == 0, -1e9)
+        attn = self.dropout(F.softmax(attn, dim=-1))
+        output = torch.matmul(attn, v)
+        return output, attn
+class MultiHeadAttention(nn.Module):
+    ''' Multi-Head Attention module '''
+    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
+        super().__init__()
+        self.n_head = n_head
+        self.d_k = d_k
+        self.d_v = d_v
+        self.w_qs = nn.Linear(d_model, n_head * d_k, bias=False)
+        self.w_ks = nn.Linear(d_model, n_head * d_k, bias=False)
+        self.w_vs = nn.Linear(d_model, n_head * d_v, bias=False)
+        self.fc = nn.Linear(n_head * d_v, d_model, bias=False)
+        self.attention = ScaledDotProductAttention(temperature=d_k ** 0.5)
+        self.dropout = nn.Dropout(dropout)
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+    def forward(self, q, k, v, mask=None):
+        d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
+        sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1)
+        residual = q
+        # Pass through the pre-attention projection: b x lq x (n*dv)
+        # Separate different heads: b x lq x n x dv
+        q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
+        k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
+        v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
+        # Transpose for attention dot product: b x n x lq x dv
+        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
+        if mask is not None:
+            mask = mask.unsqueeze(1)   # For head axis broadcasting.
+        q, attn = self.attention(q, k, v, mask=mask)
+        # Transpose to move the head dimension back: b x lq x n x dv
+        # Combine the last two dimensions to concatenate all the heads together: b x lq x (n*dv)
+        q = q.transpose(1, 2).contiguous().view(sz_b, len_q, -1)
+        q = self.dropout(self.fc(q))
+        q += residual
+        q = self.layer_norm(q)
+        return q, attn
+class PositionwiseFeedForward(nn.Module):
+    ''' A two-feed-forward-layer module '''
+    def __init__(self, d_in, d_hid, dropout=0.1):
+        super().__init__()
+        self.w_1 = nn.Linear(d_in, d_hid) # position-wise
+        self.w_2 = nn.Linear(d_hid, d_in) # position-wise
+        self.layer_norm = nn.LayerNorm(d_in, eps=1e-6)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        residual = x
+        x = self.w_2(F.relu(self.w_1(x)))
+        x = self.dropout(x)
+        x += residual
+        x = self.layer_norm(x)
+        return x
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_hid, n_position=200):
+        super(PositionalEncoding, self).__init__()
+        self.register_buffer('pos_table', self._get_sinusoid_encoding_table(n_position, d_hid))
+    def _get_sinusoid_encoding_table(self, n_position, d_hid):
+        def get_position_angle_vec(position):
+            return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
+        sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+        return torch.FloatTensor(sinusoid_table).unsqueeze(0)
+    def forward(self, x):
+        return x + self.pos_table[:, :x.size(1)].clone().detach()
+class EncoderLayer(nn.Module):
+    ''' Compose with two layers '''
+    def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.0):
+        super(EncoderLayer, self).__init__()
+        self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
+        self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
+    def forward(self, enc_input, slf_attn_mask=None):
+        enc_output, enc_slf_attn = self.slf_attn(
+            enc_input, enc_input, enc_input, mask=slf_attn_mask)
+        enc_output = self.pos_ffn(enc_output)
+        return enc_output, enc_slf_attn
+class TransformerEncoder(nn.Module):
+    ''' A encoder model with self attention mechanism. '''
+    def __init__(
+            self, d_word_vec=512, n_layers=6, n_head=8, d_k=64, d_v=64,
+            d_model=512, d_inner=2048, dropout=0.0, n_position=16, scale_emb=True):
+        super().__init__()
+        if n_position > 0:
+            self.position_enc = PositionalEncoding(d_word_vec, n_position=n_position)
+        else:
+            self.position_enc = lambda x: x
+        self.dropout = nn.Dropout(p=dropout)
+        self.layer_stack = nn.ModuleList([
+            EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
+            for _ in range(n_layers)])
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+        self.scale_emb = scale_emb
+        self.d_model = d_model
+    def forward(self, src_seq, return_attns=False):
+        if len(src_seq.shape) == 2:
+            src_seq = src_seq.unsqueeze(1)
+        B, L, D = src_seq.shape
+        enc_slf_attn_list = []
+        causal_mask = None
+        enc_output = src_seq
+        if self.scale_emb:
+            enc_output = enc_output * self.d_model ** 0.5
+        enc_output = self.dropout(self.position_enc(enc_output))
+        enc_output = self.layer_norm(enc_output)
+        for enc_layer in self.layer_stack:
+            enc_output, enc_slf_attn = enc_layer(enc_output, slf_attn_mask=causal_mask)
+            enc_slf_attn_list += [enc_slf_attn] if return_attns else []
+        if return_attns:
+            return enc_output, enc_slf_attn_list
+        return enc_output
+# gated cross attention
+class MaskedCrossAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_audio,
+        max_window_per_audio,
+        dim_head=64,
+        heads=8,
+        only_attend_immediate_media=True,
+    ):
+        super().__init__()
+        self.max_window_per_audio = max_window_per_audio
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim_audio, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+        self.only_attend_immediate_media = only_attend_immediate_media
+    def forward(
+        self,
+        x,
+        media, media_mask,
+        media_locations=None,
+        use_cached_media=False
+    ):
+        if not use_cached_media:
+            assert (
+                media_locations.shape[1] == x.shape[1]
+            ), f"media_location.shape is {media_locations.shape} but x.shape is {x.shape}"
+        T_txt = x.shape[1]
+        B, L = media.shape[:2]
+        assert media.shape[2] == 1  # extra dim
+        assert L % self.max_window_per_audio == 0  # should be 4 or 8 times
+        h = self.heads
+        x = self.norm(x)
+        q = self.to_q(x)
+        media = rearrange(media, "b t n d -> b (t n) d")
+        k, v = self.to_kv(media).chunk(2, dim=-1)
+        q, k, v = rearrange_many((q, k, v), "b n (h d) -> b h n d", h=h)
+        q = q * self.scale
+        sim = einsum("... i d, ... j d -> ... i j", q, k)
+        # mask padded audio embeddings
+        media_mask = rearrange(media_mask, "b i n -> b 1 1 (i n)").bool()  # n = 1 is extra dim
+        sim = sim.masked_fill(~media_mask, -torch.finfo(sim.dtype).max)
+        assert self.only_attend_immediate_media is False
+        # mask media locations
+        if exists(media_locations):
+            few_shot_mask = torch.zeros(B, T_txt, L).bool().to(sim.device)
+            for batch_idx in range(B):
+                media_locations_b = media_locations[batch_idx].nonzero()  # locations of <audio>
+                if len(media_locations_b.shape) > 1:
+                    media_locations_b = media_locations_b.squeeze(-1)
+                for i in range(-1, len(media_locations_b)):
+                    if i == -1:
+                        if len(media_locations_b) == 1:
+                            text_start, text_end = 0, T_txt
+                        else:
+                            text_start, text_end = 0, media_locations_b[i+1]
+                    elif i == len(media_locations_b) - 1:
+                        text_start, text_end = media_locations_b[i], T_txt
+                    else:
+                        text_start, text_end = media_locations_b[i], media_locations_b[i+1]
+                    if self.only_attend_immediate_media:
+                        look_at_window_start = max(i,0) * self.max_window_per_audio
+                    else:
+                        look_at_window_start = 0
+                    look_at_window_end = (max(i,0) + 1) * self.max_window_per_audio
+                    few_shot_mask[batch_idx, text_start:text_end, look_at_window_start:look_at_window_end] = True
+            sim = sim.masked_fill(~few_shot_mask.unsqueeze(1), -torch.finfo(sim.dtype).max)
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        attn = sim.softmax(dim=-1)
+        if exists(media_locations) and self.only_attend_immediate_media:
+            text_without_media_mask = text_time == 0
+            text_without_media_mask = rearrange(
+                text_without_media_mask, "b i -> b 1 i 1"
+            )
+            attn = attn.masked_fill(text_without_media_mask, 0.0)
+        out = einsum("... i j, ... j d -> ... i d", attn, v)
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+class GatedCrossAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_audio,
+        max_window_per_audio,
+        dim_head=64,
+        heads=8,
+        ff_mult=4,
+        only_attend_immediate_media=True,
+    ):
+        super().__init__()
+        self.attn = MaskedCrossAttention(
+            dim=dim,
+            dim_audio=dim_audio,
+            max_window_per_audio=max_window_per_audio,
+            dim_head=dim_head,
+            heads=heads,
+            only_attend_immediate_media=only_attend_immediate_media,
+        )
+        self.attn_gate = nn.Parameter(torch.tensor([0.0]))
+        self.ff = FeedForward(dim, mult=ff_mult)
+        self.ff_gate = nn.Parameter(torch.tensor([0.0]))
+    def forward(
+        self,
+        x,
+        media,
+        media_mask,
+        media_locations=None,
+        use_cached_media=False,
+    ):
+        x = (
+            self.attn(
+                x,
+                media,
+                media_mask,
+                media_locations=media_locations,
+                use_cached_media=use_cached_media,
+            )
+            * self.attn_gate.tanh()
+            + x
+        )
+        x = self.ff(x) * self.ff_gate.tanh() + x
+        return x
+if __name__ == '__main__':
+    enc = TransformerEncoder().cuda()
+    x = torch.randn(4, 512).cuda()
+    output = enc(x)
+    enc._use_gradient_checkpointing = True
+    print(output.shape)

models/audio-flamingo-1/chat/src/utils.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+# Adapted from https://github.com/mlfoundations/open_flamingo under the MIT license.
+#   LICENSE is in incl_licenses directory.
+def extend_instance(obj, mixin):
+    """Apply mixins to a class instance after creation"""
+    base_cls = obj.__class__
+    base_cls_name = obj.__class__.__name__
+    obj.__class__ = type(
+        base_cls_name, (mixin, base_cls), {}
+    )  # mixin needs to go first for our forward() logic to work
+def getattr_recursive(obj, att):
+    """
+    Return nested attribute of obj
+    Example: getattr_recursive(obj, 'a.b.c') is equivalent to obj.a.b.c
+    """
+    if att == "":
+        return obj
+    i = att.find(".")
+    if i < 0:
+        return getattr(obj, att)
+    else:
+        return getattr_recursive(getattr(obj, att[:i]), att[i + 1 :])
+def setattr_recursive(obj, att, val):
+    """
+    Set nested attribute of obj
+    Example: setattr_recursive(obj, 'a.b.c', val) is equivalent to obj.a.b.c = val
+    """
+    if "." in att:
+        obj = getattr_recursive(obj, ".".join(att.split(".")[:-1]))
+    setattr(obj, att.split(".")[-1], val)
+def apply_with_stopping_condition(
+    module, apply_fn, apply_condition=None, stopping_condition=None, **other_args
+):
+    if stopping_condition(module):
+        return
+    if apply_condition(module):
+        apply_fn(module, **other_args)
+    for child in module.children():
+        apply_with_stopping_condition(
+            child,
+            apply_fn,
+            apply_condition=apply_condition,
+            stopping_condition=stopping_condition,
+            **other_args
+        )

models/audio-flamingo-1/chat/train/distributed.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+# Adapted from https://github.com/mlfoundations/open_flamingo under the MIT license.
+#   LICENSE is in incl_licenses directory.
+# Adapted from https://github.com/mlfoundations/open_clip under the MIT license.
+#   LICENSE is in incl_licenses directory.
+import os
+import torch
+try:
+    import horovod.torch as hvd
+except ImportError:
+    hvd = None
+def is_global_master(args):
+    return args.rank == 0
+def is_local_master(args):
+    return args.local_rank == 0
+def is_master(args, local=False):
+    return is_local_master(args) if local else is_global_master(args)
+def is_using_horovod():
+    # NOTE w/ horovod run, OMPI vars should be set, but w/ SLURM PMI vars will be set
+    # Differentiating between horovod and DDP use via SLURM may not be possible, so horovod arg still required...
+    ompi_vars = ["OMPI_COMM_WORLD_RANK", "OMPI_COMM_WORLD_SIZE"]
+    pmi_vars = ["PMI_RANK", "PMI_SIZE"]
+    if all([var in os.environ for var in ompi_vars]) or all(
+        [var in os.environ for var in pmi_vars]
+    ):
+        return True
+    else:
+        return False
+def is_using_distributed():
+    if "WORLD_SIZE" in os.environ:
+        return int(os.environ["WORLD_SIZE"]) > 1
+    if "SLURM_NTASKS" in os.environ:
+        return int(os.environ["SLURM_NTASKS"]) > 1
+    return False
+def world_info_from_env():
+    local_rank = 0
+    for v in (
+        "LOCAL_RANK",
+        "MPI_LOCALRANKID",
+        "SLURM_LOCALID",
+        "OMPI_COMM_WORLD_LOCAL_RANK",
+    ):
+        if v in os.environ:
+            local_rank = int(os.environ[v])
+            break
+    global_rank = 0
+    for v in ("RANK", "PMI_RANK", "SLURM_PROCID", "OMPI_COMM_WORLD_RANK"):
+        if v in os.environ:
+            global_rank = int(os.environ[v])
+            break
+    world_size = 1
+    for v in ("WORLD_SIZE", "PMI_SIZE", "SLURM_NTASKS", "OMPI_COMM_WORLD_SIZE"):
+        if v in os.environ:
+            world_size = int(os.environ[v])
+            break
+    return local_rank, global_rank, world_size
+def init_distributed_device(args):
+    # Distributed training = training on more than one GPU.
+    # Works in both single and multi-node scenarios.
+    args.distributed = False
+    args.world_size = 1
+    args.rank = 0  # global rank
+    args.local_rank = 0
+    if args.horovod:
+        assert hvd is not None, "Horovod is not installed"
+        print('using horovod')
+        hvd.init()
+        args.local_rank = int(hvd.local_rank())
+        args.rank = hvd.rank()
+        args.world_size = hvd.size()
+        args.distributed = True
+        os.environ["LOCAL_RANK"] = str(args.local_rank)
+        os.environ["RANK"] = str(args.rank)
+        os.environ["WORLD_SIZE"] = str(args.world_size)
+    elif is_using_distributed():
+        if "SLURM_PROCID" in os.environ:
+            print('DDP via SLURM')
+            args.local_rank, args.rank, args.world_size = world_info_from_env()
+            # SLURM var -> torch.distributed vars in case needed
+            os.environ["LOCAL_RANK"] = str(args.local_rank)
+            os.environ["RANK"] = str(args.rank)
+            os.environ["WORLD_SIZE"] = str(args.world_size)
+            init_method = args.dist_url
+            # # master_ip = os.getenv('MASTER_ADDR', 'localhost')
+            # # master_port = os.getenv('MASTER_PORT', '7000')
+            # print("DDP RANK %d WORLD_SIZE %d" % (args.rank, args.world_size))
+            # # init_method = f'tcp://{master_ip}:{master_port}'
+            # init_method = 'tcp://localhost:54323'
+            # print("Init method: %s" % (init_method))
+            torch.distributed.init_process_group(
+                backend=args.dist_backend,
+                init_method=init_method,
+                world_size=args.world_size,
+                rank=args.rank,
+            )
+        else:
+            print('DDP via torchrun, torch.distributed.launch')
+            args.local_rank, _, _ = world_info_from_env()
+            torch.distributed.init_process_group(
+                backend=args.dist_backend, init_method=args.dist_url
+            )
+            args.world_size = torch.distributed.get_world_size()
+            args.rank = torch.distributed.get_rank()
+        args.distributed = True
+    else:
+        print('needed to run on single gpu')
+        torch.distributed.init_process_group(
+            backend=args.dist_backend,
+            init_method=args.dist_url,
+            world_size=1,
+            rank=0,
+        )
+    if torch.cuda.is_available():
+        if args.distributed and not args.no_set_device_rank:
+            device = "cuda:%d" % args.local_rank
+        else:
+            device = "cuda:0"
+        torch.cuda.set_device(device)
+    else:
+        device = "cpu"
+    args.device = device
+    device = torch.device(device)
+    return device

models/audio-flamingo-1/chat/train/train.py ADDED Viewed

	@@ -0,0 +1,376 @@

+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+# Adapted from https://github.com/mlfoundations/open_flamingo under the MIT license.
+#   LICENSE is in incl_licenses directory.
+import argparse
+import functools
+import glob
+import os
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
+import random
+import shutil
+import sys
+sys.path.append('../')
+import yaml
+import time
+import numpy as np
+import torch
+from torch.utils.tensorboard import SummaryWriter
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import (
+    CPUOffload,
+    MixedPrecision,
+    ShardingStrategy,
+    BackwardPrefetch,
+)
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    checkpoint_wrapper,
+    CheckpointWrapper,
+    CheckpointImpl,
+    apply_activation_checkpointing,
+)
+from torch.distributed.fsdp._init_utils import _init_intra_and_inter_node_groups
+from torch.distributed.distributed_c10d import _get_default_group
+torch.cuda.empty_cache()
+from transformers import (
+    get_constant_schedule_with_warmup,
+    get_cosine_schedule_with_warmup,
+    get_linear_schedule_with_warmup,
+)
+from data.data import get_audiotext_dataloader  # AudioTextData, DataCollator
+from distributed import init_distributed_device, world_info_from_env
+from train_utils import (
+    train_one_epoch,
+    get_mp_policy_dtype,
+    save_checkpoint,
+    Dict2Class,
+    get_autocast,
+    get_cast_dtype
+)
+from src.factory import create_model_and_transforms
+def random_seed(seed=42, rank=0):
+    torch.manual_seed(seed + rank)
+    np.random.seed(seed + rank)
+    random.seed(seed + rank)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str, default='../config/config.yaml', help='yaml config path')
+    parsed_args = parser.parse_args()
+    config = yaml.load(open(parsed_args.config), Loader=yaml.FullLoader)
+    data_config = config['data_config']
+    model_config = config['model_config']
+    clap_config = config["clap_config"]
+    args = Dict2Class(config['train_config'])
+    if 'sft_config' in config:
+        sft_config = config['sft_config']
+        unfreeze_full_lm = sft_config['unfreeze_full_lm']
+    else:
+        sft_config = None
+        unfreeze_full_lm = False
+    # get paths done
+    exp_path = os.path.join(args.expdir, args.run_name)
+    os.makedirs(exp_path, exist_ok=True)
+    print('exp_path:', exp_path)
+    shutil.copy(parsed_args.config, os.path.join(exp_path, 'config.yaml'))
+    data_config["dataset_blending_output"] = os.path.join(exp_path, data_config["dataset_blending_output"])
+    # Validate args
+    if args.fsdp and not args.fsdp_use_orig_params:
+        print(
+            "Warning: FSDP is running without fsdp_use_orig_params flag. "
+            + "This is not recommended because it means we will use uniform weight decay"
+            + " and train all embeddings, not just the newly added ones. "
+            + "Note: OPT models are not compatible with fsdp_use_orig_params flag."
+        )
+    if args.fsdp and args.fsdp_sharding_strategy == "hybrid":
+        print(
+            "Warning: As of torch=2.0.1, the FSDP logic for optim_state_dict() is broken for hybrid sharding."
+            + "To make this method work, we need to modify torch.distributed.fsdp._optim_utils.py"
+            + "Copy and paste the code from the _optim_utils.py in this repo into the torch file."
+            + "The main issue was the missing group kwarg on line 1596 in _all_gather_optim_state."
+        )
+    # Set up distributed training
+    print('initializing distributed environment')
+    if args.offline:
+        os.environ["TRANSFORMERS_OFFLINE"] = "1"
+    args.local_rank, args.rank, args.world_size = world_info_from_env()
+    device_id = init_distributed_device(args)
+    random_seed(args.seed)
+    # Initialize model
+    print('creating model')
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"  # disable the tokenizer parallelism warning
+    model, tokenizer = create_model_and_transforms(
+        **model_config,
+        clap_config=clap_config,
+        use_local_files=args.offline,
+        gradient_checkpointing=args.gradient_checkpointing,
+        freeze_lm_embeddings=args.freeze_lm_embeddings,
+        unfreeze_full_lm=unfreeze_full_lm
+    )
+    random_seed(args.seed, args.rank)
+    # Initialize logging
+    print(f"Start running training on rank {args.rank}.")
+    # Load model checkpoint on CPU
+    checkpoint_list = glob.glob(f"{args.expdir}/{args.run_name}/checkpoint_*.pt")
+    if len(checkpoint_list) == 0:
+        print(f"Found no checkpoints for run {args.run_name}.")
+        resume_from_checkpoint = None
+    else:
+        resume_from_checkpoint = sorted(
+            checkpoint_list, key=lambda x: int(x.split("_")[-1].split(".")[0])
+        )[-1]
+        print(
+            f"Found checkpoint {resume_from_checkpoint} for run {args.run_name}."
+        )
+    # load pretrained model
+    resume_from_epoch = 0
+    if (resume_from_checkpoint is None) and (sft_config is not None):
+        # just started SFT
+        pretrained_path = os.path.join(
+            sft_config['pretrained_path'],
+            sft_config['pretrained_ckpt']
+        )
+        if args.rank == 0:
+            print(f"Loading checkpoint from {pretrained_path}")
+        checkpoint = torch.load(pretrained_path, map_location="cpu")
+        msd = checkpoint["model_state_dict"]
+        msd = {k.replace("module.", ""): v for k, v in msd.items()}
+        # for fsdp, only one rank needs to load the state dict
+        if not args.fsdp or args.rank == 0:
+            model.load_state_dict(msd, False)
+            del checkpoint["model_state_dict"]
+            del msd
+    elif resume_from_checkpoint is not None:
+        # continue training (either pretraining or STF)
+        if args.rank == 0:
+            print(f"Loading checkpoint from {resume_from_checkpoint}")
+        checkpoint = torch.load(resume_from_checkpoint, map_location="cpu")
+        msd = checkpoint["model_state_dict"]
+        msd = {k.replace("module.", ""): v for k, v in msd.items()}
+        resume_from_epoch = checkpoint["epoch"] + 1
+        # for fsdp, only one rank needs to load the state dict
+        if not args.fsdp or args.rank == 0:
+            model.load_state_dict(msd, False)
+            del checkpoint["model_state_dict"]
+            del msd
+    else:
+        pass
+    # Initialize FSDP / DDP, and ensure the model is on GPU
+    print(f"Initializing distributed training with {args.world_size} GPUs.")
+    if args.fsdp:
+        print(
+            f"Before FSDP parameter num: {sum(p.numel() for p in model.parameters())} on rank {args.rank}"
+        )
+        # init MixedPrecision
+        if args.precision != "fp32":
+            cast_dtype = get_mp_policy_dtype(args.precision)
+            mp_policy = MixedPrecision(
+                param_dtype=torch.float32,
+                reduce_dtype=cast_dtype,  # gradient communication
+                buffer_dtype=cast_dtype,
+            )
+        else:
+            mp_policy = None
+        # init process groups
+        if args.fsdp_sharding_strategy == "hybrid":
+            intra_node_group, inter_node_group = _init_intra_and_inter_node_groups(
+                _get_default_group()
+            )
+            args.my_group = intra_node_group  # for optimizer saving
+            process_group = (intra_node_group, inter_node_group)  # for FSDP init
+        else:
+            args.my_group = None  # for optimizer saving
+            process_group = None  # for FSDP init
+        # init FSDP
+        wrapper_kwargs = dict(
+            process_group=process_group,
+            cpu_offload=CPUOffload(offload_params=False),
+            device_id=device_id,
+            sync_module_states=True,  # broadcast loaded ckpt from rank 0 -> all ranks
+            sharding_strategy=ShardingStrategy.FULL_SHARD
+            if args.fsdp_sharding_strategy == "full"
+            else ShardingStrategy.HYBRID_SHARD,
+            use_orig_params=args.fsdp_use_orig_params,
+            mixed_precision=mp_policy,
+            forward_prefetch=True,
+            backward_prefetch=BackwardPrefetch.BACKWARD_PRE,
+            limit_all_gathers=True,
+        )
+        model.wrap_fsdp(wrapper_kwargs, device_id)
+        ddp_model = model
+        print(
+            f"After FSDP parameter num: {sum(p.numel() for p in model.parameters())} on rank {args.rank}"
+        )
+        print(
+            f"After FSDP {torch.cuda.memory_allocated()/1024**3:.3} GB on rank {args.rank}"
+        )
+    else:
+        model = model.to(device_id)
+        ddp_model = DDP(model, device_ids=[device_id])
+    # Initialize gradient checkpointing
+    if args.gradient_checkpointing:
+        non_reentrant_wrapper = functools.partial(
+            checkpoint_wrapper,
+            offload_to_cpu=True,
+            checkpoint_impl=CheckpointImpl.NO_REENTRANT,
+        )
+        apply_activation_checkpointing(
+            ddp_model,
+            checkpoint_wrapper_fn=non_reentrant_wrapper,
+            check_fn=lambda m: getattr(m, "_use_gradient_checkpointing", False)
+            and not isinstance(m, FSDP)
+            and not isinstance(m, CheckpointWrapper),
+        )
+    # Initialize optimizer
+    params_to_optimize = ddp_model.named_parameters()
+    params_to_optimize = list(
+        filter(
+            lambda x: x[1].requires_grad
+            and not getattr(x[1], "exclude_from_optimizer", False),
+            params_to_optimize,
+        )
+    )
+    if not args.fsdp or args.fsdp_use_orig_params:
+        # apply weight decay only to params in the xattn layers
+        def get_grouped_params(model):
+            params_with_wd, params_without_wd = [], []
+            for n, p in params_to_optimize:
+                if "gated_cross_attn" in n:
+                    params_with_wd.append(p)
+                else:
+                    params_without_wd.append(p)
+            return [
+                {"params": params_with_wd, "weight_decay": args.weight_decay},
+                {"params": params_without_wd, "weight_decay": 0.0},
+            ]
+        optimizer = torch.optim.AdamW(
+            get_grouped_params(params_to_optimize), lr=args.learning_rate
+        )
+    else:
+        # unclear if we should be using no weight decay or small weight decay for all parameters
+        optimizer = torch.optim.AdamW(
+            (p for _, p in params_to_optimize),
+            lr=args.learning_rate,
+            weight_decay=args.weight_decay,
+        )
+    # load optimizer checkpoint
+    if resume_from_checkpoint is not None:
+        osd = checkpoint["optimizer_state_dict"]
+        if args.fsdp:
+            osd = FSDP.optim_state_dict_to_load(osd, ddp_model, optimizer)
+        optimizer.load_state_dict(osd)
+        del checkpoint["optimizer_state_dict"]
+        del osd
+    # Initialize data loaders
+    AudioTextDataInfo = get_audiotext_dataloader(
+        data_config, clap_config, tokenizer, args.batch_size, split='train',
+        epoch=0, force_reblend=True
+    )
+    total_training_steps = (
+        len(AudioTextDataInfo.dataset) // (args.batch_size * args.world_size)
+    ) * args.num_epochs
+    if args.rank == 0:
+        print(f"Total training steps: {total_training_steps}")
+        tb = SummaryWriter(os.path.join(exp_path, 'tensorboard'))
+    else:
+        tb = None
+    # Initialize lr scheduler
+    if args.lr_scheduler == "linear":
+        lr_scheduler = get_linear_schedule_with_warmup(
+            optimizer,
+            num_warmup_steps=args.warmup_steps,
+            num_training_steps=total_training_steps,
+        )
+    elif args.lr_scheduler == "cosine":
+        lr_scheduler = get_cosine_schedule_with_warmup(
+            optimizer,
+            num_warmup_steps=args.warmup_steps,
+            num_training_steps=total_training_steps,
+        )
+    else:
+        lr_scheduler = get_constant_schedule_with_warmup(
+            optimizer, num_warmup_steps=args.warmup_steps
+        )
+    # load lr scheduler checkpoint
+    if resume_from_checkpoint is not None:
+        lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state_dict"])
+        del checkpoint["lr_scheduler_state_dict"]
+    # Start training!
+    ddp_model.train()
+    print('start training from epoch {}'.format(resume_from_epoch))
+    for epoch in range(resume_from_epoch, args.num_epochs):
+        # force reblending dataset for every epoch
+        if epoch > 0:
+            AudioTextDataInfo = get_audiotext_dataloader(
+                data_config, clap_config, tokenizer, args.batch_size, split='train',
+                epoch=epoch, force_reblend=True
+            )
+        AudioTextDataInfo.set_epoch(epoch)
+        trainloader = AudioTextDataInfo.dataloader
+        # train one epoch
+        train_one_epoch(
+            args=args,
+            model=ddp_model,
+            epoch=epoch,
+            tokenizer=tokenizer,
+            optimizer=optimizer,
+            lr_scheduler=lr_scheduler,
+            trainloader=trainloader,
+            device_id=device_id,
+            tb=tb
+        )
+        # save checkpoint
+        save_checkpoint(ddp_model, optimizer, lr_scheduler, epoch, args)
+        time.sleep(1.0)
+    # save final checkpoint
+    save_checkpoint(ddp_model, optimizer, lr_scheduler, epoch, args)
+    if args.rank == 0:
+        tb.close()
+if __name__ == "__main__":
+    main()

models/audio-flamingo-1/chat/train/train_utils.py ADDED Viewed

	@@ -0,0 +1,351 @@

+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+# Adapted from https://github.com/mlfoundations/open_flamingo under the MIT license.
+#   LICENSE is in incl_licenses directory.
+import time
+import os
+from tqdm import tqdm
+import sys
+from copy import deepcopy
+from contextlib import suppress
+import torch
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import (
+    FullStateDictConfig,
+    StateDictType,
+)
+from torch.distributed.fsdp.api import FullOptimStateDictConfig
+from einops import rearrange
+class Dict2Class:
+    def __init__(self, data_dict):
+        for key, value in data_dict.items():
+            setattr(self, key, value)
+class SysLogger(object):
+    def __init__(self, filename="../log/log.log"):
+        self.terminal = sys.stdout
+        self.log = open(filename, "a")
+    def write(self, message):
+        self.terminal.write(message+'\n')
+        self.log.write(message)
+def get_cast_dtype(precision: str):
+    cast_dtype = None
+    if precision == "bf16":
+        cast_dtype = torch.bfloat16
+    elif precision == "fp16":
+        cast_dtype = torch.float16
+    return cast_dtype
+def get_mp_policy_dtype(precision: str):
+    if "bfloat16" in precision or "bf16" in precision:
+        return torch.bfloat16
+    elif precision == "fp16":
+        return torch.float16
+    else:
+        return torch.float32
+def get_autocast(precision, cache_enabled=True):
+    if precision == "amp":
+        return torch.cuda.amp.autocast(cache_enabled=cache_enabled)
+    elif precision == "amp_bfloat16" or precision == "amp_bf16":
+        return lambda: torch.cuda.amp.autocast(
+            dtype=torch.bfloat16, cache_enabled=cache_enabled
+        )
+    else:
+        return suppress
+def train_one_epoch(
+    args,
+    model,
+    epoch,
+    trainloader,
+    tokenizer,
+    optimizer,
+    lr_scheduler,
+    device_id,
+    tb
+):
+    # setup loaders
+    num_batches_per_epoch = len(trainloader)
+    total_training_steps = num_batches_per_epoch * args.num_epochs
+    print('num_batches_per_epoch={}, total_training_steps={}'.format(num_batches_per_epoch, total_training_steps))
+    autocast = get_autocast(
+        args.precision, cache_enabled=(not args.fsdp)
+    )  # if fsdp, disable cache to save memory
+    cast_dtype = get_cast_dtype(args.precision)
+    # setup model
+    media_token_id = tokenizer("<audio>", add_special_tokens=False)["input_ids"][-1]
+    assert media_token_id == tokenizer.encode("<audio>")[-1]
+    endofchunk_token_id = tokenizer("<|endofchunk|>", add_special_tokens=False)["input_ids"][-1]
+    model.train()
+    # setup logging
+    step_time_m = AverageMeter()
+    data_time_m = AverageMeter()
+    end = time.time()
+    # loop through dataloader
+    for num_steps, batch in tqdm(
+        enumerate(trainloader),
+        disable=args.rank != 0,
+        total=total_training_steps,
+        initial=(epoch * num_batches_per_epoch)
+    ):
+        data_time_m.update(time.time() - end)
+        global_step = num_steps + epoch * num_batches_per_epoch
+        #### FORWARD PASS ####
+        audio_clips = batch["audio_clips"].to(device_id, dtype=cast_dtype, non_blocking=True)  # (B, N_WINDOWS, WINDOW_LENGTH)
+        audio_embed_mask = batch["audio_embed_mask"].to(device_id, dtype=cast_dtype, non_blocking=True)  # (B, N_WINDOWS)
+        input_ids = batch["input_ids"].to(device_id, dtype=cast_dtype, non_blocking=True)  # (B, N_TOKENS)
+        attention_mask = batch["attention_mask"].to(device_id, dtype=cast_dtype, non_blocking=True)  # (B, N_TOKENS)
+        # set up labels; language model is expected to handle shifting
+        labels = input_ids.clone()
+        labels[labels == tokenizer.pad_token_id] = -100
+        labels[:, :2] = -100
+        labels[labels == tokenizer.encode("<audio>")[-1]] = -100
+        # mask all prompts except for between <SEP> and <|endofchunk|>
+        sep_locations = labels == tokenizer.sep_token_id
+        eoc_locations = labels == endofchunk_token_id
+        if not all(sep_locations.sum(dim=1) == eoc_locations.sum(dim=1)):
+            print("Warning: <SEP>-<EoC> pairing mismatch at step {} due to max_token limit.".format(num_steps))
+        for i in range(labels.shape[0]):
+            shouldmask = True
+            for j in range(labels.shape[1]):
+                if shouldmask and (labels[i][j] != tokenizer.eos_token_id):
+                    masked_value = -100
+                else:
+                    masked_value = labels[i][j]
+                if labels[i][j] == tokenizer.sep_token_id:
+                    shouldmask = False
+                elif labels[i][j] == endofchunk_token_id:
+                    shouldmask = True
+                labels[i][j] = masked_value
+            if labels[i][-1] not in [-100, tokenizer.eos_token_id, tokenizer.pad_token_id, endofchunk_token_id]:
+                for j in range(labels.shape[1]-1, -1, -1):
+                    if labels[i][j] not in [-100, tokenizer.eos_token_id, endofchunk_token_id]:
+                        labels[i][j] = -100
+                    else:
+                        break
+        labels = labels.to(device_id)
+        # gradient accumulation w/ fsdp cpu offloading requires a no_sync context manager
+        with autocast():
+            output = model(
+                audio_x=audio_clips,
+                audio_x_mask=audio_embed_mask,
+                lang_x=input_ids,
+                attention_mask=attention_mask,
+                labels=labels
+            )
+            loss = output.loss
+        divided_loss = loss / args.gradient_accumulation_steps
+        train_loss = divided_loss * args.loss_multiplier
+        train_loss.backward()
+        if (not args.freeze_lm_embeddings) and (
+            not args.fsdp or args.fsdp_use_orig_params
+        ):
+            # Mask gradients for input embeddings s.t. we only update the added tokens <audio> and <|endofchunk|>
+            if args.fsdp:
+                embed_grad = model.lang_encoder.get_input_embeddings().weight.grad
+            else:
+                embed_grad = (
+                    model.module.lang_encoder.get_input_embeddings().weight.grad
+                )
+            zero_mask = torch.zeros_like(embed_grad)
+            zero_mask[media_token_id] = torch.ones_like(zero_mask[media_token_id])
+            zero_mask[endofchunk_token_id] = torch.ones_like(
+                zero_mask[endofchunk_token_id]
+            )
+            if args.fsdp:
+                model.lang_encoder.get_input_embeddings().weight.grad = (
+                    embed_grad * zero_mask
+                )
+            else:
+                model.module.lang_encoder.get_input_embeddings().weight.grad = (
+                    embed_grad * zero_mask
+                )
+        # clip gradient norm
+        if args.fsdp:
+            """
+            The way we clip gradients with FSDP is different than the non-FSDP case,
+            because during FSDP, gradient norms are computed over certain submodules,
+            rather than the entire model.
+            At least for OPT-125M, this didn't seem to make a difference in performance.
+            """
+            model.clip_grad_norm_(1.0)
+        else:
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+        # step optimizer and log
+        if (((num_steps + 1) % args.gradient_accumulation_steps) == 0) or (
+            num_steps == num_batches_per_epoch - 1
+        ):
+            optimizer.step()
+            lr_scheduler.step()
+            optimizer.zero_grad(set_to_none=True)
+            # step time and reset end outside of rank 0
+            step_time_m.update(time.time() - end)
+            end = time.time()
+            # rank 0 logging
+            if args.rank == 0:
+                samples_per_second = (
+                    args.gradient_accumulation_steps
+                    * args.batch_size
+                    * args.world_size
+                    / step_time_m.val
+                )
+                samples_per_second_per_gpu = (
+                    args.gradient_accumulation_steps
+                    * args.batch_size
+                    / step_time_m.val
+                )
+                log_dict = {
+                    "data_time": data_time_m.avg,
+                    "step_time": step_time_m.avg,
+                    "samples_per_second": samples_per_second,
+                    "samples_per_second_per_gpu": samples_per_second_per_gpu,
+                    "lr": optimizer.param_groups[0]["lr"],
+                    "loss": loss.item()
+                }
+                if ((num_steps + 1) % args.logging_steps == 0):
+                    for key in log_dict:
+                        tb.add_scalar("Train/{}".format(key), log_dict[key], global_step)
+                step_time_m.reset()
+                data_time_m.reset()
+        # Log loss to console
+        if ((num_steps + 1) % args.logging_steps == 0):
+            print(
+                f"Step {num_steps+1}/{num_batches_per_epoch} of epoch {epoch+1}/{args.num_epochs} complete. Loss: {loss.item():.3f}\n"
+            )
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+def filter_state_dict_to_trainable(model, state_dict):
+    """
+    Remove non-trainable parameters from model state dict.
+    Exception: Embeddings will not be removed, even if frozen.
+    This is because we need the new <audio> <|endofchunk|> tokens to
+    be consistent across initializations.
+    """
+    for (
+        name,
+        p,
+    ) in model.named_parameters():  # won't work for fsdp + use_orig_params=False
+        if "fsdp" in name:
+            continue
+        if "embed" in name or isinstance(p, torch.nn.Embedding):
+            continue
+        if not p.requires_grad:
+            name = name.replace("._checkpoint_wrapped_module", "")
+            if name in state_dict:
+                del state_dict[name]
+            else:
+                print(f"WARNING: filtering but {name} not in state_dict")
+    # also remove the keys in state_dict generated from
+    # lang_encoder.old_decoder_blocks and lang_encoder.gated_cross_attn_layers
+    # because these are already saved in lang_encoder.model...
+    to_delete = [
+        n
+        for n in state_dict.keys()
+        if ("lang_encoder.old_decoder_blocks" in n)
+        or ("lang_encoder.gated_cross_attn_layers" in n)
+        or ("vision_encoder" in n)
+    ]
+    for name in to_delete:
+        del state_dict[name]
+    return state_dict
+def save_checkpoint(model, optimizer, lr_scheduler, epoch, args):
+    """
+    Save training checkpoint with model, optimizer, and lr_scheduler state.
+    """
+    if args.fsdp:
+        FSDP.set_state_dict_type(
+            model,
+            StateDictType.FULL_STATE_DICT,
+            FullStateDictConfig(rank0_only=True, offload_to_cpu=True),
+            FullOptimStateDictConfig(rank0_only=True),
+        )
+        model_state = model.state_dict()
+        optim_state = FSDP.optim_state_dict(model, optimizer, group=args.my_group)
+    else:
+        model_state = model.state_dict()
+        optim_state = optimizer.state_dict()
+    if args.rank == 0:
+        if not (args.fsdp and not args.fsdp_use_orig_params):
+            model_state = filter_state_dict_to_trainable(model, model_state)
+        checkpoint_dir = os.path.join(args.expdir, args.run_name)
+        if not os.path.exists(checkpoint_dir):
+            os.makedirs(checkpoint_dir)
+        checkpoint_dict = {
+            "epoch": epoch,
+            "model_state_dict": model_state,
+            "optimizer_state_dict": optim_state,
+            "lr_scheduler_state_dict": lr_scheduler.state_dict(),
+        }
+        print(f"Saving checkpoint to {checkpoint_dir}/checkpoint_{epoch}.pt")
+        torch.save(checkpoint_dict, f"{checkpoint_dir}/checkpoint_{epoch}.pt")
+        if args.delete_previous_checkpoint:
+            if epoch > 0 and epoch % 20 != 0:
+                try:
+                    os.remove(f"{checkpoint_dir}/checkpoint_{epoch-1}.pt")
+                except:
+                    pass

models/audio-flamingo-1/checkpoints/chat_part1.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5673d1541cd5764d6dcc89b3bdc331b768c1159ef685a373c7f4deb9e1ddaef
+size 3328734458

models/audio-flamingo-1/checkpoints/chat_part2.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ea792e60d95deacc75244af0b23c5b75b8de3aef617b392e633cd67a5f20c5aa
+size 3482749306

models/audio-flamingo-1/checkpoints/chat_part3.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8828e7e5db7014259d746025274dd752fe39f959eb6d7e1380796a838c2983c
+size 3898925434

models/audio-flamingo-1/checkpoints/chat_part4.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0445a8f663774e16df616f8e20abb5ceff61a40b2ee5a8b12a83a641971f8e1
+size 3357325242

models/audio-flamingo-1/checkpoints/chat_part5.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e54e84938e4143a3d2fc8b090a1ab238287a0a41236efcc4a5df5d476291d96f
+size 3591230906

models/audio-flamingo-1/checkpoints/checkpoint_utils.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+import torch
+def merge_checkpoints(checkpoint_path, num_parts=5):
+    combined_state_dict = {}
+    for i in range(1, num_parts + 1):
+        part_path = checkpoint_path.replace('.pt', '_part{}.pt'.format(i))
+        part_checkpoint = torch.load(part_path)
+        part_state_dict = part_checkpoint['model_state_dict']
+        combined_state_dict.update(part_state_dict)
+    full_checkpoint = {'model_state_dict': combined_state_dict}
+    torch.save(full_checkpoint, checkpoint_path)
+    print('merging {}: finished'.format(checkpoint_path))
+merge_checkpoints('foundation.pt', num_parts=5)
+merge_checkpoints('chat.pt', num_parts=5)

models/audio-flamingo-1/checkpoints/foundation_part1.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5921b6167a0e0d27a4732dc77899505db454bcb769942da3079c8b821e54711
+size 3328736090

models/audio-flamingo-1/checkpoints/foundation_part2.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fa325ee55d333ff4155679af58e901096b89ffd8097092b602cdf54f8b989791
+size 3482750938

models/audio-flamingo-1/checkpoints/foundation_part3.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dfec3fcfe53582e71c8ae1bc8832bbce5b15168878cf5e85a32f1c717bacdc78
+size 3898927066

models/audio-flamingo-1/checkpoints/foundation_part4.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b4b3e3011ea05da7ef29e8b906141af05a7eb2fa9604b65f21638db142a192d
+size 3357326874

models/audio-flamingo-1/checkpoints/foundation_part5.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e52ea8848b540816e4b7e46ed705ec84965b3718d77b3e8a4ded0b64668fd168
+size 3591232538