HeartMuLa

Runtime error

App Files Files Community

mrfakename commited on Jan 20

Commit

0929406

verified ·

1 Parent(s): fa77ee6

Delete MuCodec

Browse files

Files changed (37) hide show

MuCodec/.DS_Store +0 -0
MuCodec/.gitattributes +0 -2
MuCodec/.gitignore +0 -3
MuCodec/LICENSE +0 -21
MuCodec/LICENSE_weights +0 -399
MuCodec/configs/models/transformer2D.json +0 -25
MuCodec/configs/scheduler/stable_diffusion_2.1_largenoise_sample.json +0 -14
MuCodec/generate.py +0 -248
MuCodec/libs/rvq/descript_quantize3.py +0 -298
MuCodec/model.py +0 -367
MuCodec/models/attention.py +0 -682
MuCodec/models/transformer_2d_flow.py +0 -545
MuCodec/muq_dev/muq_fairseq/data/__init__.py +0 -1
MuCodec/muq_dev/muq_fairseq/data/ark_dataset.py +0 -71
MuCodec/muq_dev/muq_fairseq/data/mert_dataset.py +0 -295
MuCodec/muq_dev/muq_fairseq/data/utils/data_utils.py +0 -535
MuCodec/muq_dev/muq_fairseq/models/muq/__init__.py +0 -1
MuCodec/muq_dev/muq_fairseq/models/muq/model/__init__.py +0 -2
MuCodec/muq_dev/muq_fairseq/models/muq/model/muq.py +0 -520
MuCodec/muq_dev/muq_fairseq/models/muq/model/pred_ark_target_with_model.py +0 -151
MuCodec/muq_dev/muq_fairseq/models/muq/model/rvq.py +0 -459
MuCodec/muq_dev/muq_fairseq/models/muq/model/rvq_muq.py +0 -394
MuCodec/muq_dev/muq_fairseq/models/muq/model/w2v2_config.json +0 -113
MuCodec/muq_dev/muq_fairseq/models/muq/modules/__init__.py +0 -2
MuCodec/muq_dev/muq_fairseq/models/muq/modules/conv.py +0 -77
MuCodec/muq_dev/muq_fairseq/models/muq/modules/features.py +0 -67
MuCodec/muq_dev/muq_fairseq/models/muq/modules/flash_conformer.py +0 -2114
MuCodec/muq_dev/muq_fairseq/models/muq/modules/random_quantizer.py +0 -68
MuCodec/muq_dev/muq_fairseq/models/muq/muq_model.py +0 -139
MuCodec/muq_dev/muq_fairseq/tasks/muq_pretraining.py +0 -354
MuCodec/muq_dev/test.py +0 -22
MuCodec/readme.md +0 -67
MuCodec/reconstructed/test.wav +0 -3
MuCodec/requirements.txt +0 -335
MuCodec/test_wav/test.wav +0 -3
MuCodec/tools/get_melvaehifigan48k.py +0 -1551
MuCodec/tools/torch_tools.py +0 -100

MuCodec/.DS_Store DELETED Viewed

Binary file (8.2 kB)

MuCodec/.gitattributes DELETED Viewed

	@@ -1,2 +0,0 @@
1	- *.pt filter=lfs diff=lfs merge=lfs -text
2	- *.pth filter=lfs diff=lfs merge=lfs -text

MuCodec/.gitignore DELETED Viewed

@@ -1,3 +0,0 @@
-__pycache__
-*.pt
-*.pth

MuCodec/LICENSE DELETED Viewed

@@ -1,21 +0,0 @@
-MIT License
-Copyright (c) Meta Platforms, Inc. and affiliates.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.

MuCodec/LICENSE_weights DELETED Viewed

@@ -1,399 +0,0 @@
-Attribution-NonCommercial 4.0 International
-=======================================================================
-Creative Commons Corporation ("Creative Commons") is not a law firm and
-does not provide legal services or legal advice. Distribution of
-Creative Commons public licenses does not create a lawyer-client or
-other relationship. Creative Commons makes its licenses and related
-information available on an "as-is" basis. Creative Commons gives no
-warranties regarding its licenses, any material licensed under their
-terms and conditions, or any related information. Creative Commons
-disclaims all liability for damages resulting from their use to the
-fullest extent possible.
-Using Creative Commons Public Licenses
-Creative Commons public licenses provide a standard set of terms and
-conditions that creators and other rights holders may use to share
-original works of authorship and other material subject to copyright
-and certain other rights specified in the public license below. The
-following considerations are for informational purposes only, are not
-exhaustive, and do not form part of our licenses.
-     Considerations for licensors: Our public licenses are
-     intended for use by those authorized to give the public
-     permission to use material in ways otherwise restricted by
-     copyright and certain other rights. Our licenses are
-     irrevocable. Licensors should read and understand the terms
-     and conditions of the license they choose before applying it.
-     Licensors should also secure all rights necessary before
-     applying our licenses so that the public can reuse the
-     material as expected. Licensors should clearly mark any
-     material not subject to the license. This includes other CC-
-     licensed material, or material used under an exception or
-     limitation to copyright. More considerations for licensors:
-	wiki.creativecommons.org/Considerations_for_licensors
-     Considerations for the public: By using one of our public
-     licenses, a licensor grants the public permission to use the
-     licensed material under specified terms and conditions. If
-     the licensor's permission is not necessary for any reason--for
-     example, because of any applicable exception or limitation to
-     copyright--then that use is not regulated by the license. Our
-     licenses grant only permissions under copyright and certain
-     other rights that a licensor has authority to grant. Use of
-     the licensed material may still be restricted for other
-     reasons, including because others have copyright or other
-     rights in the material. A licensor may make special requests,
-     such as asking that all changes be marked or described.
-     Although not required by our licenses, you are encouraged to
-     respect those requests where reasonable. More_considerations
-     for the public:
-	wiki.creativecommons.org/Considerations_for_licensees
-=======================================================================
-Creative Commons Attribution-NonCommercial 4.0 International Public
-License
-By exercising the Licensed Rights (defined below), You accept and agree
-to be bound by the terms and conditions of this Creative Commons
-Attribution-NonCommercial 4.0 International Public License ("Public
-License"). To the extent this Public License may be interpreted as a
-contract, You are granted the Licensed Rights in consideration of Your
-acceptance of these terms and conditions, and the Licensor grants You
-such rights in consideration of benefits the Licensor receives from
-making the Licensed Material available under these terms and
-conditions.
-Section 1 -- Definitions.
-  a. Adapted Material means material subject to Copyright and Similar
-     Rights that is derived from or based upon the Licensed Material
-     and in which the Licensed Material is translated, altered,
-     arranged, transformed, or otherwise modified in a manner requiring
-     permission under the Copyright and Similar Rights held by the
-     Licensor. For purposes of this Public License, where the Licensed
-     Material is a musical work, performance, or sound recording,
-     Adapted Material is always produced where the Licensed Material is
-     synched in timed relation with a moving image.
-  b. Adapter's License means the license You apply to Your Copyright
-     and Similar Rights in Your contributions to Adapted Material in
-     accordance with the terms and conditions of this Public License.
-  c. Copyright and Similar Rights means copyright and/or similar rights
-     closely related to copyright including, without limitation,
-     performance, broadcast, sound recording, and Sui Generis Database
-     Rights, without regard to how the rights are labeled or
-     categorized. For purposes of this Public License, the rights
-     specified in Section 2(b)(1)-(2) are not Copyright and Similar
-     Rights.
-  d. Effective Technological Measures means those measures that, in the
-     absence of proper authority, may not be circumvented under laws
-     fulfilling obligations under Article 11 of the WIPO Copyright
-     Treaty adopted on December 20, 1996, and/or similar international
-     agreements.
-  e. Exceptions and Limitations means fair use, fair dealing, and/or
-     any other exception or limitation to Copyright and Similar Rights
-     that applies to Your use of the Licensed Material.
-  f. Licensed Material means the artistic or literary work, database,
-     or other material to which the Licensor applied this Public
-     License.
-  g. Licensed Rights means the rights granted to You subject to the
-     terms and conditions of this Public License, which are limited to
-     all Copyright and Similar Rights that apply to Your use of the
-     Licensed Material and that the Licensor has authority to license.
-  h. Licensor means the individual(s) or entity(ies) granting rights
-     under this Public License.
-  i. NonCommercial means not primarily intended for or directed towards
-     commercial advantage or monetary compensation. For purposes of
-     this Public License, the exchange of the Licensed Material for
-     other material subject to Copyright and Similar Rights by digital
-     file-sharing or similar means is NonCommercial provided there is
-     no payment of monetary compensation in connection with the
-     exchange.
-  j. Share means to provide material to the public by any means or
-     process that requires permission under the Licensed Rights, such
-     as reproduction, public display, public performance, distribution,
-     dissemination, communication, or importation, and to make material
-     available to the public including in ways that members of the
-     public may access the material from a place and at a time
-     individually chosen by them.
-  k. Sui Generis Database Rights means rights other than copyright
-     resulting from Directive 96/9/EC of the European Parliament and of
-     the Council of 11 March 1996 on the legal protection of databases,
-     as amended and/or succeeded, as well as other essentially
-     equivalent rights anywhere in the world.
-  l. You means the individual or entity exercising the Licensed Rights
-     under this Public License. Your has a corresponding meaning.
-Section 2 -- Scope.
-  a. License grant.
-       1. Subject to the terms and conditions of this Public License,
-          the Licensor hereby grants You a worldwide, royalty-free,
-          non-sublicensable, non-exclusive, irrevocable license to
-          exercise the Licensed Rights in the Licensed Material to:
-            a. reproduce and Share the Licensed Material, in whole or
-               in part, for NonCommercial purposes only; and
-            b. produce, reproduce, and Share Adapted Material for
-               NonCommercial purposes only.
-       2. Exceptions and Limitations. For the avoidance of doubt, where
-          Exceptions and Limitations apply to Your use, this Public
-          License does not apply, and You do not need to comply with
-          its terms and conditions.
-       3. Term. The term of this Public License is specified in Section
-          6(a).
-       4. Media and formats; technical modifications allowed. The
-          Licensor authorizes You to exercise the Licensed Rights in
-          all media and formats whether now known or hereafter created,
-          and to make technical modifications necessary to do so. The
-          Licensor waives and/or agrees not to assert any right or
-          authority to forbid You from making technical modifications
-          necessary to exercise the Licensed Rights, including
-          technical modifications necessary to circumvent Effective
-          Technological Measures. For purposes of this Public License,
-          simply making modifications authorized by this Section 2(a)
-          (4) never produces Adapted Material.
-       5. Downstream recipients.
-            a. Offer from the Licensor -- Licensed Material. Every
-               recipient of the Licensed Material automatically
-               receives an offer from the Licensor to exercise the
-               Licensed Rights under the terms and conditions of this
-               Public License.
-            b. No downstream restrictions. You may not offer or impose
-               any additional or different terms or conditions on, or
-               apply any Effective Technological Measures to, the
-               Licensed Material if doing so restricts exercise of the
-               Licensed Rights by any recipient of the Licensed
-               Material.
-       6. No endorsement. Nothing in this Public License constitutes or
-          may be construed as permission to assert or imply that You
-          are, or that Your use of the Licensed Material is, connected
-          with, or sponsored, endorsed, or granted official status by,
-          the Licensor or others designated to receive attribution as
-          provided in Section 3(a)(1)(A)(i).
-  b. Other rights.
-       1. Moral rights, such as the right of integrity, are not
-          licensed under this Public License, nor are publicity,
-          privacy, and/or other similar personality rights; however, to
-          the extent possible, the Licensor waives and/or agrees not to
-          assert any such rights held by the Licensor to the limited
-          extent necessary to allow You to exercise the Licensed
-          Rights, but not otherwise.
-       2. Patent and trademark rights are not licensed under this
-          Public License.
-       3. To the extent possible, the Licensor waives any right to
-          collect royalties from You for the exercise of the Licensed
-          Rights, whether directly or through a collecting society
-          under any voluntary or waivable statutory or compulsory
-          licensing scheme. In all other cases the Licensor expressly
-          reserves any right to collect such royalties, including when
-          the Licensed Material is used other than for NonCommercial
-          purposes.
-Section 3 -- License Conditions.
-Your exercise of the Licensed Rights is expressly made subject to the
-following conditions.
-  a. Attribution.
-       1. If You Share the Licensed Material (including in modified
-          form), You must:
-            a. retain the following if it is supplied by the Licensor
-               with the Licensed Material:
-                 i. identification of the creator(s) of the Licensed
-                    Material and any others designated to receive
-                    attribution, in any reasonable manner requested by
-                    the Licensor (including by pseudonym if
-                    designated);
-                ii. a copyright notice;
-               iii. a notice that refers to this Public License;
-                iv. a notice that refers to the disclaimer of
-                    warranties;
-                 v. a URI or hyperlink to the Licensed Material to the
-                    extent reasonably practicable;
-            b. indicate if You modified the Licensed Material and
-               retain an indication of any previous modifications; and
-            c. indicate the Licensed Material is licensed under this
-               Public License, and include the text of, or the URI or
-               hyperlink to, this Public License.
-       2. You may satisfy the conditions in Section 3(a)(1) in any
-          reasonable manner based on the medium, means, and context in
-          which You Share the Licensed Material. For example, it may be
-          reasonable to satisfy the conditions by providing a URI or
-          hyperlink to a resource that includes the required
-          information.
-       3. If requested by the Licensor, You must remove any of the
-          information required by Section 3(a)(1)(A) to the extent
-          reasonably practicable.
-       4. If You Share Adapted Material You produce, the Adapter's
-          License You apply must not prevent recipients of the Adapted
-          Material from complying with this Public License.
-Section 4 -- Sui Generis Database Rights.
-Where the Licensed Rights include Sui Generis Database Rights that
-apply to Your use of the Licensed Material:
-  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
-     to extract, reuse, reproduce, and Share all or a substantial
-     portion of the contents of the database for NonCommercial purposes
-     only;
-  b. if You include all or a substantial portion of the database
-     contents in a database in which You have Sui Generis Database
-     Rights, then the database in which You have Sui Generis Database
-     Rights (but not its individual contents) is Adapted Material; and
-  c. You must comply with the conditions in Section 3(a) if You Share
-     all or a substantial portion of the contents of the database.
-For the avoidance of doubt, this Section 4 supplements and does not
-replace Your obligations under this Public License where the Licensed
-Rights include other Copyright and Similar Rights.
-Section 5 -- Disclaimer of Warranties and Limitation of Liability.
-  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
-     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
-     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
-     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
-     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
-     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
-     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
-     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
-     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
-     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
-  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
-     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
-     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
-     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
-     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
-     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
-     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
-     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
-     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
-  c. The disclaimer of warranties and limitation of liability provided
-     above shall be interpreted in a manner that, to the extent
-     possible, most closely approximates an absolute disclaimer and
-     waiver of all liability.
-Section 6 -- Term and Termination.
-  a. This Public License applies for the term of the Copyright and
-     Similar Rights licensed here. However, if You fail to comply with
-     this Public License, then Your rights under this Public License
-     terminate automatically.
-  b. Where Your right to use the Licensed Material has terminated under
-     Section 6(a), it reinstates:
-       1. automatically as of the date the violation is cured, provided
-          it is cured within 30 days of Your discovery of the
-          violation; or
-       2. upon express reinstatement by the Licensor.
-     For the avoidance of doubt, this Section 6(b) does not affect any
-     right the Licensor may have to seek remedies for Your violations
-     of this Public License.
-  c. For the avoidance of doubt, the Licensor may also offer the
-     Licensed Material under separate terms or conditions or stop
-     distributing the Licensed Material at any time; however, doing so
-     will not terminate this Public License.
-  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
-     License.
-Section 7 -- Other Terms and Conditions.
-  a. The Licensor shall not be bound by any additional or different
-     terms or conditions communicated by You unless expressly agreed.
-  b. Any arrangements, understandings, or agreements regarding the
-     Licensed Material not stated herein are separate from and
-     independent of the terms and conditions of this Public License.
-Section 8 -- Interpretation.
-  a. For the avoidance of doubt, this Public License does not, and
-     shall not be interpreted to, reduce, limit, restrict, or impose
-     conditions on any use of the Licensed Material that could lawfully
-     be made without permission under this Public License.
-  b. To the extent possible, if any provision of this Public License is
-     deemed unenforceable, it shall be automatically reformed to the
-     minimum extent necessary to make it enforceable. If the provision
-     cannot be reformed, it shall be severed from this Public License
-     without affecting the enforceability of the remaining terms and
-     conditions.
-  c. No term or condition of this Public License will be waived and no
-     failure to comply consented to unless expressly agreed to by the
-     Licensor.
-  d. Nothing in this Public License constitutes or may be interpreted
-     as a limitation upon, or waiver of, any privileges and immunities
-     that apply to the Licensor or You, including from the legal
-     processes of any jurisdiction or authority.
-=======================================================================
-Creative Commons is not a party to its public
-licenses. Notwithstanding, Creative Commons may elect to apply one of
-its public licenses to material it publishes and in those instances
-will be considered the “Licensor.” The text of the Creative Commons
-public licenses is dedicated to the public domain under the CC0 Public
-Domain Dedication. Except for the limited purpose of indicating that
-material is shared under a Creative Commons public license or as
-otherwise permitted by the Creative Commons policies published at
-creativecommons.org/policies, Creative Commons does not authorize the
-use of the trademark "Creative Commons" or any other trademark or logo
-of Creative Commons without its prior written consent including,
-without limitation, in connection with any unauthorized modifications
-to any of its public licenses or any other arrangements,
-understandings, or agreements concerning use of licensed material. For
-the avoidance of doubt, this paragraph does not form part of the
-public licenses.
-Creative Commons may be contacted at creativecommons.org.

MuCodec/configs/models/transformer2D.json DELETED Viewed

@@ -1,25 +0,0 @@
-{
-  "_class_name": "Transformer2DModel",
-  "activation_fn": "gelu-approximate",
-  "attention_bias": true,
-  "attention_head_dim": 72,
-  "attention_type": "default",
-  "cross_attention_dim": null,
-  "double_self_attention": false,
-  "dropout": 0.0,
-  "in_channels": 96,
-  "norm_elementwise_affine": false,
-  "norm_eps": 1e-06,
-  "norm_num_groups": 32,
-  "norm_type": "ada_norm_single",
-  "num_attention_heads": 22,
-  "num_embeds_ada_norm": 1000,
-  "num_layers": 24,
-  "num_vector_embeds": null,
-  "only_cross_attention": false,
-  "out_channels": 32,
-  "patch_size": 2,
-  "sample_size": 384,
-  "upcast_attention": false,
-  "use_linear_projection": false
-}

MuCodec/configs/scheduler/stable_diffusion_2.1_largenoise_sample.json DELETED Viewed

@@ -1,14 +0,0 @@
-{
-  "_class_name": "DDIMScheduler",
-  "_diffusers_version": "0.8.0",
-  "beta_end": 0.02,
-  "beta_schedule": "scaled_linear",
-  "beta_start": 0.0015,
-  "clip_sample": false,
-  "num_train_timesteps": 1000,
-  "prediction_type": "sample",
-  "set_alpha_to_one": false,
-  "skip_prk_steps": true,
-  "steps_offset": 1,
-  "trained_betas": null
-}

MuCodec/generate.py DELETED Viewed

@@ -1,248 +0,0 @@
-import json
-import torch
-from tqdm import tqdm
-import sys
-from model import PromptCondAudioDiffusion
-from diffusers import DDIMScheduler, DDPMScheduler
-import torchaudio
-import librosa
-import os
-import math
-import numpy as np
-from tools.get_melvaehifigan48k import build_pretrained_models
-import tools.torch_tools as torch_tools
-from safetensors.torch import load_file
-from cached_path import cached_path
-class MuCodec:
-    def __init__(self, \
-        model_path, \
-        layer_num, \
-        load_main_model=True, \
-        device="cuda:0"):
-        self.layer_num = layer_num - 1
-        self.sample_rate = 48000
-        self.device = device
-        self.MAX_DURATION = 360
-        if load_main_model:
-            audio_ldm_path = str(cached_path("hf://haoheliu/audioldm_48k/audioldm_48k.pth"))
-            self.vae, self.stft = build_pretrained_models(audio_ldm_path)
-            self.vae, self.stft = self.vae.eval().to(device), self.stft.eval().to(device)
-            main_config = {
-                "num_channels":32,
-                "unet_model_name":None,
-                "unet_model_config_path":os.path.dirname(os.path.abspath(__file__)) + "/configs/models/transformer2D.json",
-                "snr_gamma":None,
-            }
-            self.model = PromptCondAudioDiffusion(**main_config)
-            if model_path.endswith('.safetensors'):
-                main_weights = load_file(model_path)
-            else:
-                main_weights = torch.load(model_path, map_location='cpu')
-            self.model.load_state_dict(main_weights, strict=False)
-            self.model = self.model.to(device)
-            print ("Successfully loaded checkpoint from:", model_path)
-        else:
-            main_config = {
-                "num_channels":32,
-                "unet_model_name":None,
-                "unet_model_config_path":None,
-                "snr_gamma":None,
-            }
-            self.model = PromptCondAudioDiffusion(**main_config).to(device)
-            main_weights = torch.load(model_path, map_location='cpu')
-            self.model.load_state_dict(main_weights, strict=False)
-            self.model = self.model.to(device)
-            print ("Successfully loaded checkpoint from:", model_path)
-        self.model.eval()
-        self.model.init_device_dtype(torch.device(device), torch.float32)
-        print("scaling factor: ", self.model.normfeat.std)
-    def file2code(self, fname):
-        orig_samples, fs = torchaudio.load(fname)
-        if(fs!=self.sample_rate):
-            orig_samples = torchaudio.functional.resample(orig_samples, fs, self.sample_rate)
-            fs = self.sample_rate
-        if orig_samples.shape[0] == 1:
-            orig_samples = torch.cat([orig_samples, orig_samples], 0)
-        return self.sound2code(orig_samples)
-    @torch.no_grad()
-    @torch.autocast(device_type="cuda", dtype=torch.float32)
-    def sound2code(self, orig_samples, batch_size=3):
-        if(orig_samples.ndim == 2):
-            audios = orig_samples.unsqueeze(0).to(self.device)
-        elif(orig_samples.ndim == 3):
-            audios = orig_samples.to(self.device)
-        else:
-            assert orig_samples.ndim in (2,3), orig_samples.shape
-        audios = self.preprocess_audio(audios)
-        audios = audios.squeeze(0)
-        orig_length = audios.shape[-1]
-        min_samples = int(40.96 * self.sample_rate)
-        output_len = int(orig_length / float(self.sample_rate) * 25) + 1
-        print("output_len: ", output_len)
-        while(audios.shape[-1] < min_samples + 480):
-            audios = torch.cat([audios, audios], -1)
-        int_max_len=audios.shape[-1]//min_samples+1
-        # print("int_max_len: ", int_max_len)
-        audios = torch.cat([audios, audios], -1)
-        # print("audios:",audios.shape)
-        audios=audios[:,:int(int_max_len*(min_samples+480))]
-        codes_list=[]
-        audio_input = audios.reshape(2, -1, min_samples+480).permute(1, 0, 2).reshape(-1, 2, min_samples+480)
-        for audio_inx in range(0, audio_input.shape[0], batch_size):
-            # import pdb; pdb.set_trace()
-            codes, _, spk_embeds = self.model.fetch_codes_batch((audio_input[audio_inx:audio_inx+batch_size]), additional_feats=[],layer=self.layer_num)
-            codes_list.append(torch.cat(codes, 1))
-            # print("codes_list",codes_list[0].shape)
-        codes = torch.cat(codes_list, 0).permute(1,0,2).reshape(1, -1)[None] # B 3 T -> 3 B T
-        codes=codes[:,:,:output_len]
-        return codes
-    @torch.no_grad()
-    def code2sound(self, codes, prompt=None, duration=40.96, guidance_scale=1.5, num_steps=20, disable_progress=False):
-        codes = codes.to(self.device)
-        first_latent = torch.randn(codes.shape[0], 32, 512, 32).to(self.device)
-        first_latent_length = 0
-        first_latent_codes_length = 0
-        if(isinstance(prompt, torch.Tensor)):
-            prompt = prompt.to(self.device)
-            if(prompt.ndim == 3):
-                assert prompt.shape[0] == 1, prompt.shape
-                prompt = prompt[0]
-            elif(prompt.ndim == 1):
-                prompt = prompt.unsqueeze(0).repeat(2,1)
-            elif(prompt.ndim == 2):
-                if(prompt.shape[0] == 1):
-                    prompt = prompt.repeat(2,1)
-            if(prompt.shape[-1] < int(30.76 * self.sample_rate)):
-                prompt = prompt[:,:int(10.24*self.sample_rate)] # limit max length to 10.24
-            else:
-                prompt = prompt[:,int(20.48*self.sample_rate):int(30.72*self.sample_rate)] # limit max length to 10.24
-            true_mel , _, _ = torch_tools.wav_to_fbank2(prompt, -1, fn_STFT=self.stft) # maximum 10.24s
-            true_mel = true_mel.unsqueeze(1).to(self.device)
-            true_latent = torch.cat([self.vae.get_first_stage_encoding(self.vae.encode_first_stage(true_mel[[m]])) for m in range(true_mel.shape[0])],0)
-            true_latent = true_latent.reshape(true_latent.shape[0]//2, -1, true_latent.shape[2], true_latent.shape[3]).detach()
-            first_latent[:,:,0:true_latent.shape[2],:] = true_latent
-            first_latent_length = true_latent.shape[2]
-            first_latent_codes = self.sound2code(prompt)[:,:,0:first_latent_length*2] # B 4 T
-            first_latent_codes_length = first_latent_codes.shape[-1]
-            codes = torch.cat([first_latent_codes, codes], -1)
-        min_samples = 1024
-        hop_samples = min_samples // 4 * 3
-        ovlp_samples = min_samples - hop_samples
-        hop_frames = hop_samples // 2
-        ovlp_frames = ovlp_samples // 2
-        codes_len= codes.shape[-1]
-        target_len = int((codes_len - first_latent_codes_length) / 100 * 4 * self.sample_rate)
-        if(codes_len < min_samples):
-            while(codes.shape[-1] < min_samples):
-                codes = torch.cat([codes, codes], -1)
-            codes = codes[:,:,0:min_samples]
-        codes_len = codes.shape[-1]
-        if((codes_len - ovlp_frames) % hop_samples > 0):
-            len_codes=math.ceil((codes_len - ovlp_samples) / float(hop_samples)) * hop_samples + ovlp_samples
-            while(codes.shape[-1] < len_codes):
-                codes = torch.cat([codes, codes], -1)
-            codes = codes[:,:,0:len_codes]
-        latent_length = 512
-        latent_list = []
-        spk_embeds = torch.zeros([1, 32, 1, 32], device=codes.device)
-        with torch.autocast(device_type="cuda", dtype=torch.float16):
-            for sinx in range(0, codes.shape[-1]-hop_samples, hop_samples):
-                codes_input=[]
-                codes_input.append(codes[:,:,sinx:sinx+min_samples])
-                if(sinx == 0):
-                    incontext_length = first_latent_length
-                    latents = self.model.inference_codes(codes_input, spk_embeds, first_latent, latent_length, incontext_length, additional_feats=[], guidance_scale=1.5, num_steps = num_steps, disable_progress=disable_progress, scenario='other_seg')
-                    latent_list.append(latents)
-                else:
-                    true_latent = latent_list[-1][:,:,-ovlp_frames:,:]
-                    len_add_to_512 = 512 - true_latent.shape[-2]
-                    incontext_length = true_latent.shape[-2]
-                    true_latent = torch.cat([true_latent, torch.randn(true_latent.shape[0], true_latent.shape[1], len_add_to_512, true_latent.shape[-1]).to(self.device)], -2)
-                    latents = self.model.inference_codes(codes_input, spk_embeds, true_latent, latent_length, incontext_length,  additional_feats=[], guidance_scale=1.5, num_steps = num_steps, disable_progress=disable_progress, scenario='other_seg')
-                    latent_list.append(latents)
-        latent_list = [l.float() for l in latent_list]
-        latent_list[0] = latent_list[0][:,:,first_latent_length:,:]
-        min_samples =  int(duration * self.sample_rate)
-        hop_samples = min_samples // 4 * 3
-        ovlp_samples = min_samples - hop_samples
-        with torch.no_grad():
-            output = None
-            for i in range(len(latent_list)):
-                latent = latent_list[i]
-                bsz , ch, t, f = latent.shape
-                latent = latent.reshape(bsz*2, ch//2, t, f)
-                mel = self.vae.decode_first_stage(latent)
-                cur_output = self.vae.decode_to_waveform(mel)
-                cur_output = torch.from_numpy(cur_output)[:, 0:min_samples]
-                if output is None:
-                    output = cur_output
-                else:
-                    ov_win = torch.from_numpy(np.linspace(0, 1, ovlp_samples)[None, :])
-                    ov_win = torch.cat([ov_win, 1 - ov_win], -1)
-                    output[:, -ovlp_samples:] = output[:, -ovlp_samples:] * ov_win[:, -ovlp_samples:] + cur_output[:, 0:ovlp_samples] * ov_win[:, 0:ovlp_samples]
-                    output = torch.cat([output, cur_output[:, ovlp_samples:]], -1)
-            output = output[:, 0:target_len]
-        return output
-    @torch.no_grad()
-    def preprocess_audio(self, input_audios, threshold=0.8):
-        assert len(input_audios.shape) == 3, input_audios.shape
-        nchan = input_audios.shape[1]
-        input_audios = input_audios.reshape(input_audios.shape[0], -1)
-        norm_value = torch.ones_like(input_audios[:,0])
-        max_volume = input_audios.abs().max(dim=-1)[0]
-        norm_value[max_volume>threshold] = max_volume[max_volume>threshold] / threshold
-        return input_audios.reshape(input_audios.shape[0], nchan, -1)/norm_value.unsqueeze(-1).unsqueeze(-1)
-    @torch.no_grad()
-    def sound2sound(self, sound, prompt=None, min_duration=40.96, steps=50, disable_progress=False):
-        codes = self.sound2code(sound)
-        wave = self.code2sound(codes, prompt, duration=min_duration, guidance_scale=1.5, num_steps=steps, disable_progress=disable_progress)
-        return wave
-if __name__=="__main__":
-    ckpt_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ckpt/mucodec.pt")
-    mucodec = MuCodec(model_path=ckpt_path,layer_num=7,load_main_model=True)
-    filelist = []
-    root_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_wav")
-    for f in [os.path.join(root_dir, f) for f in os.listdir(root_dir) if '.flac' in f or '.wav' in f or '.mp3' in f]:
-        a, fs = torchaudio.load(f)
-        if(fs!=48000):
-            a = torchaudio.functional.resample(a, fs, 48000)
-        if(a.shape[0]==1):
-            a = torch.cat([a,a],0)
-        ori_len = a.shape[-1]
-        filelist.append([a, '', [0, a.shape[-1]/48000.], f,ori_len])
-    reconstructed_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "reconstructed")
-    os.makedirs(reconstructed_dir, exist_ok=True)
-    for sample_idx, (orig_samples, lyric, st_et, fname,ori_len) in enumerate(filelist):
-        print(fname, lyric)
-        wave = mucodec.sound2sound(orig_samples,None)
-        wave = wave[:,0:ori_len]
-        torchaudio.save(os.path.join(reconstructed_dir, os.path.basename(fname)),wave.detach().cpu(), 48000)

MuCodec/libs/rvq/descript_quantize3.py DELETED Viewed

@@ -1,298 +0,0 @@
-from typing import Union
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from torch.nn.utils import weight_norm
-def WNConv1d(*args, **kwargs):
-    return weight_norm(nn.Conv1d(*args, **kwargs))
-class VectorQuantize(nn.Module):
-    """
-    Implementation of VQ similar to Karpathy's repo:
-    https://github.com/karpathy/deep-vector-quantization
-    Additionally uses following tricks from Improved VQGAN
-    (https://arxiv.org/pdf/2110.04627.pdf):
-        1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space
-            for improved codebook usage
-        2. l2-normalized codes: Converts euclidean distance to cosine similarity which
-            improves training stability
-    """
-    def __init__(self, input_dim: int, codebook_size: int, codebook_dim: int, stale_tolerance: int = 100):
-        super().__init__()
-        self.codebook_size = codebook_size
-        self.codebook_dim = codebook_dim
-        self.in_proj = WNConv1d(input_dim, codebook_dim, kernel_size=1)
-        self.out_proj = WNConv1d(codebook_dim, input_dim, kernel_size=1)
-        self.codebook = nn.Embedding(codebook_size, codebook_dim)
-        self.register_buffer("stale_counter", torch.zeros(self.codebook_size,))
-        self.stale_tolerance = stale_tolerance
-    def forward(self, z):
-        """Quantized the input tensor using a fixed codebook and returns
-        the corresponding codebook vectors
-        Parameters
-        ----------
-        z : Tensor[B x D x T]
-        Returns
-        -------
-        Tensor[B x D x T]
-            Quantized continuous representation of input
-        Tensor[1]
-            Commitment loss to train encoder to predict vectors closer to codebook
-            entries
-        Tensor[1]
-            Codebook loss to update the codebook
-        Tensor[B x T]
-            Codebook indices (quantized discrete representation of input)
-        Tensor[B x D x T]
-            Projected latents (continuous representation of input before quantization)
-        """
-        # Factorized codes (ViT-VQGAN) Project input into low-dimensional space
-        z_e = self.in_proj(z)  # z_e : (B x D x T)
-        z_q, indices = self.decode_latents(z_e)
-        commitment_loss = F.mse_loss(z_e, z_q.detach(), reduction="none").mean([1, 2])
-        codebook_loss = F.mse_loss(z_q, z_e.detach(), reduction="none").mean([1, 2])
-        z_q = (
-            z_e + (z_q - z_e).detach()
-        )  # noop in forward pass, straight-through gradient estimator in backward pass
-        z_q = self.out_proj(z_q)
-        return z_q, commitment_loss, codebook_loss, indices, z_e
-    def embed_code(self, embed_id):
-        return F.embedding(embed_id, self.codebook.weight)
-    def decode_code(self, embed_id):
-        return self.embed_code(embed_id).transpose(1, 2)
-    def decode_latents(self, latents):
-        encodings = rearrange(latents, "b d t -> (b t) d")
-        codebook = self.codebook.weight  # codebook: (N x D)
-        # L2 normalize encodings and codebook (ViT-VQGAN)
-        encodings = F.normalize(encodings)
-        codebook = F.normalize(codebook)
-        # Compute euclidean distance with codebook
-        dist = (
-            encodings.pow(2).sum(1, keepdim=True)
-            - 2 * encodings @ codebook.t()
-            + codebook.pow(2).sum(1, keepdim=True).t()
-        )
-        indices = rearrange((-dist).max(1)[1], "(b t) -> b t", b=latents.size(0))
-        z_q = self.decode_code(indices)
-        if(self.training):
-            onehots = torch.nn.functional.one_hot(indices, self.codebook_size).float()  # B, T, codebook_size
-            stale_codes = (onehots.sum(0).sum(0) == 0).float()
-            self.stale_counter = self.stale_counter * stale_codes + stale_codes
-            # random replace codes that haven't been used for a while
-            replace_code = (self.stale_counter == self.stale_tolerance).float() # codebook_size
-            if replace_code.sum(-1) > 0:
-                print("Replace {} codes".format(replace_code.sum(-1)))
-                random_input_idx = torch.randperm(encodings.shape[0])
-                random_input = encodings[random_input_idx].view(encodings.shape)
-                if random_input.shape[0] < self.codebook_size:
-                    random_input = torch.cat([random_input]*(self.codebook_size // random_input.shape[0] + 1), 0)
-                random_input = random_input[:self.codebook_size,:].contiguous()  # codebook_size, dim
-                self.codebook.weight.data = self.codebook.weight.data * (1 - replace_code).unsqueeze(-1) + random_input * replace_code.unsqueeze(-1)
-                self.stale_counter = self.stale_counter * (1 - replace_code)
-        return z_q, indices
-class ResidualVectorQuantize(nn.Module):
-    """
-    Introduced in SoundStream: An end2end neural audio codec
-    https://arxiv.org/abs/2107.03312
-    """
-    def __init__(
-        self,
-        input_dim: int = 512,
-        n_codebooks: int = 9,
-        codebook_size: int = 1024,
-        codebook_dim: Union[int, list] = 8,
-        quantizer_dropout: float = 0.0,
-        stale_tolerance: int = 100,
-    ):
-        super().__init__()
-        if isinstance(codebook_dim, int):
-            codebook_dim = [codebook_dim for _ in range(n_codebooks)]
-        self.n_codebooks = n_codebooks
-        self.codebook_dim = codebook_dim
-        self.codebook_size = codebook_size
-        self.quantizers = nn.ModuleList(
-            [
-                VectorQuantize(input_dim, codebook_size, codebook_dim[i], stale_tolerance=stale_tolerance)
-                for i in range(n_codebooks)
-            ]
-        )
-        self.quantizer_dropout = quantizer_dropout
-    def forward(self, z, n_quantizers: int = None):
-        """Quantized the input tensor using a fixed set of `n` codebooks and returns
-        the corresponding codebook vectors
-        Parameters
-        ----------
-        z : Tensor[B x D x T]
-        n_quantizers : int, optional
-            No. of quantizers to use
-            (n_quantizers < self.n_codebooks ex: for quantizer dropout)
-            Note: if `self.quantizer_dropout` is True, this argument is ignored
-                when in training mode, and a random number of quantizers is used.
-        Returns
-        -------
-        dict
-            A dictionary with the following keys:
-            "z" : Tensor[B x D x T]
-                Quantized continuous representation of input
-            "codes" : Tensor[B x N x T]
-                Codebook indices for each codebook
-                (quantized discrete representation of input)
-            "latents" : Tensor[B x N*D x T]
-                Projected latents (continuous representation of input before quantization)
-            "vq/commitment_loss" : Tensor[1]
-                Commitment loss to train encoder to predict vectors closer to codebook
-                entries
-            "vq/codebook_loss" : Tensor[1]
-                Codebook loss to update the codebook
-        """
-        z_q = 0
-        residual = z
-        commitment_loss = 0
-        codebook_loss = 0
-        codebook_indices = []
-        latents = []
-        if n_quantizers is None:
-            n_quantizers = self.n_codebooks
-        if self.training:
-            n_quantizers = torch.ones((z.shape[0],)) * self.n_codebooks + 1
-            dropout = torch.randint(1, self.n_codebooks + 1, (z.shape[0],))
-            n_dropout = int(z.shape[0] * self.quantizer_dropout)
-            n_quantizers[:n_dropout] = dropout[:n_dropout]
-            n_quantizers = n_quantizers.to(z.device)
-        else:
-            n_quantizers = torch.ones((z.shape[0],)) * n_quantizers + 1
-            n_quantizers = n_quantizers.to(z.device)
-        for i, quantizer in enumerate(self.quantizers):
-            # if self.training is False and i >= n_quantizers:
-            #     break
-            z_q_i, commitment_loss_i, codebook_loss_i, indices_i, z_e_i = quantizer(
-                residual
-            )
-            # Create mask to apply quantizer dropout
-            mask = (
-                torch.full((z.shape[0],), fill_value=i, device=z.device) < n_quantizers
-            )
-            z_q = z_q + z_q_i * mask[:, None, None]
-            residual = residual - z_q_i
-            # Sum losses
-            commitment_loss += (commitment_loss_i * mask).mean()
-            codebook_loss += (codebook_loss_i * mask).mean()
-            codebook_indices.append(indices_i)
-            latents.append(z_e_i)
-        codes = torch.stack(codebook_indices, dim=1)
-        latents = torch.cat(latents, dim=1)
-        encodings = F.one_hot(codes, self.codebook_size).float() # B N T 1024
-        for n in range(encodings.shape[1]):
-            print("Lyaer {}, Ratio of unused vector : {:.1f}".format(n,
-                (encodings[:,n,:,:].sum(0).sum(0) < 1.0).sum()/torch.numel(encodings[:,n,:,:].sum(0).sum(0) < 1.0) * 100.
-            ))
-        return z_q, codes, latents, commitment_loss, codebook_loss, n_quantizers.clamp(max=self.n_codebooks).long() - 1
-    def from_codes(self, codes: torch.Tensor):
-        """Given the quantized codes, reconstruct the continuous representation
-        Parameters
-        ----------
-        codes : Tensor[B x N x T]
-            Quantized discrete representation of input
-        Returns
-        -------
-        Tensor[B x D x T]
-            Quantized continuous representation of input
-        """
-        z_q = 0.0
-        z_p = []
-        n_codebooks = codes.shape[1]
-        for i in range(n_codebooks):
-            z_p_i = self.quantizers[i].decode_code(codes[:, i, :])
-            z_p.append(z_p_i)
-            z_q_i = self.quantizers[i].out_proj(z_p_i)
-            z_q = z_q + z_q_i
-        return z_q, torch.cat(z_p, dim=1), codes
-    def from_latents(self, latents: torch.Tensor):
-        """Given the unquantized latents, reconstruct the
-        continuous representation after quantization.
-        Parameters
-        ----------
-        latents : Tensor[B x N x T]
-            Continuous representation of input after projection
-        Returns
-        -------
-        Tensor[B x D x T]
-            Quantized representation of full-projected space
-        Tensor[B x D x T]
-            Quantized representation of latent space
-        """
-        z_q = 0
-        z_p = []
-        codes = []
-        dims = np.cumsum([0] + [q.codebook_dim for q in self.quantizers])
-        n_codebooks = np.where(dims <= latents.shape[1])[0].max(axis=0, keepdims=True)[
-            0
-        ]
-        for i in range(n_codebooks):
-            j, k = dims[i], dims[i + 1]
-            z_p_i, codes_i = self.quantizers[i].decode_latents(latents[:, j:k, :])
-            z_p.append(z_p_i)
-            codes.append(codes_i)
-            z_q_i = self.quantizers[i].out_proj(z_p_i)
-            z_q = z_q + z_q_i
-        return z_q, torch.cat(z_p, dim=1), torch.stack(codes, dim=1)
-if __name__ == "__main__":
-    rvq = ResidualVectorQuantize(input_dim = 1024, n_codebooks = 4, codebook_size = 1024, codebook_dim = 32, quantizer_dropout = 0.0)
-    x = torch.randn(16, 1024, 80)
-    quantized_prompt_embeds, codes, _, commitment_loss, codebook_loss, rvq_usage = rvq(x)
-    print(quantized_prompt_embeds.shape)
-    print(codes.shape)
-    # w/o reconstruction
-    loss = commitment_loss * 0.25 + codebook_loss * 1.0
-    # w/ reconstruction
-    loss = commitment_loss * 0.25 + codebook_loss * 1.0 + (x - quantized_prompt_embeds).abs().mean()

MuCodec/model.py DELETED Viewed

@@ -1,367 +0,0 @@
-import yaml
-import random
-import inspect
-import numpy as np
-from tqdm import tqdm
-import typing as tp
-from abc import ABC
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torchaudio
-from einops import repeat
-from tools.torch_tools import wav_to_fbank
-import os
-import diffusers
-from diffusers.utils.torch_utils import randn_tensor
-from diffusers import DDPMScheduler
-from models.transformer_2d_flow import Transformer2DModel
-from libs.rvq.descript_quantize3 import ResidualVectorQuantize
-from torch.cuda.amp import autocast
-from muq_dev.test import load_model
-class SampleProcessor(torch.nn.Module):
-    def project_sample(self, x: torch.Tensor):
-        """Project the original sample to the 'space' where the diffusion will happen."""
-        return x
-    def return_sample(self, z: torch.Tensor):
-        """Project back from diffusion space to the actual sample space."""
-        return z
-class Feature2DProcessor(SampleProcessor):
-    def __init__(self, dim: int = 8, power_std: tp.Union[float, tp.List[float], torch.Tensor] = 1., \
-                 num_samples: int = 100_000):
-        super().__init__()
-        self.num_samples = num_samples
-        self.dim = dim
-        self.power_std = power_std
-        self.register_buffer('counts', torch.zeros(1))
-        self.register_buffer('sum_x', torch.zeros(dim, 32))
-        self.register_buffer('sum_x2', torch.zeros(dim, 32))
-        self.register_buffer('sum_target_x2', torch.zeros(dim, 32))
-        self.counts: torch.Tensor
-        self.sum_x: torch.Tensor
-        self.sum_x2: torch.Tensor
-    @property
-    def mean(self):
-        mean = self.sum_x / self.counts
-        return mean
-    @property
-    def std(self):
-        std = (self.sum_x2 / self.counts - self.mean**2).clamp(min=0).sqrt()
-        return std
-    @property
-    def target_std(self):
-        return 1
-    def project_sample(self, x: torch.Tensor):
-        assert x.dim() == 4
-        if self.counts.item() < self.num_samples:
-            self.counts += len(x)
-            self.sum_x += x.mean(dim=(2,)).sum(dim=0)
-            self.sum_x2 += x.pow(2).mean(dim=(2,)).sum(dim=0)
-        rescale = (self.target_std / self.std.clamp(min=1e-12)) ** self.power_std  # same output size
-        x = (x - self.mean.view(1, -1, 1, 32).contiguous()) * rescale.view(1, -1, 1, 32).contiguous()
-        return x
-    def return_sample(self, x: torch.Tensor):
-        assert x.dim() == 4
-        rescale = (self.std / self.target_std) ** self.power_std
-        x = x * rescale.view(1, -1, 1, 32).contiguous() + self.mean.view(1, -1, 1, 32).contiguous()
-        return x
-class BASECFM(torch.nn.Module, ABC):
-    def __init__(
-        self,
-        estimator,
-    ):
-        super().__init__()
-        self.sigma_min = 1e-4
-        self.estimator = estimator
-    @torch.inference_mode()
-    def forward(self, mu, n_timesteps, temperature=1.0):
-        """Forward diffusion
-        Args:
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-            n_timesteps (int): number of diffusion steps
-            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
-        Returns:
-            sample: generated mel-spectrogram
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        """
-        z = torch.randn_like(mu) * temperature
-        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
-        return self.solve_euler(z, t_span=t_span)
-    def solve_euler(self, x, incontext_x, incontext_length, t_span, mu, added_cond_kwargs, guidance_scale):
-        """
-        Fixed euler solver for ODEs.
-        Args:
-            x (torch.Tensor): random noise
-            t_span (torch.Tensor): n_timesteps interpolated
-                shape: (n_timesteps + 1,)
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_channels, mel_timesteps, n_feats)
-        """
-        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
-        noise = x.clone()
-        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
-        # Or in future might add like a return_all_steps flag
-        sol = []
-        for step in tqdm(range(1, len(t_span))):
-            x[:,:,0:incontext_length,:] = (1 - (1 - self.sigma_min) * t) * noise[:,:,0:incontext_length,:] + t * incontext_x[:,:,0:incontext_length,:]
-            if(guidance_scale > 1.0):
-                dphi_dt = self.estimator( \
-                    torch.cat([ \
-                        torch.cat([x, x], 0), \
-                        torch.cat([incontext_x, incontext_x], 0), \
-                        torch.cat([torch.zeros_like(mu), mu], 0), \
-                        ], 1), \
-                timestep = t.unsqueeze(-1).repeat(2), \
-                added_cond_kwargs={k:torch.cat([v,v],0) for k,v in added_cond_kwargs.items()}).sample
-                dphi_dt_uncond, dhpi_dt_cond = dphi_dt.chunk(2,0)
-                dphi_dt = dphi_dt_uncond + guidance_scale * (dhpi_dt_cond - dphi_dt_uncond)
-            else:
-                dphi_dt = self.estimator(torch.cat([x, incontext_x, mu], 1), \
-                timestep = t.unsqueeze(-1),
-                added_cond_kwargs=added_cond_kwargs).sample
-            x = x + dt * dphi_dt
-            t = t + dt
-            sol.append(x)
-            if step < len(t_span) - 1:
-                dt = t_span[step + 1] - t
-        return sol[-1]
-class PromptCondAudioDiffusion(nn.Module):
-    def __init__(
-        self,
-        num_channels,
-        unet_model_name=None,
-        unet_model_config_path=None,
-        snr_gamma=None,
-        uncondition=True,
-        out_paint=False,
-    ):
-        super().__init__()
-        assert unet_model_name is not None or unet_model_config_path is not None, "Either UNet pretrain model name or a config file path is required"
-        self.unet_model_name = unet_model_name
-        self.unet_model_config_path = unet_model_config_path
-        self.snr_gamma = snr_gamma
-        self.uncondition = uncondition
-        self.num_channels = num_channels
-        # https://huggingface.co/docs/diffusers/v0.14.0/en/api/schedulers/overview
-        self.normfeat = Feature2DProcessor(dim=num_channels)
-        self.sample_rate = 48000
-        self.rsp48toclap = torchaudio.transforms.Resample(48000, 24000)
-        self.rsq48towav2vec = torchaudio.transforms.Resample(48000, 16000)
-        muencoder_dir = "muq_dev/muq_fairseq"
-        muencoder_ckpt = "muq_dev/muq.pt"
-        self.muencoder = load_model(
-            model_dir=os.path.abspath(muencoder_dir),
-            checkpoint_dir=os.path.abspath(muencoder_ckpt),
-        )
-        self.rsq48tomuencoder = torchaudio.transforms.Resample(48000, 24000)
-        for v in self.muencoder.parameters():v.requires_grad = False
-        self.rvq_muencoder_emb = ResidualVectorQuantize(input_dim = 1024, n_codebooks = 1, codebook_size = 16_384, codebook_dim = 32, quantizer_dropout = 0.0, stale_tolerance=200)
-        self.cond_muencoder_emb = nn.Linear(1024, 16*32)
-        self.zero_cond_embedding1 = nn.Parameter(torch.randn(32*32,))
-        unet = Transformer2DModel.from_config(
-            unet_model_config_path,
-        )
-        self.set_from = "random"
-        self.cfm_wrapper = BASECFM(unet)
-        print("Transformer initialized from pretrain.")
-    def compute_snr(self, timesteps):
-        """
-        Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
-        """
-        alphas_cumprod = self.noise_scheduler.alphas_cumprod
-        sqrt_alphas_cumprod = alphas_cumprod**0.5
-        sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
-        # Expand the tensors.
-        # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
-        sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
-        while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
-            sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
-        alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
-        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
-        while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
-            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
-        sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
-        # Compute SNR.
-        snr = (alpha / sigma) ** 2
-        return snr
-    def preprocess_audio(self, input_audios, threshold=0.9):
-        assert len(input_audios.shape) == 2, input_audios.shape
-        norm_value = torch.ones_like(input_audios[:,0])
-        max_volume = input_audios.abs().max(dim=-1)[0]
-        norm_value[max_volume>threshold] = max_volume[max_volume>threshold] / threshold
-        return input_audios/norm_value.unsqueeze(-1)
-    def extract_muencoder_embeds(self, input_audio_0,input_audio_1,layer):
-        input_wav_mean = (input_audio_0 + input_audio_1) / 2.0
-        input_wav_mean = self.muencoder(self.rsq48tomuencoder(input_wav_mean), features_only = True)
-        layer_results = input_wav_mean['layer_results']
-        muencoder_emb = layer_results[layer]
-        muencoder_emb = muencoder_emb.permute(0,2,1).contiguous()
-        return muencoder_emb
-    def init_device_dtype(self, device, dtype):
-        self.device = device
-        self.dtype = dtype
-    @torch.no_grad()
-    def fetch_codes(self, input_audios, additional_feats,layer):
-        input_audio_0 = input_audios[[0],:]
-        input_audio_1 = input_audios[[1],:]
-        input_audio_0 = self.preprocess_audio(input_audio_0)
-        input_audio_1 = self.preprocess_audio(input_audio_1)
-        self.muencoder.eval()
-        muencoder_emb = self.extract_muencoder_embeds(input_audio_0,input_audio_1,layer)
-        muencoder_emb = muencoder_emb.detach()
-        self.rvq_muencoder_emb.eval()
-        quantized_muencoder_emb, codes_muencoder_emb, *_ = self.rvq_muencoder_emb(muencoder_emb)
-        spk_embeds = None
-        return [codes_muencoder_emb], [muencoder_emb], spk_embeds
-    @torch.no_grad()
-    def fetch_codes_batch(self, input_audios, additional_feats,layer):
-        input_audio_0 = input_audios[:,0,:]
-        input_audio_1 = input_audios[:,1,:]
-        input_audio_0 = self.preprocess_audio(input_audio_0)
-        input_audio_1 = self.preprocess_audio(input_audio_1)
-        self.muencoder.eval()
-        muencoder_emb = self.extract_muencoder_embeds(input_audio_0,input_audio_1,layer)
-        muencoder_emb = muencoder_emb.detach()
-        self.rvq_muencoder_emb.eval()
-        quantized_muencoder_emb, codes_muencoder_emb, *_ = self.rvq_muencoder_emb(muencoder_emb) # b,d,t
-        spk_embeds = None
-        return [codes_muencoder_emb], [muencoder_emb], spk_embeds
-    @torch.no_grad()
-    def inference_codes(self, codes, spk_embeds, true_latents, latent_length,incontext_length, additional_feats,
-                  guidance_scale=2, num_steps=20,
-                  disable_progress=True, scenario='start_seg'):
-        classifier_free_guidance = guidance_scale > 1.0
-        device = self.device
-        dtype = self.dtype
-        codes_muencoder_emb = codes[0]
-        batch_size = codes_muencoder_emb.shape[0]
-        quantized_muencoder_emb,_,_=self.rvq_muencoder_emb.from_codes(codes_muencoder_emb)
-        quantized_muencoder_emb = self.cond_muencoder_emb(quantized_muencoder_emb.permute(0,2,1)) # b t 16*32
-        quantized_muencoder_emb = quantized_muencoder_emb.reshape(quantized_muencoder_emb.shape[0], quantized_muencoder_emb.shape[1]//2, 2, 16, 32).reshape(quantized_muencoder_emb.shape[0], quantized_muencoder_emb.shape[1]//2, 2*16, 32).permute(0,2,1,3).contiguous() # b 32 t f
-        num_frames = quantized_muencoder_emb.shape[-2]
-        num_channels_latents = self.num_channels
-        latents = self.prepare_latents(batch_size, num_frames, num_channels_latents, dtype, device)
-        bsz, _, height, width = latents.shape
-        resolution = torch.tensor([height, width]).repeat(bsz, 1)
-        aspect_ratio = torch.tensor([float(height / width)]).repeat(bsz, 1)
-        resolution = resolution.to(dtype=quantized_muencoder_emb.dtype, device=device)
-        aspect_ratio = aspect_ratio.to(dtype=quantized_muencoder_emb.dtype, device=device)
-        if classifier_free_guidance:
-            resolution = torch.cat([resolution, resolution], 0)
-            aspect_ratio = torch.cat([aspect_ratio, aspect_ratio], 0)
-        added_cond_kwargs = {"resolution": resolution, "aspect_ratio": aspect_ratio}
-        latent_masks = torch.zeros(latents.shape[0], latents.shape[2], dtype=torch.int64, device=latents.device)
-        latent_masks[:,0:latent_length] = 2
-        if(scenario=='other_seg'):
-            latent_masks[:,0:incontext_length] = 1
-        quantized_muencoder_emb = (latent_masks > 0.5).unsqueeze(1).unsqueeze(-1) * quantized_muencoder_emb \
-            + (latent_masks < 0.5).unsqueeze(1).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,32,1,32)
-        true_latents = self.normfeat.project_sample(true_latents)
-        incontext_latents = true_latents * ((latent_masks > 0.5) * (latent_masks < 1.5)).unsqueeze(1).unsqueeze(-1).float()
-        incontext_length = ((latent_masks > 0.5) * (latent_masks < 1.5)).sum(-1)[0]
-        additional_model_input = torch.cat([quantized_muencoder_emb],1)
-        temperature = 1.0
-        t_span = torch.linspace(0, 1, num_steps + 1, device=quantized_muencoder_emb.device)
-        latents = self.cfm_wrapper.solve_euler(latents * temperature, incontext_latents, incontext_length, t_span, additional_model_input, added_cond_kwargs, guidance_scale)
-        latents[:,:,0:incontext_length,:] = incontext_latents[:,:,0:incontext_length,:]
-        latents = self.normfeat.return_sample(latents)
-        return latents
-    @torch.no_grad()
-    def inference(self, input_audios, lyric, true_latents, latent_length, additional_feats, guidance_scale=2, num_steps=20,
-                  disable_progress=True,layer=5,scenario='start_seg'):
-        codes, embeds, spk_embeds = self.fetch_codes(input_audios, additional_feats,layer)
-        latents = self.inference_codes(codes, spk_embeds, true_latents, latent_length, additional_feats, \
-            guidance_scale=guidance_scale, num_steps=num_steps, \
-            disable_progress=disable_progress,scenario=scenario)
-        return latents
-    def prepare_latents(self, batch_size, num_frames, num_channels_latents, dtype, device):
-        divisor = 4
-        shape = (batch_size, num_channels_latents, num_frames, 32)
-        if(num_frames%divisor>0):
-            num_frames = round(num_frames/float(divisor))*divisor
-            shape = (batch_size, num_channels_latents, num_frames, 32)
-        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
-        return latents

MuCodec/models/attention.py DELETED Viewed

@@ -1,682 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Any, Dict, Optional
-import torch
-import torch.nn.functional as F
-from torch import nn
-from diffusers.utils import USE_PEFT_BACKEND
-from diffusers.utils.torch_utils import maybe_allow_in_graph
-from diffusers.models.activations import GEGLU, GELU, ApproximateGELU
-from diffusers.models.attention_processor import Attention
-from diffusers.models.embeddings import SinusoidalPositionalEmbedding
-from diffusers.models.lora import LoRACompatibleLinear
-from diffusers.models.normalization import AdaLayerNorm, AdaLayerNormContinuous, AdaLayerNormZero, RMSNorm
-def _chunked_feed_forward(
-    ff: nn.Module, hidden_states: torch.Tensor, chunk_dim: int, chunk_size: int, lora_scale: Optional[float] = None
-):
-    # "feed_forward_chunk_size" can be used to save memory
-    if hidden_states.shape[chunk_dim] % chunk_size != 0:
-        raise ValueError(
-            f"`hidden_states` dimension to be chunked: {hidden_states.shape[chunk_dim]} has to be divisible by chunk size: {chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
-        )
-    num_chunks = hidden_states.shape[chunk_dim] // chunk_size
-    if lora_scale is None:
-        ff_output = torch.cat(
-            [ff(hid_slice) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)],
-            dim=chunk_dim,
-        )
-    else:
-        # TOOD(Patrick): LoRA scale can be removed once PEFT refactor is complete
-        ff_output = torch.cat(
-            [ff(hid_slice, scale=lora_scale) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)],
-            dim=chunk_dim,
-        )
-    return ff_output
-@maybe_allow_in_graph
-class GatedSelfAttentionDense(nn.Module):
-    r"""
-    A gated self-attention dense layer that combines visual features and object features.
-    Parameters:
-        query_dim (`int`): The number of channels in the query.
-        context_dim (`int`): The number of channels in the context.
-        n_heads (`int`): The number of heads to use for attention.
-        d_head (`int`): The number of channels in each head.
-    """
-    def __init__(self, query_dim: int, context_dim: int, n_heads: int, d_head: int):
-        super().__init__()
-        # we need a linear projection since we need cat visual feature and obj feature
-        self.linear = nn.Linear(context_dim, query_dim)
-        self.attn = Attention(query_dim=query_dim, heads=n_heads, dim_head=d_head)
-        self.ff = FeedForward(query_dim, activation_fn="geglu")
-        self.norm1 = nn.LayerNorm(query_dim)
-        self.norm2 = nn.LayerNorm(query_dim)
-        self.register_parameter("alpha_attn", nn.Parameter(torch.tensor(0.0)))
-        self.register_parameter("alpha_dense", nn.Parameter(torch.tensor(0.0)))
-        self.enabled = True
-    def forward(self, x: torch.Tensor, objs: torch.Tensor) -> torch.Tensor:
-        if not self.enabled:
-            return x
-        n_visual = x.shape[1]
-        objs = self.linear(objs)
-        x = x + self.alpha_attn.tanh() * self.attn(self.norm1(torch.cat([x, objs], dim=1)))[:, :n_visual, :]
-        x = x + self.alpha_dense.tanh() * self.ff(self.norm2(x))
-        return x
-@maybe_allow_in_graph
-class BasicTransformerBlock(nn.Module):
-    r"""
-    A basic Transformer block.
-    Parameters:
-        dim (`int`): The number of channels in the input and output.
-        num_attention_heads (`int`): The number of heads to use for multi-head attention.
-        attention_head_dim (`int`): The number of channels in each head.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
-        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
-        num_embeds_ada_norm (:
-            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
-        attention_bias (:
-            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
-        only_cross_attention (`bool`, *optional*):
-            Whether to use only cross-attention layers. In this case two cross attention layers are used.
-        double_self_attention (`bool`, *optional*):
-            Whether to use two self-attention layers. In this case no cross attention layers are used.
-        upcast_attention (`bool`, *optional*):
-            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
-        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
-            Whether to use learnable elementwise affine parameters for normalization.
-        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
-            The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
-        final_dropout (`bool` *optional*, defaults to False):
-            Whether to apply a final dropout after the last feed-forward layer.
-        attention_type (`str`, *optional*, defaults to `"default"`):
-            The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
-        positional_embeddings (`str`, *optional*, defaults to `None`):
-            The type of positional embeddings to apply to.
-        num_positional_embeddings (`int`, *optional*, defaults to `None`):
-            The maximum number of positional embeddings to apply.
-    """
-    def __init__(
-        self,
-        dim: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        dropout=0.0,
-        cross_attention_dim: Optional[int] = None,
-        activation_fn: str = "geglu",
-        num_embeds_ada_norm: Optional[int] = None,
-        attention_bias: bool = False,
-        only_cross_attention: bool = False,
-        double_self_attention: bool = False,
-        upcast_attention: bool = False,
-        norm_elementwise_affine: bool = True,
-        norm_type: str = "layer_norm",  # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single'
-        norm_eps: float = 1e-5,
-        final_dropout: bool = False,
-        attention_type: str = "default",
-        positional_embeddings: Optional[str] = None,
-        num_positional_embeddings: Optional[int] = None,
-        ada_norm_continous_conditioning_embedding_dim: Optional[int] = None,
-        ada_norm_bias: Optional[int] = None,
-        ff_inner_dim: Optional[int] = None,
-        ff_bias: bool = True,
-        attention_out_bias: bool = True,
-    ):
-        super().__init__()
-        self.only_cross_attention = only_cross_attention
-        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
-        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
-        self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
-        self.use_layer_norm = norm_type == "layer_norm"
-        self.use_ada_layer_norm_continuous = norm_type == "ada_norm_continuous"
-        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
-            raise ValueError(
-                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
-                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
-            )
-        if positional_embeddings and (num_positional_embeddings is None):
-            raise ValueError(
-                "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
-            )
-        if positional_embeddings == "sinusoidal":
-            self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
-        else:
-            self.pos_embed = None
-        # Define 3 blocks. Each block has its own normalization layer.
-        # 1. Self-Attn
-        if self.use_ada_layer_norm:
-            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
-        elif self.use_ada_layer_norm_zero:
-            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
-        elif self.use_ada_layer_norm_continuous:
-            self.norm1 = AdaLayerNormContinuous(
-                dim,
-                ada_norm_continous_conditioning_embedding_dim,
-                norm_elementwise_affine,
-                norm_eps,
-                ada_norm_bias,
-                "rms_norm",
-            )
-        else:
-            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
-        self.attn1 = Attention(
-            query_dim=dim,
-            heads=num_attention_heads,
-            dim_head=attention_head_dim,
-            dropout=dropout,
-            bias=attention_bias,
-            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
-            upcast_attention=upcast_attention,
-            out_bias=attention_out_bias,
-        )
-        # 2. Cross-Attn
-        if cross_attention_dim is not None or double_self_attention:
-            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
-            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
-            # the second cross attention block.
-            if self.use_ada_layer_norm:
-                self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm)
-            elif self.use_ada_layer_norm_continuous:
-                self.norm2 = AdaLayerNormContinuous(
-                    dim,
-                    ada_norm_continous_conditioning_embedding_dim,
-                    norm_elementwise_affine,
-                    norm_eps,
-                    ada_norm_bias,
-                    "rms_norm",
-                )
-            else:
-                self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
-            self.attn2 = Attention(
-                query_dim=dim,
-                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
-                heads=num_attention_heads,
-                dim_head=attention_head_dim,
-                dropout=dropout,
-                bias=attention_bias,
-                upcast_attention=upcast_attention,
-                out_bias=attention_out_bias,
-            )  # is self-attn if encoder_hidden_states is none
-        else:
-            self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
-            self.attn2 = None
-        # 3. Feed-forward
-        if self.use_ada_layer_norm_continuous:
-            self.norm3 = AdaLayerNormContinuous(
-                dim,
-                ada_norm_continous_conditioning_embedding_dim,
-                norm_elementwise_affine,
-                norm_eps,
-                ada_norm_bias,
-                "layer_norm",
-            )
-        elif not self.use_ada_layer_norm_single:
-            self.norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
-        self.ff = FeedForward(
-            dim,
-            dropout=dropout,
-            activation_fn=activation_fn,
-            final_dropout=final_dropout,
-            inner_dim=ff_inner_dim,
-            bias=ff_bias,
-        )
-        # 4. Fuser
-        if attention_type == "gated" or attention_type == "gated-text-image":
-            self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
-        # 5. Scale-shift for PixArt-Alpha.
-        if self.use_ada_layer_norm_single:
-            self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
-        # let chunk size default to None
-        self._chunk_size = None
-        self._chunk_dim = 0
-    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
-        # Sets chunk feed-forward
-        self._chunk_size = chunk_size
-        self._chunk_dim = dim
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        timestep: Optional[torch.LongTensor] = None,
-        cross_attention_kwargs: Dict[str, Any] = None,
-        class_labels: Optional[torch.LongTensor] = None,
-        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
-    ) -> torch.FloatTensor:
-        # Notice that normalization is always applied before the real computation in the following blocks.
-        # 0. Self-Attention
-        batch_size = hidden_states.shape[0]
-        if self.use_ada_layer_norm:
-            norm_hidden_states = self.norm1(hidden_states, timestep)
-        elif self.use_ada_layer_norm_zero:
-            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
-                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
-            )
-        elif self.use_layer_norm:
-            norm_hidden_states = self.norm1(hidden_states)
-        elif self.use_ada_layer_norm_continuous:
-            norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
-        elif self.use_ada_layer_norm_single:
-            # print("Using PixArt-Alpha norm")
-            # print("time step: ", timestep.shape)
-            # print("self.scale_shift_table: ", self.scale_shift_table.shape)
-            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
-                self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
-            ).chunk(6, dim=1)
-            norm_hidden_states = self.norm1(hidden_states)
-            # print("scale_msa: ", scale_msa.shape)
-            # print("shift_msa: ", shift_msa.shape)
-            #scale_msa:  torch.Size([5, 1, 1152])
-            #shift_msa:  torch.Size([5, 1, 1152])
-            # exit()
-            # print("before: ", norm_hidden_states.shape)
-            #before:  torch.Size([5, 3584, 1152])
-            norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
-            # print("after: ", norm_hidden_states.shape)
-            #before:  torch.Size([5, 3584, 1152])
-            # exit()
-            norm_hidden_states = norm_hidden_states.squeeze(1)
-        else:
-            raise ValueError("Incorrect norm used")
-        if self.pos_embed is not None:
-            norm_hidden_states = self.pos_embed(norm_hidden_states)
-        # 1. Retrieve lora scale.
-        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
-        # 2. Prepare GLIGEN inputs
-        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
-        gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
-        attn_output = self.attn1(
-            norm_hidden_states,
-            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
-            attention_mask=attention_mask,
-            **cross_attention_kwargs,
-        )
-        if self.use_ada_layer_norm_zero:
-            attn_output = gate_msa.unsqueeze(1) * attn_output
-        elif self.use_ada_layer_norm_single:
-            attn_output = gate_msa * attn_output
-        hidden_states = attn_output + hidden_states
-        if hidden_states.ndim == 4:
-            hidden_states = hidden_states.squeeze(1)
-        # 2.5 GLIGEN Control
-        if gligen_kwargs is not None:
-            hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
-        # 3. Cross-Attention
-        if self.attn2 is not None:
-            if self.use_ada_layer_norm:
-                norm_hidden_states = self.norm2(hidden_states, timestep)
-            elif self.use_ada_layer_norm_zero or self.use_layer_norm:
-                norm_hidden_states = self.norm2(hidden_states)
-            elif self.use_ada_layer_norm_single:
-                # For PixArt norm2 isn't applied here:
-                # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
-                norm_hidden_states = hidden_states
-            elif self.use_ada_layer_norm_continuous:
-                norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
-            else:
-                raise ValueError("Incorrect norm")
-            if self.pos_embed is not None and self.use_ada_layer_norm_single is False:
-                norm_hidden_states = self.pos_embed(norm_hidden_states)
-            attn_output = self.attn2(
-                norm_hidden_states,
-                encoder_hidden_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                **cross_attention_kwargs,
-            )
-            hidden_states = attn_output + hidden_states
-        # 4. Feed-forward
-        if self.use_ada_layer_norm_continuous:
-            norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"])
-        elif not self.use_ada_layer_norm_single:
-            norm_hidden_states = self.norm3(hidden_states)
-        if self.use_ada_layer_norm_zero:
-            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
-        if self.use_ada_layer_norm_single:
-            norm_hidden_states = self.norm2(hidden_states)
-            norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
-        if self._chunk_size is not None:
-            # "feed_forward_chunk_size" can be used to save memory
-            ff_output = _chunked_feed_forward(
-                self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size, lora_scale=lora_scale
-            )
-        else:
-            ff_output = self.ff(norm_hidden_states, scale=lora_scale)
-        if self.use_ada_layer_norm_zero:
-            ff_output = gate_mlp.unsqueeze(1) * ff_output
-        elif self.use_ada_layer_norm_single:
-            ff_output = gate_mlp * ff_output
-        hidden_states = ff_output + hidden_states
-        if hidden_states.ndim == 4:
-            hidden_states = hidden_states.squeeze(1)
-        return hidden_states
-@maybe_allow_in_graph
-class TemporalBasicTransformerBlock(nn.Module):
-    r"""
-    A basic Transformer block for video like data.
-    Parameters:
-        dim (`int`): The number of channels in the input and output.
-        time_mix_inner_dim (`int`): The number of channels for temporal attention.
-        num_attention_heads (`int`): The number of heads to use for multi-head attention.
-        attention_head_dim (`int`): The number of channels in each head.
-        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
-    """
-    def __init__(
-        self,
-        dim: int,
-        time_mix_inner_dim: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        cross_attention_dim: Optional[int] = None,
-    ):
-        super().__init__()
-        self.is_res = dim == time_mix_inner_dim
-        self.norm_in = nn.LayerNorm(dim)
-        # Define 3 blocks. Each block has its own normalization layer.
-        # 1. Self-Attn
-        self.norm_in = nn.LayerNorm(dim)
-        self.ff_in = FeedForward(
-            dim,
-            dim_out=time_mix_inner_dim,
-            activation_fn="geglu",
-        )
-        self.norm1 = nn.LayerNorm(time_mix_inner_dim)
-        self.attn1 = Attention(
-            query_dim=time_mix_inner_dim,
-            heads=num_attention_heads,
-            dim_head=attention_head_dim,
-            cross_attention_dim=None,
-        )
-        # 2. Cross-Attn
-        if cross_attention_dim is not None:
-            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
-            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
-            # the second cross attention block.
-            self.norm2 = nn.LayerNorm(time_mix_inner_dim)
-            self.attn2 = Attention(
-                query_dim=time_mix_inner_dim,
-                cross_attention_dim=cross_attention_dim,
-                heads=num_attention_heads,
-                dim_head=attention_head_dim,
-            )  # is self-attn if encoder_hidden_states is none
-        else:
-            self.norm2 = None
-            self.attn2 = None
-        # 3. Feed-forward
-        self.norm3 = nn.LayerNorm(time_mix_inner_dim)
-        self.ff = FeedForward(time_mix_inner_dim, activation_fn="geglu")
-        # let chunk size default to None
-        self._chunk_size = None
-        self._chunk_dim = None
-    def set_chunk_feed_forward(self, chunk_size: Optional[int], **kwargs):
-        # Sets chunk feed-forward
-        self._chunk_size = chunk_size
-        # chunk dim should be hardcoded to 1 to have better speed vs. memory trade-off
-        self._chunk_dim = 1
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        num_frames: int,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-    ) -> torch.FloatTensor:
-        # Notice that normalization is always applied before the real computation in the following blocks.
-        # 0. Self-Attention
-        batch_size = hidden_states.shape[0]
-        batch_frames, seq_length, channels = hidden_states.shape
-        batch_size = batch_frames // num_frames
-        hidden_states = hidden_states[None, :].reshape(batch_size, num_frames, seq_length, channels)
-        hidden_states = hidden_states.permute(0, 2, 1, 3)
-        hidden_states = hidden_states.reshape(batch_size * seq_length, num_frames, channels)
-        residual = hidden_states
-        hidden_states = self.norm_in(hidden_states)
-        if self._chunk_size is not None:
-            hidden_states = _chunked_feed_forward(self.ff_in, hidden_states, self._chunk_dim, self._chunk_size)
-        else:
-            hidden_states = self.ff_in(hidden_states)
-        if self.is_res:
-            hidden_states = hidden_states + residual
-        norm_hidden_states = self.norm1(hidden_states)
-        attn_output = self.attn1(norm_hidden_states, encoder_hidden_states=None)
-        hidden_states = attn_output + hidden_states
-        # 3. Cross-Attention
-        if self.attn2 is not None:
-            norm_hidden_states = self.norm2(hidden_states)
-            attn_output = self.attn2(norm_hidden_states, encoder_hidden_states=encoder_hidden_states)
-            hidden_states = attn_output + hidden_states
-        # 4. Feed-forward
-        norm_hidden_states = self.norm3(hidden_states)
-        if self._chunk_size is not None:
-            ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
-        else:
-            ff_output = self.ff(norm_hidden_states)
-        if self.is_res:
-            hidden_states = ff_output + hidden_states
-        else:
-            hidden_states = ff_output
-        hidden_states = hidden_states[None, :].reshape(batch_size, seq_length, num_frames, channels)
-        hidden_states = hidden_states.permute(0, 2, 1, 3)
-        hidden_states = hidden_states.reshape(batch_size * num_frames, seq_length, channels)
-        return hidden_states
-class SkipFFTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        kv_input_dim: int,
-        kv_input_dim_proj_use_bias: bool,
-        dropout=0.0,
-        cross_attention_dim: Optional[int] = None,
-        attention_bias: bool = False,
-        attention_out_bias: bool = True,
-    ):
-        super().__init__()
-        if kv_input_dim != dim:
-            self.kv_mapper = nn.Linear(kv_input_dim, dim, kv_input_dim_proj_use_bias)
-        else:
-            self.kv_mapper = None
-        self.norm1 = RMSNorm(dim, 1e-06)
-        self.attn1 = Attention(
-            query_dim=dim,
-            heads=num_attention_heads,
-            dim_head=attention_head_dim,
-            dropout=dropout,
-            bias=attention_bias,
-            cross_attention_dim=cross_attention_dim,
-            out_bias=attention_out_bias,
-        )
-        self.norm2 = RMSNorm(dim, 1e-06)
-        self.attn2 = Attention(
-            query_dim=dim,
-            cross_attention_dim=cross_attention_dim,
-            heads=num_attention_heads,
-            dim_head=attention_head_dim,
-            dropout=dropout,
-            bias=attention_bias,
-            out_bias=attention_out_bias,
-        )
-    def forward(self, hidden_states, encoder_hidden_states, cross_attention_kwargs):
-        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
-        if self.kv_mapper is not None:
-            encoder_hidden_states = self.kv_mapper(F.silu(encoder_hidden_states))
-        norm_hidden_states = self.norm1(hidden_states)
-        attn_output = self.attn1(
-            norm_hidden_states,
-            encoder_hidden_states=encoder_hidden_states,
-            **cross_attention_kwargs,
-        )
-        hidden_states = attn_output + hidden_states
-        norm_hidden_states = self.norm2(hidden_states)
-        attn_output = self.attn2(
-            norm_hidden_states,
-            encoder_hidden_states=encoder_hidden_states,
-            **cross_attention_kwargs,
-        )
-        hidden_states = attn_output + hidden_states
-        return hidden_states
-class FeedForward(nn.Module):
-    r"""
-    A feed-forward layer.
-    Parameters:
-        dim (`int`): The number of channels in the input.
-        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
-        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
-        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
-        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
-    """
-    def __init__(
-        self,
-        dim: int,
-        dim_out: Optional[int] = None,
-        mult: int = 4,
-        dropout: float = 0.0,
-        activation_fn: str = "geglu",
-        final_dropout: bool = False,
-        inner_dim=None,
-        bias: bool = True,
-    ):
-        super().__init__()
-        if inner_dim is None:
-            inner_dim = int(dim * mult)
-        dim_out = dim_out if dim_out is not None else dim
-        linear_cls = LoRACompatibleLinear if not USE_PEFT_BACKEND else nn.Linear
-        if activation_fn == "gelu":
-            act_fn = GELU(dim, inner_dim, bias=bias)
-        if activation_fn == "gelu-approximate":
-            act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
-        elif activation_fn == "geglu":
-            act_fn = GEGLU(dim, inner_dim, bias=bias)
-        elif activation_fn == "geglu-approximate":
-            act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
-        self.net = nn.ModuleList([])
-        # project in
-        self.net.append(act_fn)
-        # project dropout
-        self.net.append(nn.Dropout(dropout))
-        # project out
-        self.net.append(linear_cls(inner_dim, dim_out, bias=bias))
-        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
-        if final_dropout:
-            self.net.append(nn.Dropout(dropout))
-    def forward(self, hidden_states: torch.Tensor, scale: float = 1.0) -> torch.Tensor:
-        compatible_cls = (GEGLU,) if USE_PEFT_BACKEND else (GEGLU, LoRACompatibleLinear)
-        for module in self.net:
-            if isinstance(module, compatible_cls):
-                hidden_states = module(hidden_states, scale)
-            else:
-                hidden_states = module(hidden_states)
-        return hidden_states

MuCodec/models/transformer_2d_flow.py DELETED Viewed

@@ -1,545 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from dataclasses import dataclass
-import math
-from typing import Any, Dict, Optional, Tuple
-import torch
-import torch.nn.functional as F
-from torch import nn
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.models.embeddings import ImagePositionalEmbeddings
-from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate, is_torch_version
-from models.attention import BasicTransformerBlock
-from diffusers.models.embeddings import PatchEmbed, PixArtAlphaTextProjection
-from diffusers.models.lora import LoRACompatibleConv, LoRACompatibleLinear
-from diffusers.models.modeling_utils import ModelMixin
-from diffusers.models.embeddings import TimestepEmbedding
-class PixArtAlphaCombinedFlowEmbeddings(nn.Module):
-    """
-    For PixArt-Alpha.
-    Reference:
-    https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L164C9-L168C29
-    """
-    def __init__(self, embedding_dim, size_emb_dim, use_additional_conditions: bool = False):
-        super().__init__()
-        self.flow_t_size = 512
-        self.outdim = size_emb_dim
-        self.timestep_embedder = TimestepEmbedding(in_channels=self.flow_t_size, time_embed_dim=embedding_dim)
-        self.use_additional_conditions = use_additional_conditions
-        if use_additional_conditions:
-            self.additional_condition_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
-            self.resolution_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=size_emb_dim)
-            self.aspect_ratio_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=size_emb_dim)
-    # https://github.com/atong01/conditional-flow-matching/blob/main/torchcfm/models/unet/nn.py#L87
-    def timestep_embedding(self, timesteps, max_period=10000, scale=1000):
-        """Create sinusoidal timestep embeddings.
-        :param timesteps: a 1-D Tensor of N indices, one per batch element. These may be fractional.
-        :param dim: the dimension of the output.
-        :param max_period: controls the minimum frequency of the embeddings.
-        :return: an [N x dim] Tensor of positional embeddings.
-        """
-        half = self.flow_t_size // 2
-        freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, device=timesteps.device) / half).type(timesteps.type())
-        args = timesteps[:, None] * freqs[None] * scale
-        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-        if self.flow_t_size % 2:
-            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
-        return embedding
-    def forward(self, timestep, resolution, aspect_ratio, batch_size, hidden_dtype):
-        timesteps_proj = self.timestep_embedding(timestep)
-        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (N, D)
-        if self.use_additional_conditions:
-            resolution_emb = self.additional_condition_proj(resolution.flatten()).to(hidden_dtype)
-            resolution_emb = self.resolution_embedder(resolution_emb).reshape(batch_size, -1)
-            aspect_ratio_emb = self.additional_condition_proj(aspect_ratio.flatten()).to(hidden_dtype)
-            aspect_ratio_emb = self.aspect_ratio_embedder(aspect_ratio_emb).reshape(batch_size, -1)
-            conditioning = timesteps_emb + torch.cat([resolution_emb, aspect_ratio_emb], dim=1)
-        else:
-            conditioning = timesteps_emb
-        return conditioning
-class AdaLayerNormSingleFlow(nn.Module):
-    r"""
-    Norm layer adaptive layer norm single (adaLN-single).
-    As proposed in PixArt-Alpha (see: https://arxiv.org/abs/2310.00426; Section 2.3).
-    Parameters:
-        embedding_dim (`int`): The size of each embedding vector.
-        use_additional_conditions (`bool`): To use additional conditions for normalization or not.
-    """
-    def __init__(self, embedding_dim: int, use_additional_conditions: bool = False):
-        super().__init__()
-        self.emb = PixArtAlphaCombinedFlowEmbeddings(
-            embedding_dim, size_emb_dim=embedding_dim // 3, use_additional_conditions=use_additional_conditions
-        )
-        self.silu = nn.SiLU()
-        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
-    def forward(
-        self,
-        timestep: torch.Tensor,
-        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
-        batch_size: Optional[int] = None,
-        hidden_dtype: Optional[torch.dtype] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        # No modulation happening here.
-        embedded_timestep = self.emb(timestep, **added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_dtype)
-        return self.linear(self.silu(embedded_timestep)), embedded_timestep
-@dataclass
-class Transformer2DModelOutput(BaseOutput):
-    """
-    The output of [`Transformer2DModel`].
-    Args:
-        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
-            The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability
-            distributions for the unnoised latent pixels.
-    """
-    sample: torch.FloatTensor
-class Transformer2DModel(ModelMixin, ConfigMixin):
-    """
-    A 2D Transformer model for image-like data.
-    Parameters:
-        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
-        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
-        in_channels (`int`, *optional*):
-            The number of channels in the input and output (specify if the input is **continuous**).
-        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
-        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
-            This is fixed during training since it is used to learn a number of position embeddings.
-        num_vector_embeds (`int`, *optional*):
-            The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**).
-            Includes the class for the masked latent pixel.
-        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
-        num_embeds_ada_norm ( `int`, *optional*):
-            The number of diffusion steps used during training. Pass if at least one of the norm_layers is
-            `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
-            added to the hidden states.
-            During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`.
-        attention_bias (`bool`, *optional*):
-            Configure if the `TransformerBlocks` attention should contain a bias parameter.
-    """
-    _supports_gradient_checkpointing = True
-    @register_to_config
-    def __init__(
-        self,
-        num_attention_heads: int = 16,
-        attention_head_dim: int = 88,
-        in_channels: Optional[int] = None,
-        out_channels: Optional[int] = None,
-        num_layers: int = 1,
-        dropout: float = 0.0,
-        norm_num_groups: int = 32,
-        cross_attention_dim: Optional[int] = None,
-        attention_bias: bool = False,
-        sample_size: Optional[int] = None,
-        num_vector_embeds: Optional[int] = None,
-        patch_size: Optional[int] = None,
-        activation_fn: str = "geglu",
-        num_embeds_ada_norm: Optional[int] = None,
-        use_linear_projection: bool = False,
-        only_cross_attention: bool = False,
-        double_self_attention: bool = False,
-        upcast_attention: bool = False,
-        norm_type: str = "layer_norm",
-        norm_elementwise_affine: bool = True,
-        norm_eps: float = 1e-5,
-        attention_type: str = "default",
-        caption_channels: int = None,
-    ):
-        super().__init__()
-        self.use_linear_projection = use_linear_projection
-        self.num_attention_heads = num_attention_heads
-        self.attention_head_dim = attention_head_dim
-        inner_dim = num_attention_heads * attention_head_dim
-        conv_cls = nn.Conv2d if USE_PEFT_BACKEND else LoRACompatibleConv
-        linear_cls = nn.Linear if USE_PEFT_BACKEND else LoRACompatibleLinear
-        # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
-        # Define whether input is continuous or discrete depending on configuration
-        self.is_input_continuous = (in_channels is not None) and (patch_size is None)
-        self.is_input_vectorized = num_vector_embeds is not None
-        self.is_input_patches = in_channels is not None and patch_size is not None
-        if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
-            deprecation_message = (
-                f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
-                " incorrectly set to `'layer_norm'`.Make sure to set `norm_type` to `'ada_norm'` in the config."
-                " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
-                " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
-                " would be very nice if you could open a Pull request for the `transformer/config.json` file"
-            )
-            deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
-            norm_type = "ada_norm"
-        if self.is_input_continuous and self.is_input_vectorized:
-            raise ValueError(
-                f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
-                " sure that either `in_channels` or `num_vector_embeds` is None."
-            )
-        elif self.is_input_vectorized and self.is_input_patches:
-            raise ValueError(
-                f"Cannot define both `num_vector_embeds`: {num_vector_embeds} and `patch_size`: {patch_size}. Make"
-                " sure that either `num_vector_embeds` or `num_patches` is None."
-            )
-        elif not self.is_input_continuous and not self.is_input_vectorized and not self.is_input_patches:
-            raise ValueError(
-                f"Has to define `in_channels`: {in_channels}, `num_vector_embeds`: {num_vector_embeds}, or patch_size:"
-                f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None."
-            )
-        # 2. Define input layers
-        if self.is_input_continuous:
-            self.in_channels = in_channels
-            self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
-            if use_linear_projection:
-                self.proj_in = linear_cls(in_channels, inner_dim)
-            else:
-                self.proj_in = conv_cls(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
-        elif self.is_input_vectorized:
-            assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
-            assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"
-            self.height = sample_size
-            self.width = sample_size
-            self.num_vector_embeds = num_vector_embeds
-            self.num_latent_pixels = self.height * self.width
-            self.latent_image_embedding = ImagePositionalEmbeddings(
-                num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width
-            )
-        elif self.is_input_patches:
-            assert sample_size is not None, "Transformer2DModel over patched input must provide sample_size"
-            self.height = sample_size
-            self.width = sample_size
-            self.patch_size = patch_size
-            interpolation_scale = self.config.sample_size // 64  # => 64 (= 512 pixart) has interpolation scale 1
-            interpolation_scale = max(interpolation_scale, 1)
-            self.pos_embed = PatchEmbed(
-                height=sample_size,
-                width=sample_size,
-                patch_size=patch_size,
-                in_channels=in_channels,
-                embed_dim=inner_dim,
-                interpolation_scale=interpolation_scale,
-            )
-        # 3. Define transformers blocks
-        self.transformer_blocks = nn.ModuleList(
-            [
-                BasicTransformerBlock(
-                    inner_dim,
-                    num_attention_heads,
-                    attention_head_dim,
-                    dropout=dropout,
-                    cross_attention_dim=cross_attention_dim,
-                    activation_fn=activation_fn,
-                    num_embeds_ada_norm=num_embeds_ada_norm,
-                    attention_bias=attention_bias,
-                    only_cross_attention=only_cross_attention,
-                    double_self_attention=double_self_attention,
-                    upcast_attention=upcast_attention,
-                    norm_type=norm_type,
-                    norm_elementwise_affine=norm_elementwise_affine,
-                    norm_eps=norm_eps,
-                    attention_type=attention_type,
-                )
-                for d in range(num_layers)
-            ]
-        )
-        # 4. Define output layers
-        self.out_channels = in_channels if out_channels is None else out_channels
-        if self.is_input_continuous:
-            # TODO: should use out_channels for continuous projections
-            if use_linear_projection:
-                self.proj_out = linear_cls(inner_dim, in_channels)
-            else:
-                self.proj_out = conv_cls(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
-        elif self.is_input_vectorized:
-            self.norm_out = nn.LayerNorm(inner_dim)
-            self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
-        elif self.is_input_patches and norm_type != "ada_norm_single":
-            self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
-            self.proj_out_1 = nn.Linear(inner_dim, 2 * inner_dim)
-            self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
-        elif self.is_input_patches and norm_type == "ada_norm_single":
-            self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
-            self.scale_shift_table = nn.Parameter(torch.randn(2, inner_dim) / inner_dim**0.5)
-            self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
-        # 5. PixArt-Alpha blocks.
-        self.adaln_single = None
-        self.use_additional_conditions = False
-        if norm_type == "ada_norm_single":
-            self.use_additional_conditions = self.config.sample_size == 128
-            # TODO(Sayak, PVP) clean this, for now we use sample size to determine whether to use
-            # additional conditions until we find better name
-            self.adaln_single = AdaLayerNormSingleFlow(inner_dim, use_additional_conditions=self.use_additional_conditions)
-        self.caption_projection = None
-        if caption_channels is not None:
-            self.caption_projection = PixArtAlphaTextProjection(in_features=caption_channels, hidden_size=inner_dim)
-        self.gradient_checkpointing = False
-    def _set_gradient_checkpointing(self, module, value=False):
-        if hasattr(module, "gradient_checkpointing"):
-            module.gradient_checkpointing = value
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        timestep: Optional[torch.LongTensor] = None,
-        added_cond_kwargs: Dict[str, torch.Tensor] = None,
-        class_labels: Optional[torch.LongTensor] = None,
-        cross_attention_kwargs: Dict[str, Any] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.Tensor] = None,
-        return_dict: bool = True,
-    ):
-        """
-        The [`Transformer2DModel`] forward method.
-        Args:
-            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
-                Input `hidden_states`.
-            encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
-                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
-                self-attention.
-            timestep ( `torch.LongTensor`, *optional*):
-                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
-            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
-                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
-                `AdaLayerZeroNorm`.
-            cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-            attention_mask ( `torch.Tensor`, *optional*):
-                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
-                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
-                negative values to the attention scores corresponding to "discard" tokens.
-            encoder_attention_mask ( `torch.Tensor`, *optional*):
-                Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
-                    * Mask `(batch, sequence_length)` True = keep, False = discard.
-                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
-                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
-                above. This bias will be added to the cross-attention scores.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
-                tuple.
-        Returns:
-            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
-            `tuple` where the first element is the sample tensor.
-        """
-        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
-        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
-        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
-        # expects mask of shape:
-        #   [batch, key_tokens]
-        # adds singleton query_tokens dimension:
-        #   [batch,                    1, key_tokens]
-        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
-        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
-        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
-        if attention_mask is not None and attention_mask.ndim == 2:
-            # assume that mask is expressed as:
-            #   (1 = keep,      0 = discard)
-            # convert mask into a bias that can be added to attention scores:
-            #       (keep = +0,     discard = -10000.0)
-            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
-            attention_mask = attention_mask.unsqueeze(1)
-        # convert encoder_attention_mask to a bias the same way we do for attention_mask
-        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
-            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
-            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
-        # Retrieve lora scale.
-        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
-        # 1. Input
-        if self.is_input_continuous:
-            batch, _, height, width = hidden_states.shape
-            residual = hidden_states
-            hidden_states = self.norm(hidden_states)
-            if not self.use_linear_projection:
-                hidden_states = (
-                    self.proj_in(hidden_states, scale=lora_scale)
-                    if not USE_PEFT_BACKEND
-                    else self.proj_in(hidden_states)
-                )
-                inner_dim = hidden_states.shape[1]
-                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
-            else:
-                inner_dim = hidden_states.shape[1]
-                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
-                hidden_states = (
-                    self.proj_in(hidden_states, scale=lora_scale)
-                    if not USE_PEFT_BACKEND
-                    else self.proj_in(hidden_states)
-                )
-        elif self.is_input_vectorized:
-            hidden_states = self.latent_image_embedding(hidden_states)
-        elif self.is_input_patches:
-            height, width = hidden_states.shape[-2] // self.patch_size, hidden_states.shape[-1] // self.patch_size
-            hidden_states = self.pos_embed(hidden_states)
-            if self.adaln_single is not None:
-                if self.use_additional_conditions and added_cond_kwargs is None:
-                    raise ValueError(
-                        "`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`."
-                    )
-                batch_size = hidden_states.shape[0]
-                timestep, embedded_timestep = self.adaln_single(
-                    timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_states.dtype
-                )
-        # 2. Blocks
-        if self.caption_projection is not None:
-            batch_size = hidden_states.shape[0]
-            encoder_hidden_states = self.caption_projection(encoder_hidden_states)
-            encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
-        for block in self.transformer_blocks:
-            if self.training and self.gradient_checkpointing:
-                def create_custom_forward(module, return_dict=None):
-                    def custom_forward(*inputs):
-                        if return_dict is not None:
-                            return module(*inputs, return_dict=return_dict)
-                        else:
-                            return module(*inputs)
-                    return custom_forward
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states,
-                    attention_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    timestep,
-                    cross_attention_kwargs,
-                    class_labels,
-                    **ckpt_kwargs,
-                )
-            else:
-                hidden_states = block(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    encoder_hidden_states=encoder_hidden_states,
-                    encoder_attention_mask=encoder_attention_mask,
-                    timestep=timestep,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    class_labels=class_labels,
-                )
-        # 3. Output
-        if self.is_input_continuous:
-            if not self.use_linear_projection:
-                hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
-                hidden_states = (
-                    self.proj_out(hidden_states, scale=lora_scale)
-                    if not USE_PEFT_BACKEND
-                    else self.proj_out(hidden_states)
-                )
-            else:
-                hidden_states = (
-                    self.proj_out(hidden_states, scale=lora_scale)
-                    if not USE_PEFT_BACKEND
-                    else self.proj_out(hidden_states)
-                )
-                hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
-            output = hidden_states + residual
-        elif self.is_input_vectorized:
-            hidden_states = self.norm_out(hidden_states)
-            logits = self.out(hidden_states)
-            # (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
-            logits = logits.permute(0, 2, 1)
-            # log(p(x_0))
-            output = F.log_softmax(logits.double(), dim=1).float()
-        if self.is_input_patches:
-            if self.config.norm_type != "ada_norm_single":
-                conditioning = self.transformer_blocks[0].norm1.emb(
-                    timestep, class_labels, hidden_dtype=hidden_states.dtype
-                )
-                shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
-                hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
-                hidden_states = self.proj_out_2(hidden_states)
-            elif self.config.norm_type == "ada_norm_single":
-                shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, dim=1)
-                hidden_states = self.norm_out(hidden_states)
-                # Modulation
-                hidden_states = hidden_states * (1 + scale) + shift
-                hidden_states = self.proj_out(hidden_states)
-                hidden_states = hidden_states.squeeze(1)
-            # unpatchify
-            if self.adaln_single is None:
-                height = width = int(hidden_states.shape[1] ** 0.5)
-            hidden_states = hidden_states.reshape(
-                shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels)
-            )
-            hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
-            output = hidden_states.reshape(
-                shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
-            )
-        if not return_dict:
-            return (output,)
-        return Transformer2DModelOutput(sample=output)

MuCodec/muq_dev/muq_fairseq/data/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .mert_dataset import MERTDataset

MuCodec/muq_dev/muq_fairseq/data/ark_dataset.py DELETED Viewed

@@ -1,71 +0,0 @@
-import logging
-import torch
-import torch.nn.functional as F
-from fairseq.data.audio.raw_audio_dataset import RawAudioDataset
-from typing import Tuple
-try:
-    import kaldiio
-except:
-    kaldiio = None
-import warnings
-logger = logging.getLogger(__name__)
-class ArkDataset(RawAudioDataset):
-    def __init__(
-        self,
-        wav_scp,
-        dur_scp,
-        sr = 24000,
-        max_dur = 20,
-        num_buckets=0,
-        normalize=False,
-    ):
-        super().__init__(
-            sample_rate=sr,
-            max_sample_size=max_dur*sr,
-            min_sample_size=1200,
-            shuffle=True,
-            pad=True,
-            normalize=normalize,
-            compute_mask=False,
-        )
-        self.sr = sr
-        self.max_dur = max_dur
-        self.normalize = normalize
-        logger.info("Loading Kaldi scp files from {}".format(wav_scp))
-        self.wav_data = kaldiio.load_scp(wav_scp)
-        self.keys = list(self.wav_data.keys())
-        dur_data = {}
-        keys_set = set(self.keys)
-        with open(dur_scp, 'r') as f:
-            for line in f:
-                line = line.strip().split()
-                if line[0] in keys_set:
-                    dur_data[line[0]] = float(line[-1])
-        self.sizes = [int(dur_data[k]*self.sr/100) for k in self.keys]
-        logger.info("Loading Kaldi scp files done")
-        self.dataset_len = len(self.keys)
-        self.set_bucket_info(num_buckets)
-    def __len__(self):
-        return self.dataset_len
-    def __getitem__(self, idx):
-        pass
-    def size(self, idx):
-        pass
-    def postprocess(self, wav):
-        pass
-    def collater(self, samples):
-        pass

MuCodec/muq_dev/muq_fairseq/data/mert_dataset.py DELETED Viewed

@@ -1,295 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-import itertools
-import logging
-import os
-import sys
-from typing import Any, List, Optional, Union
-import numpy as np
-from typing import Tuple
-import torch
-import torch.nn.functional as F
-from fairseq.data import data_utils
-from fairseq.data.fairseq_dataset import FairseqDataset
-from fairseq.data.audio.audio_utils import (
-    parse_path,
-    read_from_stored_zip,
-)
-import math
-import io
-import torchaudio
-# this is in the user_dir
-from nnAudio import features as nnAudioFeatures
-# from tqdm import tqdm
-import tqdm
-import json
-import random
-import traceback
-from einops import rearrange
-# from scripts.prepare_codecs_from_manifest import *
-logger = logging.getLogger(__name__)
-class model_cqt_pred(torch.nn.Module):
-    def __init__(self, n_bins=84, sr=16000, freq=50):
-        super().__init__()
-        self.epsilon=1e-10
-        # Getting Mel Spectrogram on the fly
-        self.spec_layer = nnAudioFeatures.cqt.CQT(sr=sr, hop_length=sr//freq, fmin=32.7,
-                                           fmax=None, n_bins=n_bins, bins_per_octave=n_bins//7,
-                                           filter_scale=1, norm=1, window='hann', center=True,
-                                           pad_mode='constant', trainable=False,
-                                           output_format='Magnitude', verbose=True)
-        # self.fc = nn.Linear(input_dim, n_bins)
-        # self.criterion = nn.MSELoss()
-        self.forward_dict = {
-            # 'masked_transformer_output': self.plain_forward
-            'compute_cqt': self.compute_cqt
-        }
-    def compute_cqt(self, x):
-        '''
-        convert waveform to CQT -> [batch, bins, len] -> transpose
-        '''
-        # align with the padding of HuBERT model,
-        # the truncation is calculated by bruteforce search since the nnAudio padding strategy and fairseq models are different
-        # x = x[..., :-560]
-        return torch.transpose(self.spec_layer(x), -1, -2)
-    def forward(self, x, forward_type='masked_transformer_output'):
-        '''
-        take input from transformer hidden states: [batch, len_seq, channel]
-        output: [batch, len_seq, n_bins]
-        '''
-        return self.forward_dict[forward_type](x)
-def load_audio_by_json(json_path, max_keep, min_keep, tgt_sample_rate, clip_secs=5):
-    # read json file
-    print(json_path)
-    datas = []
-    inds = []
-    sizes = []
-    with open(json_path) as fp:
-        for ind,line in  enumerate(fp):
-            data = json.loads(line)
-            if 'duration' in data and min_keep is not None and tgt_sample_rate*data['duration'] < min_keep:
-                continue
-            datas.append(data)
-            inds.append(ind)
-            # sz = int(data['duration'] * data['sample_rate'])
-            if clip_secs > 0:
-                sz = int(tgt_sample_rate * clip_secs)
-            else:
-                sz = int(tgt_sample_rate * data['duration'])
-            sizes.append(sz)
-    tot = ind + 1
-    return datas,inds,tot,sizes
-def load_audio(manifest_path, max_keep, min_keep):
-    pass
-def load_label(label_path, inds, tot):
-    pass
-def load_numpy_label(label_path, inds, tot):
-    labels = np.load(label_path, mmap_mode='r')
-    assert (labels.shape[0] == tot), f"number of labels does not match ({labels.shape[0]} != {tot})"
-    return labels
-def verify_label_lengths(
-    audio_sizes,
-    audio_rate,
-    label_path,
-    label_rate,
-    inds,
-    tot,
-    tol=0.1,  # tolerance in seconds
-):
-    pass
-class Read_and_PadCrop_Normalized_T(torch.nn.Module):
-    def __init__(self, n_samples: int, sample_rate: int, randomize: bool = True):
-        super().__init__()
-        self.n_samples = n_samples
-        self.sample_rate = sample_rate
-        self.randomize = randomize
-    def __call__(self, filename: str, duration: float, cur_sample_rate: int, fixed_offset_duration=None) -> Tuple[torch.Tensor, float, float, int, int]:
-        pass
-class MERTDataset(FairseqDataset):
-    def __init__(
-        self,
-        manifest_path: str,
-        sample_rate: float,
-        label_paths: List[str],
-        label_rates: Union[List[float], float],  # -1 for sequence labels
-        pad_list: List[str],
-        eos_list: List[str],
-        label_scp_path: Optional[str] = None,
-        label_scp_clip_duration: float = -1,
-        label_processors: Optional[List[Any]] = None,
-        max_keep_sample_size: Optional[int] = None,
-        min_keep_sample_size: Optional[int] = None,
-        max_sample_size: Optional[int] = None,
-        shuffle: bool = True,
-        pad_audio: bool = False,
-        normalize: bool = False,
-        store_labels: bool = True,
-        npmemmap: bool = False,
-        random_crop: bool = False,
-        single_target: bool = False,
-        augmentation_effects: List[str] = [],
-        augmentation_probs: List[float] = [],
-        inbatch_noise_augment_len_range: List[int] = [8000, 24000],
-        inbatch_noise_augment_number_range: List[int] = [1, 3],
-        inbatch_noise_augment_volume: float = 1.0,
-        cqt_prediction_bin: int = -1,
-        dataset_len:int = 128*3000,
-        clip_secs = 5,
-    ):
-        self.sample_rate = sample_rate
-        self.shuffle = shuffle
-        self.random_crop = random_crop
-        self.datas,inds,tot,self.sizes = load_audio_by_json(manifest_path,max_keep_sample_size,min_keep_sample_size, self.sample_rate, clip_secs)
-        self.inds = inds
-        self.num_labels = len(label_paths)
-        self.pad_list = pad_list
-        self.eos_list = eos_list
-        self.label_processors = label_processors
-        self.single_target = single_target
-        self.label_rates = (
-            [label_rates for _ in range(len(label_paths))]
-            if isinstance(label_rates, float)
-            else label_rates
-        )
-        self.store_labels = store_labels
-        self.npmemmap = npmemmap
-        self.label_scp_path = label_scp_path
-        self.label_scp_clip_duration = label_scp_clip_duration
-        if self.label_scp_path is not None:
-            from kaldiio import load_scp
-            self.label_scp = load_scp(self.label_scp_path)
-        # self.dataset_len = dataset_len
-        self.dataset_len = len(self.datas)
-        logger.info('preparing labels')
-        logger.info('========dataset len: {}=========='.format(self.dataset_len))
-        if store_labels:
-            if self.npmemmap:
-                self.label_list = [load_numpy_label(p+'.npy', inds, tot) for p in label_paths]
-            else:
-                self.label_list = [load_label(p, inds, tot) for p in label_paths]
-        else:
-            self.label_paths = label_paths
-            # self.label_offsets_list = [
-            #     load_label_offset(p, inds, tot) for p in label_paths
-            # ]
-        assert label_processors is None or len(label_processors) == self.num_labels
-        self.max_sample_size = (
-            max_sample_size if max_sample_size is not None else sys.maxsize
-        )
-        self.pad_audio = pad_audio
-        self.normalize = normalize
-        logger.info(
-            f"pad_audio={pad_audio}, random_crop={random_crop}, "
-            f"normalize={normalize}, max_sample_size={self.max_sample_size}"
-        )
-        self.augmentation_effects = augmentation_effects
-        self.augmentation_probs = augmentation_probs
-        self.inbatch_noise_augment_len_range = inbatch_noise_augment_len_range
-        self.inbatch_noise_augment_number_range = inbatch_noise_augment_number_range
-        self.inbatch_noise_augment_volume = inbatch_noise_augment_volume
-        self.cqt_prediction_bin = cqt_prediction_bin
-        if self.cqt_prediction_bin > 0:
-            self.encoder_cqt_model = model_cqt_pred(n_bins=self.cqt_prediction_bin)
-            logger.info('preparing cqt loss objective in dataloader with cpu')
-        self.epoch = -1
-        self.reader = Read_and_PadCrop_Normalized_T(n_samples=clip_secs*sample_rate if clip_secs>0 else None, sample_rate = self.sample_rate)
-    @property
-    def can_reuse_epoch_itr_across_epochs(self):
-        pass
-    def set_epoch(self, epoch):
-        pass
-    def inbatch_noise_augment(self,
-        target_audio: torch.Tensor, target_audio_idx: int ,
-        batch_audios: torch.Tensor, # [bsz, audio_lengths]
-        noise_len_min: int, noise_len_max: int,
-        n_noise_min: int, n_noise_max: int,
-        noise_vol: float = 1.0):
-        pass
-    def get_audio_by_slice(self,index):
-        pass
-    def get_audio(self, index):
-        pass
-    def get_label(self, index, label_idx):
-        pass
-    def get_labels(self, index):
-        pass
-    def __getitem__(self, i):
-        pass
-    def __len__(self):
-        return self.dataset_len
-    def crop_to_max_size(self, wav, target_size):
-        pass
-    def collater(self, samples):
-        pass
-    def collater_audio(self, audios, audio_size):
-        pass
-    def collater_frm_label(self, targets, audio_size, audio_starts, label_rate, pad):
-        pass
-    def collater_seq_label(self, targets, pad):
-        pass
-    def collater_label(self, targets_by_label, audio_size, audio_starts):
-        pass
-    def num_tokens(self, index):
-        pass
-    def size(self, index):
-        pass
-    def ordered_indices(self):
-        pass
-    def postprocess(self, wav, cur_sample_rate):
-        pass

MuCodec/muq_dev/muq_fairseq/data/utils/data_utils.py DELETED Viewed

@@ -1,535 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-import logging
-import math
-import numpy as np
-import torch
-from typing import Optional, Tuple
-logger = logging.getLogger(__name__)
-def compute_mask_indices(
-    shape: Tuple[int, int],
-    padding_mask: Optional[torch.Tensor],
-    mask_prob: float,
-    mask_length: int,
-    mask_type: str = "static",
-    mask_other: float = 0.0,
-    min_masks: int = 0,
-    no_overlap: bool = False,
-    min_space: int = 0,
-    require_same_masks: bool = True,
-    mask_dropout: float = 0.0,
-    add_masks: bool = False,
-    seed: Optional[int] = None,
-    epoch: Optional[int] = None,
-    indices: Optional[torch.Tensor] = None,
-    idc_select_ver: int = 1,  # 2 to reproduce mask_tokens_dataset
-    num_mask_ver: int = 2,  # 2 to reproduce mask_tokens_dataset
-) -> np.ndarray:
-    """
-    Computes random mask spans for a given shape
-    Args:
-        shape: the the shape for which to compute masks.
-            should be of size 2 where first element is batch size and 2nd is timesteps
-        padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
-        mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
-            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
-            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
-        mask_type: how to compute mask lengths
-            static = fixed size
-            uniform = sample from uniform distribution [mask_other, mask_length*2]
-            normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
-            poisson = sample from possion distribution with lambda = mask length
-        min_masks: minimum number of masked spans
-        no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
-        min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
-        require_same_masks: if true, will randomly drop out masks until same amount of masks remains in each sample
-        mask_dropout: randomly dropout this percentage of masks in each example
-    """
-    bsz, all_sz = shape
-    mask = np.full((bsz, all_sz), False)
-    if num_mask_ver == 1:
-        all_num_mask = int(
-            # add a random number for probabilistic rounding
-            mask_prob * all_sz / float(mask_length)
-            + np.random.rand()
-        )
-        all_num_mask = max(min_masks, all_num_mask)
-    mask_idcs = []
-    for i in range(bsz):
-        if seed is not None and epoch is not None and indices is not None:
-            seed_i = int(hash((seed, epoch, indices[i].item())) % 1e6)
-        else:
-            seed_i = None
-        rng = np.random.default_rng(seed_i)
-        if padding_mask is not None:
-            sz = all_sz - padding_mask[i].long().sum().item()
-            assert sz >= 0, sz
-        else:
-            sz = all_sz
-        if num_mask_ver == 1:
-            if padding_mask is not None:
-                num_mask = int(
-                    # add a random number for probabilistic rounding
-                    mask_prob * sz / float(mask_length)
-                    + np.random.rand()
-                )
-                num_mask = max(min_masks, num_mask)
-            else:
-                num_mask = all_num_mask
-        elif num_mask_ver == 2:
-            num_mask = int(
-                # add a random number for probabilistic rounding
-                mask_prob * sz / float(mask_length)
-                + rng.random()
-            )
-            num_mask = max(min_masks, num_mask)
-        else:
-            raise ValueError()
-        if mask_type == "static":
-            lengths = np.full(num_mask, mask_length)
-        elif mask_type == "uniform":
-            lengths = rng.randint(mask_other, mask_length * 2 + 1, size=num_mask)
-        elif mask_type == "normal":
-            lengths = rng.normal(mask_length, mask_other, size=num_mask)
-            lengths = [max(1, int(round(x))) for x in lengths]
-        elif mask_type == "poisson":
-            lengths = rng.poisson(mask_length, size=num_mask)
-            lengths = [int(round(x)) for x in lengths]
-        else:
-            raise Exception("unknown mask selection " + mask_type)
-        if sum(lengths) == 0:
-            if mask_type == "static":
-                raise ValueError(f"this should never happens")
-            else:
-                lengths = [min(mask_length, sz - 1)]
-        if no_overlap:
-            mask_idc = []
-            def arrange(s, e, length, keep_length):
-                span_start = rng.randint(s, e - length)
-                mask_idc.extend(span_start + i for i in range(length))
-                new_parts = []
-                if span_start - s - min_space >= keep_length:
-                    new_parts.append((s, span_start - min_space + 1))
-                if e - span_start - length - min_space > keep_length:
-                    new_parts.append((span_start + length + min_space, e))
-                return new_parts
-            parts = [(0, sz)]
-            min_length = min(lengths)
-            for length in sorted(lengths, reverse=True):
-                lens = np.fromiter(
-                    (e - s if e - s >= length + min_space else 0 for s, e in parts),
-                    np.int,
-                )
-                l_sum = np.sum(lens)
-                if l_sum == 0:
-                    break
-                probs = lens / np.sum(lens)
-                c = rng.choice(len(parts), p=probs)
-                s, e = parts.pop(c)
-                parts.extend(arrange(s, e, length, min_length))
-            mask_idc = np.asarray(mask_idc)
-        else:
-            if idc_select_ver == 1:
-                min_len = min(lengths)
-                if sz - min_len <= num_mask:
-                    min_len = sz - num_mask - 1
-                mask_idc = rng.choice(sz - min_len, num_mask, replace=False)
-            elif idc_select_ver == 2:
-                mask_idc = rng.choice(sz, num_mask, replace=False)
-            else:
-                raise ValueError()
-            mask_idc = np.asarray(
-                [
-                    mask_idc[j] + offset
-                    for j in range(len(mask_idc))
-                    for offset in range(lengths[j])
-                ]
-            )
-        mask_idc = np.unique(mask_idc[mask_idc < sz])
-        if len(mask_idc) >= sz:
-            raise ValueError(
-                (
-                    f"the entire sequence is masked. "
-                    f"sz={sz}; mask_idc[mask_idc]; "
-                    f"index={indices[i] if indices is not None else None}"
-                )
-            )
-        mask_idcs.append(mask_idc)
-    target_len = None
-    if require_same_masks:
-        if add_masks:
-            target_len = max([len(m) for m in mask_idcs])
-        else:
-            target_len = min([len(m) for m in mask_idcs])
-    for i, mask_idc in enumerate(mask_idcs):
-        if target_len is not None and len(mask_idc) > target_len:
-            mask_idc = rng.choice(mask_idc, target_len, replace=False)
-        mask[i, mask_idc] = True
-        if target_len is not None and len(mask_idc) < target_len:
-            unmasked = np.flatnonzero(~mask[i])
-            to_mask = rng.choice(unmasked, target_len - len(mask_idc), replace=False)
-            mask[i, to_mask] = True
-        if mask_dropout > 0:
-            masked = np.flatnonzero(mask[i])
-            num_holes = np.rint(len(masked) * mask_dropout).astype(int)
-            to_drop = rng.choice(masked, num_holes, replace=False)
-            mask[i, to_drop] = False
-    return mask
-def compute_block_mask_2d(
-    shape: Tuple[int, int],
-    mask_prob: float,
-    mask_length: int,
-    mask_prob_adjust: float = 0,
-    inverse_mask: bool = False,
-    require_same_masks: bool = True,
-    expand_adjcent: bool = False,
-    mask_dropout: float = 0,
-    non_overlapping: bool = False,
-    img_shape: tuple = None,   # For the situation when d[0] != d[1], especially in audio spce ways
-    flexible_mask: bool = False,
-) -> torch.Tensor:
-    assert mask_length > 1
-    B, L = shape
-    d = (int(L**0.5),int(L**0.5))
-    if img_shape:
-        d = (img_shape[0],img_shape[1])
-    if flexible_mask:
-        index = np.random.randint(0,3)
-        block_size_options = np.array([(6, 4), (5, 5), (8, 3)])
-        block_size = block_size_options[index]
-    if inverse_mask:
-        mask_prob = 1 - mask_prob
-    if flexible_mask:
-        mask = torch.zeros((B, d[0], d[1]))
-        mask_inds = torch.randint(
-            0,
-            L,
-            size=(
-                B,
-                int(
-                    L
-                    * ((mask_prob + mask_prob_adjust) / (block_size[0]*block_size[1]))
-                    * (1 + mask_dropout)
-                ),
-            ),
-        )
-        mask.view(B, -1).scatter_(1, mask_inds, 1)
-        centers = mask.nonzero(as_tuple=True)
-        inds = ([], [], [])
-        offset = mask_length // 2
-        for i in range(block_size[0]):
-            for j in range(block_size[1]):
-                k1 = i - offset
-                k2 = j - offset
-                inds[0].append(centers[0])
-                inds[1].append(centers[1] + k1)
-                inds[2].append(centers[2] + k2)
-        i0 = torch.cat(inds[0])
-        i1 = torch.cat(inds[1]).clamp_(min=0, max=d[0] - 1)
-        i2 = torch.cat(inds[2]).clamp_(min=0, max=d[1] - 1)
-        mask[(i0, i1, i2)] = 1
-    elif non_overlapping:
-        sz = math.ceil(d[0] / mask_length)
-        inp_len = sz * sz
-        inp = torch.zeros((B, 1, sz, sz))
-        w = torch.ones((1, 1, mask_length, mask_length))
-        mask_inds = torch.multinomial(
-            1 - inp.view(B, -1),
-            int(inp_len * (mask_prob + mask_prob_adjust) * (1 + mask_dropout)),
-            replacement=False,
-        )
-        inp.view(B, -1).scatter_(1, mask_inds, 1)
-        mask = torch.nn.functional.conv_transpose2d(inp, w, stride=mask_length).squeeze(
-            1
-        )
-        if mask.size(-1) > d[0]:
-            mask = mask[..., :d, :d]
-    else:
-        mask = torch.zeros((B, d[0], d[1]))
-        mask_inds = torch.randint(
-            0,
-            L,
-            size=(
-                B,
-                int(
-                    L
-                    * ((mask_prob + mask_prob_adjust) / mask_length**2)
-                    * (1 + mask_dropout)
-                ),
-            ),
-        )
-        mask.view(B, -1).scatter_(1, mask_inds, 1)
-        centers = mask.nonzero(as_tuple=True)
-        inds = ([], [], [])
-        offset = mask_length // 2
-        for i in range(mask_length):
-            for j in range(mask_length):
-                k1 = i - offset
-                k2 = j - offset
-                inds[0].append(centers[0])
-                inds[1].append(centers[1] + k1)
-                inds[2].append(centers[2] + k2)
-        i0 = torch.cat(inds[0])
-        i1 = torch.cat(inds[1]).clamp_(min=0, max=d[0] - 1)
-        i2 = torch.cat(inds[2]).clamp_(min=0, max=d[1] - 1)
-        mask[(i0, i1, i2)] = 1
-    def get_nbs(b, m, w):
-        all_nbs = torch.nn.functional.conv2d(m.unsqueeze(1), w, padding="same")
-        all_nbs = all_nbs.clamp_max_(1).view(b, -1)
-        return all_nbs
-    if require_same_masks and expand_adjcent:
-        w = torch.zeros((1, 1, 3, 3))
-        w[..., 0, 1] = 1
-        w[..., 2, 1] = 1
-        w[..., 1, 0] = 1
-        w[..., 1, 2] = 1
-        all_nbs = get_nbs(B, mask, w)
-    mask = mask.reshape(B, -1)
-    if require_same_masks:
-        n_masks = mask.sum(dim=-1)
-        final_target_len = int(L * (mask_prob))
-        target_len = int(final_target_len * (1 + mask_dropout))
-        for i in range(len(mask)):
-            n = n_masks[i]
-            m = mask[i]
-            r = 0
-            while expand_adjcent and n < target_len:
-                if r == 0:
-                    nbs = all_nbs[i]
-                else:
-                    nbs = get_nbs(1, m.view(1, d[0], d[1]), w).flatten()
-                cands = (1 - m + nbs) > 1
-                cand_sz = int(cands.sum().item())
-                assert cand_sz > 0, f"{nbs} {cand_sz}"
-                to_mask = torch.multinomial(
-                    cands.float(), min(cand_sz, int(target_len - n)), replacement=False
-                )
-                m[to_mask] = 1
-                assert to_mask.numel() > 0
-                n += to_mask.numel()
-                r += 1
-            if n > final_target_len:
-                to_unmask = torch.multinomial(
-                    m, int(n - final_target_len), replacement=False
-                )
-                m[to_unmask] = 0
-            elif n < final_target_len:
-                to_mask = torch.multinomial(
-                    (1 - m), int(final_target_len - n), replacement=False
-                )
-                m[to_mask] = 1
-    if inverse_mask:
-        mask = 1 - mask
-    return mask
-def compute_block_mask_1d(
-    shape: Tuple[int, int],
-    mask_prob: float,
-    mask_length: int,
-    mask_prob_adjust: float = 0,
-    inverse_mask: bool = False,
-    require_same_masks: bool = True,
-    expand_adjcent: bool = False,
-    mask_dropout: float = 0,
-    non_overlapping: bool = False,
-) -> torch.Tensor:
-    B, L = shape
-    if inverse_mask:
-        mask_prob = 1 - mask_prob
-    if non_overlapping:
-        sz = math.ceil(L / mask_length)
-        inp = torch.zeros((B, 1, sz))
-        w = torch.ones((1, 1, mask_length))
-        mask_inds = torch.multinomial(
-            1 - inp.view(B, -1),
-            int(sz * (mask_prob + mask_prob_adjust) * (1 + mask_dropout)),
-            replacement=False,
-        )
-        inp.view(B, -1).scatter_(1, mask_inds, 1)
-        mask = torch.nn.functional.conv_transpose1d(inp, w, stride=mask_length).squeeze(
-            1
-        )
-        if mask.size(-1) > L:
-            mask = mask[..., :L]
-    else:
-        mask = torch.zeros((B, L))
-        mask_inds = torch.randint(
-            0,
-            L,
-            size=(
-                B,
-                int(
-                    L
-                    * ((mask_prob + mask_prob_adjust) / mask_length)
-                    * (1 + mask_dropout)
-                ),
-            ),
-        )
-        mask.view(B, -1).scatter_(1, mask_inds, 1)
-        centers = mask.nonzero(as_tuple=True)
-        inds = ([], [])
-        offset = mask_length // 2
-        for i in range(mask_length):
-            k1 = i - offset
-            inds[0].append(centers[0])
-            inds[1].append(centers[1] + k1)
-        i0 = torch.cat(inds[0])
-        i1 = torch.cat(inds[1]).clamp_(min=0, max=L - 1)
-        mask[(i0, i1)] = 1
-    def get_nbs(b, m, w):
-        all_nbs = torch.nn.functional.conv1d(m.unsqueeze(1), w, padding="same")
-        all_nbs = all_nbs.clamp_max_(1).view(b, -1)
-        return all_nbs
-    if require_same_masks and expand_adjcent:
-        w = torch.ones((1, 1, 3))
-        w[..., 1] = 0
-        all_nbs = get_nbs(B, mask, w)
-    mask = mask.view(B, -1)
-    if require_same_masks:
-        n_masks = mask.sum(dim=-1)
-        final_target_len = int(L * (mask_prob))
-        target_len = int(final_target_len * (1 + mask_dropout))
-        for i in range(len(mask)):
-            n = n_masks[i]
-            m = mask[i]
-            r = 0
-            while expand_adjcent and n < target_len:
-                if r == 0:
-                    nbs = all_nbs[i]
-                else:
-                    nbs = get_nbs(1, m.unsqueeze(0), w).squeeze(0)
-                cands = (1 - m + nbs) > 1
-                cand_sz = int(cands.sum().item())
-                assert cand_sz > 0, f"{nbs} {cand_sz}"
-                to_mask = torch.multinomial(
-                    cands.float(), min(cand_sz, int(target_len - n)), replacement=False
-                )
-                m[to_mask] = 1
-                assert to_mask.numel() > 0
-                n += to_mask.numel()
-                r += 1
-            if n > final_target_len:
-                to_unmask = torch.multinomial(
-                    m, int(n - final_target_len), replacement=False
-                )
-                m[to_unmask] = 0
-            elif n < final_target_len:
-                to_mask = torch.multinomial(
-                    (1 - m), int(final_target_len - n), replacement=False
-                )
-                m[to_mask] = 1
-    if inverse_mask:
-        mask = 1 - mask
-    return mask
-def get_buckets(sizes, num_buckets):
-    buckets = np.unique(
-        np.percentile(
-            sizes,
-            np.linspace(0, 100, num_buckets + 1),
-            interpolation="lower",
-        )[1:]
-    )
-    return buckets
-def get_bucketed_sizes(orig_sizes, buckets):
-    sizes = np.copy(orig_sizes)
-    assert np.min(sizes) >= 0
-    start_val = -1
-    for end_val in buckets:
-        mask = (sizes > start_val) & (sizes <= end_val)
-        sizes[mask] = end_val
-        start_val = end_val
-    return sizes

MuCodec/muq_dev/muq_fairseq/models/muq/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .muq_model import *

MuCodec/muq_dev/muq_fairseq/models/muq/model/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	-
2	-

MuCodec/muq_dev/muq_fairseq/models/muq/model/muq.py DELETED Viewed

@@ -1,520 +0,0 @@
-import json
-import random
-import torch
-from torch import nn
-from einops import rearrange
-import os
-from fairseq.data.data_utils import compute_mask_indices
-from fairseq.models.wav2vec.wav2vec2 import ConvFeatureExtractionModel
-from fairseq.modules import LayerNorm
-try:
-    from ..modules.random_quantizer import RandomProjectionQuantizer
-    from ..modules.features import MelSTFT
-    from ..modules.conv import Conv2dSubsampling
-except:
-    import sys, os
-    sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
-    from modules.random_quantizer import RandomProjectionQuantizer
-    from modules.features import MelSTFT
-    from modules.conv import Conv2dSubsampling
-class MuQ(nn.Module):
-    """
-    MuQ
-    Input: 128-band mel spectrogram
-    Frontend: 2-layer Residual convolution
-    Backend: 12-layer Conformer
-    Quantizer: a codebook for mel spectrogram
-    """
-    def __init__(
-        self,
-        num_codebooks=1,
-        codebook_dim=16,
-        codebook_size=4096,
-        features=["melspec_2048"],
-        hop_length=240,
-        n_mels=128,
-        conv_dim=512,
-        encoder_dim=1024,
-        encoder_depth=12,
-        mask_hop=0.4,
-        mask_prob=0.6,
-        is_flash=False,
-        stat_path=None, #"./data/fma_stats.json",
-        model_path=None, #"./data/pretrained_fma.pt",
-        w2v2_config_path=None, #"facebook/wav2vec2-conformer-rope-large-960h-ft",
-        use_rvq_target=False,
-        use_vq_target=False,
-        rvq_ckpt_path=None,
-        recon_loss_ratio=None,
-        label_rate=25,
-        use_hubert_masking_strategy=False,
-        use_hubert_featurizer=False,
-        hubert_conv_feature_layers="[(512,10,5)] + [(512,3,2)] * 3  + [(512,3,3)] + [(512,2,2)] * 2",
-        use_hubert_nce_loss=False,
-        hubert_final_dim=256,
-        rvq_n_codebooks=8,
-        rvq_multi_layer_num=1,
-        use_encodec_target=False,
-    ):
-        super(MuQ, self).__init__()
-        # global variables
-        self.hop_length = hop_length
-        self.mask_hop = mask_hop
-        self.mask_prob = mask_prob
-        self.num_codebooks = num_codebooks
-        self.codebook_size = codebook_size
-        self.features = features
-        self.recon_loss_ratio = recon_loss_ratio
-        self.n_fold = int(100//label_rate)
-        self.label_rate = label_rate
-        self.use_hubert_masking_strategy = use_hubert_masking_strategy
-        self.use_hubert_featurizer = use_hubert_featurizer
-        self.use_hubert_nce_loss = use_hubert_nce_loss
-        # load feature mean / std stats
-        import os
-        if stat_path is not None and os.path.exists(stat_path):
-            with open(stat_path, "r") as f:
-                self.stat = json.load(f)
-        else:
-            # print("No stats file found at `{}`, use default from msd.".format(stat_path))
-            self.stat = {"spec_256_cnt": 14394344256, "spec_256_mean": -23.34296658431829, "spec_256_std": 26.189295587132637, "spec_512_cnt": 28677104448, "spec_512_mean": -21.31267396860235, "spec_512_std": 26.52644536245769, "spec_1024_cnt": 57242624832, "spec_1024_mean": -18.852271129208273, "spec_1024_std": 26.443154583585663, "spec_2048_cnt": 114373665600, "spec_2048_mean": -15.638743433896792, "spec_2048_std": 26.115825961611545, "spec_4096_cnt": 228635747136, "spec_4096_mean": -11.715532502794836, "spec_4096_std": 25.763972210234062, "melspec_256_cnt": 14282760192, "melspec_256_mean": -26.962600400166156, "melspec_256_std": 36.13614100912126, "melspec_512_cnt": 14282760192, "melspec_512_mean": -9.108344167718862, "melspec_512_std": 24.71910937988429, "melspec_1024_cnt": 14282760192, "melspec_1024_mean": 0.37302579246531126, "melspec_1024_std": 18.684082325919388, "melspec_2048_cnt": 14282760192, "melspec_2048_mean": 6.768444971712967, "melspec_2048_std": 18.417922652295623, "melspec_4096_cnt": 14282760192, "melspec_4096_mean": 13.617164614990036, "melspec_4096_std": 18.08552130124525, "cqt_cnt": 9373061376, "cqt_mean": 0.46341379757927165, "cqt_std": 0.9543998080910191, "mfcc_256_cnt": 1339008768, "mfcc_256_mean": -11.681755459447485, "mfcc_256_std": 29.183186444668316, "mfcc_512_cnt": 1339008768, "mfcc_512_mean": -2.540581461792183, "mfcc_512_std": 31.93752185832081, "mfcc_1024_cnt": 1339008768, "mfcc_1024_mean": 6.606636263169779, "mfcc_1024_std": 34.151644801729624, "mfcc_2048_cnt": 1339008768, "mfcc_2048_mean": 5.281600844245184, "mfcc_2048_std": 33.12784541220003, "mfcc_4096_cnt": 1339008768, "mfcc_4096_mean": 4.7616569480166095, "mfcc_4096_std": 32.61458906894133, "chromagram_256_cnt": 1339008768, "chromagram_256_mean": 55.15596556703181, "chromagram_256_std": 73.91858278719991, "chromagram_512_cnt": 1339008768, "chromagram_512_mean": 175.73092252759895, "chromagram_512_std": 248.48485148525953, "chromagram_1024_cnt": 1339008768, "chromagram_1024_mean": 589.2947481634608, "chromagram_1024_std": 913.857929063196, "chromagram_2048_cnt": 1339008768, "chromagram_2048_mean": 2062.286388327397, "chromagram_2048_std": 3458.92657915397, "chromagram_4096_cnt": 1339008768, "chromagram_4096_mean": 7673.039107997085, "chromagram_4096_std": 13009.883158267234}
-        # feature extractor
-        self.preprocessor_melspec_2048 = MelSTFT(
-            n_fft=2048, hop_length=hop_length, is_db=True
-        )
-        # random quantizer
-        self.use_rvq_target = use_rvq_target
-        self.use_vq_target = use_vq_target
-        self.use_encodec_target = use_encodec_target
-        seed = 142
-        if self.use_rvq_like_target:
-            if use_rvq_target:
-                try:
-                    from .rvq_muq import ResidualVectorQuantize
-                except:
-                    import sys, os
-                    sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-                    from rvq_muq import ResidualVectorQuantize
-                inp_dim = 128*self.n_fold
-                self.rvq = ResidualVectorQuantize(
-                    input_dim = inp_dim,
-                    n_codebooks = rvq_n_codebooks,
-                    codebook_size = 1024,
-                    codebook_dim = 16,
-                    quantizer_dropout = 0.0,
-                    use_multi_layer_num = rvq_multi_layer_num,
-                    )
-            elif use_vq_target:
-                try:
-                    from .rvq_muq import VectorQuantize
-                except:
-                    import sys, os
-                    sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-                    from rvq_muq import VectorQuantize
-                self.rvq = VectorQuantize(
-                    input_dim = 128*self.n_fold,
-                    codebook_size = 1024,
-                    codebook_dim = 8,
-                    stale_tolerance = 1000,
-                    mfcc_clustering = False
-                )
-            elif use_encodec_target:
-                from encodec import EncodecModel
-                self.rvq = EncodecModel.encodec_model_24khz()
-                self.rvq.set_target_bandwidth(6.0)
-                for param in self.rvq.parameters():
-                    param.requires_grad = False
-            import os
-            if rvq_ckpt_path is not None and os.path.exists(rvq_ckpt_path):
-                state_dict = torch.load(rvq_ckpt_path, map_location="cpu")
-                self.rvq.load_state_dict(state_dict)
-            else:
-                print(f'Checkpoint for rvq `{rvq_ckpt_path}` not found. Using random initialization.')
-        else:
-            for feature in self.features:
-                for i in range(num_codebooks):
-                    setattr(
-                        self,
-                        f"quantizer_{feature}", # _{i}
-                        RandomProjectionQuantizer(
-                            n_mels * self.n_fold, codebook_dim, codebook_size, seed=seed + i
-                        ),
-                    )
-        if use_hubert_masking_strategy:
-            self.mask_emb = nn.Parameter(
-                torch.FloatTensor(encoder_dim).uniform_()
-            )
-        if use_hubert_featurizer:
-            feature_enc_layers = eval(hubert_conv_feature_layers)  # noqa
-            hubert_feat_embed = feature_enc_layers[-1][0]
-            self.hubert_feature_extractor = ConvFeatureExtractionModel(
-                conv_layers=feature_enc_layers,
-                dropout=0.0,
-                mode='default', #cfg.extractor_mode,
-                conv_bias=False, #cfg.conv_bias,
-            )
-            self.post_extract_proj = (
-                nn.Linear(hubert_feat_embed, encoder_dim)
-                if hubert_feat_embed != encoder_dim
-                else None
-            )
-            self.layer_norm = LayerNorm(hubert_feat_embed)
-        else:
-            # two residual convolution layers + one projection layer
-            strides_factory = {
-                4: [2, 2],
-                2: [2, 1]
-            }
-            self.conv = Conv2dSubsampling(
-                1, conv_dim, encoder_dim, strides=strides_factory.get(self.n_fold), n_bands=n_mels
-            )
-        # Conformer
-        if is_flash:
-            from modules.flash_conformer import (
-                Wav2Vec2ConformerEncoder,
-                Wav2Vec2ConformerConfig,
-            )
-        else:
-            from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer import (
-                Wav2Vec2ConformerEncoder,
-                Wav2Vec2ConformerConfig,
-            )
-        import os
-        if w2v2_config_path is None or not os.path.exists(w2v2_config_path):
-            w2v2_config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "w2v2_config.json")
-        print("load w2v2 config from:", w2v2_config_path)
-        config = Wav2Vec2ConformerConfig.from_pretrained(
-            w2v2_config_path
-        )
-        config.num_hidden_layers = encoder_depth
-        config.hidden_size = encoder_dim
-        self.conformer = Wav2Vec2ConformerEncoder(config)
-        if self.use_hubert_nce_loss:
-            self.label_embs_concat = nn.Parameter(
-                torch.FloatTensor(codebook_size, hubert_final_dim)
-            ) # embeddings of codes
-            nn.init.uniform_(self.label_embs_concat)
-            self.linear = nn.Linear(encoder_dim, hubert_final_dim) # final_proj
-        else:
-            # projection
-            self.linear = nn.Linear(encoder_dim, codebook_size) # N_SubSpec=8
-        # reconstruct melspec
-        if self.recon_loss_ratio is not None and self.recon_loss_ratio > 0:
-            self.recon_proj = nn.Linear(encoder_dim, n_mels * self.n_fold)
-            self.recon_loss = nn.MSELoss()
-        # loss function
-        self.loss = nn.CrossEntropyLoss()
-        # cls token (used for sequence classification)
-        random.seed(seed)
-        self.cls_token = nn.Parameter(torch.randn(encoder_dim))
-        # load model
-        if model_path:
-            S = torch.load(model_path)["state_dict"]
-            SS = {k[6:]: v for k, v in S.items()}
-            SS['quantizer_melspec_2048.random_projection'] = SS['quantizer_melspec_2048_0.random_projection']
-            SS['quantizer_melspec_2048.codebook'] = SS['quantizer_melspec_2048_0.codebook']
-            del SS['quantizer_melspec_2048_0.random_projection']
-            del SS['quantizer_melspec_2048_0.codebook']
-            unmatch = self.load_state_dict(SS, strict=False)
-            if len(unmatch.missing_keys) > 0:
-                print(f'Missing keys: {unmatch.missing_keys}')
-    @property
-    def use_rvq_like_target(self):
-        return self.use_rvq_target or self.use_vq_target or self.use_encodec_target
-    def apply_hubert_mask(self, x, padding_mask=None, target_list=None):
-        B, T, C = x.shape
-        if self.mask_prob > 0:
-            mask_length = int(self.mask_hop / (1/self.label_rate))
-            mask_indices = compute_mask_indices(
-                (B, T),
-                padding_mask,
-                self.mask_prob,
-                mask_length, # self.mask_length,
-                "static", #self.mask_selection,
-                0, #self.mask_other,
-                min_masks=2,
-                no_overlap=False, #self.no_mask_overlap,
-                min_space=1, #self.mask_min_space,
-            )
-            mask_indices = torch.from_numpy(mask_indices).to(x.device)
-            x[mask_indices] = self.mask_emb
-            mask_indices = torch.nonzero(mask_indices)
-        else:
-            mask_indices = None
-        return x, mask_indices
-    def masking(self, x, attention_mask=None):
-        """random masking of 400ms with given probability"""
-        if self.use_hubert_masking_strategy:
-            return x, None
-        mx = x.clone()
-        b, t = mx.shape
-        len_masking_raw = int(24000 * self.mask_hop) # 9600 = 24000 * 0.4
-        len_masking_token = int(24000 / self.hop_length / 2 / 2 * self.mask_hop) # 10 = 25Hz * 0.4
-        # get random mask indices
-        start_indices = torch.rand(b, t // len_masking_raw) < self.mask_prob
-        time_domain_masked_indices = torch.nonzero(
-            start_indices.repeat_interleave(len_masking_raw, dim=1)
-        )
-        token_domain_masked_indices = torch.nonzero(
-            start_indices.repeat_interleave(len_masking_token, dim=1)
-        )
-        # mask with random values
-        masking_noise = (
-            torch.randn(time_domain_masked_indices.shape[0], dtype=x.dtype) * 0.1
-        )  # 0 mean 0.1 std
-        mx[tuple(time_domain_masked_indices.t())] = masking_noise.to(x.device)
-        return mx, token_domain_masked_indices
-    @torch.no_grad()
-    def preprocessing(self, x, features):
-        """extract classic audio features"""
-        # check precision
-        if x.dtype == torch.float16 or x.dtype == torch.bfloat16:
-            precision = 16
-        else:
-            precision = 32
-        out = {}
-        for key in features:
-            layer = getattr(self, "preprocessor_%s" % key)
-            layer.to(x.device)
-            dtype = x.dtype
-            out[key] = layer.float()(x.float())[..., :-1]
-            if precision == 16:
-                out[key] = out[key].half()
-            if out[key].dtype != dtype:
-                out[key].to(dtype=dtype)
-        return out
-    def encoder(self, x, *, attention_mask=None, is_features_only=False):
-        """2-layer conv + w2v-conformer"""
-        if not self.use_hubert_featurizer:
-            x = self.conv(x) # [3, 128, 3000] -> [3, 750, 1024]
-        if self.training and self.use_hubert_masking_strategy and not is_features_only:
-            x, mask_indices = self.apply_hubert_mask(x)
-        else:
-            mask_indices = None
-        if attention_mask is None:
-            out = self.conformer(x, output_hidden_states=True)
-        else:
-            attention_mask = attention_mask.bool()
-            skip_n = int(attention_mask.size(-1) / x.size(1))
-            attention_mask = attention_mask[:, ::skip_n]
-            attention_mask = attention_mask[:, :x.size(1)]
-            out = self.conformer(x, attention_mask=attention_mask, output_hidden_states=True)
-        hidden_emb = out["hidden_states"]
-        last_emb = out["last_hidden_state"]
-        logits = self.linear(last_emb)
-        interval = self.codebook_size
-        logits = {
-            key: logits[:, :, i * interval : (i + 1) * interval]
-            for i, key in enumerate(self.features)
-        }
-        return logits, hidden_emb, mask_indices
-    @torch.no_grad()
-    def normalize(self, x):
-        """normalize the input audio to have zero mean unit variance"""
-        for key in x.keys():
-            x[key] = (x[key] - self.stat["%s_mean" % key]) / self.stat["%s_std" % key] # {'melspec_2048_cnt': 14282760192, 'melspec_2048_mean': 6.768444971712967}
-        return x
-    @torch.no_grad()
-    def rearrange(self, x):
-        """rearrange the batch to flatten every 4 steps"""
-        for key in x.keys():
-            if key == "chromagram":
-                x[key] = rearrange(x[key], "b f t -> b t f")
-            else:
-                x[key] = rearrange(x[key], "b f (t s) -> b t (s f)", s=self.n_fold)
-        return x
-    def get_rvq_codes(self, inp, raw_wav):
-        if self.use_rvq_target:
-            quantized_prompt_embeds, codes, _, commitment_loss, codebook_loss, rvq_usage = self.rvq(inp)
-            return codes
-        if self.use_vq_target:
-            quantized_prompt_embeds, commitment_loss, codebook_loss, codes, _ = self.rvq(inp)
-            return codes.unsqueeze(1)
-        if self.use_encodec_target:
-            encoded_frames = self.rvq.encode(raw_wav.unsqueeze(1)) #list, B,[ 8,T ]
-            codes = torch.cat([encoded[0].detach() for encoded in encoded_frames], dim=-1)
-            if self.label_rate == 25:
-                codes = codes[:, :, ::3]
-            return codes
-    @torch.no_grad()
-    def tokenize(self, x, raw_wav):
-        out = {}
-        for key in x.keys():
-            if self.use_rvq_like_target:
-                self.rvq.eval()
-                inp = x[key].permute((0, 2, 1))
-                codes = self.get_rvq_codes(inp, raw_wav)
-                out[key] = torch.cat([codes[:, idx, ...] for idx in range(int(self.codebook_size//1024))], dim=-1) # (when use freq mask)->[Batch, N_SubSpec, SeqLen=8*750]
-            else:
-                layer = getattr(self, "quantizer_%s" % key)
-                out[key] = layer(x[key])
-        return out
-    def to_spec_wise_quad(self, x):
-        Batch, QuadSpec, Time = x.shape
-        SubSpec, N_SubSpec = 16, 8
-        assert 4 * SubSpec * N_SubSpec == QuadSpec == 4*128
-        x = rearrange(x, "b (q n s) t -> b (q s) (n t)", q=4, n=N_SubSpec, s=SubSpec)
-        return x # [Batch, SubSpec=16, N_SubSpec*Time=8*100Hz]
-    def get_targets(self, x, label=None):
-        if self.use_encodec_target:
-            raw_x = x.clone()
-        else:
-            raw_x = None
-        x = self.preprocessing(x, features=self.features) # -> {'melspec_2048': Tensor{Size([3, 128, 3000]) cuda:0 f32}}
-        x = self.normalize(x)
-        x = self.rearrange(x) # -> {'melspec_2048': Tensor{Size([3, 750, 512]) cuda:0 f32}}
-        melspec = x['melspec_2048']
-        if label is None:
-            target_tokens = self.tokenize(x, raw_x) # -> {'melspec_2048': Tensor{Size([3, 750]) cuda:0 i64}}
-        else:
-            # print("use_target from label")
-            target_tokens = {'melspec_2048': rearrange(label, "b n s -> b (n s)").long()}
-        return target_tokens, melspec
-    def get_predictions(self, x, *, mask=None, attention_mask=None, return_new_mask=False, is_features_only=False):
-        # preprocessing
-        if not self.use_hubert_featurizer:
-            x = self.preprocessing(x, features=["melspec_2048"])
-            x = self.normalize(x) # -> {'melspec_2048': Tensor{Size([3, 128, 3000]) cuda:0 f32}}
-        else:
-            features = self.hubert_feature_extractor(x)
-            features = self.layer_norm(features.transpose(1, 2))
-            if self.post_extract_proj is not None:
-                features = self.post_extract_proj(features)
-            x = {"melspec_2048": features}
-        # encoding
-        logits, hidden_emb, new_mask = self.encoder(x["melspec_2048"], attention_mask=attention_mask, is_features_only=is_features_only)
-        if return_new_mask:
-            return logits, hidden_emb, mask if new_mask is None else new_mask
-        else:
-            return logits, hidden_emb
-    def get_latent(self, x, layer_ix=12):
-        _, hidden_states = self.get_predictions(x)
-        emb = hidden_states[layer_ix]
-        return emb
-    def compute_nce(self, x, pos, negs):
-        neg_is_pos = (pos == negs).all(-1)
-        pos = pos.unsqueeze(0)
-        targets = torch.cat([pos, negs], dim=0)
-        logits = torch.cosine_similarity(x.float(), targets.float(), dim=-1).type_as(x)
-        logits /= 0.1
-        if neg_is_pos.any():
-            logits[1:][neg_is_pos] = float("-inf")
-        logits = logits.transpose(0, 1)  # (num_x, num_cls+1)
-        return logits
-    def compute_hubert_nce_loss(self, proj_xs, targets):
-        label_embs_list = self.label_embs_concat.split(self.codebook_size, 0) # (self.num_classes, 0)
-        def compute_pred(proj_x, target, label_embs):
-            # compute logits for the i-th label set
-            y = torch.index_select(label_embs, 0, target.long())
-            negs = label_embs.unsqueeze(1).expand(-1, proj_x.size(0), -1)
-            return self.compute_nce(proj_x, y, negs)
-        logit_list = [
-                    compute_pred(proj_x, t, label_embs_list[i])
-                    for i, (proj_x, t) in enumerate(zip(proj_xs, targets))
-                ]
-        return sum(logit_list)
-    def get_loss(self, logits, target_tokens, masked_indices):
-        losses = {}
-        accuracies = {}
-        for key in logits.keys():
-            if not self.use_rvq_like_target:
-                masked_logits = logits[key][tuple(masked_indices.t())]
-                masked_tokens = target_tokens[key][tuple(masked_indices.t())]
-            else:
-                Batch, SeqLen, N_Codebook_x_CodebookSize = logits[key].shape # CodebookSize=4096
-                Batch, N_Codebook_x_SeqLen = target_tokens[key].shape # N_Codebook*SeqLen=4*750
-                N_Codebook = int(N_Codebook_x_SeqLen // SeqLen)
-                # print("not use_virtual, n codebook = ", N_Codebook)
-                target_tokens[key] = rearrange(target_tokens[key], "b (n s) -> b s n", n=N_Codebook) # Batch, SeqLen=750, N_Codebook=4
-                masked_logits = logits[key][tuple(masked_indices.t())]
-                masked_tokens = target_tokens[key][tuple(masked_indices.t())]
-                masked_logits = rearrange(masked_logits, "b (n c) -> (b n) c", n=N_Codebook)
-                masked_tokens = rearrange(masked_tokens, "b n -> (b n)", n=N_Codebook)
-            if self.use_hubert_nce_loss:
-                losses[key] = self.compute_hubert_nce_loss(masked_logits, masked_tokens)
-            else:
-                losses[key] = self.loss(masked_logits, masked_tokens)
-            accuracies[key] = (
-                torch.sum(masked_logits.argmax(-1) == masked_tokens)
-                / masked_tokens.numel()
-            )
-        return losses, accuracies
-    def get_recon_loss(self, last_hidden_emb, melspec, masked_indices):
-        pred_melspec = self.recon_proj(last_hidden_emb[tuple(masked_indices.t())])
-        target_melspec = melspec[tuple(masked_indices.t())]
-        recon_loss = self.recon_loss(pred_melspec, target_melspec)
-        return recon_loss
-    def forward(self, x, attention_mask=None, label=None):
-        dtype = x.dtype
-        # get target feature tokens
-        target_tokens, melspec = self.get_targets(x, label=label)
-        # masking
-        x, masked_indices = self.masking(x, attention_mask=attention_mask)
-        # forward
-        logits, hidden_emb, masked_indices = self.get_predictions(x, mask=masked_indices, attention_mask=attention_mask, return_new_mask=True)
-        # get loss
-        losses, accuracies = self.get_loss(logits, target_tokens, masked_indices)
-        if self.recon_loss_ratio:
-            losses["recon_loss"] = self.get_recon_loss(hidden_emb[-1], melspec, masked_indices) * self.recon_loss_ratio
-        return logits, hidden_emb, losses, accuracies

MuCodec/muq_dev/muq_fairseq/models/muq/model/pred_ark_target_with_model.py DELETED Viewed

@@ -1,151 +0,0 @@
-import sys
-import torch.nn as nn
-import torch
-import sys, os
-sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-from rvq_musicfm import PreprocessorWithModel, ResidualVectorQuantize
-class RVQ(nn.Module):
-    def __init__(self,
-                model_config,
-                rvq_ckpt_path,
-                preprocess,
-                ):
-        super().__init__()
-        self.rvq = ResidualVectorQuantize(**model_config)
-        if rvq_ckpt_path is not None:
-            self.rvq.load_state_dict(torch.load(rvq_ckpt_path, map_location='cpu'))
-        self.preprocess = preprocess
-    def get_targets(self, x):
-        self.rvq.eval()
-        x = self.preprocess(x)
-        quantized_prompt_embeds, codes, _, commitment_loss, codebook_loss, rvq_usage = self.rvq(x)
-        return codes.permute(1,0,2)
-    @torch.no_grad()
-    def encode_wavs(self, wavs):
-        wavs = wavs[..., :int((wavs.shape[-1]//320)*320)]
-        return self.get_targets(wavs)
-def This_Music_ModelTarget_Config():
-    config = dict(
-        model = dict(
-            input_dim = 1024,
-            n_codebooks = 8,
-            codebook_size = 1024,
-            codebook_dim = 16,
-            quantizer_dropout = 0.0,
-        ),
-        train = dict(
-            batch_size = 32,
-            num_workers = 6,
-            valid_interval = 10,
-            save_interval = 100,
-            max_updates = 500000,
-            lr = 1e-4,
-            # device = 'cuda:1',
-            loss = 'commitment_loss * 0.25 + codebook_loss * 1.0 + (x - quantized_prompt_embeds).abs().mean()',
-            preprocess = PreprocessorWithModel(
-                model_dir= 'path/to/muq_fairseq',
-                checkpoint_dir='path/to/muq_m4a_75K.pt',
-                use_layer_idx=9,
-            )
-        ),
-        pred = dict(
-            rvq_ckpt_path='path/to/runs/Aug07_18-09-24_ts-828fa13e58384d0bba4144fda78ecc92-launcher/ckpt/RVQ_8100.pth',
-            sr=24000,
-            data_jsonl_path='path/to/data/music4all/train.json',
-            save_target_dir= 'path/to/data/music4all_ark/reiter_musicssl_m4a',
-        ),
-    )
-    return config
-CLEN = 30
-N_GPU_PER = 8
-N_NODE = 4
-def parse_lr(wave_length, sr):
-    n_step = int( wave_length // (sr*CLEN) )
-    if n_step == 0:
-        n_step = 1
-        print('wave_length: ', wave_length, 'sr: ', sr, 'n_step: ', n_step)
-    starts = torch.arange(n_step) * CLEN * sr
-    left_rights = torch.stack((starts, starts+CLEN*sr)).T
-    return left_rights[:10, ...]
-@torch.no_grad()
-def main(index, rank):
-    device = f'cuda:{rank}'
-    config = This_Music_ModelTarget_Config()
-    preprocess = config['train']['preprocess']
-    model = RVQ(
-        model_config = config['model'],
-        rvq_ckpt_path = config['pred']['rvq_ckpt_path'],
-        preprocess = preprocess
-    ).to(device)
-    model.eval()
-    sr = config['pred']['sr']
-    fname_nobase = os.path.basename(config['pred']['data_jsonl_path']).split('.')[0]
-    scp_dir = os.path.join(config['pred']['save_target_dir'], 'scp')
-    ark_dir = os.path.join(config['pred']['save_target_dir'], 'ark')
-    os.makedirs(scp_dir, exist_ok=True)
-    os.makedirs(ark_dir, exist_ok=True)
-    scp_path = os.path.join(scp_dir, f'{fname_nobase}.{index}_{rank}.scp')
-    ark_path = os.path.join(ark_dir, f'{fname_nobase}.{index}_{rank}.ark')
-    from kaldiio import WriteHelper
-    with open(config['pred']['data_jsonl_path']) as f:
-        lines = f.readlines()
-    print("Total:", len(lines))
-    from tqdm import tqdm
-    import json
-    import librosa
-    import time
-    from einops import rearrange
-    import numpy as np
-    # lines = lines[(index*N_GPU_PER+rank)::(N_GPU_PER*N_NODE)]
-    with WriteHelper(f'ark,scp:{ark_path},{scp_path}') as writer:
-        for idx, line in tqdm(enumerate(lines)):
-            try:
-                if idx % (N_GPU_PER*N_NODE) != (index*N_GPU_PER+rank):
-                    continue
-                item = json.loads(line)
-                path = item['path']
-                wave, _ = librosa.load(path, sr=sr)
-                wave = torch.from_numpy(wave)
-                wave_length = wave.shape[-1]
-                if wave_length < sr*CLEN:
-                    continue
-                left_rights = parse_lr(wave_length, sr)
-                lr = left_rights.tolist()
-                wavs = torch.stack(
-                    [wave[l:r] for l,r in lr]
-                ).to(device)
-                targets = model.encode_wavs(wavs) # [Codebook=8, N_Steps, Feature]
-                final_target = rearrange(targets, "c n f -> n (c f)").cpu().numpy().astype(np.int32)
-                for j in range(final_target.shape[0]):
-                    writer(f'{idx}:{j}', final_target[j])
-            except Exception as e:
-                print(e)
-if __name__ == '__main__':
-    import sys
-    index = int(sys.argv[1])
-    import multiprocessing
-    pool = multiprocessing.Pool(processes=N_GPU_PER)
-    for rank in range(8):
-        pool.apply_async(main, (index, rank))
-    pool.close()
-    pool.join()
-    print("Done.")

MuCodec/muq_dev/muq_fairseq/models/muq/model/rvq.py DELETED Viewed

@@ -1,459 +0,0 @@
-from typing import Union
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from torch.nn.utils import weight_norm
-def WNConv1d(*args, **kwargs):
-    return weight_norm(nn.Conv1d(*args, **kwargs))
-class VectorQuantize(nn.Module):
-    """
-    Implementation of VQ similar to Karpathy's repo:
-    https://github.com/karpathy/deep-vector-quantization
-    Additionally uses following tricks from Improved VQGAN
-    (https://arxiv.org/pdf/2110.04627.pdf):
-        1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space
-            for improved codebook usage
-        2. l2-normalized codes: Converts euclidean distance to cosine similarity which
-            improves training stability
-    """
-    def __init__(self, input_dim: int, codebook_size: int, codebook_dim: int, stale_tolerance: int = 1000, mfcc_clustering=False, n_layer=1):
-        super().__init__()
-        self.codebook_size = codebook_size
-        self.codebook_dim = codebook_dim
-        self.mfcc_clustering = mfcc_clustering
-        ProjClass = nn.Identity if mfcc_clustering else WNConv1d
-        if n_layer==1:
-            self.in_proj = ProjClass(input_dim, codebook_dim, kernel_size=1)
-            self.out_proj = ProjClass(codebook_dim, input_dim, kernel_size=1)
-        elif n_layer >= 2:
-            ndim_hidden = 128
-            self.in_proj = nn.Sequential(
-                ProjClass(input_dim, ndim_hidden, kernel_size=1),
-                *[nn.Sequential(nn.ReLU(), ProjClass(ndim_hidden, ndim_hidden, kernel_size=1),) for _ in range(n_layer-2)],
-                nn.ReLU(),
-                ProjClass(ndim_hidden, codebook_dim, kernel_size=1)
-            )
-            self.out_proj = nn.Sequential(
-                ProjClass(codebook_dim, ndim_hidden, kernel_size=1),
-                nn.ReLU(),
-                *[nn.Sequential(ProjClass(ndim_hidden, ndim_hidden, kernel_size=1), nn.ReLU()) for _ in range(n_layer-2)],
-                ProjClass(ndim_hidden, input_dim, kernel_size=1),
-            )
-        self.codebook = nn.Embedding(codebook_size, codebook_dim)
-        self.register_buffer("stale_counter", torch.zeros(self.codebook_size,))
-        self.stale_tolerance = stale_tolerance
-    def forward(self, z):
-        """Quantized the input tensor using a fixed codebook and returns
-        the corresponding codebook vectors
-        Parameters
-        ----------
-        z : Tensor[B x D x T]
-        Returns
-        -------
-        Tensor[B x D x T]
-            Quantized continuous representation of input
-        Tensor[1]
-            Commitment loss to train encoder to predict vectors closer to codebook
-            entries
-        Tensor[1]
-            Codebook loss to update the codebook
-        Tensor[B x T]
-            Codebook indices (quantized discrete representation of input)
-        Tensor[B x D x T]
-            Projected latents (continuous representation of input before quantization)
-        """
-        # Factorized codes (ViT-VQGAN) Project input into low-dimensional space
-        z_e = self.in_proj(z)  # z_e : (B x D x T)
-        z_q, indices = self.decode_latents(z_e)
-        commitment_loss = F.mse_loss(z_e, z_q.detach(), reduction="none").mean([1, 2])
-        codebook_loss = F.mse_loss(z_q, z_e.detach(), reduction="none").mean([1, 2])
-        z_q = (
-            z_e + (z_q - z_e).detach()
-        )  # noop in forward pass, straight-through gradient estimator in backward pass
-        z_q = self.out_proj(z_q)
-        return z_q, commitment_loss, codebook_loss, indices, z_e
-    def embed_code(self, embed_id):
-        return F.embedding(embed_id, self.codebook.weight)
-    def decode_code(self, embed_id):
-        return self.embed_code(embed_id).transpose(1, 2)
-    def decode_latents(self, latents):
-        encodings = rearrange(latents, "b d t -> (b t) d")
-        codebook = self.codebook.weight  # codebook: (N x D)
-        # L2 normalize encodings and codebook (ViT-VQGAN)
-        encodings = F.normalize(encodings)
-        codebook = F.normalize(codebook)
-        # Compute euclidean distance with codebook
-        dist = (
-            encodings.pow(2).sum(1, keepdim=True)
-            - 2 * encodings @ codebook.t()
-            + codebook.pow(2).sum(1, keepdim=True).t()
-        )
-        indices = rearrange((-dist).max(1)[1], "(b t) -> b t", b=latents.size(0))
-        z_q = self.decode_code(indices)
-        if(self.training):
-            onehots = torch.nn.functional.one_hot(indices, self.codebook_size).float()  # B, T, codebook_size
-            stale_codes = (onehots.sum(0).sum(0) == 0).float()
-            self.stale_counter = self.stale_counter * stale_codes + stale_codes
-            # random replace codes that haven't been used for a while
-            replace_code = (self.stale_counter == self.stale_tolerance).float() # codebook_size
-            if replace_code.sum(-1) > 0:
-                print("Replace {} codes".format(replace_code.sum(-1)))
-                random_input_idx = torch.randperm(encodings.shape[0])
-                random_input = encodings[random_input_idx].view(encodings.shape)
-                if random_input.shape[0] < self.codebook_size:
-                    random_input = torch.cat([random_input]*(self.codebook_size // random_input.shape[0] + 1), 0)
-                random_input = random_input[:self.codebook_size,:].contiguous()  # codebook_size, dim
-                self.codebook.weight.data = self.codebook.weight.data * (1 - replace_code).unsqueeze(-1) + random_input * replace_code.unsqueeze(-1)
-                self.stale_counter = self.stale_counter * (1 - replace_code)
-        return z_q, indices
-class ResidualVectorQuantize(nn.Module):
-    """
-    Introduced in SoundStream: An end2end neural audio codec
-    https://arxiv.org/abs/2107.03312
-    """
-    def __init__(
-        self,
-        input_dim: int = 512,
-        n_codebooks: int = 9,
-        codebook_size: int = 1024,
-        codebook_dim: Union[int, list] = 8,
-        quantizer_dropout: float = 0.0,
-        stale_tolerance: int = 100,
-        use_multi_layer_num:int = 1,
-    ):
-        super().__init__()
-        if isinstance(codebook_dim, int):
-            codebook_dim = [codebook_dim for _ in range(n_codebooks)]
-        self.n_codebooks = n_codebooks
-        self.codebook_dim = codebook_dim
-        self.codebook_size = codebook_size
-        self.quantizers = nn.ModuleList(
-            [
-                VectorQuantize(input_dim, codebook_size, codebook_dim[i], stale_tolerance=stale_tolerance, n_layer=use_multi_layer_num)
-                for i in range(n_codebooks)
-            ]
-        )
-        self.quantizer_dropout = quantizer_dropout
-    def forward(self, z, n_quantizers: int = None):
-        """Quantized the input tensor using a fixed set of `n` codebooks and returns
-        the corresponding codebook vectors
-        Parameters
-        ----------
-        z : Tensor[B x D x T]
-        n_quantizers : int, optional
-            No. of quantizers to use
-            (n_quantizers < self.n_codebooks ex: for quantizer dropout)
-            Note: if `self.quantizer_dropout` is True, this argument is ignored
-                when in training mode, and a random number of quantizers is used.
-        Returns
-        -------
-        dict
-            A dictionary with the following keys:
-            "z" : Tensor[B x D x T]
-                Quantized continuous representation of input
-            "codes" : Tensor[B x N x T]
-                Codebook indices for each codebook
-                (quantized discrete representation of input)
-            "latents" : Tensor[B x N*D x T]
-                Projected latents (continuous representation of input before quantization)
-            "vq/commitment_loss" : Tensor[1]
-                Commitment loss to train encoder to predict vectors closer to codebook
-                entries
-            "vq/codebook_loss" : Tensor[1]
-                Codebook loss to update the codebook
-        """
-        z_q = 0
-        residual = z
-        commitment_loss = 0
-        codebook_loss = 0
-        codebook_indices = []
-        latents = []
-        if n_quantizers is None:
-            n_quantizers = self.n_codebooks
-        if self.training:
-            n_quantizers = torch.ones((z.shape[0],)) * self.n_codebooks + 1
-            dropout = torch.randint(1, self.n_codebooks + 1, (z.shape[0],))
-            n_dropout = int(z.shape[0] * self.quantizer_dropout)
-            n_quantizers[:n_dropout] = dropout[:n_dropout]
-            n_quantizers = n_quantizers.to(z.device)
-        else:
-            n_quantizers = torch.ones((z.shape[0],)) * n_quantizers + 1
-            n_quantizers = n_quantizers.to(z.device)
-        for i, quantizer in enumerate(self.quantizers):
-            # if self.training is False and i >= n_quantizers:
-            #     break
-            z_q_i, commitment_loss_i, codebook_loss_i, indices_i, z_e_i = quantizer(
-                residual
-            )
-            # Create mask to apply quantizer dropout
-            mask = (
-                torch.full((z.shape[0],), fill_value=i, device=z.device) < n_quantizers
-            )
-            z_q = z_q + z_q_i * mask[:, None, None]
-            residual = residual - z_q_i
-            # Sum losses
-            commitment_loss += (commitment_loss_i * mask).mean()
-            codebook_loss += (codebook_loss_i * mask).mean()
-            codebook_indices.append(indices_i)
-            latents.append(z_e_i)
-        codes = torch.stack(codebook_indices, dim=1)
-        latents = torch.cat(latents, dim=1)
-        encodings = F.one_hot(codes, self.codebook_size).float() # B N T 1024
-        return z_q, codes, latents, commitment_loss, codebook_loss, n_quantizers.clamp(max=self.n_codebooks).long() - 1
-    def from_codes(self, codes: torch.Tensor):
-        """Given the quantized codes, reconstruct the continuous representation
-        Parameters
-        ----------
-        codes : Tensor[B x N x T]
-            Quantized discrete representation of input
-        Returns
-        -------
-        Tensor[B x D x T]
-            Quantized continuous representation of input
-        """
-        z_q = 0.0
-        z_p = []
-        n_codebooks = codes.shape[1]
-        for i in range(n_codebooks):
-            z_p_i = self.quantizers[i].decode_code(codes[:, i, :])
-            z_p.append(z_p_i)
-            z_q_i = self.quantizers[i].out_proj(z_p_i)
-            z_q = z_q + z_q_i
-        return z_q, torch.cat(z_p, dim=1), codes
-    def from_latents(self, latents: torch.Tensor):
-        """Given the unquantized latents, reconstruct the
-        continuous representation after quantization.
-        Parameters
-        ----------
-        latents : Tensor[B x N x T]
-            Continuous representation of input after projection
-        Returns
-        -------
-        Tensor[B x D x T]
-            Quantized representation of full-projected space
-        Tensor[B x D x T]
-            Quantized representation of latent space
-        """
-        z_q = 0
-        z_p = []
-        codes = []
-        dims = np.cumsum([0] + [q.codebook_dim for q in self.quantizers])
-        n_codebooks = np.where(dims <= latents.shape[1])[0].max(axis=0, keepdims=True)[
-            0
-        ]
-        for i in range(n_codebooks):
-            j, k = dims[i], dims[i + 1]
-            z_p_i, codes_i = self.quantizers[i].decode_latents(latents[:, j:k, :])
-            z_p.append(z_p_i)
-            codes.append(codes_i)
-            z_q_i = self.quantizers[i].out_proj(z_p_i)
-            z_q = z_q + z_q_i
-        return z_q, torch.cat(z_p, dim=1), torch.stack(codes, dim=1)
-from torch.utils.data import Dataset, DataLoader
-import json, traceback
-import torchaudio
-import math
-from typing import List, Tuple, Dict, Any
-CLIPSECS = 5
-def load_audio_by_json(json_path, max_keep, min_keep, tgt_sample_rate):
-    # read json file
-    print(json_path)
-    datas = []
-    inds = []
-    sizes = []
-    with open(json_path) as fp:
-        for ind,line in  enumerate(fp):
-            data = json.loads(line)
-            datas.append(data)
-            inds.append(ind)
-            # sz = int(data['duration'] * data['sample_rate'])
-            sz = int(tgt_sample_rate * CLIPSECS)
-            sizes.append(sz)
-    tot = ind + 1
-    return datas,inds,tot,sizes
-class Read_and_PadCrop_Normalized_T(torch.nn.Module):
-    def __init__(self, n_samples: int, sample_rate: int, randomize: bool = True):
-        super().__init__()
-        self.n_samples = n_samples
-        self.sample_rate = sample_rate
-        self.randomize = randomize
-    def __call__(self, filename: str, duration: float, cur_sample_rate: int) -> Tuple[torch.Tensor, float, float, int, int]:
-        if(duration<(float(self.n_samples)/self.sample_rate+1)):
-            # print(duration,(float(self.n_samples)/self.sample_rate+1))
-            chunk, _ = torchaudio.load(filename, frame_offset=0, num_frames=-1)
-            t_start = 0.
-            t_end = min(1.0, float(self.n_samples) / float(self.sample_rate) / duration)
-            offset = 0
-            # print('c1:',chunk.shape)
-        else:
-            offset = np.random.randint(0,int(duration*cur_sample_rate)-int(float(self.n_samples)/self.sample_rate*cur_sample_rate))
-            t_start = offset / float(cur_sample_rate) / duration
-            t_end = t_start + float(self.n_samples) / float(self.sample_rate) / duration
-            chunk, _ = torchaudio.load(filename, frame_offset=offset, num_frames=int(float(self.n_samples)/self.sample_rate*cur_sample_rate))
-            # print('offset:',offset)
-            # print('c0:',chunk.shape)
-        # Pad with silence if necessary.
-        if(chunk.shape[0]>1):
-            chunk = chunk[torch.randint(chunk.shape[0], size=(1,)),:].float()
-        else:
-            chunk = chunk[[0],:].float()
-        if(cur_sample_rate!=self.sample_rate):
-            # print('a:',cur_sample_rate,chunk.shape)
-            chunk = torchaudio.functional.resample(chunk, cur_sample_rate, self.sample_rate)
-            # print('b:',self.sample_rate,chunk.shape)
-        if chunk.shape[-1] < self.n_samples:
-            chunk = torch.cat([chunk, torch.zeros((1, self.n_samples - chunk.shape[-1],))],-1)
-        else:
-            chunk = chunk[:,0:self.n_samples]
-        seconds_start = math.floor(offset / cur_sample_rate)
-        seconds_total = math.floor(duration)
-        return (
-            chunk,
-            t_start,
-            t_end,
-            seconds_start,
-            seconds_total
-        )
-class RVQDataset(Dataset):
-    def __init__(
-        self,
-        manifest_path: str,
-        sample_rate: float,
-        normalize: bool = False,
-    ):
-        self.sample_rate = sample_rate
-        self.datas,inds,tot,self.sizes = load_audio_by_json(manifest_path, None, None, self.sample_rate)
-        self.dataset_len = len(self.datas)
-        self.reader = Read_and_PadCrop_Normalized_T(n_samples=CLIPSECS*sample_rate,sample_rate = self.sample_rate)
-        self.normalize = normalize
-    def __getitem__(self, i):
-        # WORLD_SIZE = int(torch.distributed.get_world_size())
-        # WORLD_RANK = int(torch.distributed.get_rank())
-        # np.random.seed(1337 + self.epoch * WORLD_SIZE + WORLD_RANK + i)
-        # index = random.randint(0,len(self.sizes) - 1)
-        index = i
-        item = None
-        while item is None:
-            try:
-                wav = self.get_audio_by_slice(index)
-                # labels = self.get_labels(index)
-                # labels = None
-                # item = {"id": index, "source": wav, "label_list": labels}
-                item = {"id": index, "source": wav}
-            except Exception as e:
-                # print(e)
-                traceback.print_exc()
-                print(f'skip damaged data {index}')
-                index = np.random.randint(0,len(self.sizes)-1)
-        return item
-    def __len__(self):
-        return self.dataset_len
-    def get_audio_by_slice(self,index):
-        wav_path = self.datas[index]['path']
-        # print(wav_path)
-        audio_info =  torchaudio.info(wav_path)
-        origin_sample_rate = audio_info.sample_rate
-        origin_duration = audio_info.num_frames / origin_sample_rate
-        wav, *ignored = self.reader(wav_path, origin_duration,origin_sample_rate)
-        wav = wav.float()
-        # _path, slice_ptr = parse_path(wav_path)
-        # original way
-        # if len(slice_ptr) == 0:
-        #     wav, cur_sample_rate = sf.read(_path)
-        # else:
-        #     assert _path.endswith(".zip")
-        #     data = read_from_stored_zip(_path, slice_ptr[0], slice_ptr[1])
-        #     f = io.BytesIO(data)
-        #     wav, cur_sample_rate = sf.read(f)
-        # wav = torch.from_numpy(wav).float()
-        # print(wav.shape)
-        wav = wav.permute(1,0)
-        wav = self.postprocess(wav, self.sample_rate)
-        # print(wav.shape)
-        # wav = wav.squeeze(0)
-        return wav
-    def postprocess(self, wav, cur_sample_rate):
-        if wav.dim() == 2:
-            wav = wav.mean(-1)
-        assert wav.dim() == 1, wav.dim()
-        if cur_sample_rate != self.sample_rate:
-            raise Exception(f"sr {cur_sample_rate} != {self.sample_rate}")
-        if self.normalize:
-            with torch.no_grad():
-                wav = F.layer_norm(wav, wav.shape)
-        return wav

MuCodec/muq_dev/muq_fairseq/models/muq/model/rvq_muq.py DELETED Viewed

@@ -1,394 +0,0 @@
-try:
-    from .rvq import *
-except:
-    import sys, os
-    sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-    from rvq import *
-try:
-    from ..modules.random_quantizer import RandomProjectionQuantizer
-    from ..modules.features import MelSTFT
-    from ..modules.conv import Conv2dSubsampling
-except:
-    import sys, os
-    sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
-    from modules.random_quantizer import RandomProjectionQuantizer
-    from modules.features import MelSTFT
-    from modules.conv import Conv2dSubsampling
-import fairseq
-CLIPSECS = 5 # 5 for rvq, 30 for model
-class RVQDataset(Dataset):
-    def __init__(
-        self,
-        manifest_path: str,
-        sample_rate: float,
-        normalize: bool = False,
-    ):
-        self.sample_rate = sample_rate
-        self.datas,inds,tot,self.sizes = load_audio_by_json(manifest_path, None, None, self.sample_rate)
-        self.dataset_len = len(self.datas)
-        self.reader = Read_and_PadCrop_Normalized_T(n_samples=CLIPSECS*sample_rate,sample_rate = self.sample_rate)
-        self.normalize = normalize
-    def __getitem__(self, i):
-        # WORLD_SIZE = int(torch.distributed.get_world_size())
-        # WORLD_RANK = int(torch.distributed.get_rank())
-        # np.random.seed(1337 + self.epoch * WORLD_SIZE + WORLD_RANK + i)
-        # index = random.randint(0,len(self.sizes) - 1)
-        index = i
-        item = None
-        while item is None:
-            try:
-                wav = self.get_audio_by_slice(index)
-                item = {"id": index, "source": wav}
-            except Exception as e:
-                # print(e)
-                traceback.print_exc()
-                print(f'skip damaged data {index}')
-                index = np.random.randint(0,len(self.sizes)-1)
-        return item
-    def __len__(self):
-        return self.dataset_len
-    def get_audio_by_slice(self,index):
-        wav_path = self.datas[index]['path']
-        audio_info =  torchaudio.info(wav_path)
-        origin_sample_rate = audio_info.sample_rate
-        origin_duration = audio_info.num_frames / origin_sample_rate
-        wav, *ignored = self.reader(wav_path, origin_duration,origin_sample_rate)
-        wav = wav.float()
-        # _path, slice_ptr = parse_path(wav_path)
-        # original way
-        # if len(slice_ptr) == 0:
-        #     wav, cur_sample_rate = sf.read(_path)
-        # else:
-        #     assert _path.endswith(".zip")
-        #     data = read_from_stored_zip(_path, slice_ptr[0], slice_ptr[1])
-        #     f = io.BytesIO(data)
-        #     wav, cur_sample_rate = sf.read(f)
-        # wav = torch.from_numpy(wav).float()
-        # print(wav.shape)
-        wav = wav.permute(1,0)
-        wav = self.postprocess(wav, self.sample_rate)
-        # print(wav.shape)
-        # wav = wav.squeeze(0)
-        return wav
-    def postprocess(self, wav, cur_sample_rate):
-        if wav.dim() == 2:
-            wav = wav.mean(-1)
-        assert wav.dim() == 1, wav.dim()
-        if cur_sample_rate != self.sample_rate:
-            raise Exception(f"sr {cur_sample_rate} != {self.sample_rate}")
-        if self.normalize:
-            with torch.no_grad():
-                wav = F.layer_norm(wav, wav.shape)
-        return wav
-class Preprocessor(nn.Module):
-    def __init__(self,
-            codebook_dim=16,
-            codebook_size=4096,
-            hop_length=240,
-            n_mels=128,
-            stat_path=None,
-            is_spec_wise=False,
-            s=4,
-        ) -> None:
-        super().__init__()
-        self.features=["melspec_2048"]
-        self.s = s
-        # load feature mean / std stats
-        import os
-        if stat_path is not None and os.path.exists(stat_path):
-            with open(stat_path, "r") as f:
-                self.stat = json.load(f)
-        else:
-            # print("No stats file found at `{}`, use default from msd.".format(stat_path))
-            self.stat = {"spec_256_cnt": 14394344256, "spec_256_mean": -23.34296658431829, "spec_256_std": 26.189295587132637, "spec_512_cnt": 28677104448, "spec_512_mean": -21.31267396860235, "spec_512_std": 26.52644536245769, "spec_1024_cnt": 57242624832, "spec_1024_mean": -18.852271129208273, "spec_1024_std": 26.443154583585663, "spec_2048_cnt": 114373665600, "spec_2048_mean": -15.638743433896792, "spec_2048_std": 26.115825961611545, "spec_4096_cnt": 228635747136, "spec_4096_mean": -11.715532502794836, "spec_4096_std": 25.763972210234062, "melspec_256_cnt": 14282760192, "melspec_256_mean": -26.962600400166156, "melspec_256_std": 36.13614100912126, "melspec_512_cnt": 14282760192, "melspec_512_mean": -9.108344167718862, "melspec_512_std": 24.71910937988429, "melspec_1024_cnt": 14282760192, "melspec_1024_mean": 0.37302579246531126, "melspec_1024_std": 18.684082325919388, "melspec_2048_cnt": 14282760192, "melspec_2048_mean": 6.768444971712967, "melspec_2048_std": 18.417922652295623, "melspec_4096_cnt": 14282760192, "melspec_4096_mean": 13.617164614990036, "melspec_4096_std": 18.08552130124525, "cqt_cnt": 9373061376, "cqt_mean": 0.46341379757927165, "cqt_std": 0.9543998080910191, "mfcc_256_cnt": 1339008768, "mfcc_256_mean": -11.681755459447485, "mfcc_256_std": 29.183186444668316, "mfcc_512_cnt": 1339008768, "mfcc_512_mean": -2.540581461792183, "mfcc_512_std": 31.93752185832081, "mfcc_1024_cnt": 1339008768, "mfcc_1024_mean": 6.606636263169779, "mfcc_1024_std": 34.151644801729624, "mfcc_2048_cnt": 1339008768, "mfcc_2048_mean": 5.281600844245184, "mfcc_2048_std": 33.12784541220003, "mfcc_4096_cnt": 1339008768, "mfcc_4096_mean": 4.7616569480166095, "mfcc_4096_std": 32.61458906894133, "chromagram_256_cnt": 1339008768, "chromagram_256_mean": 55.15596556703181, "chromagram_256_std": 73.91858278719991, "chromagram_512_cnt": 1339008768, "chromagram_512_mean": 175.73092252759895, "chromagram_512_std": 248.48485148525953, "chromagram_1024_cnt": 1339008768, "chromagram_1024_mean": 589.2947481634608, "chromagram_1024_std": 913.857929063196, "chromagram_2048_cnt": 1339008768, "chromagram_2048_mean": 2062.286388327397, "chromagram_2048_std": 3458.92657915397, "chromagram_4096_cnt": 1339008768, "chromagram_4096_mean": 7673.039107997085, "chromagram_4096_std": 13009.883158267234}
-        # feature extractor
-        self.preprocessor_melspec_2048 = MelSTFT(
-            n_fft=2048, hop_length=hop_length, is_db=True
-        )
-        self.is_spec_wise = is_spec_wise
-    @torch.no_grad()
-    def normalize(self, x):
-        """normalize the input audio to have zero mean unit variance"""
-        for key in x.keys():
-            x[key] = (x[key] - self.stat["%s_mean" % key]) / self.stat["%s_std" % key] # {'melspec_2048_cnt': 14282760192, 'melspec_2048_mean': 6.768444971712967}
-        return x
-    @torch.no_grad()
-    def rearrange(self, x):
-        """rearrange the batch to flatten every 4 steps"""
-        for key in x.keys():
-            if key == "chromagram":
-                x[key] = rearrange(x[key], "b f t -> b t f")
-            else:
-                x[key] = rearrange(x[key], "b f (t s) -> b t (s f)", s=self.s)
-        return x
-    @torch.no_grad()
-    def preprocessing(self, x, features):
-        """extract classic audio features"""
-        # check precision
-        if x.dtype == torch.float16:
-            precision = 16
-        else:
-            precision = 32
-        out = {}
-        for key in features:
-            layer = getattr(self, "preprocessor_%s" % key)
-            out[key] = layer.float()(x.float())[..., :-1]
-            if precision == 16:
-                out[key] = out[key].half()
-        return out
-    @torch.no_grad()
-    def tokenize(self, x):
-        out = {}
-        for key in x.keys():
-            layer = getattr(self, "quantizer_%s" % key)
-            out[key] = layer(x[key])
-        return out
-    def to_spec_wise(self, x):
-        Batch, Spec, Time = x.shape
-        SubSpec, N_SubSpec = 16, 8
-        assert SubSpec * N_SubSpec == Spec == 128
-        x = rearrange(x, "b (n s) t -> b s (n t)", n=N_SubSpec, s=SubSpec)
-        return x # [Batch, SubSpec=16, N_SubSpec*Time=8*100Hz]
-    @torch.no_grad()
-    def __call__(self, x):
-        x = self.preprocessing(x, features=self.features) # -> {'melspec_2048': Tensor{Size([3, 128, 3000]) cuda:0 f32}}
-        x = self.normalize(x)
-        if self.is_spec_wise:
-            x = {k:self.to_spec_wise(v) for k,v in x.items()}
-        x = self.rearrange(x) # -> {'melspec_2048': Tensor{Size([3, 750, 512]) cuda:0 f32}}
-        return x['melspec_2048'].permute((0, 2, 1))
-class CQTPreprocessor(nn.Module):
-    def __init__(self,
-            sr=24000,
-            hop=960,
-            nb=84,
-            to_db = True,
-        ) -> None:
-        super().__init__()
-        from nnAudio.features.cqt import CQT
-        import torchaudio
-        self.cqt_fn = CQT(
-                            sr=sr,
-                            hop_length=hop,
-                            n_bins=nb,
-                            fmin=32.7 if nb == 84 else 27.5, # 84 or 88
-                            bins_per_octave=12,
-                            filter_scale=1,
-                            norm=1,
-                            window='hann',
-                            center=True,
-                            pad_mode='constant',
-                            trainable=False,
-                            output_format='Magnitude',
-                            verbose=True,
-                        )
-        if to_db:
-            self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
-        else:
-            self.amplitude_to_db = lambda x:x
-    @torch.no_grad()
-    def __call__(self, x):
-        return self.amplitude_to_db(self.cqt_fn(x))
-from dataclasses import dataclass
-@dataclass
-class UserDirModule:
-    user_dir: str
-def load_model(model_dir, checkpoint_dir):
-    '''Load Fairseq SSL model'''
-    if model_dir is not None:
-        model_path = UserDirModule(model_dir)
-        fairseq.utils.import_user_module(model_path)
-    model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_dir], strict=False)
-    model = model[0]
-    return model
-class PreprocessorWithModel(nn.Module):
-    def __init__(self, model_dir, checkpoint_dir, use_layer_idx=9) -> None:
-        super().__init__()
-        self.model = load_model(model_dir=model_dir, checkpoint_dir=checkpoint_dir)
-        self.model.eval()
-        self.use_layer_idx = use_layer_idx
-    def forward(self, x):
-        with torch.no_grad():
-            self.model.eval()
-            res = self.model(x, features_only = True)
-            layer_results = res['layer_results']
-            return layer_results[self.use_layer_idx].permute(0,2,1)
-def Music_Mel_Target_Config():
-    config = dict(
-        train_dataset = dict(
-            manifest_path = 'path/to/data/music4all/train.json',
-            sample_rate = 24000,
-            normalize = False,
-        ),
-        valid_dataset = dict(
-            manifest_path = 'path/to/data/music4all/valid.json',
-            sample_rate = 24000,
-            normalize = False,
-        ),
-        model = dict(
-            input_dim = 128*4,
-            n_codebooks = 8,
-            codebook_size = 1024,
-            codebook_dim = 16,
-            quantizer_dropout = 0.0,
-        ),
-        train = dict(
-            batch_size = 32,
-            num_workers = 6,
-            valid_interval = 10,
-            save_interval = 100,
-            max_updates = 500000,
-            lr = 1e-4,
-            device = 'cuda:0',
-            loss = 'commitment_loss * 0.25 + codebook_loss * 1.0 + (x - quantized_prompt_embeds).abs().mean()',
-            preprocess = Preprocessor()
-        )
-    )
-    return config
-def main(config):
-    train_dataset = RVQDataset(**config['train_dataset'])
-    if config['valid_dataset']['manifest_path'] is None:
-        # split train and valid dataset
-        from torch.utils.data import random_split
-        train_dataset, valid_dataset = random_split(
-            train_dataset, lengths=[len(train_dataset) - 500, 500]
-        )
-    else:
-        valid_dataset = RVQDataset(**config['valid_dataset'])
-    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=config['train']['batch_size'], drop_last=True, num_workers=config['train']['num_workers'])
-    valid_dataloader = DataLoader(valid_dataset, shuffle=False, batch_size=config['train']['batch_size'], drop_last=True, num_workers=config['train']['num_workers'])
-    model = ResidualVectorQuantize(**config['model'])
-    device = config['train']['device']
-    preprocess = config['train']['preprocess'].to(device)
-    model = model.to(device)
-    optimizer = torch.optim.Adam(model.parameters(), lr=config['train']['lr'])
-    cur_updates = 0
-    is_running = True
-    result = {}
-    from tqdm import tqdm
-    from tensorboardX import SummaryWriter
-    writer = SummaryWriter()
-    from collections import defaultdict
-    import os
-    from logging import getLogger
-    logger = getLogger()
-    while is_running:
-        results = defaultdict(lambda:0)
-        for item in tqdm(train_dataloader, desc='train'):
-            wavs = item['source']
-            optimizer.zero_grad()
-            wavs = wavs.to(device)
-            x = preprocess(wavs)
-            model.train()
-            quantized_prompt_embeds, codes, _, commitment_loss, codebook_loss, rvq_usage = model(x)
-            loss = eval(config['train']['loss'])
-            loss.backward()
-            optimizer.step()
-            results['loss/train'] += loss.item()
-            results['commitment_loss/train'] += commitment_loss.item()
-            results['codebook_loss/train'] += codebook_loss.item()
-            results['rvq_usage/train'] += rvq_usage.float().mean().item()
-            if cur_updates % config['train']['valid_interval'] == 0:
-                model.eval()
-                with torch.no_grad():
-                    for item in tqdm(valid_dataloader, desc='valid'):
-                        wavs = item['source']
-                        wavs = wavs.to(device)
-                        x = preprocess(wavs)
-                        quantized_prompt_embeds, codes, _, commitment_loss, codebook_loss, rvq_usage = model(x)
-                        valid_loss = eval(config['train']['loss'])
-                        results['loss/valid'] += valid_loss.item()
-                        results['commitment_loss/valid'] += commitment_loss.item()
-                        results['codebook_loss/valid'] += codebook_loss.item()
-                        results['rvq_usage/valid'] += rvq_usage.float().mean().item()
-                    results['cur_updates'] = cur_updates
-                    results['loss/train'] /= config['train']['valid_interval']
-                    results['commitment_loss/train'] /= config['train']['valid_interval']
-                    results['codebook_loss/train'] /= config['train']['valid_interval']
-                    results['rvq_usage/train'] /= config['train']['valid_interval']
-                    results['loss/valid'] /= len(valid_dataloader)
-                    results['commitment_loss/valid'] /= len(valid_dataloader)
-                    results['codebook_loss/valid'] /= len(valid_dataloader)
-                    results['rvq_usage/valid'] /= len(valid_dataloader)
-                    print('')
-                    logger.info(str(results))
-                    for k,v in results.items():
-                        writer.add_scalar(k, v, cur_updates)
-                    results.clear()
-            if cur_updates % config['train']['save_interval'] == 0:
-                os.makedirs(f'{writer.logdir}/ckpt/', exist_ok=True)
-                logger.info(f'saving checkpoint to {writer.logdir}/ckpt/RVQ_{cur_updates}.pth')
-                torch.save(model.state_dict(), f'{writer.logdir}/ckpt/RVQ_{cur_updates}.pth')
-            if cur_updates < config['train']['max_updates']:
-                cur_updates += 1
-            else:
-                is_running = False
-                break
-if __name__ == '__main__':
-    config = Music_Mel_Target_Config()
-    main(config)

MuCodec/muq_dev/muq_fairseq/models/muq/model/w2v2_config.json DELETED Viewed

@@ -1,113 +0,0 @@
-{
-  "activation_dropout": 0.1,
-  "adapter_kernel_size": 3,
-  "adapter_stride": 2,
-  "add_adapter": false,
-  "apply_spec_augment": true,
-  "architectures": [
-    "Wav2Vec2ConformerForCTC"
-  ],
-  "attention_dropout": 0.1,
-  "bos_token_id": 1,
-  "classifier_proj_size": 256,
-  "codevector_dim": 768,
-  "conformer_conv_dropout": 0.1,
-  "contrastive_logits_temperature": 0.1,
-  "conv_bias": true,
-  "conv_depthwise_kernel_size": 31,
-  "conv_dim": [
-    512,
-    512,
-    512,
-    512,
-    512,
-    512,
-    512
-  ],
-  "conv_kernel": [
-    10,
-    3,
-    3,
-    3,
-    3,
-    2,
-    2
-  ],
-  "conv_stride": [
-    5,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2
-  ],
-  "ctc_loss_reduction": "sum",
-  "ctc_zero_infinity": false,
-  "diversity_loss_weight": 0.1,
-  "do_stable_layer_norm": true,
-  "eos_token_id": 2,
-  "feat_extract_activation": "gelu",
-  "feat_extract_dropout": 0.0,
-  "feat_extract_norm": "layer",
-  "feat_proj_dropout": 0.1,
-  "feat_quantizer_dropout": 0.0,
-  "final_dropout": 0.1,
-  "gradient_checkpointing": false,
-  "hidden_act": "swish",
-  "hidden_dropout": 0.1,
-  "hidden_dropout_prob": 0.1,
-  "hidden_size": 1024,
-  "initializer_range": 0.02,
-  "intermediate_size": 4096,
-  "layer_norm_eps": 1e-05,
-  "layerdrop": 0.0,
-  "mask_feature_length": 10,
-  "mask_feature_min_masks": 0,
-  "mask_feature_prob": 0.0,
-  "mask_time_length": 10,
-  "mask_time_min_masks": 2,
-  "mask_time_prob": 0.05,
-  "max_source_positions": 5000,
-  "model_type": "wav2vec2-conformer",
-  "num_adapter_layers": 3,
-  "num_attention_heads": 16,
-  "num_codevector_groups": 2,
-  "num_codevectors_per_group": 320,
-  "num_conv_pos_embedding_groups": 16,
-  "num_conv_pos_embeddings": 128,
-  "num_feat_extract_layers": 7,
-  "num_hidden_layers": 24,
-  "num_negatives": 100,
-  "output_hidden_size": 1024,
-  "pad_token_id": 0,
-  "position_embeddings_type": "rotary",
-  "proj_codevector_dim": 768,
-  "rotary_embedding_base": 10000,
-  "tdnn_dilation": [
-    1,
-    2,
-    3,
-    1,
-    1
-  ],
-  "tdnn_dim": [
-    512,
-    512,
-    512,
-    512,
-    1500
-  ],
-  "tdnn_kernel": [
-    5,
-    3,
-    3,
-    1,
-    1
-  ],
-  "torch_dtype": "float32",
-  "transformers_version": "4.19.0.dev0",
-  "use_weighted_layer_sum": false,
-  "vocab_size": 32,
-  "xvector_output_dim": 512
-}

MuCodec/muq_dev/muq_fairseq/models/muq/modules/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	-
2	-

MuCodec/muq_dev/muq_fairseq/models/muq/modules/conv.py DELETED Viewed

@@ -1,77 +0,0 @@
-from torch import nn
-from einops import rearrange
-class Res2dModule(nn.Module):
-    def __init__(self, idim, odim, stride=(2, 2)):
-        super(Res2dModule, self).__init__()
-        self.conv1 = nn.Conv2d(idim, odim, 3, padding=1, stride=stride)
-        self.bn1 = nn.BatchNorm2d(odim)
-        self.conv2 = nn.Conv2d(odim, odim, 3, padding=1)
-        self.bn2 = nn.BatchNorm2d(odim)
-        self.relu = nn.ReLU()
-        # residual
-        self.diff = False
-        if (idim != odim) or (stride[0] > 1):
-            self.conv3 = nn.Conv2d(idim, odim, 3, padding=1, stride=stride)
-            self.bn3 = nn.BatchNorm2d(odim)
-            self.diff = True
-    def forward(self, x):
-        out = self.bn2(self.conv2(self.relu(self.bn1(self.conv1(x)))))
-        if self.diff:
-            x = self.bn3(self.conv3(x))
-        out = x + out
-        out = self.relu(out)
-        return out
-class Conv2dSubsampling(nn.Module):
-    """Convolutional 2D subsampling (to 1/4 length).
-    Args:
-        idim (int): Input dimension.
-        hdim (int): Hidden dimension.
-        odim (int): Output dimension.
-        strides (list): Sizes of strides.
-        n_bands (int): Number of frequency bands.
-    """
-    def __init__(self, idim, hdim, odim, strides=[2, 2], n_bands=64):
-        """Construct an Conv2dSubsampling object."""
-        super(Conv2dSubsampling, self).__init__()
-        self.conv = nn.Sequential(
-            Res2dModule(idim, hdim, (2, strides[0])),
-            Res2dModule(hdim, hdim, (2, strides[1])),
-        )
-        self.linear = nn.Linear(hdim * n_bands // 2 // 2, odim)
-    def forward(self, x):
-        """Subsample x.
-        Args:
-            x (torch.Tensor): Input tensor (#batch, idim, time).
-        Returns:
-            torch.Tensor: Subsampled tensor (#batch, time', odim),
-                where time' = time // 4.
-        """
-        if x.dim() == 3:
-            x = x.unsqueeze(1)  # (b, c, f, t)
-        x = self.conv(x)
-        x = rearrange(x, "b c f t -> b t (c f)")
-        x = self.linear(x)
-        return x
-if __name__ == '__main__':
-    import torch
-    conv_dim, encoder_dim = 512, 1024
-    conv = Conv2dSubsampling(
-            1, conv_dim, encoder_dim, strides=[2, 1], n_bands=128
-        )
-    inp = torch.randn((1, 128, 3000))
-    out = conv(inp)
-    print(out.shape)

MuCodec/muq_dev/muq_fairseq/models/muq/modules/features.py DELETED Viewed

@@ -1,67 +0,0 @@
-import torchaudio
-from torch import nn
-import torch
-class MelSTFT(nn.Module):
-    def __init__(
-        self,
-        sample_rate=24000,
-        n_fft=2048,
-        hop_length=240,
-        n_mels=128,
-        is_db=False,
-    ):
-        super(MelSTFT, self).__init__()
-        # spectrogram
-        self.mel_stft = torchaudio.transforms.MelSpectrogram(
-            sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels
-        )
-        # amplitude to decibel
-        self.is_db = is_db
-        if is_db:
-            self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
-    def forward(self, waveform):
-        if self.is_db:
-            return self.amplitude_to_db(self.mel_stft(waveform))
-        else:
-            return self.mel_stft(waveform)
-class CQTPreprocessor(nn.Module):
-    def __init__(self,
-            sr=24000,
-            hop=960,
-            nb=84,
-            to_db = True,
-        ) -> None:
-        super().__init__()
-        from nnAudio.features.cqt import CQT
-        import torchaudio
-        self.cqt_fn = CQT(
-                            sr=sr,
-                            hop_length=hop,
-                            n_bins=nb,
-                            fmin=32.7 if nb == 84 else 27.5, # 84 or 88
-                            bins_per_octave=12,
-                            filter_scale=1,
-                            norm=1,
-                            window='hann',
-                            center=True,
-                            pad_mode='constant',
-                            trainable=False,
-                            output_format='Magnitude',
-                            verbose=True,
-                        )
-        if to_db:
-            self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
-        else:
-            self.amplitude_to_db = lambda x:x
-    @torch.no_grad()
-    def __call__(self, x):
-        return self.amplitude_to_db(self.cqt_fn(x))

MuCodec/muq_dev/muq_fairseq/models/muq/modules/flash_conformer.py DELETED Viewed

@@ -1,2114 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Wav2Vec2-Conformer model."""
-import math
-from dataclasses import dataclass
-from typing import Optional, Tuple, Union
-import numpy as np
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from torch.nn import functional as F
-from transformers.activations import ACT2FN
-from transformers.deepspeed import is_deepspeed_zero3_enabled
-from transformers.modeling_outputs import (
-    BaseModelOutput,
-    CausalLMOutput,
-    SequenceClassifierOutput,
-    TokenClassifierOutput,
-    Wav2Vec2BaseModelOutput,
-    XVectorOutput,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    logging,
-    replace_return_docstrings,
-)
-from transformers.models.wav2vec2_conformer.configuration_wav2vec2_conformer import Wav2Vec2ConformerConfig
-logger = logging.get_logger(__name__)
-_HIDDEN_STATES_START_POSITION = 2
-# General docstring
-_CONFIG_FOR_DOC = "Wav2Vec2ConformerConfig"
-# Base docstring
-_CHECKPOINT_FOR_DOC = "facebook/wav2vec2-conformer-rope-large-960h-ft"
-_EXPECTED_OUTPUT_SHAPE = [1, 292, 1024]
-# CTC docstring
-_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
-_CTC_EXPECTED_LOSS = 64.21
-WAV2VEC2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "facebook/wav2vec2-conformer-rel-pos-large",
-    # See all Wav2Vec2Conformer models at https://huggingface.co/models?filter=wav2vec2-conformer
-]
-@dataclass
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTrainingOutput with Wav2Vec2->Wav2Vec2Conformer
-class Wav2Vec2ConformerForPreTrainingOutput(ModelOutput):
-    """
-    Output type of [`Wav2Vec2ConformerForPreTraining`], with potential hidden states and attentions.
-    Args:
-        loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
-            Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
-            paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
-        projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
-            Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
-            projected quantized states.
-        projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
-            Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
-            target vectors for contrastive loss.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
-            The contrastive loss (L_m) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
-        diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
-            The diversity loss (L_d) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
-    """
-    loss: Optional[torch.FloatTensor] = None
-    projected_states: torch.FloatTensor = None
-    projected_quantized_states: torch.FloatTensor = None
-    codevector_perplexity: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-    contrastive_loss: Optional[torch.FloatTensor] = None
-    diversity_loss: Optional[torch.FloatTensor] = None
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
-def _compute_mask_indices(
-    shape: Tuple[int, int],
-    mask_prob: float,
-    mask_length: int,
-    attention_mask: Optional[torch.LongTensor] = None,
-    min_masks: int = 0,
-) -> np.ndarray:
-    """
-    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
-    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
-    CPU as part of the preprocessing during training.
-    Args:
-        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
-               the first element is the batch size and the second element is the length of the axis to span.
-        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
-                    independently generated mask spans of length `mask_length` is computed by
-                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
-                    actual percentage will be smaller.
-        mask_length: size of the mask
-        min_masks: minimum number of masked spans
-        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
-                        each batch dimension.
-    """
-    batch_size, sequence_length = shape
-    if mask_length < 1:
-        raise ValueError("`mask_length` has to be bigger than 0.")
-    if mask_length > sequence_length:
-        raise ValueError(
-            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
-            f" and `sequence_length`: {sequence_length}`"
-        )
-    # epsilon is used for probabilistic rounding
-    epsilon = np.random.rand(1).item()
-    def compute_num_masked_span(input_length):
-        """Given input length, compute how many spans should be masked"""
-        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
-        num_masked_span = max(num_masked_span, min_masks)
-        # make sure num masked span <= sequence_length
-        if num_masked_span * mask_length > sequence_length:
-            num_masked_span = sequence_length // mask_length
-        # make sure num_masked span is also <= input_length - (mask_length - 1)
-        if input_length - (mask_length - 1) < num_masked_span:
-            num_masked_span = max(input_length - (mask_length - 1), 0)
-        return num_masked_span
-    # compute number of masked spans in batch
-    input_lengths = (
-        attention_mask.sum(-1).detach().tolist()
-        if attention_mask is not None
-        else [sequence_length for _ in range(batch_size)]
-    )
-    # SpecAugment mask to fill
-    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
-    spec_aug_mask_idxs = []
-    max_num_masked_span = compute_num_masked_span(sequence_length)
-    if max_num_masked_span == 0:
-        return spec_aug_mask
-    for input_length in input_lengths:
-        # compute num of masked spans for this input
-        num_masked_span = compute_num_masked_span(input_length)
-        # get random indices to mask
-        spec_aug_mask_idx = np.random.choice(
-            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
-        )
-        # pick first sampled index that will serve as a dummy index to pad vector
-        # to ensure same dimension for all batches due to probabilistic rounding
-        # Picking first sample just pads those vectors twice.
-        if len(spec_aug_mask_idx) == 0:
-            # this case can only happen if `input_length` is strictly smaller then
-            # `sequence_length` in which case the last token has to be a padding
-            # token which we can use as a dummy mask id
-            dummy_mask_idx = sequence_length - 1
-        else:
-            dummy_mask_idx = spec_aug_mask_idx[0]
-        spec_aug_mask_idx = np.concatenate(
-            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
-        )
-        spec_aug_mask_idxs.append(spec_aug_mask_idx)
-    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
-    # expand masked indices to masked spans
-    spec_aug_mask_idxs = np.broadcast_to(
-        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
-    )
-    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
-    # add offset to the starting indexes so that indexes now create a span
-    offsets = np.arange(mask_length)[None, None, :]
-    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
-        batch_size, max_num_masked_span * mask_length
-    )
-    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
-    # ensure that we cannot have indices larger than sequence_length
-    if spec_aug_mask_idxs.max() > sequence_length - 1:
-        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
-    # scatter indices to mask
-    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
-    return spec_aug_mask
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2._sample_negative_indices
-def _sample_negative_indices(
-    features_shape: Tuple, num_negatives: int, mask_time_indices: Optional[np.ndarray] = None
-):
-    """
-    Sample `num_negatives` vectors from feature vectors.
-    """
-    batch_size, sequence_length = features_shape
-    # generate indices of the positive vectors themselves, repeat them `num_negatives` times
-    sequence_length_range = np.arange(sequence_length)
-    # get `num_negatives` random vector indices from the same utterance
-    sampled_negative_indices = np.zeros(shape=(batch_size, sequence_length, num_negatives), dtype=np.int32)
-    mask_time_indices = (
-        mask_time_indices.astype(bool) if mask_time_indices is not None else np.ones(features_shape, dtype=bool)
-    )
-    for batch_idx in range(batch_size):
-        high = mask_time_indices[batch_idx].sum() - 1
-        mapped_masked_indices = sequence_length_range[mask_time_indices[batch_idx]]
-        feature_indices = np.broadcast_to(np.arange(high + 1)[:, None], (high + 1, num_negatives))
-        sampled_indices = np.random.randint(0, high, size=(high + 1, num_negatives))
-        # avoid sampling the same positive vector, but keep the distribution uniform
-        sampled_indices[sampled_indices >= feature_indices] += 1
-        # remap to actual indices
-        sampled_negative_indices[batch_idx][mask_time_indices[batch_idx]] = mapped_masked_indices[sampled_indices]
-        # correct for batch size
-        sampled_negative_indices[batch_idx] += batch_idx * sequence_length
-    return sampled_negative_indices
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2NoLayerNormConvLayer with Wav2Vec2->Wav2Vec2Conformer
-class Wav2Vec2ConformerNoLayerNormConvLayer(nn.Module):
-    def __init__(self, config, layer_id=0):
-        super().__init__()
-        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
-        self.out_conv_dim = config.conv_dim[layer_id]
-        self.conv = nn.Conv1d(
-            self.in_conv_dim,
-            self.out_conv_dim,
-            kernel_size=config.conv_kernel[layer_id],
-            stride=config.conv_stride[layer_id],
-            bias=config.conv_bias,
-        )
-        self.activation = ACT2FN[config.feat_extract_activation]
-    def forward(self, hidden_states):
-        hidden_states = self.conv(hidden_states)
-        hidden_states = self.activation(hidden_states)
-        return hidden_states
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2LayerNormConvLayer with Wav2Vec2->Wav2Vec2Conformer
-class Wav2Vec2ConformerLayerNormConvLayer(nn.Module):
-    def __init__(self, config, layer_id=0):
-        super().__init__()
-        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
-        self.out_conv_dim = config.conv_dim[layer_id]
-        self.conv = nn.Conv1d(
-            self.in_conv_dim,
-            self.out_conv_dim,
-            kernel_size=config.conv_kernel[layer_id],
-            stride=config.conv_stride[layer_id],
-            bias=config.conv_bias,
-        )
-        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
-        self.activation = ACT2FN[config.feat_extract_activation]
-    def forward(self, hidden_states):
-        hidden_states = self.conv(hidden_states)
-        hidden_states = hidden_states.transpose(-2, -1)
-        hidden_states = self.layer_norm(hidden_states)
-        hidden_states = hidden_states.transpose(-2, -1)
-        hidden_states = self.activation(hidden_states)
-        return hidden_states
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GroupNormConvLayer with Wav2Vec2->Wav2Vec2Conformer
-class Wav2Vec2ConformerGroupNormConvLayer(nn.Module):
-    def __init__(self, config, layer_id=0):
-        super().__init__()
-        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
-        self.out_conv_dim = config.conv_dim[layer_id]
-        self.conv = nn.Conv1d(
-            self.in_conv_dim,
-            self.out_conv_dim,
-            kernel_size=config.conv_kernel[layer_id],
-            stride=config.conv_stride[layer_id],
-            bias=config.conv_bias,
-        )
-        self.activation = ACT2FN[config.feat_extract_activation]
-        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)
-    def forward(self, hidden_states):
-        hidden_states = self.conv(hidden_states)
-        hidden_states = self.layer_norm(hidden_states)
-        hidden_states = self.activation(hidden_states)
-        return hidden_states
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PositionalConvEmbedding with Wav2Vec2->Wav2Vec2Conformer
-class Wav2Vec2ConformerPositionalConvEmbedding(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.conv = nn.Conv1d(
-            config.hidden_size,
-            config.hidden_size,
-            kernel_size=config.num_conv_pos_embeddings,
-            padding=config.num_conv_pos_embeddings // 2,
-            groups=config.num_conv_pos_embedding_groups,
-        )
-        if is_deepspeed_zero3_enabled():
-            import deepspeed
-            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
-                self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
-            deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
-            deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
-        else:
-            self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
-        self.padding = Wav2Vec2ConformerSamePadLayer(config.num_conv_pos_embeddings)
-        self.activation = ACT2FN[config.feat_extract_activation]
-    def forward(self, hidden_states):
-        hidden_states = hidden_states.transpose(1, 2)
-        hidden_states = self.conv(hidden_states)
-        hidden_states = self.padding(hidden_states)
-        hidden_states = self.activation(hidden_states)
-        hidden_states = hidden_states.transpose(1, 2)
-        return hidden_states
-class Wav2Vec2ConformerRotaryPositionalEmbedding(nn.Module):
-    """Rotary positional embedding
-    Reference : https://blog.eleuther.ai/rotary-embeddings/ Paper: https://arxiv.org/pdf/2104.09864.pdf
-    """
-    def __init__(self, config):
-        super().__init__()
-        dim = config.hidden_size // config.num_attention_heads
-        base = config.rotary_embedding_base
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
-        self.register_buffer("inv_freq", inv_freq)
-        self.cached_sequence_length = None
-        self.cached_rotary_positional_embedding = None
-    def forward(self, hidden_states):
-        sequence_length = hidden_states.shape[1]
-        if sequence_length == self.cached_sequence_length and self.cached_rotary_positional_embedding is not None:
-            return self.cached_rotary_positional_embedding
-        self.cached_sequence_length = sequence_length
-        time_stamps = torch.arange(sequence_length).type_as(self.inv_freq)
-        freqs = torch.einsum("i,j->ij", time_stamps, self.inv_freq)
-        embeddings = torch.cat((freqs, freqs), dim=-1)
-        cos_embeddings = embeddings.cos()[:, None, None, :]
-        sin_embeddings = embeddings.sin()[:, None, None, :]
-        self.cached_rotary_positional_embedding = torch.stack([cos_embeddings, sin_embeddings])
-        return self.cached_rotary_positional_embedding
-class Wav2Vec2ConformerRelPositionalEmbedding(nn.Module):
-    """Relative positional encoding module."""
-    def __init__(self, config):
-        super().__init__()
-        self.max_len = config.max_source_positions
-        self.d_model = config.hidden_size
-        self.pe = None
-        self.extend_pe(torch.tensor(0.0).expand(1, self.max_len))
-    def extend_pe(self, x):
-        # Reset the positional encodings
-        if self.pe is not None:
-            # self.pe contains both positive and negative parts
-            # the length of self.pe is 2 * input_len - 1
-            if self.pe.size(1) >= x.size(1) * 2 - 1:
-                if self.pe.dtype != x.dtype or self.pe.device != x.device:
-                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
-                return
-        # Suppose `i` is the position of query vector and `j` is the
-        # position of key vector. We use positive relative positions when keys
-        # are to the left (i>j) and negative relative positions otherwise (i<j).
-        pe_positive = torch.zeros(x.size(1), self.d_model)
-        pe_negative = torch.zeros(x.size(1), self.d_model)
-        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
-        div_term = torch.exp(
-            torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model)
-        )
-        pe_positive[:, 0::2] = torch.sin(position * div_term)
-        pe_positive[:, 1::2] = torch.cos(position * div_term)
-        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
-        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
-        # Reverse the order of positive indices and concat both positive and
-        # negative indices. This is used to support the shifting trick
-        # as in https://arxiv.org/abs/1901.02860
-        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
-        pe_negative = pe_negative[1:].unsqueeze(0)
-        pe = torch.cat([pe_positive, pe_negative], dim=1)
-        self.pe = pe.to(device=x.device, dtype=x.dtype)
-    def forward(self, hidden_states: torch.Tensor):
-        self.extend_pe(hidden_states)
-        start_idx = self.pe.size(1) // 2 - hidden_states.size(1) + 1
-        end_idx = self.pe.size(1) // 2 + hidden_states.size(1)
-        relative_position_embeddings = self.pe[:, start_idx:end_idx]
-        return relative_position_embeddings
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer with Wav2Vec2->Wav2Vec2Conformer
-class Wav2Vec2ConformerSamePadLayer(nn.Module):
-    def __init__(self, num_conv_pos_embeddings):
-        super().__init__()
-        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
-    def forward(self, hidden_states):
-        if self.num_pad_remove > 0:
-            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
-        return hidden_states
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->Wav2Vec2Conformer
-class Wav2Vec2ConformerFeatureEncoder(nn.Module):
-    """Construct the features from raw audio waveform"""
-    def __init__(self, config):
-        super().__init__()
-        if config.feat_extract_norm == "group":
-            conv_layers = [Wav2Vec2ConformerGroupNormConvLayer(config, layer_id=0)] + [
-                Wav2Vec2ConformerNoLayerNormConvLayer(config, layer_id=i + 1)
-                for i in range(config.num_feat_extract_layers - 1)
-            ]
-        elif config.feat_extract_norm == "layer":
-            conv_layers = [
-                Wav2Vec2ConformerLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
-            ]
-        else:
-            raise ValueError(
-                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
-            )
-        self.conv_layers = nn.ModuleList(conv_layers)
-        self.gradient_checkpointing = False
-        self._requires_grad = True
-    def _freeze_parameters(self):
-        for param in self.parameters():
-            param.requires_grad = False
-        self._requires_grad = False
-    def forward(self, input_values):
-        hidden_states = input_values[:, None]
-        # make sure hidden_states require grad for gradient_checkpointing
-        if self._requires_grad and self.training:
-            hidden_states.requires_grad = True
-        for conv_layer in self.conv_layers:
-            if self._requires_grad and self.gradient_checkpointing and self.training:
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-                    return custom_forward
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(conv_layer),
-                    hidden_states,
-                )
-            else:
-                hidden_states = conv_layer(hidden_states)
-        return hidden_states
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->Wav2Vec2Conformer
-class Wav2Vec2ConformerFeatureProjection(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
-        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
-        self.dropout = nn.Dropout(config.feat_proj_dropout)
-    def forward(self, hidden_states):
-        # non-projected hidden states are needed for quantization
-        norm_hidden_states = self.layer_norm(hidden_states)
-        hidden_states = self.projection(norm_hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        return hidden_states, norm_hidden_states
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeedForward with Wav2Vec2->Wav2Vec2Conformer
-class Wav2Vec2ConformerFeedForward(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
-        self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
-        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.output_dropout = nn.Dropout(config.hidden_dropout)
-    def forward(self, hidden_states):
-        hidden_states = self.intermediate_dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        hidden_states = self.intermediate_dropout(hidden_states)
-        hidden_states = self.output_dense(hidden_states)
-        hidden_states = self.output_dropout(hidden_states)
-        return hidden_states
-class Wav2Vec2ConformerConvolutionModule(nn.Module):
-    """Convolution block used in the conformer block"""
-    def __init__(self, config):
-        super().__init__()
-        if (config.conv_depthwise_kernel_size - 1) % 2 == 1:
-            raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding")
-        self.layer_norm = nn.LayerNorm(config.hidden_size)
-        self.pointwise_conv1 = torch.nn.Conv1d(
-            config.hidden_size,
-            2 * config.hidden_size,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            bias=False,
-        )
-        self.glu = torch.nn.GLU(dim=1)
-        self.depthwise_conv = torch.nn.Conv1d(
-            config.hidden_size,
-            config.hidden_size,
-            config.conv_depthwise_kernel_size,
-            stride=1,
-            padding=(config.conv_depthwise_kernel_size - 1) // 2,
-            groups=config.hidden_size,
-            bias=False,
-        )
-        self.batch_norm = torch.nn.BatchNorm1d(config.hidden_size)
-        self.activation = ACT2FN[config.hidden_act]
-        self.pointwise_conv2 = torch.nn.Conv1d(
-            config.hidden_size,
-            config.hidden_size,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            bias=False,
-        )
-        self.dropout = torch.nn.Dropout(config.conformer_conv_dropout)
-    def forward(self, hidden_states):
-        hidden_states = self.layer_norm(hidden_states)
-        # exchange the temporal dimension and the feature dimension
-        hidden_states = hidden_states.transpose(1, 2)
-        # GLU mechanism
-        # => (batch, 2*channel, dim)
-        hidden_states = self.pointwise_conv1(hidden_states)
-        # => (batch, channel, dim)
-        hidden_states = self.glu(hidden_states)
-        # 1D Depthwise Conv
-        hidden_states = self.depthwise_conv(hidden_states)
-        hidden_states = self.batch_norm(hidden_states)
-        hidden_states = self.activation(hidden_states)
-        hidden_states = self.pointwise_conv2(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = hidden_states.transpose(1, 2)
-        return hidden_states
-class Wav2Vec2ConformerSelfAttention(nn.Module):
-    """Construct an Wav2Vec2ConformerSelfAttention object.
-    Can be enhanced with rotary or relative position embeddings.
-    """
-    def __init__(self, config):
-        super().__init__()
-        self.head_size = config.hidden_size // config.num_attention_heads
-        self.num_heads = config.num_attention_heads
-        self.position_embeddings_type = config.position_embeddings_type
-        self.linear_q = nn.Linear(config.hidden_size, config.hidden_size)
-        self.linear_k = nn.Linear(config.hidden_size, config.hidden_size)
-        self.linear_v = nn.Linear(config.hidden_size, config.hidden_size)
-        self.linear_out = nn.Linear(config.hidden_size, config.hidden_size)
-        self.dropout = nn.Dropout(p=config.attention_dropout)
-        self.dropout_p = config.attention_dropout
-        self.is_causal = config.is_causal
-        if self.position_embeddings_type == "relative":
-            # linear transformation for positional encoding
-            self.linear_pos = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
-            # these two learnable bias are used in matrix c and matrix d
-            # as described in https://arxiv.org/abs/1901.02860 Section 3.3
-            self.pos_bias_u = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
-            self.pos_bias_v = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        relative_position_embeddings: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        # self-attention mechanism
-        batch_size, sequence_length, hidden_size = hidden_states.size()
-        # make sure query/key states can be != value states
-        query_key_states = hidden_states
-        value_states = hidden_states
-        if self.position_embeddings_type == "rotary":
-            if relative_position_embeddings is None:
-                raise ValueError(
-                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type == 'rotary'"
-                )
-            query_key_states = self._apply_rotary_embedding(query_key_states, relative_position_embeddings)
-        # project query_key_states and value_states
-        query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
-        key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
-        value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size)
-        # => (batch, head, time1, d_k)
-        query = query.transpose(1, 2)
-        key = key.transpose(1, 2)
-        value = value.transpose(1, 2)
-        with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=True, enable_mem_efficient=False):
-            hidden_states = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask, dropout_p=self.dropout_p, is_causal=self.is_causal)
-        probs = None
-        # # apply attention_mask if necessary
-        # if attention_mask is not None:
-        #     scores = scores + attention_mask
-        # # => (batch, head, time1, time2)
-        # probs = torch.softmax(scores, dim=-1)
-        # probs = self.dropout(probs)
-        # # => (batch, head, time1, d_k)
-        # hidden_states = torch.matmul(probs, value)
-        # => (batch, time1, hidden_size)
-        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size)
-        hidden_states = self.linear_out(hidden_states)
-        return hidden_states, probs
-    def _apply_rotary_embedding(self, hidden_states, relative_position_embeddings):
-        batch_size, sequence_length, hidden_size = hidden_states.size()
-        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads, self.head_size)
-        cos = relative_position_embeddings[0, :sequence_length, ...]
-        sin = relative_position_embeddings[1, :sequence_length, ...]
-        # rotate hidden_states with rotary embeddings
-        hidden_states = hidden_states.transpose(0, 1)
-        rotated_states_begin = hidden_states[..., : self.head_size // 2]
-        rotated_states_end = hidden_states[..., self.head_size // 2 :]
-        rotated_states = torch.cat((-rotated_states_end, rotated_states_begin), dim=rotated_states_begin.ndim - 1)
-        hidden_states = (hidden_states * cos) + (rotated_states * sin)
-        hidden_states = hidden_states.transpose(0, 1)
-        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads * self.head_size)
-        return hidden_states
-    def _apply_relative_embeddings(self, query, key, relative_position_embeddings):
-        # 1. project positional embeddings
-        # => (batch, head, 2*time1-1, d_k)
-        proj_relative_position_embeddings = self.linear_pos(relative_position_embeddings)
-        proj_relative_position_embeddings = proj_relative_position_embeddings.view(
-            relative_position_embeddings.size(0), -1, self.num_heads, self.head_size
-        )
-        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(1, 2)
-        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(2, 3)
-        # 2. Add bias to query
-        # => (batch, head, time1, d_k)
-        query = query.transpose(1, 2)
-        q_with_bias_u = (query + self.pos_bias_u).transpose(1, 2)
-        q_with_bias_v = (query + self.pos_bias_v).transpose(1, 2)
-        # 3. attention score: first compute matrix a and matrix c
-        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
-        # => (batch, head, time1, time2)
-        scores_ac = torch.matmul(q_with_bias_u, key.transpose(-2, -1))
-        # 4. then compute matrix b and matrix d
-        # => (batch, head, time1, 2*time1-1)
-        scores_bd = torch.matmul(q_with_bias_v, proj_relative_position_embeddings)
-        # 5. shift matrix b and matrix d
-        zero_pad = torch.zeros((*scores_bd.size()[:3], 1), device=scores_bd.device, dtype=scores_bd.dtype)
-        scores_bd_padded = torch.cat([zero_pad, scores_bd], dim=-1)
-        scores_bd_padded_shape = scores_bd.size()[:2] + (scores_bd.shape[3] + 1, scores_bd.shape[2])
-        scores_bd_padded = scores_bd_padded.view(*scores_bd_padded_shape)
-        scores_bd = scores_bd_padded[:, :, 1:].view_as(scores_bd)
-        scores_bd = scores_bd[:, :, :, : scores_bd.size(-1) // 2 + 1]
-        # 6. sum matrices
-        # => (batch, head, time1, time2)
-        scores = (scores_ac + scores_bd) / math.sqrt(self.head_size)
-        return scores
-class Wav2Vec2ConformerEncoderLayer(nn.Module):
-    """Conformer block based on https://arxiv.org/abs/2005.08100."""
-    def __init__(self, config):
-        super().__init__()
-        embed_dim = config.hidden_size
-        dropout = config.attention_dropout
-        # Feed-forward 1
-        self.ffn1_layer_norm = nn.LayerNorm(embed_dim)
-        self.ffn1 = Wav2Vec2ConformerFeedForward(config)
-        # Self-Attention
-        self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
-        self.self_attn_dropout = torch.nn.Dropout(dropout)
-        self.self_attn = Wav2Vec2ConformerSelfAttention(config)
-        # Conformer Convolution
-        self.conv_module = Wav2Vec2ConformerConvolutionModule(config)
-        # Feed-forward 2
-        self.ffn2_layer_norm = nn.LayerNorm(embed_dim)
-        self.ffn2 = Wav2Vec2ConformerFeedForward(config)
-        self.final_layer_norm = nn.LayerNorm(embed_dim)
-    def forward(
-        self,
-        hidden_states,
-        attention_mask: Optional[torch.Tensor] = None,
-        relative_position_embeddings: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ):
-        hidden_states = hidden_states
-        # 1. Feed-Forward 1 layer
-        residual = hidden_states
-        hidden_states = self.ffn1_layer_norm(hidden_states)
-        hidden_states = self.ffn1(hidden_states)
-        hidden_states = hidden_states * 0.5 + residual
-        residual = hidden_states
-        # 2. Self-Attention layer
-        hidden_states = self.self_attn_layer_norm(hidden_states)
-        hidden_states, attn_weigts = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            relative_position_embeddings=relative_position_embeddings,
-            output_attentions=output_attentions,
-        )
-        hidden_states = self.self_attn_dropout(hidden_states)
-        hidden_states = hidden_states + residual
-        # 3. Convolutional Layer
-        residual = hidden_states
-        hidden_states = self.conv_module(hidden_states)
-        hidden_states = residual + hidden_states
-        # 4. Feed-Forward 2 Layer
-        residual = hidden_states
-        hidden_states = self.ffn2_layer_norm(hidden_states)
-        hidden_states = self.ffn2(hidden_states)
-        hidden_states = hidden_states * 0.5 + residual
-        hidden_states = self.final_layer_norm(hidden_states)
-        return hidden_states, attn_weigts
-class Wav2Vec2ConformerEncoder(nn.Module):
-    def __init__(self, config, is_causal=False):
-        super().__init__()
-        config.is_causal = is_causal
-        self.config = config
-        if config.position_embeddings_type == "relative":
-            self.embed_positions = Wav2Vec2ConformerRelPositionalEmbedding(config)
-        elif config.position_embeddings_type == "rotary":
-            self.embed_positions = Wav2Vec2ConformerRotaryPositionalEmbedding(config)
-        else:
-            self.embed_positions = None
-        self.pos_conv_embed = Wav2Vec2ConformerPositionalConvEmbedding(config)
-        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout)
-        self.layers = nn.ModuleList([Wav2Vec2ConformerEncoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict=True,
-    ):
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attentions = () if output_attentions else None
-        if attention_mask is not None:
-            # make sure padded tokens output 0
-            hidden_states[~attention_mask] = 0.0
-            # extend attention_mask
-            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
-            attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
-            attention_mask = attention_mask.expand(
-                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
-            )
-        hidden_states = self.dropout(hidden_states)
-        if self.embed_positions is not None:
-            relative_position_embeddings = self.embed_positions(hidden_states)
-        else:
-            relative_position_embeddings = None
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
-        for i, layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
-            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
-                if self.gradient_checkpointing and self.training:
-                    # create gradient checkpointing function
-                    def create_custom_forward(module):
-                        def custom_forward(*inputs):
-                            return module(*inputs, output_attentions)
-                        return custom_forward
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(layer),
-                        hidden_states,
-                        attention_mask,
-                        relative_position_embeddings,
-                    )
-                else:
-                    layer_outputs = layer(
-                        hidden_states,
-                        attention_mask=attention_mask,
-                        relative_position_embeddings=relative_position_embeddings,
-                        output_attentions=output_attentions,
-                    )
-                hidden_states = layer_outputs[0]
-            if skip_the_layer:
-                layer_outputs = (None, None)
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (layer_outputs[1],)
-        hidden_states = self.layer_norm(hidden_states)
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-        if not return_dict:
-            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
-        return BaseModelOutput(
-            last_hidden_state=hidden_states,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GumbelVectorQuantizer with Wav2Vec2->Wav2Vec2Conformer
-class Wav2Vec2ConformerGumbelVectorQuantizer(nn.Module):
-    """
-    Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH
-    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
-    """
-    def __init__(self, config):
-        super().__init__()
-        self.num_groups = config.num_codevector_groups
-        self.num_vars = config.num_codevectors_per_group
-        if config.codevector_dim % self.num_groups != 0:
-            raise ValueError(
-                f"`config.codevector_dim {config.codevector_dim} must be divisible "
-                f"by `config.num_codevector_groups` {self.num_groups} for concatenation"
-            )
-        # storage for codebook variables (codewords)
-        self.codevectors = nn.Parameter(
-            torch.FloatTensor(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups)
-        )
-        self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars)
-        # can be decayed for training
-        self.temperature = 2
-    @staticmethod
-    def _compute_perplexity(probs, mask=None):
-        if mask is not None:
-            mask_extended = mask.flatten()[:, None, None].expand(probs.shape)
-            probs = torch.where(mask_extended, probs, torch.zeros_like(probs))
-            marginal_probs = probs.sum(dim=0) / mask.sum()
-        else:
-            marginal_probs = probs.mean(dim=0)
-        perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum()
-        return perplexity
-    def forward(self, hidden_states, mask_time_indices=None):
-        batch_size, sequence_length, hidden_size = hidden_states.shape
-        # project to codevector dim
-        hidden_states = self.weight_proj(hidden_states)
-        hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)
-        if self.training:
-            # sample code vector probs via gumbel in differentiateable way
-            codevector_probs = nn.functional.gumbel_softmax(
-                hidden_states.float(), tau=self.temperature, hard=True
-            ).type_as(hidden_states)
-            # compute perplexity
-            codevector_soft_dist = torch.softmax(
-                hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1
-            )
-            perplexity = self._compute_perplexity(codevector_soft_dist, mask_time_indices)
-        else:
-            # take argmax in non-differentiable way
-            # comptute hard codevector distribution (one hot)
-            codevector_idx = hidden_states.argmax(dim=-1)
-            codevector_probs = hidden_states.new_zeros(hidden_states.shape).scatter_(
-                -1, codevector_idx.view(-1, 1), 1.0
-            )
-            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)
-            perplexity = self._compute_perplexity(codevector_probs, mask_time_indices)
-        codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
-        # use probs to retrieve codevectors
-        codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
-        codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
-        codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)
-        return codevectors, perplexity
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Adapter with Wav2Vec2->Wav2Vec2Conformer
-class Wav2Vec2ConformerAdapter(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        # feature dim might need to be down-projected
-        if config.output_hidden_size != config.hidden_size:
-            self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
-            self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size)
-        else:
-            self.proj = self.proj_layer_norm = None
-        self.layers = nn.ModuleList(Wav2Vec2ConformerAdapterLayer(config) for _ in range(config.num_adapter_layers))
-        self.layerdrop = config.layerdrop
-    def forward(self, hidden_states):
-        # down project hidden_states if necessary
-        if self.proj is not None and self.proj_layer_norm is not None:
-            hidden_states = self.proj(hidden_states)
-            hidden_states = self.proj_layer_norm(hidden_states)
-        hidden_states = hidden_states.transpose(1, 2)
-        for layer in self.layers:
-            layerdrop_prob = np.random.random()
-            if not self.training or (layerdrop_prob > self.layerdrop):
-                hidden_states = layer(hidden_states)
-        hidden_states = hidden_states.transpose(1, 2)
-        return hidden_states
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2AdapterLayer with Wav2Vec2->Wav2Vec2Conformer
-class Wav2Vec2ConformerAdapterLayer(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.conv = nn.Conv1d(
-            config.output_hidden_size,
-            2 * config.output_hidden_size,
-            config.adapter_kernel_size,
-            stride=config.adapter_stride,
-            padding=1,
-        )
-    def forward(self, hidden_states):
-        hidden_states = self.conv(hidden_states)
-        hidden_states = nn.functional.glu(hidden_states, dim=1)
-        return hidden_states
-class Wav2Vec2ConformerPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-    config_class = Wav2Vec2ConformerConfig
-    base_model_prefix = "wav2vec2_conformer"
-    main_input_name = "input_values"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-    supports_gradient_checkpointing = True
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        # Wav2Vec2ForPreTraining last 2 linear layers need standard Linear init.
-        if isinstance(module, Wav2Vec2ConformerForPreTraining):
-            module.project_hid.reset_parameters()
-            module.project_q.reset_parameters()
-            module.project_hid._is_hf_initialized = True
-            module.project_q._is_hf_initialized = True
-        # gumbel softmax requires special init
-        elif isinstance(module, Wav2Vec2ConformerGumbelVectorQuantizer):
-            module.weight_proj.weight.data.normal_(mean=0.0, std=1)
-            module.weight_proj.bias.data.zero_()
-            nn.init.uniform_(module.codevectors)
-        elif isinstance(module, Wav2Vec2ConformerSelfAttention):
-            if hasattr(module, "pos_bias_u"):
-                nn.init.xavier_uniform_(module.pos_bias_u)
-            if hasattr(module, "pos_bias_v"):
-                nn.init.xavier_uniform_(module.pos_bias_v)
-        elif isinstance(module, Wav2Vec2ConformerPositionalConvEmbedding):
-            nn.init.normal_(
-                module.conv.weight,
-                mean=0,
-                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
-            )
-            nn.init.constant_(module.conv.bias, 0)
-        elif isinstance(module, Wav2Vec2ConformerFeatureProjection):
-            k = math.sqrt(1 / module.projection.in_features)
-            nn.init.uniform_(module.projection.weight, a=-k, b=k)
-            nn.init.uniform_(module.projection.bias, a=-k, b=k)
-        elif isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        elif isinstance(module, nn.Conv1d):
-            nn.init.kaiming_normal_(module.weight)
-            if module.bias is not None:
-                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
-                nn.init.uniform_(module.bias, a=-k, b=k)
-    def _get_feat_extract_output_lengths(
-        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
-    ):
-        """
-        Computes the output length of the convolutional layers
-        """
-        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
-        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
-            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
-        if add_adapter:
-            for _ in range(self.config.num_adapter_layers):
-                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
-        return input_lengths
-    def _get_feature_vector_attention_mask(
-        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
-    ):
-        # Effectively attention_mask.sum(-1), but not inplace to be able to run
-        # on inference mode.
-        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
-        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
-        output_lengths = output_lengths.to(torch.long)
-        batch_size = attention_mask.shape[0]
-        attention_mask = torch.zeros(
-            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
-        )
-        # these two operations makes sure that all values before the output lengths idxs are attended to
-        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
-        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
-        return attention_mask
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (Wav2Vec2ConformerEncoder, Wav2Vec2ConformerFeatureEncoder)):
-            module.gradient_checkpointing = value
-WAV2VEC2_CONFORMER_START_DOCSTRING = r"""
-    Wav2Vec2Conformer was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
-    Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
-    Auli.
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving etc.).
-    This model is a PyTorch [nn.Module](https://pytorch.org/docs/stable/nn.html#nn.Module) sub-class. Use it as a
-    regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior.
-    Parameters:
-        config ([`Wav2Vec2ConformerConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-WAV2VEC2_CONFORMER_INPUTS_DOCSTRING = r"""
-    Args:
-        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
-            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
-            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
-            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
-            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
-        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
-            1]`:
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-            [What are attention masks?](../glossary#attention-mask)
-            <Tip warning={true}>
-            `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
-            True`. For all models whose processor has `config.return_attention_mask == False`, such as
-            [wav2vec2-conformer-rel-pos-large](https://huggingface.co/facebook/wav2vec2-conformer-rel-pos-large),
-            `attention_mask` should **not** be passed to avoid degraded performance when doing batched inference. For
-            such models `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware
-            that these models also yield slightly different results depending on whether `input_values` is padded or
-            not.
-            </Tip>
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-@add_start_docstrings(
-    "The bare Wav2Vec2Conformer Model transformer outputting raw hidden-states without any specific head on top.",
-    WAV2VEC2_CONFORMER_START_DOCSTRING,
-)
-class Wav2Vec2ConformerModel(Wav2Vec2ConformerPreTrainedModel):
-    def __init__(self, config: Wav2Vec2ConformerConfig):
-        super().__init__(config)
-        self.config = config
-        self.feature_extractor = Wav2Vec2ConformerFeatureEncoder(config)
-        self.feature_projection = Wav2Vec2ConformerFeatureProjection(config)
-        # model only needs masking vector if mask prob is > 0.0
-        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
-            self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())
-        self.encoder = Wav2Vec2ConformerEncoder(config)
-        self.adapter = Wav2Vec2ConformerAdapter(config) if config.add_adapter else None
-        # Initialize weights and apply final processing
-        self.post_init()
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model.freeze_feature_encoder
-    def freeze_feature_encoder(self):
-        """
-        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
-        not be updated during training.
-        """
-        self.feature_extractor._freeze_parameters()
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states
-    def _mask_hidden_states(
-        self,
-        hidden_states: torch.FloatTensor,
-        mask_time_indices: Optional[torch.FloatTensor] = None,
-        attention_mask: Optional[torch.LongTensor] = None,
-    ):
-        """
-        Masks extracted features along time axis and/or along feature axis according to
-        [SpecAugment](https://arxiv.org/abs/1904.08779).
-        """
-        # `config.apply_spec_augment` can set masking to False
-        if not getattr(self.config, "apply_spec_augment", True):
-            return hidden_states
-        # generate indices & apply SpecAugment along time axis
-        batch_size, sequence_length, hidden_size = hidden_states.size()
-        if mask_time_indices is not None:
-            # apply SpecAugment along time axis with given mask_time_indices
-            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
-        elif self.config.mask_time_prob > 0 and self.training:
-            mask_time_indices = _compute_mask_indices(
-                (batch_size, sequence_length),
-                mask_prob=self.config.mask_time_prob,
-                mask_length=self.config.mask_time_length,
-                attention_mask=attention_mask,
-                min_masks=self.config.mask_time_min_masks,
-            )
-            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
-            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
-        if self.config.mask_feature_prob > 0 and self.training:
-            # generate indices & apply SpecAugment along feature axis
-            mask_feature_indices = _compute_mask_indices(
-                (batch_size, hidden_size),
-                mask_prob=self.config.mask_feature_prob,
-                mask_length=self.config.mask_feature_length,
-                min_masks=self.config.mask_feature_min_masks,
-            )
-            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
-            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
-            hidden_states[mask_feature_indices] = 0
-        return hidden_states
-    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=Wav2Vec2BaseModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-        modality="audio",
-        expected_output=_EXPECTED_OUTPUT_SHAPE,
-    )
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model.forward with wav2vec2->wav2vec2_conformer
-    def forward(
-        self,
-        input_values: Optional[torch.Tensor],
-        attention_mask: Optional[torch.Tensor] = None,
-        mask_time_indices: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        extract_features = self.feature_extractor(input_values)
-        extract_features = extract_features.transpose(1, 2)
-        if attention_mask is not None:
-            # compute reduced attention_mask corresponding to feature vectors
-            attention_mask = self._get_feature_vector_attention_mask(
-                extract_features.shape[1], attention_mask, add_adapter=False
-            )
-        hidden_states, extract_features = self.feature_projection(extract_features)
-        hidden_states = self._mask_hidden_states(
-            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
-        )
-        encoder_outputs = self.encoder(
-            hidden_states,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = encoder_outputs[0]
-        if self.adapter is not None:
-            hidden_states = self.adapter(hidden_states)
-        if not return_dict:
-            return (hidden_states, extract_features) + encoder_outputs[1:]
-        return Wav2Vec2BaseModelOutput(
-            last_hidden_state=hidden_states,
-            extract_features=extract_features,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )
-@add_start_docstrings(
-    """Wav2Vec2Conformer Model with a quantizer and `VQ` head on top.""", WAV2VEC2_CONFORMER_START_DOCSTRING
-)
-class Wav2Vec2ConformerForPreTraining(Wav2Vec2ConformerPreTrainedModel):
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.__init__ with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer
-    def __init__(self, config: Wav2Vec2ConformerConfig):
-        super().__init__(config)
-        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
-        self.dropout_features = nn.Dropout(config.feat_quantizer_dropout)
-        self.quantizer = Wav2Vec2ConformerGumbelVectorQuantizer(config)
-        self.project_hid = nn.Linear(config.hidden_size, config.proj_codevector_dim)
-        self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim)
-        # Initialize weights and apply final processing
-        self.post_init()
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.set_gumbel_temperature
-    def set_gumbel_temperature(self, temperature: int):
-        """
-        Set the Gumbel softmax temperature to a given value. Only necessary for training
-        """
-        self.quantizer.temperature = temperature
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.freeze_feature_encoder with wav2vec2->wav2vec2_conformer
-    def freeze_feature_encoder(self):
-        """
-        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
-        not be updated during training.
-        """
-        self.wav2vec2_conformer.feature_extractor._freeze_parameters()
-    @staticmethod
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.compute_contrastive_logits
-    def compute_contrastive_logits(
-        target_features: torch.FloatTensor,
-        negative_features: torch.FloatTensor,
-        predicted_features: torch.FloatTensor,
-        temperature: int = 0.1,
-    ):
-        """
-        Compute logits for contrastive loss based using cosine similarity as the distance measure between
-        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
-        """
-        target_features = torch.cat([target_features, negative_features], dim=0)
-        logits = torch.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1).type_as(
-            target_features
-        )
-        # apply temperature
-        logits = logits / temperature
-        return logits
-    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Wav2Vec2ConformerForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.forward with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer,wav2vec2_conformer-base->wav2vec2-conformer-rel-pos-large
-    def forward(
-        self,
-        input_values: Optional[torch.Tensor],
-        attention_mask: Optional[torch.Tensor] = None,
-        mask_time_indices: Optional[torch.BoolTensor] = None,
-        sampled_negative_indices: Optional[torch.BoolTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, Wav2Vec2ConformerForPreTrainingOutput]:
-        r"""
-        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
-            masked extracted features in *config.proj_codevector_dim* space.
-        sampled_negative_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*):
-            Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss.
-            Required input for pre-training.
-        Returns:
-        Example:
-        ```python
-        >>> import torch
-        >>> from transformers import AutoFeatureExtractor, Wav2Vec2ConformerForPreTraining
-        >>> from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer import (
-        ...     _compute_mask_indices,
-        ...     _sample_negative_indices,
-        ... )
-        >>> from datasets import load_dataset
-        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large")
-        >>> model = Wav2Vec2ConformerForPreTraining.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large")
-        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values  # Batch size 1
-        >>> # compute masked indices
-        >>> batch_size, raw_sequence_length = input_values.shape
-        >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length).item()
-        >>> mask_time_indices = _compute_mask_indices(
-        ...     shape=(batch_size, sequence_length), mask_prob=0.2, mask_length=2
-        ... )
-        >>> sampled_negative_indices = _sample_negative_indices(
-        ...     features_shape=(batch_size, sequence_length),
-        ...     num_negatives=model.config.num_negatives,
-        ...     mask_time_indices=mask_time_indices,
-        ... )
-        >>> mask_time_indices = torch.tensor(data=mask_time_indices, device=input_values.device, dtype=torch.long)
-        >>> sampled_negative_indices = torch.tensor(
-        ...     data=sampled_negative_indices, device=input_values.device, dtype=torch.long
-        ... )
-        >>> with torch.no_grad():
-        ...     outputs = model(input_values, mask_time_indices=mask_time_indices)
-        >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
-        >>> cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
-        >>> # show that cosine similarity is much higher than random
-        >>> cosine_sim[mask_time_indices.to(torch.bool)].mean() > 0.5
-        tensor(True)
-        >>> # for contrastive loss training model should be put into train mode
-        >>> model = model.train()
-        >>> loss = model(
-        ...     input_values, mask_time_indices=mask_time_indices, sampled_negative_indices=sampled_negative_indices
-        ... ).loss
-        ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if mask_time_indices is not None:
-            mask_time_indices = mask_time_indices.to(torch.bool)
-        outputs = self.wav2vec2_conformer(
-            input_values,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            mask_time_indices=mask_time_indices,
-            return_dict=return_dict,
-        )
-        # 1. project all transformed features (including masked) to final vq dim
-        transformer_features = self.project_hid(outputs[0])
-        # 2. quantize all (unmasked) extracted features and project to final vq dim
-        extract_features = self.dropout_features(outputs[1])
-        if attention_mask is not None:
-            # compute reduced attention_mask correponding to feature vectors
-            attention_mask = self._get_feature_vector_attention_mask(
-                extract_features.shape[1], attention_mask, add_adapter=False
-            )
-        quantized_features, codevector_perplexity = self.quantizer(
-            extract_features, mask_time_indices=mask_time_indices
-        )
-        quantized_features = self.project_q(quantized_features)
-        loss = contrastive_loss = diversity_loss = None
-        if sampled_negative_indices is not None:
-            batch_size, sequence_length, hidden_size = quantized_features.shape
-            # for training, we sample negatives
-            # 3. sample K negatives (distractors) quantized states for contrastive loss
-            # if attention_mask is passed, make sure that padded feature vectors cannot be sampled
-            # sample negative quantized vectors BTC => (BxT)C
-            negative_quantized_features = quantized_features.view(-1, hidden_size)[
-                sampled_negative_indices.long().view(-1)
-            ]
-            negative_quantized_features = negative_quantized_features.view(
-                batch_size, sequence_length, -1, hidden_size
-            ).permute(2, 0, 1, 3)
-            # 4. compute logits, corresponding to `logs = sim(c_t, [q_t, \sim{q}_t]) / \kappa`
-            # of equation (3) in https://arxiv.org/pdf/2006.11477.pdf
-            logits = self.compute_contrastive_logits(
-                quantized_features[None, :],
-                negative_quantized_features,
-                transformer_features,
-                self.config.contrastive_logits_temperature,
-            )
-            # 5. if a negative vector is identical to the positive (i.e. when codebook utilization is low),
-            # its cosine similarity will be masked
-            neg_is_pos = (quantized_features == negative_quantized_features).all(-1)
-            if neg_is_pos.any():
-                logits[1:][neg_is_pos] = float("-inf")
-            # 6. compute contrastive loss \mathbf{L}_m = cross_entropy(logs) =
-            # -log(exp(sim(c_t, q_t)/\kappa) / \sum_{\sim{q}} exp(sim(c_t, \sim{q})/\kappa))
-            logits = logits.transpose(0, 2).reshape(-1, logits.size(0))
-            target = ((1 - mask_time_indices.long()) * -100).transpose(0, 1).flatten()
-            contrastive_loss = nn.functional.cross_entropy(logits.float(), target, reduction="sum")
-            # 7. compute diversity loss: \mathbf{L}_d
-            num_codevectors = self.config.num_codevectors_per_group * self.config.num_codevector_groups
-            diversity_loss = ((num_codevectors - codevector_perplexity) / num_codevectors) * mask_time_indices.sum()
-            # 8. \mathbf{L} = \mathbf{L}_m + \alpha * \mathbf{L}_d
-            loss = contrastive_loss + self.config.diversity_loss_weight * diversity_loss
-        if not return_dict:
-            if loss is not None:
-                return (loss, transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
-            return (transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
-        return Wav2Vec2ConformerForPreTrainingOutput(
-            loss=loss,
-            projected_states=transformer_features,
-            projected_quantized_states=quantized_features,
-            codevector_perplexity=codevector_perplexity,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            contrastive_loss=contrastive_loss,
-            diversity_loss=diversity_loss,
-        )
-@add_start_docstrings(
-    """Wav2Vec2Conformer Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
-    WAV2VEC2_CONFORMER_START_DOCSTRING,
-)
-class Wav2Vec2ConformerForCTC(Wav2Vec2ConformerPreTrainedModel):
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.__init__ with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer
-    def __init__(self, config):
-        super().__init__(config)
-        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
-        self.dropout = nn.Dropout(config.final_dropout)
-        if config.vocab_size is None:
-            raise ValueError(
-                f"You are trying to instantiate {self.__class__} with a configuration that "
-                "does not define the vocabulary size of the language model head. Please "
-                "instantiate the model as follows: `Wav2Vec2ConformerForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
-                "or define `vocab_size` of your model's configuration."
-            )
-        output_hidden_size = (
-            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
-        )
-        self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
-        # Initialize weights and apply final processing
-        self.post_init()
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.freeze_feature_encoder with wav2vec2->wav2vec2_conformer
-    def freeze_feature_encoder(self):
-        """
-        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
-        not be updated during training.
-        """
-        self.wav2vec2_conformer.feature_extractor._freeze_parameters()
-    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=CausalLMOutput,
-        config_class=_CONFIG_FOR_DOC,
-        expected_output=_CTC_EXPECTED_OUTPUT,
-        expected_loss=_CTC_EXPECTED_LOSS,
-    )
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.forward with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer
-    def forward(
-        self,
-        input_values: Optional[torch.Tensor],
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        labels: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple, CausalLMOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
-            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
-            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
-            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
-            config.vocab_size - 1]`.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        outputs = self.wav2vec2_conformer(
-            input_values,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = outputs[0]
-        hidden_states = self.dropout(hidden_states)
-        logits = self.lm_head(hidden_states)
-        loss = None
-        if labels is not None:
-            if labels.max() >= self.config.vocab_size:
-                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
-            # retrieve loss input_lengths from attention_mask
-            attention_mask = (
-                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
-            )
-            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
-            # assuming that padded tokens are filled with -100
-            # when not being attended to
-            labels_mask = labels >= 0
-            target_lengths = labels_mask.sum(-1)
-            flattened_targets = labels.masked_select(labels_mask)
-            # ctc_loss doesn't support fp16
-            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
-            with torch.backends.cudnn.flags(enabled=False):
-                loss = nn.functional.ctc_loss(
-                    log_probs,
-                    flattened_targets,
-                    input_lengths,
-                    target_lengths,
-                    blank=self.config.pad_token_id,
-                    reduction=self.config.ctc_loss_reduction,
-                    zero_infinity=self.config.ctc_zero_infinity,
-                )
-        if not return_dict:
-            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
-            return ((loss,) + output) if loss is not None else output
-        return CausalLMOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
-        )
-@add_start_docstrings(
-    """
-    Wav2Vec2Conformer Model with a sequence classification head on top (a linear layer over the pooled output) for
-    tasks like SUPERB Keyword Spotting.
-    """,
-    WAV2VEC2_CONFORMER_START_DOCSTRING,
-)
-class Wav2Vec2ConformerForSequenceClassification(Wav2Vec2ConformerPreTrainedModel):
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.__init__ with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer
-    def __init__(self, config):
-        super().__init__(config)
-        if hasattr(config, "add_adapter") and config.add_adapter:
-            raise ValueError(
-                "Sequence classification does not support the use of Wav2Vec2Conformer adapters (config.add_adapter=True)"
-            )
-        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
-        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
-        if config.use_weighted_layer_sum:
-            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
-        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
-        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
-        # Initialize weights and apply final processing
-        self.post_init()
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_feature_encoder with wav2vec2->wav2vec2_conformer
-    def freeze_feature_encoder(self):
-        """
-        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
-        not be updated during training.
-        """
-        self.wav2vec2_conformer.feature_extractor._freeze_parameters()
-    def freeze_base_model(self):
-        """
-        Calling this function will disable the gradient computation for the base model so that its parameters will not
-        be updated during training. Only the classification head will be updated.
-        """
-        for param in self.wav2vec2_conformer.parameters():
-            param.requires_grad = False
-    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=SequenceClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-        modality="audio",
-    )
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer,WAV_2_VEC_2->WAV2VEC2_CONFORMER
-    def forward(
-        self,
-        input_values: Optional[torch.Tensor],
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        labels: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple, SequenceClassifierOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
-        outputs = self.wav2vec2_conformer(
-            input_values,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        if self.config.use_weighted_layer_sum:
-            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
-            hidden_states = torch.stack(hidden_states, dim=1)
-            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
-            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
-        else:
-            hidden_states = outputs[0]
-        hidden_states = self.projector(hidden_states)
-        if attention_mask is None:
-            pooled_output = hidden_states.mean(dim=1)
-        else:
-            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
-            hidden_states[~padding_mask] = 0.0
-            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
-        logits = self.classifier(pooled_output)
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
-        if not return_dict:
-            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
-            return ((loss,) + output) if loss is not None else output
-        return SequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-@add_start_docstrings(
-    """
-    Wav2Vec2Conformer Model with a frame classification head on top for tasks like Speaker Diarization.
-    """,
-    WAV2VEC2_CONFORMER_START_DOCSTRING,
-)
-class Wav2Vec2ConformerForAudioFrameClassification(Wav2Vec2ConformerPreTrainedModel):
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification.__init__ with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer,WAV_2_VEC_2->WAV2VEC2_CONFORMER
-    def __init__(self, config):
-        super().__init__(config)
-        if hasattr(config, "add_adapter") and config.add_adapter:
-            raise ValueError(
-                "Audio frame classification does not support the use of Wav2Vec2Conformer adapters (config.add_adapter=True)"
-            )
-        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
-        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
-        if config.use_weighted_layer_sum:
-            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-        self.num_labels = config.num_labels
-        self.init_weights()
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification.freeze_feature_encoder with wav2vec2->wav2vec2_conformer
-    def freeze_feature_encoder(self):
-        """
-        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
-        not be updated during training.
-        """
-        self.wav2vec2_conformer.feature_extractor._freeze_parameters()
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification.freeze_base_model with wav2vec2->wav2vec2_conformer
-    def freeze_base_model(self):
-        """
-        Calling this function will disable the gradient computation for the base model so that its parameters will not
-        be updated during training. Only the classification head will be updated.
-        """
-        for param in self.wav2vec2_conformer.parameters():
-            param.requires_grad = False
-    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=TokenClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-        modality="audio",
-    )
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification.forward with wav2vec2->wav2vec2_conformer
-    def forward(
-        self,
-        input_values: Optional[torch.Tensor],
-        attention_mask: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, TokenClassifierOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
-        outputs = self.wav2vec2_conformer(
-            input_values,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        if self.config.use_weighted_layer_sum:
-            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
-            hidden_states = torch.stack(hidden_states, dim=1)
-            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
-            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
-        else:
-            hidden_states = outputs[0]
-        logits = self.classifier(hidden_states)
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))
-        if not return_dict:
-            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
-            return output
-        return TokenClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.AMSoftmaxLoss
-class AMSoftmaxLoss(nn.Module):
-    def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
-        super(AMSoftmaxLoss, self).__init__()
-        self.scale = scale
-        self.margin = margin
-        self.num_labels = num_labels
-        self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True)
-        self.loss = nn.CrossEntropyLoss()
-    def forward(self, hidden_states, labels):
-        labels = labels.flatten()
-        weight = nn.functional.normalize(self.weight, dim=0)
-        hidden_states = nn.functional.normalize(hidden_states, dim=1)
-        cos_theta = torch.mm(hidden_states, weight)
-        psi = cos_theta - self.margin
-        onehot = nn.functional.one_hot(labels, self.num_labels)
-        logits = self.scale * torch.where(onehot.bool(), psi, cos_theta)
-        loss = self.loss(logits, labels)
-        return loss
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.TDNNLayer
-class TDNNLayer(nn.Module):
-    def __init__(self, config, layer_id=0):
-        super().__init__()
-        self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id]
-        self.out_conv_dim = config.tdnn_dim[layer_id]
-        self.kernel_size = config.tdnn_kernel[layer_id]
-        self.dilation = config.tdnn_dilation[layer_id]
-        self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim)
-        self.activation = nn.ReLU()
-    def forward(self, hidden_states):
-        hidden_states = hidden_states.unsqueeze(1)
-        hidden_states = nn.functional.unfold(
-            hidden_states,
-            (self.kernel_size, self.in_conv_dim),
-            stride=(1, self.in_conv_dim),
-            dilation=(self.dilation, 1),
-        )
-        hidden_states = hidden_states.transpose(1, 2)
-        hidden_states = self.kernel(hidden_states)
-        hidden_states = self.activation(hidden_states)
-        return hidden_states
-@add_start_docstrings(
-    """
-    Wav2Vec2Conformer Model with an XVector feature extraction head on top for tasks like Speaker Verification.
-    """,
-    WAV2VEC2_CONFORMER_START_DOCSTRING,
-)
-class Wav2Vec2ConformerForXVector(Wav2Vec2ConformerPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
-        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
-        if config.use_weighted_layer_sum:
-            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
-        self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0])
-        tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))]
-        self.tdnn = nn.ModuleList(tdnn_layers)
-        self.feature_extractor = nn.Linear(config.tdnn_dim[-1] * 2, config.xvector_output_dim)
-        self.classifier = nn.Linear(config.xvector_output_dim, config.xvector_output_dim)
-        self.objective = AMSoftmaxLoss(config.xvector_output_dim, config.num_labels)
-        self.init_weights()
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector.freeze_feature_encoder with wav2vec2->wav2vec2_conformer
-    def freeze_feature_encoder(self):
-        """
-        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
-        not be updated during training.
-        """
-        self.wav2vec2_conformer.feature_extractor._freeze_parameters()
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector.freeze_base_model with wav2vec2->wav2vec2_conformer
-    def freeze_base_model(self):
-        """
-        Calling this function will disable the gradient computation for the base model so that its parameters will not
-        be updated during training. Only the classification head will be updated.
-        """
-        for param in self.wav2vec2_conformer.parameters():
-            param.requires_grad = False
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector._get_tdnn_output_lengths with wav2vec2->wav2vec2_conformer
-    def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
-        """
-        Computes the output length of the TDNN layers
-        """
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return (input_length - kernel_size) // stride + 1
-        for kernel_size in self.config.tdnn_kernel:
-            input_lengths = _conv_out_length(input_lengths, kernel_size, 1)
-        return input_lengths
-    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=XVectorOutput,
-        config_class=_CONFIG_FOR_DOC,
-        modality="audio",
-    )
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector.forward with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer,WAV_2_VEC_2->WAV2VEC2_CONFORMER
-    def forward(
-        self,
-        input_values: Optional[torch.Tensor],
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        labels: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple, XVectorOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
-        outputs = self.wav2vec2_conformer(
-            input_values,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        if self.config.use_weighted_layer_sum:
-            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
-            hidden_states = torch.stack(hidden_states, dim=1)
-            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
-            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
-        else:
-            hidden_states = outputs[0]
-        hidden_states = self.projector(hidden_states)
-        for tdnn_layer in self.tdnn:
-            hidden_states = tdnn_layer(hidden_states)
-        # Statistic Pooling
-        if attention_mask is None:
-            mean_features = hidden_states.mean(dim=1)
-            std_features = hidden_states.std(dim=1)
-        else:
-            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
-            tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
-            mean_features = []
-            std_features = []
-            for i, length in enumerate(tdnn_output_lengths):
-                mean_features.append(hidden_states[i, :length].mean(dim=0))
-                std_features.append(hidden_states[i, :length].std(dim=0))
-            mean_features = torch.stack(mean_features)
-            std_features = torch.stack(std_features)
-        statistic_pooling = torch.cat([mean_features, std_features], dim=-1)
-        output_embeddings = self.feature_extractor(statistic_pooling)
-        logits = self.classifier(output_embeddings)
-        loss = None
-        if labels is not None:
-            loss = self.objective(logits, labels)
-        if not return_dict:
-            output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
-            return ((loss,) + output) if loss is not None else output
-        return XVectorOutput(
-            loss=loss,
-            logits=logits,
-            embeddings=output_embeddings,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )

MuCodec/muq_dev/muq_fairseq/models/muq/modules/random_quantizer.py DELETED Viewed

@@ -1,68 +0,0 @@
-import torch
-from torch import nn, einsum
-from einops import rearrange
-class RandomProjectionQuantizer(nn.Module):
-    """
-    Random projection and codebook lookup module
-    Some code is borrowed from:
-     https://github.com/lucidrains/vector-quantize-pytorch/blob/master/vector_quantize_pytorch/random_projection_quantizer.py
-    But I did normalization using pre-computed global mean & variance instead of using layer norm.
-    """
-    def __init__(
-        self,
-        input_dim,
-        codebook_dim,
-        codebook_size,
-        seed=142,
-    ):
-        super().__init__()
-        # random seed
-        torch.manual_seed(seed)
-        # randomly initialized projection
-        random_projection = torch.empty(input_dim, codebook_dim)
-        nn.init.xavier_normal_(random_projection)
-        self.register_buffer("random_projection", random_projection)
-        # randomly initialized codebook
-        codebook = torch.empty(codebook_size, codebook_dim)
-        nn.init.normal_(codebook)
-        self.register_buffer("codebook", codebook)
-    def codebook_lookup(self, x):
-        # reshape
-        b = x.shape[0]
-        x = rearrange(x, "b n e -> (b n) e")
-        # L2 normalization
-        normalized_x = nn.functional.normalize(x, dim=1, p=2)
-        normalized_codebook = nn.functional.normalize(self.codebook, dim=1, p=2)
-        # compute distances
-        distances = torch.cdist(normalized_codebook, normalized_x)
-        # get nearest
-        nearest_indices = torch.argmin(distances, dim=0)
-        # reshape
-        xq = rearrange(nearest_indices, "(b n) -> b n", b=b)
-        return xq
-    @torch.no_grad()
-    def forward(self, x):
-        # always eval
-        self.eval()
-        # random projection [batch, length, input_dim] -> [batch, length, codebook_dim]
-        x = einsum("b n d, d e -> b n e", x, self.random_projection)
-        # codebook lookup
-        xq = self.codebook_lookup(x)
-        return xq

MuCodec/muq_dev/muq_fairseq/models/muq/muq_model.py DELETED Viewed

@@ -1,139 +0,0 @@
-try:
-    from .model.muq import MuQ
-except:
-    import sys, os
-    sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-    from model.muq import MuQ
-try:
-    from fairseq.fairseq.dataclass import FairseqDataclass
-    from fairseq.fairseq.models import BaseFairseqModel, register_model
-    from fairseq.fairseq.tasks.fairseq_task import FairseqTask
-except:
-    from fairseq.dataclass import FairseqDataclass
-    from fairseq.models import BaseFairseqModel, register_model
-    from fairseq.tasks.fairseq_task import FairseqTask
-from dataclasses import dataclass, field
-from typing import List, Tuple, Optional
-import torch
-from logging import getLogger
-logger = getLogger(__name__)
-@dataclass
-class MuQConfig(FairseqDataclass):
-    label_rate:int = field(default=25)
-    num_codebooks:int = field(default=1)
-    codebook_dim:int = field(default=16)
-    codebook_size:int = field(default=4096)
-    features:List[str] = field(default_factory=lambda:["melspec_2048"])
-    hop_length:int = field(default=240)
-    n_mels:int = field(default=128)
-    conv_dim:int = field(default=512)
-    encoder_dim:int = field(default=1024)
-    encoder_depth:int = field(default=12)
-    mask_hop:float = field(default=0.4)
-    mask_prob:float = field(default=0.6)
-    is_flash:bool = field(default=False)
-    stat_path:Optional[str] = field(default=None)
-    model_path:Optional[str] = field(default=None)
-    w2v2_config_path:Optional[str] = field(default=None)
-    use_rvq_target:bool = field(default=False)
-    use_vq_target:bool = field(default=False)
-    rvq_ckpt_path: Optional[str] = field(default=None)
-    recon_loss_ratio: Optional[float] = field(default=None)
-    resume_checkpoint: Optional[str] = None
-    use_hubert_masking_strategy:bool = field(default=False)
-    use_hubert_featurizer:bool = field(default=False)
-    hubert_conv_feature_layers:str = field(default_factory=lambda:"[(512,10,5)] + [(512,3,2)] * 3  + [(512,3,3)] + [(512,2,2)] * 2")
-    rvq_n_codebooks:int = field(default=8)
-    rvq_multi_layer_num:int = field(default=1)
-    use_encodec_target:bool = field(default=False)
-SAMPLE_RATE = 24_000
-@register_model("muq", dataclass=MuQConfig)
-class MuQModel(BaseFairseqModel):
-    def __init__(self, cfg: MuQConfig, task_cfg: FairseqTask):
-        super().__init__()
-        self.cfg = cfg
-        self.model = MuQ(
-            num_codebooks=cfg.num_codebooks,
-            codebook_dim=cfg.codebook_dim,
-            codebook_size=cfg.codebook_size,
-            features=cfg.features,
-            n_mels=cfg.n_mels,
-            conv_dim=cfg.conv_dim,
-            encoder_dim=cfg.encoder_dim,
-            encoder_depth=cfg.encoder_depth,
-            mask_hop=cfg.mask_hop,
-            mask_prob=cfg.mask_prob,
-            is_flash=cfg.is_flash,
-            stat_path=cfg.stat_path,
-            model_path=cfg.model_path,
-            w2v2_config_path=cfg.w2v2_config_path,
-            use_rvq_target=cfg.use_rvq_target,
-            use_vq_target=cfg.use_vq_target,
-            rvq_ckpt_path=cfg.rvq_ckpt_path,
-            recon_loss_ratio=cfg.recon_loss_ratio,
-            label_rate=cfg.label_rate,
-            use_hubert_masking_strategy=cfg.use_hubert_masking_strategy,
-            use_hubert_featurizer=cfg.use_hubert_featurizer,
-            hubert_conv_feature_layers=cfg.hubert_conv_feature_layers,
-            rvq_n_codebooks=cfg.rvq_n_codebooks,
-            rvq_multi_layer_num=cfg.rvq_multi_layer_num,
-            use_encodec_target=cfg.use_encodec_target,
-        )
-    def forward(
-        self,
-        source: torch.Tensor, # B,L
-        features_only: bool = False,
-        label = None, # pre-extracted labeks, dim is [Batch, N_Codebook, SeqLen]
-        **kwargs,
-    ):
-        source = source[..., :int((source.shape[-1]//(SAMPLE_RATE//self.cfg.label_rate))*(SAMPLE_RATE//self.cfg.label_rate)) ]
-        if features_only:
-            if 'attention_mask' in kwargs:
-                attention_mask = kwargs['attention_mask']
-            elif 'padding_mask' in kwargs:
-                attention_mask = ~kwargs['padding_mask'].bool()
-            else:
-                attention_mask = None
-            _, hidden_states = self.model.get_predictions(source, attention_mask=attention_mask, is_features_only=True)
-            result = {
-                "layer_results": hidden_states
-            }
-            return result
-        else:
-            result = {}
-            logits, hidden_emb, losses, accuracies = self.model(source, label=label)
-            result["losses"] = losses
-            result["accuracies"] = accuracies
-            result["logits"] = logits
-            result["hidden_emb"] = hidden_emb
-            for k, v in losses.items():
-                result[k] = v
-            return result
-    @classmethod
-    def build_model(cls, cfg: MuQConfig, task: FairseqTask):
-        """Build a new model instance."""
-        model = MuQModel(cfg, task.cfg)
-        import numpy as np
-        s = 0
-        for param in model.parameters():
-            s += np.product(param.size())
-        # print('# of parameters: '+str(s/1024.0/1024.0))
-        if cfg.get("resume_checkpoint", None):
-            print("Loading checkpoint from {}".format(cfg.resume_checkpoint))
-            model.load_state_dict(torch.load(cfg.resume_checkpoint)['model'], strict=False)
-        return model
-    def get_losses(self, result, batch):
-        return result['losses']

MuCodec/muq_dev/muq_fairseq/tasks/muq_pretraining.py DELETED Viewed

@@ -1,354 +0,0 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
-import logging
-import os
-import sys
-from typing import Dict, List, Optional, Tuple
-import numpy as np
-import torch
-from dataclasses import dataclass, field
-from fairseq.data import Dictionary, HubertDataset
-from fairseq.dataclass.configs import FairseqDataclass
-from fairseq.tasks import register_task
-from fairseq.tasks.fairseq_task import FairseqTask
-from omegaconf import MISSING
-from ..data.mert_dataset import MERTDataset
-from ..data.ark_dataset import ArkDataset
-logger = logging.getLogger(__name__)
-class LabelEncoder(object):
-    def __init__(self, dictionary: Dictionary) -> None:
-        self.dictionary = dictionary
-    def __call__(self, label: str) -> List[str]:
-        # encode_line return a torch.IntTensor, should be all 1 for vanila HuBERT
-        return self.dictionary.encode_line(
-            label,
-            append_eos=False,
-            add_if_not_exist=False,
-        )
-class PaddedNumpyLabelEncoder(object):
-    def __init__(self):
-        # self.dictionary = dictionary
-        pass
-    def __call__(self, label):
-        t = torch.IntTensor(np.asarray(label))
-        t = t[t>=0] # remove padded -1 values at the end
-        return t
-@dataclass
-class MuQPretrainingConfig(FairseqDataclass):
-    data: str = field(default=MISSING, metadata={"help": "path to data directory"})
-    sharding_data: int = field(
-        default=-1,
-        metadata={
-            "help": "set this para >1 to use sharding dataset to prevent OOM"
-            "prepare data tsv and label files by adding postfix for sharding 64 like:"
-            "train_28_64.tsv and train_28_64.encodec_6"
-        },
-    )
-    load_random_data_shard: bool = field(
-        default=True,
-        metadata={
-            "help": "whether to laod shards randomly or in order when use sharding_data"
-        },
-    )
-    fine_tuning: bool = field(
-        default=False, metadata={"help": "set to true if fine-tuning Hubert"}
-    )
-    labels: List[str] = field(
-        default_factory=lambda: ["ltr"],
-        metadata={
-            "help": (
-                "extension of the label files to load, frame-level labels for"
-                " pre-training, and sequence-level label for fine-tuning"
-            )
-        },
-    )
-    label_dir: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "if set, looks for labels in this directory instead",
-        },
-    )
-    label_scp_path: Optional[str] = field(
-        default=None,
-        metadata={
-            'help': 'if set, load label from scp file'
-        }
-    )
-    label_scp_clip_duration: float = field(
-        default=-1,
-        metadata={
-            'help': 'clip duration for loading scp label. if set to -1, this will not make effect.'
-        }
-    )
-    label_rate: float = field(
-        default=-1.0,
-        metadata={"help": "label frame rate. -1.0 for sequence label"},
-    )
-    sample_rate: int = field(
-        default=16_000,
-        metadata={
-            "help": "target sample rate. audio files will be up/down "
-            "sampled to this rate"
-        },
-    )
-    normalize: bool = field(
-        default=False,
-        metadata={"help": "if set, normalizes input to have 0 mean and unit variance"},
-    )
-    enable_padding: bool = field(
-        default=False,
-        metadata={"help": "pad shorter samples instead of cropping"},
-    )
-    max_keep_size: Optional[int] = field(
-        default=None,
-        metadata={"help": "exclude sample longer than this"},
-    )
-    max_sample_size: Optional[int] = field(
-        default=None,
-        metadata={"help": "max sample size to crop to for batching"},
-    )
-    min_sample_size: Optional[int] = field(
-        default=None,
-        metadata={"help": "min sample size to crop to for batching"},
-    )
-    single_target: Optional[bool] = field(
-        default=False,
-        metadata={
-            "help": "if set, AddTargetDatasets outputs same keys " "as AddTargetDataset"
-        },
-    )
-    random_crop: Optional[bool] = field(
-        default=True,
-        metadata={"help": "always crop from the beginning if false"},
-    )
-    pad_audio: Optional[bool] = field(
-        default=False,
-        metadata={"help": "pad audio to the longest one in the batch if true"},
-    )
-    store_labels: Optional[bool] = field(
-        default=False,
-        metadata={"help": "whether to load all of the label into memory"},
-    )
-    numpy_memmap_label: Optional[bool] = field(
-        default=False,
-        metadata={"help": "whether the label file is saved as a numpy file, each line is ended with padding -1"},
-    )
-    augmentation_effects: Optional[str] = field(
-        default="[]",
-        metadata={
-            "help": (
-                "a list of effects that might apply to the audios"
-                "example: \"['random_mute', 'random_Gaussian', 'reverse_polarity']\" "
-                "supported: random_mute,"
-                "todo: "
-            )
-        },
-    )
-    augmentation_probs: Optional[str] = field(
-        default="[]",
-        metadata={
-            "help": (
-                "the corresponding probabilities for the data augmentation effects"
-                "example: \"[0.1, 0.5, 0.8]\" "
-                "the sum is not necessarily need to be 1.0, and multiple effects can be applied to the same audio"
-            )
-        },
-    )
-    # inbatch_noise_augment_len_range: Optional[List[int]] = field(
-        # default_factory=lambda: [8000, 24000],
-        # default = [8000, 24000],
-    inbatch_noise_augment_len_range: Optional[str] = field(
-        default = "[8000, 24000]",
-        metadata={
-            "help": (
-                "the range of length of the mix-up noise augmentation, unit in smaples"
-            )
-        },
-    )
-    # inbatch_noise_augment_number_range: Optional[List[int]] = field(
-    #     default_factory=lambda: [1, 3],
-        # default = [1, 3],
-    inbatch_noise_augment_number_range: Optional[str] = field(
-        default = "[1, 3]",
-        metadata={
-            "help": (
-                "the range of numbers of the mix-up noise augmentation"
-            )
-        },
-    )
-    inbatch_noise_augment_volume: float = field(
-        default = 1.0,
-        metadata={
-            "help": (
-                "the coefficient used to modify the volume of the noise audios wavs"
-            )
-        },
-    )
-    dynamic_crops: Optional[str] = field(
-        default="[]",
-        metadata={
-            "help": (
-                "used to set the maximum audio length setting, for training"
-                "example: \"[1, 2, 3, 4, 5, 10]\" "
-            )
-        },
-    )
-    dynamic_crops_epoches: Optional[str] = field(
-        default="[]",
-        metadata={
-            "help": (
-                "used to set training epoches of changing the maximum audio length"
-                "example: \"[1, 10, 20, 40, 80, 160,]\" "
-                "then len need to be equal to len(dynamic_crops)"
-            )
-        },
-    )
-    cqt_loss_bin_dataloader: Optional[int] = field(
-        default=-1,
-        metadata={
-            "help": (
-                "use this parameter to prepare cqt prediction objective in dataloader"
-            )
-        },
-    )
-    clip_secs: int = field(
-        default=5,
-        metadata={
-            "help": "clip secs for each audio"
-        }
-    )
-    dataset_shuffle: bool = field(
-        default=True,
-        metadata={
-            "help": (
-                "dataset shuffle when sample a batch"
-            )
-        },
-    )
-@register_task("muq_pretraining", dataclass=MuQPretrainingConfig)
-class MuQPretrainingTask(FairseqTask):
-    cfg: MuQPretrainingConfig
-    def __init__(
-        self,
-        cfg: MuQPretrainingConfig,
-    ) -> None:
-        super().__init__(cfg)
-        logger.info(f"current directory is {os.getcwd()}")
-        logger.info(f"MuQPretrainingTask Config {cfg}")
-        self.cfg = cfg
-        self.fine_tuning = cfg.fine_tuning
-        if cfg.fine_tuning:
-            self.state.add_factory("target_dictionary", self.load_dictionaries)
-        else:
-            self.state.add_factory("dictionaries", self.load_dictionaries)
-        self.blank_symbol = "<s>"
-        # use eval() to pass list parameters, skirt the fairseq/torch error:  Can't pickle <enum 'Choices'>: attribute lookup Choices on fairseq.dataclass.constants failed
-        self.augmentation_effects = eval(self.cfg.augmentation_effects)
-        self.augmentation_probs = eval(self.cfg.augmentation_probs)
-        if len(self.augmentation_effects) > 0:
-            assert len(self.augmentation_effects) == len(self.augmentation_probs)
-            logger.info(f"Applying audio augmentation {self.augmentation_effects}, probabilities: {self.augmentation_probs}")
-        self.inbatch_noise_augment_number_range = eval(self.cfg.inbatch_noise_augment_number_range)
-        self.inbatch_noise_augment_len_range = eval(self.cfg.inbatch_noise_augment_len_range)
-        self.max_sample_size = self.cfg.max_sample_size
-        self.dynamic_crops = eval(self.cfg.dynamic_crops)
-        self.dynamic_crops_epoches = eval(self.cfg.dynamic_crops_epoches)
-        assert len(self.dynamic_crops) == len(self.dynamic_crops_epoches)
-        if len(self.dynamic_crops) > 0:
-            assert self.dynamic_crops_epoches[0] == 1
-        self.cqt_loss_bin_dataloader = self.cfg.cqt_loss_bin_dataloader
-        self.numpy_memmap_label = self.cfg.numpy_memmap_label
-        self.store_labels = self.cfg.store_labels
-        if self.numpy_memmap_label:
-            assert self.store_labels
-    @property
-    def source_dictionary(self) -> Optional[Dictionary]:
-        return None
-    @property
-    def target_dictionary(self) -> Optional[Dictionary]:
-        return self.state.target_dictionary
-    @property
-    def dictionaries(self) -> List[Dictionary]:
-        return self.state.dictionaries
-    @classmethod
-    def setup_task(
-        cls, cfg: MuQPretrainingConfig, **kwargs
-    ) -> "MuQPretrainingTask":
-        return cls(cfg)
-    def load_dictionaries(self):
-        label_dir = self.cfg.data if (self.cfg.label_dir is None or self.cfg.label_dir == '') else self.cfg.label_dir
-        print(label_dir)
-        dictionaries = [
-            Dictionary.load(f"{label_dir}/dict.{label}.txt")
-            for label in self.cfg.labels
-        ]
-        return dictionaries[0] if self.cfg.fine_tuning else dictionaries
-    def get_label_dir(self) -> str:
-        if self.cfg.label_dir is None or self.cfg.label_dir=='':
-            return self.cfg.data
-        return self.cfg.label_dir
-    def is_force_load_dataset(self, epoch, training_restore=False):
-        # find the threshold that holds epoch \in [threshold, next_threshold)
-        return (epoch in self.dynamic_crops_epoches) or training_restore or (self.cfg.sharding_data > 1)
-    def set_dynamic_crop_max_sample(self, epoch):
-        pass
-    def load_dataset(self, split: str, **kwargs) -> None:
-        pass
-    def load_dataset_ark(self, split, **kwargs):
-        pass
-    def load_dataset_mert(self, split: str, **kwargs) -> None:
-        pass
-    def max_positions(self) -> Tuple[int, int]:
-        return (sys.maxsize, sys.maxsize)
-    def filter_indices_by_size(self, indices: np.array, *args, **kwargs) -> np.array:
-        return indices

MuCodec/muq_dev/test.py DELETED Viewed

@@ -1,22 +0,0 @@
-import torch
-from dataclasses import dataclass
-import fairseq
-import os.path as op
-root = op.dirname(op.abspath(__file__))
-@dataclass
-class UserDirModule:
-    user_dir: str
-def load_model(model_dir, checkpoint_dir):
-    '''Load Fairseq SSL model'''
-    model_path = UserDirModule(model_dir)
-    fairseq.utils.import_user_module(model_path)
-    model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_dir], strict=False)
-    model = model[0]
-    return model

MuCodec/readme.md DELETED Viewed

@@ -1,67 +0,0 @@
-# MuCodec: Ultra Low-Bitrate Music Codec
-This repository is the official code repository for MuCodec: Ultra Low-Bitrate Music Codec. You can find our paper on [arXiv] (https://arxiv.org/pdf/2409.13216). The demo page is available [here](https://xuyaoxun.github.io/MuCodec_demo/).
-In this repository, we provide the Mucodec model, inference scripts, and the checkpoint that has been trained on the Million Song Dataset. Specifically, we have released the model and inference code corresponding to the lowest bitrate of 0.35 kbps as mentioned in the paper, to demonstrate the effectiveness of our work.
-MuCodec supports 48kHz, dual-channel (stereo) audio reconstruction. If the original audio is in a different format, it will first be converted to 48kHz, dual-channel audio.
-## Installation
-You can install the necessary dependencies using the `requirements.txt` file with Python 3.8.12:
-```bash
-pip install -r requirements.txt
-```
-Due to storage limitations, we have saved the model checkpoints on Hugging Face at https://huggingface.co/yaoxunxu/mucodec. You can easily download the models from Hugging Face and save them in the following directories:
-- Save `audioldm_48k.pth` in the `tools` folder.
-- Save `muq.pt` in the `muq_dev` folder.
-- Save `mucodec.pt` in the `ckpt` folder.
-Please note that all three checkpoints must be downloaded completely for the model to load correctly. The final file paths should be:
-```
-tools/audioldm_48k.pth
-muq_dev/muq.pt
-ckpt/mucodec.pt
-```
-The file `audioldm_48k.pth` is sourced from https://huggingface.co/haoheliu/audioldm_48k/blob/main/audioldm_48k.pth.
-## Inference
-To run inference, use the following command:
-```bash
-python3 generate.py
-```
-We have provided a sample song `test.wav`, randomly sampled from the Million Song Dataset, in the `test_wav` folder. The default input path is `test_wav/test.wav`, and the output path for the reconstructed audio is `reconstruct/test.wav`.
-In the `generate.py` file, we have implemented several functions to facilitate the music compression and reconstruction process. You can easily obtain compressed tokens from audio using the `sound2code` function, and reconstruct the audio from tokens using the `code2sound` function.
-## Note
-Please note that the open-sourced model was trained solely on the Million Song Dataset. Considering the quality issues of this dataset, the open-sourced model may not achieve the same performance as demonstrated in the demo. Unfortunately, due to copyright restrictions, we are unable to release the checkpoints trained on additional datasets. However, you can use your own dataset to further train the model and achieve better results.
-## License
-The code in this repository is released under the MIT license as found in the [LICENSE](LICENSE) file.
-The model weights (muq.pt, mucodec.pt) in this repository are released under the CC-BY-NC 4.0 license, as detailed in the [LICENSE_weights](LICENSE_weights) file.
-## Citation
-If you find our work useful, please cite our paper:
-```bibtex
-@article{xu2024mucodec,
-  title={MuCodec: Ultra Low-Bitrate Music Codec},
-  author={Xu, Yaoxun and Chen, Hangting and Yu, Jianwei and Tan, Wei and Gu, Rongzhi and Lei, Shun and Lin, Zhiwei and Wu, Zhiyong},
-  journal={arXiv preprint arXiv:2409.13216},
-  year={2024}
-}
-```

MuCodec/reconstructed/test.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:946e5815c7c3b8cab9f8eb6ca6707e821498fd59233d3ee356f6bb6f2fd2296b
-size 99367376

MuCodec/requirements.txt DELETED Viewed

@@ -1,335 +0,0 @@
-absl-py==2.0.0
-accelerate==0.30.1
-aeiou==0.0.20
-aiobotocore==2.13.1
-aiofiles==23.2.1
-aiohttp==3.9.3
-aioitertools==0.11.0
-aiosignal==1.3.1
-alias-free-torch==0.0.6
-altair==5.3.0
-annotated-types==0.6.0
-antlr4-python3-runtime==4.8
-anyio==4.3.0
-appdirs==1.4.4
-argbind==0.3.9
-asttokens==2.4.1
-astunparse==1.6.3
-async-timeout==4.0.3
-attrs==23.1.0
-audioread==3.0.1
-auraloss==0.4.0
-av==11.0.0
-backcall==0.2.0
-beartype==0.18.5
-bitarray==2.9.2
-bleach==6.1.0
-blis==0.7.11
-bokeh==3.1.1
-botocore==1.34.131
-braceexpand==0.1.7
-cachetools==5.3.2
-catalogue==2.0.10
-certifi==2023.11.17
-cffi==1.16.0
-charset-normalizer==3.3.2
-clean-fid==0.1.35
-click==8.1.7
-clip-anytorch==2.6.0
-cloudpathlib==0.16.0
-cloudpickle==3.0.0
-cn2an==0.5.22
-colorama==0.4.6
-colorcet==3.1.0
-colorlog==6.8.2
-confection==0.1.4
-configparser==7.0.0
-contourpy==1.1.1
-cycler==0.12.1
-cymem==2.0.8
-Cython==3.0.10
-dataclasses==0.6
-datasets
-dctorch==0.1.2
-decorator==5.1.1
-decord==0.6.0
-deepspeed==0.14.0
-demucs==4.0.1
-descript-audio-codec==1.0.0
-descript-audiotools==0.7.2
-diffusers==0.27.2
-dill==0.3.8
-Distance==0.1.3
-docker-pycreds==0.4.0
-docopt==0.6.2
-docstring_parser==0.16
-dora_search==0.1.12
-einops==0.7.0
-einops-exts==0.0.4
-einx==0.3.0
-ema-pytorch==0.2.3
-encodec==0.1.1
-exceptiongroup==1.2.0
-executing==2.0.1
-expecttest==0.1.6
-fairseq==0.12.2
-fastapi==0.110.3
-fastcore==1.6.3
-ffmpy==0.3.2
-filelock==3.13.1
-fire==0.6.0
-flashy==0.0.2
-flatten-dict==0.4.2
-fonttools==4.49.0
-frozendict==2.4.4
-frozenlist==1.4.1
-fsspec==2024.6.1
-ftfy==6.1.3
-future==1.0.0
-g2p-en==2.1.0
-gin-config==0.5.0
-gitdb==4.0.11
-GitPython==3.1.43
-google-auth==2.23.4
-google-auth-oauthlib==1.0.0
-gradio==4.26.0
-gradio_client==0.15.1
-grpcio==1.59.3
-h11==0.14.0
-h5py==3.11.0
-hjson==3.1.0
-holoviews==1.17.1
-httpcore==1.0.5
-httpx==0.27.0
-huggingface-hub==0.23.5
-hydra-colorlog==1.2.0
-hydra-core==1.0.7
-hypothesis==6.90.0
-idna==3.4
-imageio==2.34.2
-importlib-metadata==6.8.0
-importlib-resources==5.12.0
-inflect==7.0.0
-ipython==8.12.3
-jedi==0.19.1
-jieba-fast==0.53
-Jinja2==3.1.2
-jmespath==1.0.1
-joblib==1.3.2
-json5==0.9.25
-jsonlines==4.0.0
-jsonmerge==1.9.2
-jsonschema==4.22.0
-jsonschema-specifications==2023.12.1
-julius==0.2.7
-k-diffusion==0.1.1
-kaldiio==2.18.0
-kiwisolver==1.4.5
-kornia==0.7.3
-kornia_rs==0.1.5
-laion-clap==1.1.4
-lameenc==1.7.0
-langcodes==3.4.0
-language_data==1.2.0
-lazy_loader==0.3
-librosa==0.9.2
-lightning==2.2.1
-lightning-utilities==0.10.1
-linkify-it-py==2.0.3
-lion-pytorch==0.2.2
-llvmlite==0.41.1
-local-attention==1.8.6
-loguru==0.7.2
-lxml==5.2.2
-marisa-trie==1.1.1
-Markdown==3.5.1
-markdown-it-py==3.0.0
-markdown2==2.5.0
-MarkupSafe==2.1.3
-matplotlib==3.7.5
-matplotlib-inline==0.1.7
-mdit-py-plugins==0.4.1
-mdurl==0.1.2
-mpmath==1.3.0
-msgpack==1.0.8
-multidict==6.0.5
-multiprocess==0.70.16
-murmurhash==1.0.10
-mypy-extensions==1.0.0
-networkx==3.1
-ninja==1.11.1.1
-nltk==3.8.1
-nnAudio==0.3.3
-num2words==0.5.13
-numba==0.58.1
-numpy==1.23.5
-nvidia-cublas-cu11==11.11.3.6
-nvidia-cuda-cupti-cu11==11.8.87
-nvidia-cuda-nvrtc-cu11==11.8.89
-nvidia-cuda-runtime-cu11==11.8.89
-nvidia-cudnn-cu11==8.7.0.84
-nvidia-cufft-cu11==10.9.0.58
-nvidia-curand-cu11==10.3.0.86
-nvidia-cusolver-cu11==11.4.1.48
-nvidia-cusparse-cu11==11.7.5.86
-nvidia-nccl-cu11==2.19.3
-nvidia-nvtx-cu11==11.8.86
-oauthlib==3.2.2
-omegaconf
-opencv-contrib-python==4.8.1.78
-opencv-python==4.8.1.78
-openunmix==1.2.1
-orjson==3.10.3
-packaging==23.2
-pandas==2.0.2
-panel==1.2.3
-param==2.1.1
-parso==0.8.4
-pathtools==0.1.2
-pedalboard==0.7.4
-peft==0.10.0
-pexpect==4.9.0
-pickleshare==0.7.5
-Pillow==10.1.0
-pkgutil_resolve_name==1.3.10
-platformdirs==4.2.0
-plotly==5.23.0
-pooch==1.8.1
-portalocker==2.10.1
-prefigure==0.0.9
-preshed==3.0.9
-proces==0.1.7
-prodict==0.8.18
-progressbar==2.5
-prompt_toolkit==3.0.47
-protobuf==3.19.6
-psutil==5.9.6
-ptyprocess==0.7.0
-pure_eval==0.2.3
-py-cpuinfo==9.0.0
-pyarrow==17.0.0
-pyarrow-hotfix==0.6
-pyasn1==0.5.1
-pyasn1-modules==0.3.0
-pybind11==2.11.1
-pycparser==2.21
-pydantic==2.6.3
-pydantic_core==2.16.3
-pydub==0.25.1
-Pygments==2.18.0
-pyloudnorm==0.1.1
-pynndescent==0.5.13
-pynvml==11.5.0
-pyparsing==3.1.2
-pypinyin==0.51.0
-pyre-extensions==0.0.29
-pyreaper==0.0.10
-pystoi==0.4.1
-python-dateutil==2.8.2
-python-multipart==0.0.9
-pytorch-lightning==2.1.0
-pytz==2023.3.post1
-pyviz_comms==3.0.3
-PyWavelets==1.4.1
-PyYAML==6.0.1
-randomname==0.2.1
-referencing==0.35.1
-regex==2023.10.3
-requests==2.32.3
-requests-oauthlib==1.3.1
-resampy==0.4.3
-retrying==1.3.4
-rich==13.7.1
-rpds-py==0.18.1
-rsa==4.9
-ruamel.yaml==0.18.5
-ruamel.yaml.clib==0.2.8
-ruff==0.4.4
-s3fs==2024.6.1
-s3transfer==0.7.0
-sacrebleu==2.4.2
-safetensors==0.4.3
-scikit-image==0.21.0
-scikit-learn==1.3.2
-scipy==1.10.1
-semantic-version==2.10.0
-sentencepiece==0.1.99
-sentry-sdk==2.10.0
-setproctitle==1.3.3
-shellingham==1.5.4
-six==1.16.0
-smart-open==6.4.0
-smmap==5.0.1
-sniffio==1.3.1
-sortedcontainers==2.4.0
-SoundFile==0.10.2
-sox==1.4.1
-soxr==0.3.7
-spacy==3.7.4
-spacy-legacy==3.0.12
-spacy-loggers==1.0.5
-srsly==2.4.8
-stack-data==0.6.3
-starlette==0.37.2
-submitit==1.5.1
-sympy==1.12
-tabulate==0.9.0
-tenacity==9.0.0
-tensorboard==2.14.0
-tensorboard-data-server==0.7.2
-termcolor==2.3.0
-thinc==8.2.3
-threadpoolctl==3.3.0
-tifffile==2023.7.10
-timm==0.9.11
-tokenizers==0.19.1
-tomlkit==0.12.0
-toolz==0.12.1
-torch==2.2.0+cu118
-torch-stoi==0.2.1
-torchaudio==2.2.0+cu118
-torchdata==0.7.1
-torchdiffeq==0.2.4
-torchlibrosa==0.1.0
-torchmetrics==0.11.4
-torchsde==0.2.6
-torchtext==0.17.0
-torchvision==0.17.0+cu118
-tornado==6.4.1
-tqdm==4.66.4
-traitlets==5.14.3
-trampoline==0.1.2
-transformers==4.42.4
-treetable==0.2.5
-triton==2.2.0
-typeguard==2.13.0
-typer==0.9.4
-types-dataclasses==0.6.6
-typing-inspect==0.9.0
-typing_extensions==4.8.0
-tzdata==2023.3
-uc-micro-py==1.0.3
-umap-learn==0.5.6
-Unidecode==1.3.8
-urllib3==1.26.18
-uvicorn==0.29.0
-v-diffusion-pytorch==0.0.2
-vector-quantize-pytorch==1.9.14
-wandb==0.15.4
-wasabi==1.1.2
-wcwidth==0.2.12
-weasel==0.3.4
-webdataset==0.2.48
-webencodings==0.5.1
-websockets==11.0.3
-Werkzeug==3.0.1
-wget==3.2
-wordsegment==1.3.1
-wrapt==1.16.0
-x-clip==0.14.4
-x-transformers==1.26.6
-xformers==0.0.24+cu118
-xxhash==3.4.1
-xyzservices==2024.6.0
-yarl==1.9.4
-zipp==3.17.0

MuCodec/test_wav/test.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8cd28fa4fc1e8695be47602407088fcc9c486ac27b0ac6712ad30b7c7bcef4f8
-size 22823468

MuCodec/tools/get_melvaehifigan48k.py DELETED Viewed

@@ -1,1551 +0,0 @@
-import soundfile as sf
-import os
-from librosa.filters import mel as librosa_mel_fn
-import sys
-sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
-import tools.torch_tools as torch_tools
-import torch.nn as nn
-import torch
-import numpy as np
-from einops import rearrange
-from scipy.signal import get_window
-from librosa.util import pad_center, tiny
-import librosa.util as librosa_util
-class AttrDict(dict):
-    def __init__(self, *args, **kwargs):
-        super(AttrDict, self).__init__(*args, **kwargs)
-        self.__dict__ = self
-def init_weights(m, mean=0.0, std=0.01):
-    classname = m.__class__.__name__
-    if classname.find("Conv") != -1:
-        m.weight.data.normal_(mean, std)
-def get_padding(kernel_size, dilation=1):
-    return int((kernel_size * dilation - dilation) / 2)
-LRELU_SLOPE = 0.1
-class ResBlock(torch.nn.Module):
-    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
-        super(ResBlock, self).__init__()
-        self.h = h
-        self.convs1 = nn.ModuleList(
-            [
-                torch.nn.utils.weight_norm(
-                    nn.Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[0],
-                        padding=get_padding(kernel_size, dilation[0]),
-                    )
-                ),
-                torch.nn.utils.weight_norm(
-                    nn.Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[1],
-                        padding=get_padding(kernel_size, dilation[1]),
-                    )
-                ),
-                torch.nn.utils.weight_norm(
-                    nn.Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[2],
-                        padding=get_padding(kernel_size, dilation[2]),
-                    )
-                ),
-            ]
-        )
-        self.convs1.apply(init_weights)
-        self.convs2 = nn.ModuleList(
-            [
-                torch.nn.utils.weight_norm(
-                    nn.Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-                torch.nn.utils.weight_norm(
-                    nn.Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-                torch.nn.utils.weight_norm(
-                    nn.Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-            ]
-        )
-        self.convs2.apply(init_weights)
-    def forward(self, x):
-        for c1, c2 in zip(self.convs1, self.convs2):
-            xt = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
-            xt = c1(xt)
-            xt = torch.nn.functional.leaky_relu(xt, LRELU_SLOPE)
-            xt = c2(xt)
-            x = xt + x
-        return x
-    def remove_weight_norm(self):
-        for l in self.convs1:
-            torch.nn.utils.remove_weight_norm(l)
-        for l in self.convs2:
-            torch.nn.utils.remove_weight_norm(l)
-class Generator_old(torch.nn.Module):
-    def __init__(self, h):
-        super(Generator_old, self).__init__()
-        self.h = h
-        self.num_kernels = len(h.resblock_kernel_sizes)
-        self.num_upsamples = len(h.upsample_rates)
-        self.conv_pre = torch.nn.utils.weight_norm(
-            nn.Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3)
-        )
-        resblock = ResBlock
-        self.ups = nn.ModuleList()
-        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
-            self.ups.append(
-                torch.nn.utils.weight_norm(
-                    nn.ConvTranspose1d(
-                        h.upsample_initial_channel // (2**i),
-                        h.upsample_initial_channel // (2 ** (i + 1)),
-                        k,
-                        u,
-                        padding=(k - u) // 2,
-                    )
-                )
-            )
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = h.upsample_initial_channel // (2 ** (i + 1))
-            for j, (k, d) in enumerate(
-                zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)
-            ):
-                self.resblocks.append(resblock(h, ch, k, d))
-        self.conv_post = torch.nn.utils.weight_norm(nn.Conv1d(ch, 1, 7, 1, padding=3))
-        self.ups.apply(init_weights)
-        self.conv_post.apply(init_weights)
-    def forward(self, x):
-        x = self.conv_pre(x)
-        for i in range(self.num_upsamples):
-            x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
-            x = self.ups[i](x)
-            xs = None
-            for j in range(self.num_kernels):
-                if xs is None:
-                    xs = self.resblocks[i * self.num_kernels + j](x)
-                else:
-                    xs += self.resblocks[i * self.num_kernels + j](x)
-            x = xs / self.num_kernels
-        x = torch.nn.functional.leaky_relu(x)
-        x = self.conv_post(x)
-        x = torch.tanh(x)
-        return x
-    def remove_weight_norm(self):
-        # print("Removing weight norm...")
-        for l in self.ups:
-            torch.nn.utils.remove_weight_norm(l)
-        for l in self.resblocks:
-            l.remove_weight_norm()
-        torch.nn.utils.remove_weight_norm(self.conv_pre)
-        torch.nn.utils.remove_weight_norm(self.conv_post)
-def nonlinearity(x):
-    # swish
-    return x * torch.sigmoid(x)
-def Normalize(in_channels, num_groups=32):
-    return torch.nn.GroupNorm(
-        num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True
-    )
-class Downsample(nn.Module):
-    def __init__(self, in_channels, with_conv):
-        super().__init__()
-        self.with_conv = with_conv
-        if self.with_conv:
-            # Do time downsampling here
-            # no asymmetric padding in torch conv, must do it ourselves
-            self.conv = torch.nn.Conv2d(
-                in_channels, in_channels, kernel_size=3, stride=2, padding=0
-            )
-    def forward(self, x):
-        if self.with_conv:
-            pad = (0, 1, 0, 1)
-            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
-            x = self.conv(x)
-        else:
-            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
-        return x
-class DownsampleTimeStride4(nn.Module):
-    def __init__(self, in_channels, with_conv):
-        super().__init__()
-        self.with_conv = with_conv
-        if self.with_conv:
-            # Do time downsampling here
-            # no asymmetric padding in torch conv, must do it ourselves
-            self.conv = torch.nn.Conv2d(
-                in_channels, in_channels, kernel_size=5, stride=(4, 2), padding=1
-            )
-    def forward(self, x):
-        if self.with_conv:
-            pad = (0, 1, 0, 1)
-            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
-            x = self.conv(x)
-        else:
-            x = torch.nn.functional.avg_pool2d(x, kernel_size=(4, 2), stride=(4, 2))
-        return x
-class Upsample(nn.Module):
-    def __init__(self, in_channels, with_conv):
-        super().__init__()
-        self.with_conv = with_conv
-        if self.with_conv:
-            self.conv = torch.nn.Conv2d(
-                in_channels, in_channels, kernel_size=3, stride=1, padding=1
-            )
-    def forward(self, x):
-        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
-        if self.with_conv:
-            x = self.conv(x)
-        return x
-class UpsampleTimeStride4(nn.Module):
-    def __init__(self, in_channels, with_conv):
-        super().__init__()
-        self.with_conv = with_conv
-        if self.with_conv:
-            self.conv = torch.nn.Conv2d(
-                in_channels, in_channels, kernel_size=5, stride=1, padding=2
-            )
-    def forward(self, x):
-        x = torch.nn.functional.interpolate(x, scale_factor=(4.0, 2.0), mode="nearest")
-        if self.with_conv:
-            x = self.conv(x)
-        return x
-class AttnBlock(nn.Module):
-    def __init__(self, in_channels):
-        super().__init__()
-        self.in_channels = in_channels
-        self.norm = Normalize(in_channels)
-        self.q = torch.nn.Conv2d(
-            in_channels, in_channels, kernel_size=1, stride=1, padding=0
-        )
-        self.k = torch.nn.Conv2d(
-            in_channels, in_channels, kernel_size=1, stride=1, padding=0
-        )
-        self.v = torch.nn.Conv2d(
-            in_channels, in_channels, kernel_size=1, stride=1, padding=0
-        )
-        self.proj_out = torch.nn.Conv2d(
-            in_channels, in_channels, kernel_size=1, stride=1, padding=0
-        )
-    def forward(self, x):
-        h_ = x
-        h_ = self.norm(h_)
-        q = self.q(h_)
-        k = self.k(h_)
-        v = self.v(h_)
-        # compute attention
-        b, c, h, w = q.shape
-        q = q.reshape(b, c, h * w).contiguous()
-        q = q.permute(0, 2, 1).contiguous()  # b,hw,c
-        k = k.reshape(b, c, h * w).contiguous()  # b,c,hw
-        w_ = torch.bmm(q, k).contiguous()  # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
-        w_ = w_ * (int(c) ** (-0.5))
-        w_ = torch.nn.functional.softmax(w_, dim=2)
-        # attend to values
-        v = v.reshape(b, c, h * w).contiguous()
-        w_ = w_.permute(0, 2, 1).contiguous()  # b,hw,hw (first hw of k, second of q)
-        h_ = torch.bmm(
-            v, w_
-        ).contiguous()  # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
-        h_ = h_.reshape(b, c, h, w).contiguous()
-        h_ = self.proj_out(h_)
-        return x + h_
-def make_attn(in_channels, attn_type="vanilla"):
-    assert attn_type in ["vanilla", "linear", "none"], f"attn_type {attn_type} unknown"
-    # print(f"making attention of type '{attn_type}' with {in_channels} in_channels")
-    if attn_type == "vanilla":
-        return AttnBlock(in_channels)
-    elif attn_type == "none":
-        return nn.Identity(in_channels)
-    else:
-        raise ValueError(attn_type)
-class ResnetBlock(nn.Module):
-    def __init__(
-        self,
-        *,
-        in_channels,
-        out_channels=None,
-        conv_shortcut=False,
-        dropout,
-        temb_channels=512,
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        out_channels = in_channels if out_channels is None else out_channels
-        self.out_channels = out_channels
-        self.use_conv_shortcut = conv_shortcut
-        self.norm1 = Normalize(in_channels)
-        self.conv1 = torch.nn.Conv2d(
-            in_channels, out_channels, kernel_size=3, stride=1, padding=1
-        )
-        if temb_channels > 0:
-            self.temb_proj = torch.nn.Linear(temb_channels, out_channels)
-        self.norm2 = Normalize(out_channels)
-        self.dropout = torch.nn.Dropout(dropout)
-        self.conv2 = torch.nn.Conv2d(
-            out_channels, out_channels, kernel_size=3, stride=1, padding=1
-        )
-        if self.in_channels != self.out_channels:
-            if self.use_conv_shortcut:
-                self.conv_shortcut = torch.nn.Conv2d(
-                    in_channels, out_channels, kernel_size=3, stride=1, padding=1
-                )
-            else:
-                self.nin_shortcut = torch.nn.Conv2d(
-                    in_channels, out_channels, kernel_size=1, stride=1, padding=0
-                )
-    def forward(self, x, temb):
-        h = x
-        h = self.norm1(h)
-        h = nonlinearity(h)
-        h = self.conv1(h)
-        if temb is not None:
-            h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
-        h = self.norm2(h)
-        h = nonlinearity(h)
-        h = self.dropout(h)
-        h = self.conv2(h)
-        if self.in_channels != self.out_channels:
-            if self.use_conv_shortcut:
-                x = self.conv_shortcut(x)
-            else:
-                x = self.nin_shortcut(x)
-        return x + h
-class Encoder(nn.Module):
-    def __init__(
-        self,
-        *,
-        ch,
-        out_ch,
-        ch_mult=(1, 2, 4, 8),
-        num_res_blocks,
-        attn_resolutions,
-        dropout=0.0,
-        resamp_with_conv=True,
-        in_channels,
-        resolution,
-        z_channels,
-        double_z=True,
-        use_linear_attn=False,
-        attn_type="vanilla",
-        downsample_time_stride4_levels=[],
-        **ignore_kwargs,
-    ):
-        super().__init__()
-        if use_linear_attn:
-            attn_type = "linear"
-        self.ch = ch
-        self.temb_ch = 0
-        self.num_resolutions = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        self.resolution = resolution
-        self.in_channels = in_channels
-        self.downsample_time_stride4_levels = downsample_time_stride4_levels
-        if len(self.downsample_time_stride4_levels) > 0:
-            assert max(self.downsample_time_stride4_levels) < self.num_resolutions, (
-                "The level to perform downsample 4 operation need to be smaller than the total resolution number %s"
-                % str(self.num_resolutions)
-            )
-        # downsampling
-        self.conv_in = torch.nn.Conv2d(
-            in_channels, self.ch, kernel_size=3, stride=1, padding=1
-        )
-        curr_res = resolution
-        in_ch_mult = (1,) + tuple(ch_mult)
-        self.in_ch_mult = in_ch_mult
-        self.down = nn.ModuleList()
-        for i_level in range(self.num_resolutions):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_in = ch * in_ch_mult[i_level]
-            block_out = ch * ch_mult[i_level]
-            for i_block in range(self.num_res_blocks):
-                block.append(
-                    ResnetBlock(
-                        in_channels=block_in,
-                        out_channels=block_out,
-                        temb_channels=self.temb_ch,
-                        dropout=dropout,
-                    )
-                )
-                block_in = block_out
-                if curr_res in attn_resolutions:
-                    attn.append(make_attn(block_in, attn_type=attn_type))
-            down = nn.Module()
-            down.block = block
-            down.attn = attn
-            if i_level != self.num_resolutions - 1:
-                if i_level in self.downsample_time_stride4_levels:
-                    down.downsample = DownsampleTimeStride4(block_in, resamp_with_conv)
-                else:
-                    down.downsample = Downsample(block_in, resamp_with_conv)
-                curr_res = curr_res // 2
-            self.down.append(down)
-        # middle
-        self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(
-            in_channels=block_in,
-            out_channels=block_in,
-            temb_channels=self.temb_ch,
-            dropout=dropout,
-        )
-        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
-        self.mid.block_2 = ResnetBlock(
-            in_channels=block_in,
-            out_channels=block_in,
-            temb_channels=self.temb_ch,
-            dropout=dropout,
-        )
-        # end
-        self.norm_out = Normalize(block_in)
-        self.conv_out = torch.nn.Conv2d(
-            block_in,
-            2 * z_channels if double_z else z_channels,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-        )
-    def forward(self, x):
-        # timestep embedding
-        temb = None
-        # downsampling
-        hs = [self.conv_in(x)]
-        for i_level in range(self.num_resolutions):
-            for i_block in range(self.num_res_blocks):
-                h = self.down[i_level].block[i_block](hs[-1], temb)
-                if len(self.down[i_level].attn) > 0:
-                    h = self.down[i_level].attn[i_block](h)
-                hs.append(h)
-            if i_level != self.num_resolutions - 1:
-                hs.append(self.down[i_level].downsample(hs[-1]))
-        # middle
-        h = hs[-1]
-        h = self.mid.block_1(h, temb)
-        h = self.mid.attn_1(h)
-        h = self.mid.block_2(h, temb)
-        # end
-        h = self.norm_out(h)
-        h = nonlinearity(h)
-        h = self.conv_out(h)
-        return h
-class Decoder(nn.Module):
-    def __init__(
-        self,
-        *,
-        ch,
-        out_ch,
-        ch_mult=(1, 2, 4, 8),
-        num_res_blocks,
-        attn_resolutions,
-        dropout=0.0,
-        resamp_with_conv=True,
-        in_channels,
-        resolution,
-        z_channels,
-        give_pre_end=False,
-        tanh_out=False,
-        use_linear_attn=False,
-        downsample_time_stride4_levels=[],
-        attn_type="vanilla",
-        **ignorekwargs,
-    ):
-        super().__init__()
-        if use_linear_attn:
-            attn_type = "linear"
-        self.ch = ch
-        self.temb_ch = 0
-        self.num_resolutions = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        self.resolution = resolution
-        self.in_channels = in_channels
-        self.give_pre_end = give_pre_end
-        self.tanh_out = tanh_out
-        self.downsample_time_stride4_levels = downsample_time_stride4_levels
-        if len(self.downsample_time_stride4_levels) > 0:
-            assert max(self.downsample_time_stride4_levels) < self.num_resolutions, (
-                "The level to perform downsample 4 operation need to be smaller than the total resolution number %s"
-                % str(self.num_resolutions)
-            )
-        # compute in_ch_mult, block_in and curr_res at lowest res
-        (1,) + tuple(ch_mult)
-        block_in = ch * ch_mult[self.num_resolutions - 1]
-        curr_res = resolution // 2 ** (self.num_resolutions - 1)
-        self.z_shape = (1, z_channels, curr_res, curr_res)
-        # print(
-        #     "Working with z of shape {} = {} dimensions.".format(
-        #         self.z_shape, np.prod(self.z_shape)
-        #     )
-        # )
-        # z to block_in
-        self.conv_in = torch.nn.Conv2d(
-            z_channels, block_in, kernel_size=3, stride=1, padding=1
-        )
-        # middle
-        self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(
-            in_channels=block_in,
-            out_channels=block_in,
-            temb_channels=self.temb_ch,
-            dropout=dropout,
-        )
-        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
-        self.mid.block_2 = ResnetBlock(
-            in_channels=block_in,
-            out_channels=block_in,
-            temb_channels=self.temb_ch,
-            dropout=dropout,
-        )
-        # upsampling
-        self.up = nn.ModuleList()
-        for i_level in reversed(range(self.num_resolutions)):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_out = ch * ch_mult[i_level]
-            for i_block in range(self.num_res_blocks + 1):
-                block.append(
-                    ResnetBlock(
-                        in_channels=block_in,
-                        out_channels=block_out,
-                        temb_channels=self.temb_ch,
-                        dropout=dropout,
-                    )
-                )
-                block_in = block_out
-                if curr_res in attn_resolutions:
-                    attn.append(make_attn(block_in, attn_type=attn_type))
-            up = nn.Module()
-            up.block = block
-            up.attn = attn
-            if i_level != 0:
-                if i_level - 1 in self.downsample_time_stride4_levels:
-                    up.upsample = UpsampleTimeStride4(block_in, resamp_with_conv)
-                else:
-                    up.upsample = Upsample(block_in, resamp_with_conv)
-                curr_res = curr_res * 2
-            self.up.insert(0, up)  # prepend to get consistent order
-        # end
-        self.norm_out = Normalize(block_in)
-        self.conv_out = torch.nn.Conv2d(
-            block_in, out_ch, kernel_size=3, stride=1, padding=1
-        )
-    def forward(self, z):
-        # assert z.shape[1:] == self.z_shape[1:]
-        self.last_z_shape = z.shape
-        # timestep embedding
-        temb = None
-        # z to block_in
-        h = self.conv_in(z)
-        # middle
-        h = self.mid.block_1(h, temb)
-        h = self.mid.attn_1(h)
-        h = self.mid.block_2(h, temb)
-        # upsampling
-        for i_level in reversed(range(self.num_resolutions)):
-            for i_block in range(self.num_res_blocks + 1):
-                h = self.up[i_level].block[i_block](h, temb)
-                if len(self.up[i_level].attn) > 0:
-                    h = self.up[i_level].attn[i_block](h)
-            if i_level != 0:
-                h = self.up[i_level].upsample(h)
-        # end
-        if self.give_pre_end:
-            return h
-        h = self.norm_out(h)
-        h = nonlinearity(h)
-        h = self.conv_out(h)
-        if self.tanh_out:
-            h = torch.tanh(h)
-        return h
-class DiagonalGaussianDistribution(object):
-    def __init__(self, parameters, deterministic=False):
-        self.parameters = parameters
-        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
-        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
-        self.deterministic = deterministic
-        self.std = torch.exp(0.5 * self.logvar)
-        self.var = torch.exp(self.logvar)
-        if self.deterministic:
-            self.var = self.std = torch.zeros_like(self.mean).to(
-                device=self.parameters.device
-            )
-    def sample(self):
-        x = self.mean + self.std * torch.randn(self.mean.shape).to(
-            device=self.parameters.device
-        )
-        return x
-    def kl(self, other=None):
-        if self.deterministic:
-            return torch.Tensor([0.0])
-        else:
-            if other is None:
-                return 0.5 * torch.mean(
-                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
-                    dim=[1, 2, 3],
-                )
-            else:
-                return 0.5 * torch.mean(
-                    torch.pow(self.mean - other.mean, 2) / other.var
-                    + self.var / other.var
-                    - 1.0
-                    - self.logvar
-                    + other.logvar,
-                    dim=[1, 2, 3],
-                )
-    def nll(self, sample, dims=[1, 2, 3]):
-        if self.deterministic:
-            return torch.Tensor([0.0])
-        logtwopi = np.log(2.0 * np.pi)
-        return 0.5 * torch.sum(
-            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
-            dim=dims,
-        )
-    def mode(self):
-        return self.mean
-def get_vocoder_config_48k():
-    return {
-        "resblock": "1",
-        "num_gpus": 8,
-        "batch_size": 128,
-        "learning_rate": 0.0001,
-        "adam_b1": 0.8,
-        "adam_b2": 0.99,
-        "lr_decay": 0.999,
-        "seed": 1234,
-        "upsample_rates": [6,5,4,2,2],
-        "upsample_kernel_sizes": [12,10,8,4,4],
-        "upsample_initial_channel": 1536,
-        "resblock_kernel_sizes": [3,7,11,15],
-        "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5], [1,3,5]],
-        "segment_size": 15360,
-        "num_mels": 256,
-        "n_fft": 2048,
-        "hop_size": 480,
-        "win_size": 2048,
-        "sampling_rate": 48000,
-        "fmin": 20,
-        "fmax": 24000,
-        "fmax_for_loss": None,
-        "num_workers": 8,
-        "dist_config": {
-            "dist_backend": "nccl",
-            "dist_url": "tcp://localhost:18273",
-            "world_size": 1
-        }
-    }
-def get_vocoder(config, device, mel_bins):
-    name = "HiFi-GAN"
-    speaker = ""
-    if name == "MelGAN":
-        if speaker == "LJSpeech":
-            vocoder = torch.hub.load(
-                "descriptinc/melgan-neurips", "load_melgan", "linda_johnson"
-            )
-        elif speaker == "universal":
-            vocoder = torch.hub.load(
-                "descriptinc/melgan-neurips", "load_melgan", "multi_speaker"
-            )
-        vocoder.mel2wav.eval()
-        vocoder.mel2wav.to(device)
-    elif name == "HiFi-GAN":
-        if(mel_bins == 256):
-            config = get_vocoder_config_48k()
-            config = AttrDict(config)
-            vocoder = Generator_old(config)
-            # print("Load hifigan/g_01080000")
-            # ckpt = torch.load(os.path.join(ROOT, "hifigan/g_01080000"))
-            # ckpt = torch.load(os.path.join(ROOT, "hifigan/g_00660000"))
-            # ckpt = torch_version_orig_mod_remove(ckpt)
-            # vocoder.load_state_dict(ckpt["generator"])
-            vocoder.eval()
-            vocoder.remove_weight_norm()
-            vocoder.to(device)
-        else:
-            raise ValueError(mel_bins)
-    return vocoder
-def vocoder_infer(mels, vocoder, lengths=None):
-    with torch.no_grad():
-        wavs = vocoder(mels).squeeze(1)
-    #wavs = (wavs.cpu().numpy() * 32768).astype("int16")
-    wavs = (wavs.cpu().numpy())
-    if lengths is not None:
-        wavs = wavs[:, :lengths]
-    # wavs = [wav for wav in wavs]
-    # for i in range(len(mels)):
-    #     if lengths is not None:
-    #         wavs[i] = wavs[i][: lengths[i]]
-    return wavs
-@torch.no_grad()
-def vocoder_chunk_infer(mels, vocoder, lengths=None):
-    chunk_size = 256*4
-    shift_size = 256*1
-    ov_size = chunk_size-shift_size
-    # import pdb;pdb.set_trace()
-    for cinx in range(0, mels.shape[2], shift_size):
-        if(cinx==0):
-            wavs = vocoder(mels[:,:,cinx:cinx+chunk_size]).squeeze(1).cpu()
-            num_samples = int(wavs.shape[-1]/chunk_size)*chunk_size
-            wavs = wavs[:,0:num_samples]
-            ov_sample = int(float(wavs.shape[-1]) * ov_size / chunk_size)
-            ov_win = torch.from_numpy(np.linspace(0,1,ov_sample)[None,:])
-            ov_win = torch.cat([ov_win,1-ov_win],-1)
-            if(cinx+chunk_size>=mels.shape[2]):
-                break
-        else:
-            cur_wav = vocoder(mels[:,:,cinx:cinx+chunk_size]).squeeze(1).cpu()[:,0:num_samples]
-            wavs[:,-ov_sample:] =  wavs[:,-ov_sample:] * ov_win[:,-ov_sample:] + cur_wav[:,0:ov_sample] * ov_win[:,0:ov_sample]
-            # wavs[:,-ov_sample:] =  wavs[:,-ov_sample:] * 1.0 + cur_wav[:,0:ov_sample] * 0.0
-            wavs = torch.cat([wavs, cur_wav[:,ov_sample:]],-1)
-            if(cinx+chunk_size>=mels.shape[2]):
-                break
-        # print(wavs.shape)
-    wavs = (wavs.cpu().numpy())
-    if lengths is not None:
-        wavs = wavs[:, :lengths]
-    # print(wavs.shape)
-    return wavs
-def synth_one_sample(mel_input, mel_prediction, labels, vocoder):
-    if vocoder is not None:
-        wav_reconstruction = vocoder_infer(
-            mel_input.permute(0, 2, 1),
-            vocoder,
-        )
-        wav_prediction = vocoder_infer(
-            mel_prediction.permute(0, 2, 1),
-            vocoder,
-        )
-    else:
-        wav_reconstruction = wav_prediction = None
-    return wav_reconstruction, wav_prediction
-class AutoencoderKL(nn.Module):
-    def __init__(
-        self,
-        ddconfig=None,
-        lossconfig=None,
-        batchsize=None,
-        embed_dim=None,
-        time_shuffle=1,
-        subband=1,
-        sampling_rate=16000,
-        ckpt_path=None,
-        reload_from_ckpt=None,
-        ignore_keys=[],
-        image_key="fbank",
-        colorize_nlabels=None,
-        monitor=None,
-        base_learning_rate=1e-5,
-        scale_factor=1
-    ):
-        super().__init__()
-        self.automatic_optimization = False
-        assert (
-            "mel_bins" in ddconfig.keys()
-        ), "mel_bins is not specified in the Autoencoder config"
-        num_mel = ddconfig["mel_bins"]
-        self.image_key = image_key
-        self.sampling_rate = sampling_rate
-        self.encoder = Encoder(**ddconfig)
-        self.decoder = Decoder(**ddconfig)
-        self.loss = None
-        self.subband = int(subband)
-        if self.subband > 1:
-            print("Use subband decomposition %s" % self.subband)
-        assert ddconfig["double_z"]
-        self.quant_conv = torch.nn.Conv2d(2 * ddconfig["z_channels"], 2 * embed_dim, 1)
-        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
-        if self.image_key == "fbank":
-            self.vocoder = get_vocoder(None, "cpu", num_mel)
-        self.embed_dim = embed_dim
-        if colorize_nlabels is not None:
-            assert type(colorize_nlabels) == int
-            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
-        if monitor is not None:
-            self.monitor = monitor
-        if ckpt_path is not None:
-            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
-        self.learning_rate = float(base_learning_rate)
-        # print("Initial learning rate %s" % self.learning_rate)
-        self.time_shuffle = time_shuffle
-        self.reload_from_ckpt = reload_from_ckpt
-        self.reloaded = False
-        self.mean, self.std = None, None
-        self.feature_cache = None
-        self.flag_first_run = True
-        self.train_step = 0
-        self.logger_save_dir = None
-        self.logger_exp_name = None
-        self.scale_factor = scale_factor
-        print("Num parameters:")
-        print("Encoder : ", sum(p.numel() for p in self.encoder.parameters()))
-        print("Decoder : ", sum(p.numel() for p in self.decoder.parameters()))
-        print("Vocoder : ", sum(p.numel() for p in self.vocoder.parameters()))
-    def get_log_dir(self):
-        if self.logger_save_dir is None and self.logger_exp_name is None:
-            return os.path.join(self.logger.save_dir, self.logger._project)
-        else:
-            return os.path.join(self.logger_save_dir, self.logger_exp_name)
-    def set_log_dir(self, save_dir, exp_name):
-        self.logger_save_dir = save_dir
-        self.logger_exp_name = exp_name
-    def init_from_ckpt(self, path, ignore_keys=list()):
-        sd = torch.load(path, map_location="cpu")["state_dict"]
-        keys = list(sd.keys())
-        for k in keys:
-            for ik in ignore_keys:
-                if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
-                    del sd[k]
-        self.load_state_dict(sd, strict=False)
-        print(f"Restored from {path}")
-    def encode(self, x):
-        # x = self.time_shuffle_operation(x)
-        # x = self.freq_split_subband(x)
-        h = self.encoder(x)
-        moments = self.quant_conv(h)
-        posterior = DiagonalGaussianDistribution(moments)
-        return posterior
-    def decode(self, z):
-        z = self.post_quant_conv(z)
-        dec = self.decoder(z)
-        # bs, ch, shuffled_timesteps, fbins = dec.size()
-        # dec = self.time_unshuffle_operation(dec, bs, int(ch*shuffled_timesteps), fbins)
-        # dec = self.freq_merge_subband(dec)
-        return dec
-    def decode_to_waveform(self, dec):
-        if self.image_key == "fbank":
-            dec = dec.squeeze(1).permute(0, 2, 1)
-            wav_reconstruction = vocoder_chunk_infer(dec, self.vocoder)
-        elif self.image_key == "stft":
-            dec = dec.squeeze(1).permute(0, 2, 1)
-            wav_reconstruction = self.wave_decoder(dec)
-        return wav_reconstruction
-    def mel_spectrogram_to_waveform(
-        self, mel, savepath=".", bs=None, name="outwav", save=True
-    ):
-        # Mel: [bs, 1, t-steps, fbins]
-        if len(mel.size()) == 4:
-            mel = mel.squeeze(1)
-        mel = mel.permute(0, 2, 1)
-        waveform = self.vocoder(mel)
-        waveform = waveform.cpu().detach().numpy()
-        #if save:
-        #    self.save_waveform(waveform, savepath, name)
-        return waveform
-    @torch.no_grad()
-    def encode_first_stage(self, x):
-        return self.encode(x)
-    @torch.no_grad()
-    def decode_first_stage(self, z, predict_cids=False, force_not_quantize=False):
-        if predict_cids:
-            if z.dim() == 4:
-                z = torch.argmax(z.exp(), dim=1).long()
-            z = self.first_stage_model.quantize.get_codebook_entry(z, shape=None)
-            z = rearrange(z, "b h w c -> b c h w").contiguous()
-        z = 1.0 / self.scale_factor * z
-        return self.decode(z)
-    def decode_first_stage_withgrad(self, z):
-        z = 1.0 / self.scale_factor * z
-        return self.decode(z)
-    def get_first_stage_encoding(self, encoder_posterior, use_mode=False):
-        if isinstance(encoder_posterior, DiagonalGaussianDistribution) and not use_mode:
-            z = encoder_posterior.sample()
-        elif isinstance(encoder_posterior, DiagonalGaussianDistribution) and use_mode:
-            z = encoder_posterior.mode()
-        elif isinstance(encoder_posterior, torch.Tensor):
-            z = encoder_posterior
-        else:
-            raise NotImplementedError(
-                f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented"
-            )
-        return self.scale_factor * z
-    def visualize_latent(self, input):
-        import matplotlib.pyplot as plt
-        # for i in range(10):
-        #     zero_input = torch.zeros_like(input) - 11.59
-        #     zero_input[:,:,i * 16: i * 16 + 16,:16] += 13.59
-        #     posterior = self.encode(zero_input)
-        #     latent = posterior.sample()
-        #     avg_latent = torch.mean(latent, dim=1)[0]
-        #     plt.imshow(avg_latent.cpu().detach().numpy().T)
-        #     plt.savefig("%s.png" % i)
-        #     plt.close()
-        np.save("input.npy", input.cpu().detach().numpy())
-        # zero_input = torch.zeros_like(input) - 11.59
-        time_input = input.clone()
-        time_input[:, :, :, :32] *= 0
-        time_input[:, :, :, :32] -= 11.59
-        np.save("time_input.npy", time_input.cpu().detach().numpy())
-        posterior = self.encode(time_input)
-        latent = posterior.sample()
-        np.save("time_latent.npy", latent.cpu().detach().numpy())
-        avg_latent = torch.mean(latent, dim=1)
-        for i in range(avg_latent.size(0)):
-            plt.imshow(avg_latent[i].cpu().detach().numpy().T)
-            plt.savefig("freq_%s.png" % i)
-            plt.close()
-        freq_input = input.clone()
-        freq_input[:, :, :512, :] *= 0
-        freq_input[:, :, :512, :] -= 11.59
-        np.save("freq_input.npy", freq_input.cpu().detach().numpy())
-        posterior = self.encode(freq_input)
-        latent = posterior.sample()
-        np.save("freq_latent.npy", latent.cpu().detach().numpy())
-        avg_latent = torch.mean(latent, dim=1)
-        for i in range(avg_latent.size(0)):
-            plt.imshow(avg_latent[i].cpu().detach().numpy().T)
-            plt.savefig("time_%s.png" % i)
-            plt.close()
-    def get_input(self, batch):
-        fname, text, label_indices, waveform, stft, fbank = (
-            batch["fname"],
-            batch["text"],
-            batch["label_vector"],
-            batch["waveform"],
-            batch["stft"],
-            batch["log_mel_spec"],
-        )
-        # if(self.time_shuffle != 1):
-        #     if(fbank.size(1) % self.time_shuffle != 0):
-        #         pad_len = self.time_shuffle - (fbank.size(1) % self.time_shuffle)
-        #         fbank = torch.nn.functional.pad(fbank, (0,0,0,pad_len))
-        ret = {}
-        ret["fbank"], ret["stft"], ret["fname"], ret["waveform"] = (
-            fbank.unsqueeze(1),
-            stft.unsqueeze(1),
-            fname,
-            waveform.unsqueeze(1),
-        )
-        return ret
-    def save_wave(self, batch_wav, fname, save_dir):
-        os.makedirs(save_dir, exist_ok=True)
-        for wav, name in zip(batch_wav, fname):
-            name = os.path.basename(name)
-            sf.write(os.path.join(save_dir, name), wav, samplerate=self.sampling_rate)
-    def get_last_layer(self):
-        return self.decoder.conv_out.weight
-    @torch.no_grad()
-    def log_images(self, batch, train=True, only_inputs=False, waveform=None, **kwargs):
-        log = dict()
-        x = batch.to(self.device)
-        if not only_inputs:
-            xrec, posterior = self(x)
-            log["samples"] = self.decode(posterior.sample())
-            log["reconstructions"] = xrec
-        log["inputs"] = x
-        wavs = self._log_img(log, train=train, index=0, waveform=waveform)
-        return wavs
-    def _log_img(self, log, train=True, index=0, waveform=None):
-        images_input = self.tensor2numpy(log["inputs"][index, 0]).T
-        images_reconstruct = self.tensor2numpy(log["reconstructions"][index, 0]).T
-        images_samples = self.tensor2numpy(log["samples"][index, 0]).T
-        if train:
-            name = "train"
-        else:
-            name = "val"
-        if self.logger is not None:
-            self.logger.log_image(
-                "img_%s" % name,
-                [images_input, images_reconstruct, images_samples],
-                caption=["input", "reconstruct", "samples"],
-            )
-        inputs, reconstructions, samples = (
-            log["inputs"],
-            log["reconstructions"],
-            log["samples"],
-        )
-        if self.image_key == "fbank":
-            wav_original, wav_prediction = synth_one_sample(
-                inputs[index],
-                reconstructions[index],
-                labels="validation",
-                vocoder=self.vocoder,
-            )
-            wav_original, wav_samples = synth_one_sample(
-                inputs[index], samples[index], labels="validation", vocoder=self.vocoder
-            )
-            wav_original, wav_samples, wav_prediction = (
-                wav_original[0],
-                wav_samples[0],
-                wav_prediction[0],
-            )
-        elif self.image_key == "stft":
-            wav_prediction = (
-                self.decode_to_waveform(reconstructions)[index, 0]
-                .cpu()
-                .detach()
-                .numpy()
-            )
-            wav_samples = (
-                self.decode_to_waveform(samples)[index, 0].cpu().detach().numpy()
-            )
-            wav_original = waveform[index, 0].cpu().detach().numpy()
-        if self.logger is not None:
-            self.logger.experiment.log(
-                {
-                    "original_%s"
-                    % name: wandb.Audio(
-                        wav_original, caption="original", sample_rate=self.sampling_rate
-                    ),
-                    "reconstruct_%s"
-                    % name: wandb.Audio(
-                        wav_prediction,
-                        caption="reconstruct",
-                        sample_rate=self.sampling_rate,
-                    ),
-                    "samples_%s"
-                    % name: wandb.Audio(
-                        wav_samples, caption="samples", sample_rate=self.sampling_rate
-                    ),
-                }
-            )
-        return wav_original, wav_prediction, wav_samples
-    def tensor2numpy(self, tensor):
-        return tensor.cpu().detach().numpy()
-    def to_rgb(self, x):
-        assert self.image_key == "segmentation"
-        if not hasattr(self, "colorize"):
-            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
-        x = torch.nn.functional.conv2d(x, weight=self.colorize)
-        x = 2.0 * (x - x.min()) / (x.max() - x.min()) - 1.0
-        return x
-class IdentityFirstStage(torch.nn.Module):
-    def __init__(self, *args, vq_interface=False, **kwargs):
-        self.vq_interface = vq_interface  # TODO: Should be true by default but check to not break older stuff
-        super().__init__()
-    def encode(self, x, *args, **kwargs):
-        return x
-    def decode(self, x, *args, **kwargs):
-        return x
-    def quantize(self, x, *args, **kwargs):
-        if self.vq_interface:
-            return x, None, [None, None, None]
-        return x
-    def forward(self, x, *args, **kwargs):
-        return x
-def window_sumsquare(
-    window,
-    n_frames,
-    hop_length,
-    win_length,
-    n_fft,
-    dtype=np.float32,
-    norm=None,
-):
-    """
-    # from librosa 0.6
-    Compute the sum-square envelope of a window function at a given hop length.
-    This is used to estimate modulation effects induced by windowing
-    observations in short-time fourier transforms.
-    Parameters
-    ----------
-    window : string, tuple, number, callable, or list-like
-        Window specification, as in `get_window`
-    n_frames : int > 0
-        The number of analysis frames
-    hop_length : int > 0
-        The number of samples to advance between frames
-    win_length : [optional]
-        The length of the window function.  By default, this matches `n_fft`.
-    n_fft : int > 0
-        The length of each analysis frame.
-    dtype : np.dtype
-        The data type of the output
-    Returns
-    -------
-    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
-        The sum-squared envelope of the window function
-    """
-    if win_length is None:
-        win_length = n_fft
-    n = n_fft + hop_length * (n_frames - 1)
-    x = np.zeros(n, dtype=dtype)
-    # Compute the squared window at the desired length
-    win_sq = get_window(window, win_length, fftbins=True)
-    win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
-    win_sq = librosa_util.pad_center(win_sq, n_fft)
-    # Fill the envelope
-    for i in range(n_frames):
-        sample = i * hop_length
-        x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
-    return x
-def dynamic_range_compression(x, normalize_fun=torch.log, C=1, clip_val=1e-5):
-    """
-    PARAMS
-    ------
-    C: compression factor
-    """
-    return normalize_fun(torch.clamp(x, min=clip_val) * C)
-def dynamic_range_decompression(x, C=1):
-    """
-    PARAMS
-    ------
-    C: compression factor used to compress
-    """
-    return torch.exp(x) / C
-class STFT(torch.nn.Module):
-    """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
-    def __init__(self, filter_length, hop_length, win_length, window="hann"):
-        super(STFT, self).__init__()
-        self.filter_length = filter_length
-        self.hop_length = hop_length
-        self.win_length = win_length
-        self.window = window
-        self.forward_transform = None
-        scale = self.filter_length / self.hop_length
-        fourier_basis = np.fft.fft(np.eye(self.filter_length))
-        cutoff = int((self.filter_length / 2 + 1))
-        fourier_basis = np.vstack(
-            [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
-        )
-        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
-        inverse_basis = torch.FloatTensor(
-            np.linalg.pinv(scale * fourier_basis).T[:, None, :]
-        )
-        if window is not None:
-            assert filter_length >= win_length
-            # get window and zero center pad it to filter_length
-            fft_window = get_window(window, win_length, fftbins=True)
-            fft_window = pad_center(fft_window, size=filter_length)
-            fft_window = torch.from_numpy(fft_window).float()
-            # window the bases
-            forward_basis *= fft_window
-            inverse_basis *= fft_window
-        self.register_buffer("forward_basis", forward_basis.float())
-        self.register_buffer("inverse_basis", inverse_basis.float())
-    def transform(self, input_data):
-        device = self.forward_basis.device
-        input_data = input_data.to(device)
-        num_batches = input_data.size(0)
-        num_samples = input_data.size(1)
-        self.num_samples = num_samples
-        # similar to librosa, reflect-pad the input
-        input_data = input_data.view(num_batches, 1, num_samples)
-        input_data = torch.nn.functional.pad(
-            input_data.unsqueeze(1),
-            (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
-            mode="reflect",
-        )
-        input_data = input_data.squeeze(1)
-        forward_transform = torch.nn.functional.conv1d(
-            input_data,
-            torch.autograd.Variable(self.forward_basis, requires_grad=False),
-            stride=self.hop_length,
-            padding=0,
-        )#.cpu()
-        cutoff = int((self.filter_length / 2) + 1)
-        real_part = forward_transform[:, :cutoff, :]
-        imag_part = forward_transform[:, cutoff:, :]
-        magnitude = torch.sqrt(real_part**2 + imag_part**2)
-        phase = torch.autograd.Variable(torch.atan2(imag_part.data, real_part.data))
-        return magnitude, phase
-    def inverse(self, magnitude, phase):
-        device = self.forward_basis.device
-        magnitude, phase = magnitude.to(device), phase.to(device)
-        recombine_magnitude_phase = torch.cat(
-            [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
-        )
-        inverse_transform = torch.nn.functional.conv_transpose1d(
-            recombine_magnitude_phase,
-            torch.autograd.Variable(self.inverse_basis, requires_grad=False),
-            stride=self.hop_length,
-            padding=0,
-        )
-        if self.window is not None:
-            window_sum = window_sumsquare(
-                self.window,
-                magnitude.size(-1),
-                hop_length=self.hop_length,
-                win_length=self.win_length,
-                n_fft=self.filter_length,
-                dtype=np.float32,
-            )
-            # remove modulation effects
-            approx_nonzero_indices = torch.from_numpy(
-                np.where(window_sum > tiny(window_sum))[0]
-            )
-            window_sum = torch.autograd.Variable(
-                torch.from_numpy(window_sum), requires_grad=False
-            )
-            window_sum = window_sum
-            inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
-                approx_nonzero_indices
-            ]
-            # scale by hop ratio
-            inverse_transform *= float(self.filter_length) / self.hop_length
-        inverse_transform = inverse_transform[:, :, int(self.filter_length / 2) :]
-        inverse_transform = inverse_transform[:, :, : -int(self.filter_length / 2) :]
-        return inverse_transform
-    def forward(self, input_data):
-        self.magnitude, self.phase = self.transform(input_data)
-        reconstruction = self.inverse(self.magnitude, self.phase)
-        return reconstruction
-class TacotronSTFT(torch.nn.Module):
-    def __init__(
-        self,
-        filter_length,
-        hop_length,
-        win_length,
-        n_mel_channels,
-        sampling_rate,
-        mel_fmin,
-        mel_fmax,
-    ):
-        super(TacotronSTFT, self).__init__()
-        self.n_mel_channels = n_mel_channels
-        self.sampling_rate = sampling_rate
-        self.stft_fn = STFT(filter_length, hop_length, win_length)
-        mel_basis = librosa_mel_fn(
-            sr = sampling_rate, n_fft = filter_length, n_mels = n_mel_channels, fmin = mel_fmin, fmax = mel_fmax
-        )
-        mel_basis = torch.from_numpy(mel_basis).float()
-        self.register_buffer("mel_basis", mel_basis)
-    def spectral_normalize(self, magnitudes, normalize_fun):
-        output = dynamic_range_compression(magnitudes, normalize_fun)
-        return output
-    def spectral_de_normalize(self, magnitudes):
-        output = dynamic_range_decompression(magnitudes)
-        return output
-    def mel_spectrogram(self, y, normalize_fun=torch.log):
-        """Computes mel-spectrograms from a batch of waves
-        PARAMS
-        ------
-        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
-        RETURNS
-        -------
-        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
-        """
-        assert torch.min(y.data) >= -1, torch.min(y.data)
-        assert torch.max(y.data) <= 1, torch.max(y.data)
-        magnitudes, phases = self.stft_fn.transform(y)
-        magnitudes = magnitudes.data
-        mel_output = torch.matmul(self.mel_basis, magnitudes)
-        mel_output = self.spectral_normalize(mel_output, normalize_fun)
-        energy = torch.norm(magnitudes, dim=1)
-        log_magnitudes = self.spectral_normalize(magnitudes, normalize_fun)
-        return mel_output, log_magnitudes, energy
-def build_pretrained_models(ckpt):
-    checkpoint = torch.load(ckpt, map_location="cpu")
-    scale_factor = checkpoint["state_dict"]["scale_factor"].item()
-    print("scale_factor: ", scale_factor)
-    vae_state_dict = {k[18:]: v for k, v in checkpoint["state_dict"].items() if "first_stage_model." in k}
-    config = {
-        "preprocessing": {
-            "audio": {
-            "sampling_rate": 48000,
-            "max_wav_value": 32768,
-            "duration": 10.24
-            },
-            "stft": {
-            "filter_length": 2048,
-            "hop_length": 480,
-            "win_length": 2048
-            },
-            "mel": {
-            "n_mel_channels": 256,
-            "mel_fmin": 20,
-            "mel_fmax": 24000
-            }
-        },
-        "model": {
-            "params": {
-                "first_stage_config": {
-                    "params": {
-                        "sampling_rate": 48000,
-                        "batchsize": 4,
-                        "monitor": "val/rec_loss",
-                        "image_key": "fbank",
-                        "subband": 1,
-                        "embed_dim": 16,
-                        "time_shuffle": 1,
-                        "lossconfig": {
-                            "target": "audioldm2.latent_diffusion.modules.losses.LPIPSWithDiscriminator",
-                            "params": {
-                            "disc_start": 50001,
-                            "kl_weight": 1000,
-                            "disc_weight": 0.5,
-                            "disc_in_channels": 1
-                            }
-                        },
-                        "ddconfig": {
-                            "double_z": True,
-                            "mel_bins": 256,
-                            "z_channels": 16,
-                            "resolution": 256,
-                            "downsample_time": False,
-                            "in_channels": 1,
-                            "out_ch": 1,
-                            "ch": 128,
-                            "ch_mult": [
-                            1,
-                            2,
-                            4,
-                            8
-                            ],
-                            "num_res_blocks": 2,
-                            "attn_resolutions": [],
-                            "dropout": 0
-                        }
-                    }
-                },
-            }
-        }
-    }
-    vae_config = config["model"]["params"]["first_stage_config"]["params"]
-    vae_config["scale_factor"] = scale_factor
-    vae = AutoencoderKL(**vae_config)
-    vae.load_state_dict(vae_state_dict)
-    fn_STFT = TacotronSTFT(
-        config["preprocessing"]["stft"]["filter_length"],
-        config["preprocessing"]["stft"]["hop_length"],
-        config["preprocessing"]["stft"]["win_length"],
-        config["preprocessing"]["mel"]["n_mel_channels"],
-        config["preprocessing"]["audio"]["sampling_rate"],
-        config["preprocessing"]["mel"]["mel_fmin"],
-        config["preprocessing"]["mel"]["mel_fmax"],
-    )
-    vae.eval()
-    fn_STFT.eval()
-    return vae, fn_STFT

MuCodec/tools/torch_tools.py DELETED Viewed

@@ -1,100 +0,0 @@
-import torch
-import torchaudio
-import random
-import itertools
-import numpy as np
-def normalize_wav(waveform):
-    waveform = waveform - torch.mean(waveform)
-    waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-8)
-    return waveform * 0.5
-def pad_wav(waveform, segment_length):
-    waveform_length = len(waveform)
-    if segment_length is None or waveform_length == segment_length:
-        return waveform
-    elif waveform_length > segment_length:
-        return waveform[:segment_length]
-    else:
-        pad_wav = torch.zeros(segment_length - waveform_length).to(waveform.device)
-        waveform = torch.cat([waveform, pad_wav])
-        return waveform
-def _pad_spec(fbank, target_length=1024):
-    batch, n_frames, channels = fbank.shape
-    p = target_length - n_frames
-    if p > 0:
-        pad = torch.zeros(batch, p, channels).to(fbank.device)
-        fbank = torch.cat([fbank, pad], 1)
-    elif p < 0:
-        fbank = fbank[:, :target_length, :]
-    if channels % 2 != 0:
-        fbank = fbank[:, :, :-1]
-    return fbank
-def read_wav_file(filename, segment_length):
-    waveform, sr = torchaudio.load(filename)  # Faster!!!
-    waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000)[0]
-    try:
-        waveform = normalize_wav(waveform)
-    except:
-        print ("Exception normalizing:", filename)
-        waveform = torch.ones(160000)
-    waveform = pad_wav(waveform, segment_length).unsqueeze(0)
-    waveform = waveform / torch.max(torch.abs(waveform))
-    waveform = 0.5 * waveform
-    return waveform
-def get_mel_from_wav(audio, _stft):
-    audio = torch.nan_to_num(torch.clip(audio, -1, 1))
-    audio = torch.autograd.Variable(audio, requires_grad=False)
-    melspec, log_magnitudes_stft, energy = _stft.mel_spectrogram(audio)
-    return melspec, log_magnitudes_stft, energy
-def wav_to_fbank(paths, target_length=1024, fn_STFT=None):
-    assert fn_STFT is not None
-    waveform = torch.cat([read_wav_file(path, target_length * 160) for path in paths], 0)  # hop size is 160
-    fbank, log_magnitudes_stft, energy = get_mel_from_wav(waveform, fn_STFT)
-    fbank = fbank.transpose(1, 2)
-    log_magnitudes_stft = log_magnitudes_stft.transpose(1, 2)
-    fbank, log_magnitudes_stft = _pad_spec(fbank, target_length), _pad_spec(
-        log_magnitudes_stft, target_length
-    )
-    return fbank, log_magnitudes_stft, waveform
-def wav_to_fbank2(waveform, target_length=-1, fn_STFT=None):
-    assert fn_STFT is not None
-    fbank, log_magnitudes_stft, energy = get_mel_from_wav(waveform, fn_STFT)
-    fbank = fbank.transpose(1, 2)
-    log_magnitudes_stft = log_magnitudes_stft.transpose(1, 2)
-    # print(fbank.shape, log_magnitudes_stft.shape)
-    if(target_length>0):
-        fbank, log_magnitudes_stft = _pad_spec(fbank, target_length), _pad_spec(
-            log_magnitudes_stft, target_length
-        )
-    return fbank, log_magnitudes_stft, waveform
-def uncapitalize(s):
-    if s:
-        return s[:1].lower() + s[1:]
-    else:
-        return ""