Upload model

Browse files

Files changed (4) hide show

config.json +113 -0
model.py +49 -0
model.safetensors +3 -0
model_index.json +6 -0

config.json ADDED Viewed

	@@ -0,0 +1,113 @@

+{
+  "model_type": "semantic_vocoder",
+  "auto_map": {
+    "AutoConfig": "model.SemanticVocoderConfig",
+    "AutoModel": "model.SemanticVocoder"
+  },
+  "model_config": {
+    "autoencoder": {
+      "_target_": "models.autoencoder.waveform.semanticVocoder.semanticVocoder.SemanticVocoder",
+      "encoder_name": "none",
+      "n_timesteps": 200,
+      "sample_rate": 24000,
+      "clamp_pred": true,
+      "downsampling_ratio": 960,
+      "encoder_sampling_rate": 16000,
+      "vocoder": {
+        "_target_": "models.autoencoder.waveform.semanticVocoder.flow2gan.models.generator.MaeAudioGenerator",
+        "latent_dim": 768,
+        "hop_length": 960,
+        "n_ffts": [
+          512,
+          256,
+          128
+        ],
+        "hop_lengths": [
+          320,
+          160,
+          80
+        ],
+        "channels": [
+          768,
+          512,
+          384
+        ],
+        "time_embed_channels": 512,
+        "hidden_factor": 3,
+        "conv_kernel_sizes": [
+          7,
+          7,
+          7
+        ],
+        "num_layers": [
+          8,
+          8,
+          8
+        ],
+        "use_cond_encoder": true,
+        "cond_enc_channels": 512,
+        "cond_enc_hidden_factor": 3,
+        "cond_enc_conv_kernel_size": 7,
+        "cond_enc_num_layers": 4,
+        "residual_scale": 1.0,
+        "init_noise_scale": 0.1,
+        "pred_x1": true,
+        "branch_reduction": "mean",
+        "spec_scaling_loss": true,
+        "loss_n_filters": 256,
+        "loss_n_fft": 1024,
+        "loss_hop_length": 256,
+        "loss_power": 0.5,
+        "loss_eps": 1e-07,
+        "loss_scale_min": 0.01,
+        "loss_scale_max": 100.0,
+        "branch_dropout": 0.05,
+        "max_add_noise_scale": 0.0
+      }
+    },
+    "backbone": {
+      "_target_": "models.dit.mask_dit.UDiT",
+      "img_size": 250,
+      "patch_size": 1,
+      "in_chans": 768,
+      "out_chans": 768,
+      "input_type": "1d",
+      "embed_dim": 1024,
+      "depth": 24,
+      "num_heads": 16,
+      "mlp_ratio": 4.0,
+      "qkv_bias": false,
+      "qk_scale": null,
+      "qk_norm": "layernorm",
+      "norm_layer": "layernorm",
+      "act_layer": "geglu",
+      "context_norm": true,
+      "use_checkpoint": true,
+      "time_fusion": "ada_sola_bias",
+      "ada_sola_rank": 32,
+      "ada_sola_alpha": 32,
+      "cls_dim": null,
+      "context_dim": 1024,
+      "context_fusion": "cross",
+      "context_max_length": null,
+      "context_pe_method": "none",
+      "pe_method": "none",
+      "rope_mode": "shared",
+      "use_conv": true,
+      "skip": true,
+      "skip_norm": true
+    },
+    "cfg_drop_ratio": 0.2,
+    "sample_strategy": "uniform",
+    "_target_": "models.flow_matching.SingleTaskCrossAttentionAudioFlowMatching",
+    "content_encoder": {
+      "_target_": "models.content_encoder.content_encoder.ContentEncoder",
+      "embed_dim": 1024,
+      "text_encoder": {
+        "_target_": "models.content_encoder.text_encoder.T5TextEncoder",
+        "model_name": "google/flan-t5-large",
+        "embed_dim": 1024
+      }
+    }
+  }
+}

model.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from pathlib import Path
+import copy
+import torch
+import hydra
+from omegaconf import OmegaConf
+from transformers import PreTrainedModel, PretrainedConfig
+class SemanticVocoderConfig(PretrainedConfig):
+    """Configuration class for SemanticVocoder model."""
+    model_type = "semantic_vocoder"
+    def __init__(self,
+        model_config=None,
+        **kwargs):
+        super().__init__(**kwargs)
+        self.model_config = model_config
+class SemanticVocoder(PreTrainedModel):
+    """HuggingFace compatible SemanticVocoder model."""
+    config_class = SemanticVocoderConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = hydra.utils.instantiate(config.model_config)
+    def forward(self,
+        content,
+        num_steps=100,
+        guidance_scale=3.5,
+        guidance_rescale=0.5,
+        vocoder_steps=200,
+        latent_shape=[768, 250],
+        **kwargs):
+        """Forward pass through the model."""
+        waveform = self.model.inference(
+            content=[content],
+            condition=None,
+            task=["text_to_audio"],
+            num_steps=num_steps,
+            guidance_scale=guidance_scale,
+            guidance_rescale=guidance_rescale,
+            vocoder_steps=vocoder_steps,
+            latent_shape=latent_shape,
+            **kwargs,
+        )
+        return waveform[0][0].cpu().numpy()

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:87ed79d3dc0eec648ca0db650cdea6121038957440bca51f5be070ede931385d
+size 4430573760

model_index.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "auto_map": {
+      "AutoConfig": "model.SemanticVocoderConfig",
+      "AutoModel": "model.SemanticVocoder"
+    }
+  }