Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

model_config.json +136 -0
stable_audio_open_ravi_2000.safetensors +3 -0

model_config.json ADDED Viewed

	@@ -0,0 +1,136 @@

+{
+  "model_type": "diffusion_cond",
+  "sample_size": 2097152,
+  "sample_rate": 44100,
+  "audio_channels": 2,
+  "model": {
+    "pretransform": {
+      "type": "autoencoder",
+      "iterate_batch": true,
+      "config": {
+        "encoder": {
+          "type": "oobleck",
+          "requires_grad": false,
+          "config": {
+            "in_channels": 2,
+            "channels": 128,
+            "c_mults": [1, 2, 4, 8, 16],
+            "strides": [2, 4, 4, 8, 8],
+            "latent_dim": 128,
+            "use_snake": true
+          }
+        },
+        "decoder": {
+          "type": "oobleck",
+          "config": {
+            "out_channels": 2,
+            "channels": 128,
+            "c_mults": [1, 2, 4, 8, 16],
+            "strides": [2, 4, 4, 8, 8],
+            "latent_dim": 64,
+            "use_snake": true,
+            "final_tanh": false
+          }
+        },
+        "bottleneck": {
+          "type": "vae"
+        },
+        "latent_dim": 64,
+        "downsampling_ratio": 2048,
+        "io_channels": 2
+      }
+    },
+    "conditioning": {
+      "configs": [
+        {
+          "id": "prompt",
+          "type": "t5",
+          "config": {
+            "t5_model_name": "t5-base",
+            "max_length": 128
+          }
+        },
+        {
+          "id": "seconds_start",
+          "type": "number",
+          "config": {
+            "min_val": 0,
+            "max_val": 512
+          }
+        },
+        {
+          "id": "seconds_total",
+          "type": "number",
+          "config": {
+            "min_val": 0,
+            "max_val": 512
+          }
+        }
+      ],
+      "cond_dim": 768
+    },
+    "diffusion": {
+      "cross_attention_cond_ids": ["prompt", "seconds_start", "seconds_total"],
+      "global_cond_ids": ["seconds_start", "seconds_total"],
+      "type": "dit",
+      "config": {
+        "io_channels": 64,
+        "embed_dim": 1536,
+        "depth": 24,
+        "num_heads": 24,
+        "cond_token_dim": 768,
+        "global_cond_dim": 1536,
+        "project_cond_tokens": false,
+        "transformer_type": "continuous_transformer"
+      }
+    },
+    "io_channels": 64
+  },
+  "training": {
+    "use_ema": true,
+    "log_loss_info": false,
+    "optimizer_configs": {
+      "diffusion": {
+        "optimizer": {
+          "type": "AdamW",
+          "config": {
+            "lr": 5e-5,
+            "betas": [0.9, 0.999],
+            "weight_decay": 1e-3
+          }
+        },
+        "scheduler": {
+          "type": "InverseLR",
+          "config": {
+            "inv_gamma": 1000000,
+            "power": 0.5,
+            "warmup": 0.99
+          }
+        }
+      }
+    },
+    "demo": {
+      "demo_every": 1000,
+      "demo_steps": 250,
+      "num_demos": 3,
+      "demo_cond": [
+        {
+          "prompt": "An audio that blends elements of pop and country music, featuring an electric bass and a banjo.",
+          "seconds_start": 0,
+          "seconds_total": 30
+        },
+        {
+          "prompt": "An intense audio with no consistent structure, featuring glitchy electronic sounds, synthesizers, and distorted instruments.",
+          "seconds_start": 0,
+          "seconds_total": 30
+        },
+        {
+          "prompt": "An audio with a fast-paced beginning and end, and a slow middle, featuring electronic instruments and a prominent keyboard, with a jazz and funk style influence.",
+          "seconds_start": 0,
+          "seconds_total": 30
+        }
+      ],
+      "demo_cfg_scales": [4, 8]
+    }
+  }
+}

stable_audio_open_ravi_2000.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8047ddc75bf15da9a6f519a44c160726c25f2f818aa4462b518f2984f6c94513
+size 4853889016