mispeech
/

Dasheng-AudioGen-Multilingual

dasheng_audiogen

feature-extraction

audio-generation

Model card Files Files and versions

mie237 commited on 20 days ago

Commit

ab1359a

·

verified ·

1 Parent(s): eea7fa2

Upload folder using huggingface_hub

Files changed (1) hide show

config.json +64 -0

config.json ADDED Viewed

	@@ -0,0 +1,64 @@

+{
+  "model_type": "dasheng_audiogen",
+  "architectures": [
+    "DashengAudioGenModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_dasheng_audiogen.DashengAudioGenConfig",
+    "AutoModel": "modeling_dasheng_audiogen.DashengAudioGenModel"
+  },
+  "text_encoder_name": "google/mt5-large",
+  "tokenizer_name": "mispeech/dashengtokenizer",
+  "use_zero_instruction": true,
+  "instruction_seq_len": 1,
+  "task_instruction_dim": 1024,
+  "sample_rate": 16000,
+  "downsampling_ratio": 640,
+  "latent_dim": 1280,
+  "content_dim": 1024,
+  "frame_resolution": 0.005,
+  "duration_offset": 1.0,
+  "tokenizer_max_length": 512,
+  "dit_img_size": 1000,
+  "dit_patch_size": 1,
+  "dit_in_chans": 1280,
+  "dit_out_chans": 1280,
+  "dit_input_type": "1d",
+  "dit_embed_dim": 1536,
+  "dit_depth": 32,
+  "dit_num_heads": 24,
+  "dit_mlp_ratio": 4.0,
+  "dit_qk_norm": "layernorm",
+  "dit_norm_layer": "layernorm",
+  "dit_act_layer": "geglu",
+  "dit_context_norm": true,
+  "dit_time_fusion": "ada",
+  "dit_ada_sola_rank": 32,
+  "dit_ada_sola_alpha": 32,
+  "dit_ta_context_dim": 1024,
+  "dit_ta_context_fusion": "add",
+  "dit_ta_context_norm": true,
+  "dit_context_dim": 1024,
+  "dit_context_fusion": "cross",
+  "dit_context_pe_method": "none",
+  "dit_pe_method": "none",
+  "dit_rope_mode": "shared",
+  "adapter_num_heads": 16,
+  "adapter_dropout": 0.2,
+  "adapter_duration_grad_scale": 0.1,
+  "duration_predictor_filter_channels": 512,
+  "duration_predictor_n_layers": 5,
+  "duration_predictor_kernel_size": 3,
+  "duration_predictor_p_dropout": 0.5,
+  "special_tokens": [
+    "<|caption|>",
+    "<|speech|>",
+    "<|sfx|>",
+    "<|music|>",
+    "<|env|>",
+    "<|asr|>",
+    "<|speech_start|>",
+    "<|speech_end|>"
+  ],
+  "train_special_tokens": true
+}