mie237 commited on
Commit
ab1359a
·
verified ·
1 Parent(s): eea7fa2

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. config.json +64 -0
config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "dasheng_audiogen",
3
+ "architectures": [
4
+ "DashengAudioGenModel"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_dasheng_audiogen.DashengAudioGenConfig",
8
+ "AutoModel": "modeling_dasheng_audiogen.DashengAudioGenModel"
9
+ },
10
+ "text_encoder_name": "google/mt5-large",
11
+ "tokenizer_name": "mispeech/dashengtokenizer",
12
+ "use_zero_instruction": true,
13
+ "instruction_seq_len": 1,
14
+ "task_instruction_dim": 1024,
15
+ "sample_rate": 16000,
16
+ "downsampling_ratio": 640,
17
+ "latent_dim": 1280,
18
+ "content_dim": 1024,
19
+ "frame_resolution": 0.005,
20
+ "duration_offset": 1.0,
21
+ "tokenizer_max_length": 512,
22
+ "dit_img_size": 1000,
23
+ "dit_patch_size": 1,
24
+ "dit_in_chans": 1280,
25
+ "dit_out_chans": 1280,
26
+ "dit_input_type": "1d",
27
+ "dit_embed_dim": 1536,
28
+ "dit_depth": 32,
29
+ "dit_num_heads": 24,
30
+ "dit_mlp_ratio": 4.0,
31
+ "dit_qk_norm": "layernorm",
32
+ "dit_norm_layer": "layernorm",
33
+ "dit_act_layer": "geglu",
34
+ "dit_context_norm": true,
35
+ "dit_time_fusion": "ada",
36
+ "dit_ada_sola_rank": 32,
37
+ "dit_ada_sola_alpha": 32,
38
+ "dit_ta_context_dim": 1024,
39
+ "dit_ta_context_fusion": "add",
40
+ "dit_ta_context_norm": true,
41
+ "dit_context_dim": 1024,
42
+ "dit_context_fusion": "cross",
43
+ "dit_context_pe_method": "none",
44
+ "dit_pe_method": "none",
45
+ "dit_rope_mode": "shared",
46
+ "adapter_num_heads": 16,
47
+ "adapter_dropout": 0.2,
48
+ "adapter_duration_grad_scale": 0.1,
49
+ "duration_predictor_filter_channels": 512,
50
+ "duration_predictor_n_layers": 5,
51
+ "duration_predictor_kernel_size": 3,
52
+ "duration_predictor_p_dropout": 0.5,
53
+ "special_tokens": [
54
+ "<|caption|>",
55
+ "<|speech|>",
56
+ "<|sfx|>",
57
+ "<|music|>",
58
+ "<|env|>",
59
+ "<|asr|>",
60
+ "<|speech_start|>",
61
+ "<|speech_end|>"
62
+ ],
63
+ "train_special_tokens": true
64
+ }