DeepBeepMeep commited on
Commit
c51df6b
·
verified ·
1 Parent(s): 668932a

Upload 5 files

Browse files
ace_step/ace_step_v1_dcae_config.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderDC",
3
+ "_diffusers_version": "0.32.2",
4
+ "_name_or_path": "checkpoints/music_dcae_f8c8",
5
+ "attention_head_dim": 32,
6
+ "decoder_act_fns": "silu",
7
+ "decoder_block_out_channels": [
8
+ 128,
9
+ 256,
10
+ 512,
11
+ 1024
12
+ ],
13
+ "decoder_block_types": [
14
+ "ResBlock",
15
+ "ResBlock",
16
+ "ResBlock",
17
+ "EfficientViTBlock"
18
+ ],
19
+ "decoder_layers_per_block": [
20
+ 3,
21
+ 3,
22
+ 3,
23
+ 3
24
+ ],
25
+ "decoder_norm_types": "rms_norm",
26
+ "decoder_qkv_multiscales": [
27
+ [],
28
+ [],
29
+ [
30
+ 5
31
+ ],
32
+ [
33
+ 5
34
+ ]
35
+ ],
36
+ "downsample_block_type": "Conv",
37
+ "encoder_block_out_channels": [
38
+ 128,
39
+ 256,
40
+ 512,
41
+ 1024
42
+ ],
43
+ "encoder_block_types": [
44
+ "ResBlock",
45
+ "ResBlock",
46
+ "ResBlock",
47
+ "EfficientViTBlock"
48
+ ],
49
+ "encoder_layers_per_block": [
50
+ 2,
51
+ 2,
52
+ 3,
53
+ 3
54
+ ],
55
+ "encoder_qkv_multiscales": [
56
+ [],
57
+ [],
58
+ [
59
+ 5
60
+ ],
61
+ [
62
+ 5
63
+ ]
64
+ ],
65
+ "in_channels": 2,
66
+ "latent_channels": 8,
67
+ "scaling_factor": 0.41407,
68
+ "upsample_block_type": "interpolate"
69
+ }
ace_step/ace_step_v1_music_dcae_f8c8_bf16.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b0cb469307ac50659d1880db2a99bae47d0df335cbb36853964662d4b80e8ee
3
+ size 313646516
ace_step/ace_step_v1_music_vocoder_bf16.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c92c9b46e28ab7b37b777780cf4308ad7ddac869636bb77aa61599358c4bc1c0
3
+ size 206350988
ace_step/ace_step_v1_transformer_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "ACEStepTransformer2DModel",
3
+ "_diffusers_version": "0.32.2",
4
+ "attention_head_dim": 128,
5
+ "in_channels": 8,
6
+ "inner_dim": 2560,
7
+ "lyric_encoder_vocab_size": 6693,
8
+ "lyric_hidden_size": 1024,
9
+ "max_height": 16,
10
+ "max_position": 32768,
11
+ "max_width": 32768,
12
+ "mlp_ratio": 2.5,
13
+ "num_attention_heads": 20,
14
+ "num_layers": 24,
15
+ "out_channels": 8,
16
+ "patch_size": [
17
+ 16,
18
+ 1
19
+ ],
20
+ "rope_theta": 1000000.0,
21
+ "speaker_embedding_dim": 512,
22
+ "ssl_encoder_depths": [
23
+ 8,
24
+ 8
25
+ ],
26
+ "ssl_latent_dims": [
27
+ 1024,
28
+ 768
29
+ ],
30
+ "ssl_names": [
31
+ "mert",
32
+ "m-hubert"
33
+ ],
34
+ "text_embedding_dim": 768
35
+ }
ace_step/ace_step_v1_vocoder_config.json ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "ADaMoSHiFiGANV1",
3
+ "_diffusers_version": "0.32.2",
4
+ "depths": [
5
+ 3,
6
+ 3,
7
+ 9,
8
+ 3
9
+ ],
10
+ "dims": [
11
+ 128,
12
+ 256,
13
+ 384,
14
+ 512
15
+ ],
16
+ "drop_path_rate": 0.0,
17
+ "f_max": 16000,
18
+ "f_min": 40,
19
+ "hop_length": 512,
20
+ "input_channels": 128,
21
+ "kernel_sizes": [
22
+ 7
23
+ ],
24
+ "n_fft": 2048,
25
+ "n_mels": 128,
26
+ "num_mels": 512,
27
+ "post_conv_kernel_size": 13,
28
+ "pre_conv_kernel_size": 13,
29
+ "resblock_dilation_sizes": [
30
+ [
31
+ 1,
32
+ 3,
33
+ 5
34
+ ],
35
+ [
36
+ 1,
37
+ 3,
38
+ 5
39
+ ],
40
+ [
41
+ 1,
42
+ 3,
43
+ 5
44
+ ],
45
+ [
46
+ 1,
47
+ 3,
48
+ 5
49
+ ]
50
+ ],
51
+ "resblock_kernel_sizes": [
52
+ 3,
53
+ 7,
54
+ 11,
55
+ 13
56
+ ],
57
+ "sampling_rate": 44100,
58
+ "upsample_initial_channel": 1024,
59
+ "upsample_kernel_sizes": [
60
+ 8,
61
+ 8,
62
+ 4,
63
+ 4,
64
+ 4,
65
+ 4,
66
+ 4
67
+ ],
68
+ "upsample_rates": [
69
+ 4,
70
+ 4,
71
+ 2,
72
+ 2,
73
+ 2,
74
+ 2,
75
+ 2
76
+ ],
77
+ "use_template": false,
78
+ "win_length": 2048
79
+ }