ZeyuXie commited on
Commit
e2da711
·
verified ·
1 Parent(s): 501ec8d

Upload model

Browse files
Files changed (4) hide show
  1. config.json +113 -0
  2. model.py +49 -0
  3. model.safetensors +3 -0
  4. model_index.json +6 -0
config.json ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "semantic_vocoder",
3
+ "auto_map": {
4
+ "AutoConfig": "model.SemanticVocoderConfig",
5
+ "AutoModel": "model.SemanticVocoder"
6
+ },
7
+ "model_config": {
8
+ "autoencoder": {
9
+ "_target_": "models.autoencoder.waveform.semanticVocoder.semanticVocoder.SemanticVocoder",
10
+ "encoder_name": "none",
11
+ "n_timesteps": 200,
12
+ "sample_rate": 24000,
13
+ "clamp_pred": true,
14
+ "downsampling_ratio": 960,
15
+ "encoder_sampling_rate": 16000,
16
+ "vocoder": {
17
+ "_target_": "models.autoencoder.waveform.semanticVocoder.flow2gan.models.generator.MaeAudioGenerator",
18
+ "latent_dim": 768,
19
+ "hop_length": 960,
20
+ "n_ffts": [
21
+ 512,
22
+ 256,
23
+ 128
24
+ ],
25
+ "hop_lengths": [
26
+ 320,
27
+ 160,
28
+ 80
29
+ ],
30
+ "channels": [
31
+ 768,
32
+ 512,
33
+ 384
34
+ ],
35
+ "time_embed_channels": 512,
36
+ "hidden_factor": 3,
37
+ "conv_kernel_sizes": [
38
+ 7,
39
+ 7,
40
+ 7
41
+ ],
42
+ "num_layers": [
43
+ 8,
44
+ 8,
45
+ 8
46
+ ],
47
+ "use_cond_encoder": true,
48
+ "cond_enc_channels": 512,
49
+ "cond_enc_hidden_factor": 3,
50
+ "cond_enc_conv_kernel_size": 7,
51
+ "cond_enc_num_layers": 4,
52
+ "residual_scale": 1.0,
53
+ "init_noise_scale": 0.1,
54
+ "pred_x1": true,
55
+ "branch_reduction": "mean",
56
+ "spec_scaling_loss": true,
57
+ "loss_n_filters": 256,
58
+ "loss_n_fft": 1024,
59
+ "loss_hop_length": 256,
60
+ "loss_power": 0.5,
61
+ "loss_eps": 1e-07,
62
+ "loss_scale_min": 0.01,
63
+ "loss_scale_max": 100.0,
64
+ "branch_dropout": 0.05,
65
+ "max_add_noise_scale": 0.0
66
+ }
67
+ },
68
+ "backbone": {
69
+ "_target_": "models.dit.mask_dit.UDiT",
70
+ "img_size": 250,
71
+ "patch_size": 1,
72
+ "in_chans": 768,
73
+ "out_chans": 768,
74
+ "input_type": "1d",
75
+ "embed_dim": 1024,
76
+ "depth": 24,
77
+ "num_heads": 16,
78
+ "mlp_ratio": 4.0,
79
+ "qkv_bias": false,
80
+ "qk_scale": null,
81
+ "qk_norm": "layernorm",
82
+ "norm_layer": "layernorm",
83
+ "act_layer": "geglu",
84
+ "context_norm": true,
85
+ "use_checkpoint": true,
86
+ "time_fusion": "ada_sola_bias",
87
+ "ada_sola_rank": 32,
88
+ "ada_sola_alpha": 32,
89
+ "cls_dim": null,
90
+ "context_dim": 1024,
91
+ "context_fusion": "cross",
92
+ "context_max_length": null,
93
+ "context_pe_method": "none",
94
+ "pe_method": "none",
95
+ "rope_mode": "shared",
96
+ "use_conv": true,
97
+ "skip": true,
98
+ "skip_norm": true
99
+ },
100
+ "cfg_drop_ratio": 0.2,
101
+ "sample_strategy": "uniform",
102
+ "_target_": "models.flow_matching.SingleTaskCrossAttentionAudioFlowMatching",
103
+ "content_encoder": {
104
+ "_target_": "models.content_encoder.content_encoder.ContentEncoder",
105
+ "embed_dim": 1024,
106
+ "text_encoder": {
107
+ "_target_": "models.content_encoder.text_encoder.T5TextEncoder",
108
+ "model_name": "google/flan-t5-large",
109
+ "embed_dim": 1024
110
+ }
111
+ }
112
+ }
113
+ }
model.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import copy
3
+
4
+ import torch
5
+ import hydra
6
+ from omegaconf import OmegaConf
7
+ from transformers import PreTrainedModel, PretrainedConfig
8
+
9
+ class SemanticVocoderConfig(PretrainedConfig):
10
+ """Configuration class for SemanticVocoder model."""
11
+ model_type = "semantic_vocoder"
12
+
13
+ def __init__(self,
14
+ model_config=None,
15
+ **kwargs):
16
+ super().__init__(**kwargs)
17
+ self.model_config = model_config
18
+
19
+ class SemanticVocoder(PreTrainedModel):
20
+ """HuggingFace compatible SemanticVocoder model."""
21
+ config_class = SemanticVocoderConfig
22
+
23
+ def __init__(self, config):
24
+ super().__init__(config)
25
+
26
+ self.model = hydra.utils.instantiate(config.model_config)
27
+
28
+ def forward(self,
29
+ content,
30
+ num_steps=100,
31
+ guidance_scale=3.5,
32
+ guidance_rescale=0.5,
33
+ vocoder_steps=200,
34
+ latent_shape=[768, 250],
35
+ **kwargs):
36
+ """Forward pass through the model."""
37
+
38
+ waveform = self.model.inference(
39
+ content=[content],
40
+ condition=None,
41
+ task=["text_to_audio"],
42
+ num_steps=num_steps,
43
+ guidance_scale=guidance_scale,
44
+ guidance_rescale=guidance_rescale,
45
+ vocoder_steps=vocoder_steps,
46
+ latent_shape=latent_shape,
47
+ **kwargs,
48
+ )
49
+ return waveform[0][0].cpu().numpy()
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87ed79d3dc0eec648ca0db650cdea6121038957440bca51f5be070ede931385d
3
+ size 4430573760
model_index.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoConfig": "model.SemanticVocoderConfig",
4
+ "AutoModel": "model.SemanticVocoder"
5
+ }
6
+ }