Safetensors
File size: 3,637 Bytes
0d9b403
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
{
  "_target_": "modules.v2.vc_wrapper.VoiceConversionWrapper",
  "sr": 22050,
  "hop_size": 256,
  "mel_fn": {
    "_target_": "modules.audio.mel_spectrogram",
    "_partial_": true,
    "n_fft": 1024,
    "win_size": 1024,
    "hop_size": 256,
    "num_mels": 80,
    "sampling_rate": 22050,
    "fmin": 0,
    "fmax": null,
    "center": false
  },
  "cfm": {
    "_target_": "modules.v2.cfm.CFM",
    "estimator": {
      "_target_": "modules.v2.dit_wrapper.DiT",
      "time_as_token": true,
      "style_as_token": true,
      "uvit_skip_connection": false,
      "block_size": 8192,
      "depth": 13,
      "num_heads": 8,
      "hidden_dim": 512,
      "in_channels": 80,
      "content_dim": 512,
      "style_encoder_dim": 192,
      "class_dropout_prob": 0.1,
      "dropout_rate": 0.0,
      "attn_dropout_rate": 0.0
    }
  },
  "cfm_length_regulator": {
    "_target_": "modules.v2.length_regulator.InterpolateRegulator",
    "channels": 512,
    "is_discrete": true,
    "codebook_size": 2048,
    "sampling_ratios": [
      1,
      1,
      1,
      1
    ],
    "f0_condition": false
  },
  "ar": {
    "_target_": "modules.v2.ar.NaiveWrapper",
    "model": {
      "_target_": "modules.v2.ar.NaiveTransformer",
      "config": {
        "_target_": "modules.v2.ar.NaiveModelArgs",
        "dropout": 0.0,
        "rope_base": 10000.0,
        "dim": 768,
        "head_dim": 64,
        "n_local_heads": 2,
        "intermediate_size": 2304,
        "n_head": 12,
        "n_layer": 12,
        "vocab_size": 2049
      }
    }
  },
  "ar_length_regulator": {
    "_target_": "modules.v2.length_regulator.InterpolateRegulator",
    "channels": 768,
    "is_discrete": true,
    "codebook_size": 32,
    "sampling_ratios": [],
    "f0_condition": false
  },
  "style_encoder": {
    "_target_": "modules.campplus.DTDNN.CAMPPlus",
    "feat_dim": 80,
    "embedding_size": 192
  },
  "content_extractor_narrow": {
    "_target_": "modules.astral_quantization.default_model.AstralQuantizer",
    "tokenizer_name": "openai/whisper-small",
    "ssl_model_name": "facebook/hubert-large-ll60k",
    "ssl_output_layer": 18,
    "skip_ssl": true,
    "encoder": {
      "_target_": "modules.astral_quantization.convnext.ConvNeXtV2Stage",
      "dim": 512,
      "num_blocks": 12,
      "intermediate_dim": 1536,
      "dilation": 1,
      "input_dim": 1024
    },
    "quantizer": {
      "_target_": "modules.astral_quantization.bsq.BinarySphericalQuantize",
      "codebook_size": 32,
      "dim": 512,
      "entropy_loss_weight": 0.1,
      "diversity_gamma": 1.0,
      "spherical": true,
      "enable_entropy_loss": true,
      "soft_entropy_loss": true
    }
  },
  "content_extractor_wide": {
    "_target_": "modules.astral_quantization.default_model.AstralQuantizer",
    "tokenizer_name": "openai/whisper-small",
    "ssl_model_name": "facebook/hubert-large-ll60k",
    "ssl_output_layer": 18,
    "encoder": {
      "_target_": "modules.astral_quantization.convnext.ConvNeXtV2Stage",
      "dim": 512,
      "num_blocks": 12,
      "intermediate_dim": 1536,
      "dilation": 1,
      "input_dim": 1024
    },
    "quantizer": {
      "_target_": "modules.astral_quantization.bsq.BinarySphericalQuantize",
      "codebook_size": 2048,
      "dim": 512,
      "entropy_loss_weight": 0.1,
      "diversity_gamma": 1.0,
      "spherical": true,
      "enable_entropy_loss": true,
      "soft_entropy_loss": true
    }
  },
  "vocoder": {
    "_target_": "modules.bigvgan.bigvgan.BigVGAN.from_pretrained",
    "pretrained_model_name_or_path": "nvidia/bigvgan_v2_22khz_80band_256x",
    "use_cuda_kernel": false
  }
}