mthsk commited on
Commit
cef0f7e
·
1 Parent(s): f98b2b3

Add the models

Browse files
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Sovits Models
3
  emoji: 🎙️
4
  colorFrom: gray
5
  colorTo: pink
@@ -8,4 +8,5 @@ sdk_version: 3.18.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
11
  ---
 
1
  ---
2
+ title: Sovits Vtubers
3
  emoji: 🎙️
4
  colorFrom: gray
5
  colorTo: pink
 
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
+ duplicated_from: sayashi/sovits-models
12
  ---
app.py CHANGED
@@ -31,7 +31,7 @@ gr.Audio.postprocess = audio_postprocess
31
  def create_vc_fn(model, sid):
32
  def vc_fn(input_audio, vc_transform, auto_f0, tts_text, tts_voice, tts_mode):
33
  if tts_mode:
34
- if len(tts_text) > 100 and limitation:
35
  return "Text is too long", None
36
  if tts_text is None or tts_voice is None:
37
  return "You need to enter text and select a voice", None
@@ -48,8 +48,8 @@ def create_vc_fn(model, sid):
48
  return "You need to upload an audio", None
49
  sampling_rate, audio = input_audio
50
  duration = audio.shape[0] / sampling_rate
51
- if duration > 20 and limitation:
52
- return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None
53
  audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
54
  if len(audio.shape) > 1:
55
  audio = librosa.to_mono(audio.transpose(1, 0))
@@ -79,10 +79,9 @@ if __name__ == '__main__':
79
  hubert_model = utils.get_hubert_model().to(args.device)
80
  models = []
81
  others = {
82
- "rudolf": "https://huggingface.co/spaces/sayashi/sovits-rudolf",
83
- "teio": "https://huggingface.co/spaces/sayashi/sovits-teio",
84
- "goldship": "https://huggingface.co/spaces/sayashi/sovits-goldship",
85
- "tannhauser": "https://huggingface.co/spaces/sayashi/sovits-tannhauser"
86
  }
87
  voices = []
88
  tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
@@ -97,10 +96,7 @@ if __name__ == '__main__':
97
  gr.Markdown(
98
  "# <center> Sovits Models\n"
99
  "## <center> The input audio should be clean and pure voice without background music.\n"
100
- "![visitor badge](https://visitor-badge.glitch.me/badge?page_id=sayashi.Sovits-Umamusume)\n\n"
101
- "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wfsBbMzmtLflOJeqc5ZnJiLY7L239hJW?usp=share_link)\n\n"
102
- "[![Duplicate this Space](https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm-dark.svg)](https://huggingface.co/spaces/sayashi/sovits-models?duplicate=true)\n\n"
103
- "[![Original Repo](https://badgen.net/badge/icon/github?icon=github&label=Original%20Repo)](https://github.com/svc-develop-team/so-vits-svc)"
104
 
105
  )
106
  with gr.Tabs():
@@ -114,11 +110,11 @@ if __name__ == '__main__':
114
  )
115
  with gr.Row():
116
  with gr.Column():
117
- vc_input = gr.Audio(label="Input audio"+' (less than 20 seconds)' if limitation else '')
118
  vc_transform = gr.Number(label="vc_transform", value=0)
119
  auto_f0 = gr.Checkbox(label="auto_f0", value=False)
120
  tts_mode = gr.Checkbox(label="tts (use edge-tts as input)", value=False)
121
- tts_text = gr.Textbox(visible=False, label="TTS text (100 words limitation)" if limitation else "TTS text")
122
  tts_voice = gr.Dropdown(choices=voices, visible=False)
123
  vc_submit = gr.Button("Generate", variant="primary")
124
  with gr.Column():
 
31
  def create_vc_fn(model, sid):
32
  def vc_fn(input_audio, vc_transform, auto_f0, tts_text, tts_voice, tts_mode):
33
  if tts_mode:
34
+ if len(tts_text) > 600 and limitation:
35
  return "Text is too long", None
36
  if tts_text is None or tts_voice is None:
37
  return "You need to enter text and select a voice", None
 
48
  return "You need to upload an audio", None
49
  sampling_rate, audio = input_audio
50
  duration = audio.shape[0] / sampling_rate
51
+ if duration > 60 and limitation:
52
+ return "Please upload an audio file that is less than 60 seconds. If you need to generate a longer audio file, please use Colab.", None
53
  audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
54
  if len(audio.shape) > 1:
55
  audio = librosa.to_mono(audio.transpose(1, 0))
 
79
  hubert_model = utils.get_hubert_model().to(args.device)
80
  models = []
81
  others = {
82
+ "100% Orange Juice": "https://huggingface.co/spaces/mthsk/sovits-100orangejuice",
83
+ "Dota 2": "https://huggingface.co/spaces/mthsk/sovits-models",
84
+ "Miscellanous": "https://huggingface.co/spaces/mthsk/sovits-models-misc"
 
85
  }
86
  voices = []
87
  tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
 
96
  gr.Markdown(
97
  "# <center> Sovits Models\n"
98
  "## <center> The input audio should be clean and pure voice without background music.\n"
99
+ "[![Original Repo](https://badgen.net/badge/icon/github?icon=github&label=Original%20Repo)](https://github.com/svc-develop-team/so-vits-svc)\n\n"
 
 
 
100
 
101
  )
102
  with gr.Tabs():
 
110
  )
111
  with gr.Row():
112
  with gr.Column():
113
+ vc_input = gr.Audio(label="Input audio"+' (less than 60 seconds)' if limitation else '')
114
  vc_transform = gr.Number(label="vc_transform", value=0)
115
  auto_f0 = gr.Checkbox(label="auto_f0", value=False)
116
  tts_mode = gr.Checkbox(label="tts (use edge-tts as input)", value=False)
117
+ tts_text = gr.Textbox(visible=False, label="TTS text (600 words limitation)" if limitation else "TTS text")
118
  tts_voice = gr.Dropdown(choices=voices, visible=False)
119
  vc_submit = gr.Button("Generate", variant="primary")
120
  with gr.Column():
models/amelia/amelia.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ec661a99ab27cc4a3365ab6397f276b89e45d6bde0209eeb8b6a5b9a471e99f
3
+ size 542781471
models/amelia/config.json ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 800,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 16,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 10240,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "use_sr": true,
22
+ "max_speclen": 512,
23
+ "port": "8001",
24
+ "keep_ckpts": 3
25
+ },
26
+ "data": {
27
+ "training_files": "filelists/44k/train.txt",
28
+ "validation_files": "filelists/44k/val.txt",
29
+ "max_wav_value": 32768.0,
30
+ "sampling_rate": 44100,
31
+ "filter_length": 2048,
32
+ "hop_length": 512,
33
+ "win_length": 2048,
34
+ "n_mel_channels": 80,
35
+ "mel_fmin": 0.0,
36
+ "mel_fmax": 22050
37
+ },
38
+ "model": {
39
+ "inter_channels": 192,
40
+ "hidden_channels": 192,
41
+ "filter_channels": 768,
42
+ "n_heads": 2,
43
+ "n_layers": 6,
44
+ "kernel_size": 3,
45
+ "p_dropout": 0.1,
46
+ "resblock": "1",
47
+ "resblock_kernel_sizes": [
48
+ 3,
49
+ 7,
50
+ 11
51
+ ],
52
+ "resblock_dilation_sizes": [
53
+ [
54
+ 1,
55
+ 3,
56
+ 5
57
+ ],
58
+ [
59
+ 1,
60
+ 3,
61
+ 5
62
+ ],
63
+ [
64
+ 1,
65
+ 3,
66
+ 5
67
+ ]
68
+ ],
69
+ "upsample_rates": [
70
+ 8,
71
+ 8,
72
+ 2,
73
+ 2,
74
+ 2
75
+ ],
76
+ "upsample_initial_channel": 512,
77
+ "upsample_kernel_sizes": [
78
+ 16,
79
+ 16,
80
+ 4,
81
+ 4,
82
+ 4
83
+ ],
84
+ "n_layers_q": 3,
85
+ "use_spectral_norm": false,
86
+ "gin_channels": 256,
87
+ "ssl_dim": 256,
88
+ "n_speakers": 200
89
+ },
90
+ "spk": {
91
+ "amelia": 0
92
+ }
93
+ }
models/amelia/cover.png ADDED
models/haachama/config.json ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 800,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 16,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 10240,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "use_sr": true,
22
+ "max_speclen": 512,
23
+ "port": "8001",
24
+ "keep_ckpts": 3
25
+ },
26
+ "data": {
27
+ "training_files": "filelists/44k/train.txt",
28
+ "validation_files": "filelists/44k/val.txt",
29
+ "max_wav_value": 32768.0,
30
+ "sampling_rate": 44100,
31
+ "filter_length": 2048,
32
+ "hop_length": 512,
33
+ "win_length": 2048,
34
+ "n_mel_channels": 80,
35
+ "mel_fmin": 0.0,
36
+ "mel_fmax": 22050
37
+ },
38
+ "model": {
39
+ "inter_channels": 192,
40
+ "hidden_channels": 192,
41
+ "filter_channels": 768,
42
+ "n_heads": 2,
43
+ "n_layers": 6,
44
+ "kernel_size": 3,
45
+ "p_dropout": 0.1,
46
+ "resblock": "1",
47
+ "resblock_kernel_sizes": [
48
+ 3,
49
+ 7,
50
+ 11
51
+ ],
52
+ "resblock_dilation_sizes": [
53
+ [
54
+ 1,
55
+ 3,
56
+ 5
57
+ ],
58
+ [
59
+ 1,
60
+ 3,
61
+ 5
62
+ ],
63
+ [
64
+ 1,
65
+ 3,
66
+ 5
67
+ ]
68
+ ],
69
+ "upsample_rates": [
70
+ 8,
71
+ 8,
72
+ 2,
73
+ 2,
74
+ 2
75
+ ],
76
+ "upsample_initial_channel": 512,
77
+ "upsample_kernel_sizes": [
78
+ 16,
79
+ 16,
80
+ 4,
81
+ 4,
82
+ 4
83
+ ],
84
+ "n_layers_q": 3,
85
+ "use_spectral_norm": false,
86
+ "gin_channels": 256,
87
+ "ssl_dim": 256,
88
+ "n_speakers": 200
89
+ },
90
+ "spk": {
91
+ "haachama": 0
92
+ }
93
+ }
models/haachama/cover.png ADDED
models/haachama/haachama.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:546e1c390ab3691de855d23e37c9c648f64d662a4c4fef2689f4aaa3f82bf983
3
+ size 542781471
models/kson/config.json ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 800,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 16,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 10240,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "use_sr": true,
22
+ "max_speclen": 512,
23
+ "port": "8001",
24
+ "keep_ckpts": 3
25
+ },
26
+ "data": {
27
+ "training_files": "filelists/44k/train.txt",
28
+ "validation_files": "filelists/44k/val.txt",
29
+ "max_wav_value": 32768.0,
30
+ "sampling_rate": 44100,
31
+ "filter_length": 2048,
32
+ "hop_length": 512,
33
+ "win_length": 2048,
34
+ "n_mel_channels": 80,
35
+ "mel_fmin": 0.0,
36
+ "mel_fmax": 22050
37
+ },
38
+ "model": {
39
+ "inter_channels": 192,
40
+ "hidden_channels": 192,
41
+ "filter_channels": 768,
42
+ "n_heads": 2,
43
+ "n_layers": 6,
44
+ "kernel_size": 3,
45
+ "p_dropout": 0.1,
46
+ "resblock": "1",
47
+ "resblock_kernel_sizes": [
48
+ 3,
49
+ 7,
50
+ 11
51
+ ],
52
+ "resblock_dilation_sizes": [
53
+ [
54
+ 1,
55
+ 3,
56
+ 5
57
+ ],
58
+ [
59
+ 1,
60
+ 3,
61
+ 5
62
+ ],
63
+ [
64
+ 1,
65
+ 3,
66
+ 5
67
+ ]
68
+ ],
69
+ "upsample_rates": [
70
+ 8,
71
+ 8,
72
+ 2,
73
+ 2,
74
+ 2
75
+ ],
76
+ "upsample_initial_channel": 512,
77
+ "upsample_kernel_sizes": [
78
+ 16,
79
+ 16,
80
+ 4,
81
+ 4,
82
+ 4
83
+ ],
84
+ "n_layers_q": 3,
85
+ "use_spectral_norm": false,
86
+ "gin_channels": 256,
87
+ "ssl_dim": 256,
88
+ "n_speakers": 200
89
+ },
90
+ "spk": {
91
+ "kson": 0
92
+ }
93
+ }
models/kson/cover.png ADDED
models/kson/kson.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff00adc23136d5acb1b31466ed813c0bdc55c90366e52637927d059f7ab14bd2
3
+ size 542781471