niobures commited on
Commit
b170ec5
·
verified ·
1 Parent(s): 2240006

speecht5_hifigan

Browse files
speecht5_hifigan/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
speecht5_hifigan/README.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: microsoft/speecht5_hifigan
3
+ library_name: transformers.js
4
+ ---
5
+
6
+ https://huggingface.co/microsoft/speecht5_hifigan with ONNX weights to be compatible with Transformers.js.
7
+
8
+ ## Usage (Transformers.js)
9
+
10
+ If you haven't already, you can install the [Transformers.js](https://huggingface.co/docs/transformers.js) JavaScript library from [NPM](https://www.npmjs.com/package/@huggingface/transformers) using:
11
+ ```bash
12
+ npm i @huggingface/transformers
13
+ ```
14
+
15
+ **Example:** Generate speech from text.
16
+ ```js
17
+ import { AutoTokenizer, AutoProcessor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, Tensor } from '@huggingface/transformers';
18
+
19
+ // Load the tokenizer and processor
20
+ const tokenizer = await AutoTokenizer.from_pretrained('Xenova/speecht5_tts');
21
+ const processor = await AutoProcessor.from_pretrained('Xenova/speecht5_tts');
22
+
23
+ // Load the models
24
+ // NOTE: We use the unquantized versions as they are more accurate
25
+ const model = await SpeechT5ForTextToSpeech.from_pretrained('Xenova/speecht5_tts', { dtype: "fp32" }); // Options: "fp32", "fp16", "q8", "q4"
26
+ const vocoder = await SpeechT5HifiGan.from_pretrained('Xenova/speecht5_hifigan', { dtype: "fp32" }); // Options: "fp32", "fp16", "q8", "q4"
27
+
28
+ // Load speaker embeddings from URL
29
+ const speaker_embeddings_data = new Float32Array(
30
+ await (await fetch('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin')).arrayBuffer()
31
+ );
32
+ const speaker_embeddings = new Tensor(
33
+ 'float32',
34
+ speaker_embeddings_data,
35
+ [1, speaker_embeddings_data.length]
36
+ )
37
+
38
+ // Run tokenization
39
+ const { input_ids } = tokenizer('Hello, my dog is cute');
40
+
41
+ // Generate waveform
42
+ const { waveform } = await model.generate_speech(input_ids, speaker_embeddings, { vocoder });
43
+ console.log(waveform);
44
+ // Tensor {
45
+ // dims: [ 26112 ],
46
+ // type: 'float32',
47
+ // size: 26112,
48
+ // data: Float32Array(26112) [ -0.00043630177970044315, -0.00018082228780258447, ... ],
49
+ // }
50
+ ```
51
+
52
+ Optionally, save the audio to a wav file (Node.js):
53
+ ```js
54
+ // Write to file (Node.js)
55
+ import wavefile from 'wavefile';
56
+ import fs from 'fs';
57
+
58
+ const wav = new wavefile.WaveFile();
59
+ wav.fromScratch(1, processor.feature_extractor.config.sampling_rate, '32f', waveform.data);
60
+ fs.writeFileSync('out.wav', wav.toBuffer());
61
+ ```
62
+
63
+ <audio controls src="https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/on1ij9Y269ne9zlYN9mdb.wav"></audio>
64
+
65
+ ---
66
+
67
+ Note: Having a separate repo for ONNX weights is intended to be a temporary solution until WebML gains more traction. If you would like to make your models web-ready, we recommend converting to ONNX using [🤗 Optimum](https://huggingface.co/docs/optimum/index) and structuring your repo like this one (with ONNX weights located in a subfolder named `onnx`).
speecht5_hifigan/config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SpeechT5HifiGan"
4
+ ],
5
+ "initializer_range": 0.01,
6
+ "leaky_relu_slope": 0.1,
7
+ "model_in_dim": 80,
8
+ "model_type": "hifigan",
9
+ "normalize_before": true,
10
+ "resblock_dilation_sizes": [
11
+ [
12
+ 1,
13
+ 3,
14
+ 5
15
+ ],
16
+ [
17
+ 1,
18
+ 3,
19
+ 5
20
+ ],
21
+ [
22
+ 1,
23
+ 3,
24
+ 5
25
+ ]
26
+ ],
27
+ "resblock_kernel_sizes": [
28
+ 3,
29
+ 7,
30
+ 11
31
+ ],
32
+ "sampling_rate": 16000,
33
+ "torch_dtype": "float32",
34
+ "transformers_version": "4.27.0.dev0",
35
+ "upsample_initial_channel": 512,
36
+ "upsample_kernel_sizes": [
37
+ 8,
38
+ 8,
39
+ 8,
40
+ 8
41
+ ],
42
+ "upsample_rates": [
43
+ 4,
44
+ 4,
45
+ 4,
46
+ 4
47
+ ]
48
+ }
speecht5_hifigan/onnx/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26408d9b8b3e83e66a3ef614a5621b84cacdcabd81b3a6a2fa984eb788566e6c
3
+ size 55432026
speecht5_hifigan/onnx/model_quantized.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d092ea63bbc7839f7317657b5f531ebb68377ee05ec866ec096dba25911f9feb
3
+ size 18251672
speecht5_hifigan/pth/.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
speecht5_hifigan/pth/README.md ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ tags:
4
+ - audio
5
+ ---
6
+
7
+ # SpeechT5 HiFi-GAN Vocoder
8
+
9
+ This is the HiFi-GAN vocoder for use with the SpeechT5 text-to-speech and voice conversion models.
10
+
11
+ SpeechT5 was first released in [this repository](https://github.com/microsoft/SpeechT5/), [original weights](https://huggingface.co/mechanicalsea/speecht5-tts). The license used is [MIT](https://github.com/microsoft/SpeechT5/blob/main/LICENSE).
12
+
13
+ Disclaimer: The team releasing SpeechT5 did not write a model card for this model so this model card has been written by the Hugging Face team.
14
+
15
+ ## Citation
16
+
17
+ **BibTeX:**
18
+
19
+ ```bibtex
20
+ @inproceedings{ao-etal-2022-speecht5,
21
+ title = {{S}peech{T}5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing},
22
+ author = {Ao, Junyi and Wang, Rui and Zhou, Long and Wang, Chengyi and Ren, Shuo and Wu, Yu and Liu, Shujie and Ko, Tom and Li, Qing and Zhang, Yu and Wei, Zhihua and Qian, Yao and Li, Jinyu and Wei, Furu},
23
+ booktitle = {Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
24
+ month = {May},
25
+ year = {2022},
26
+ pages={5723--5738},
27
+ }
28
+ ```
speecht5_hifigan/pth/config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SpeechT5HifiGan"
4
+ ],
5
+ "initializer_range": 0.01,
6
+ "leaky_relu_slope": 0.1,
7
+ "model_in_dim": 80,
8
+ "model_type": "hifigan",
9
+ "normalize_before": true,
10
+ "resblock_dilation_sizes": [
11
+ [
12
+ 1,
13
+ 3,
14
+ 5
15
+ ],
16
+ [
17
+ 1,
18
+ 3,
19
+ 5
20
+ ],
21
+ [
22
+ 1,
23
+ 3,
24
+ 5
25
+ ]
26
+ ],
27
+ "resblock_kernel_sizes": [
28
+ 3,
29
+ 7,
30
+ 11
31
+ ],
32
+ "sampling_rate": 16000,
33
+ "torch_dtype": "float32",
34
+ "transformers_version": "4.27.0.dev0",
35
+ "upsample_initial_channel": 512,
36
+ "upsample_kernel_sizes": [
37
+ 8,
38
+ 8,
39
+ 8,
40
+ 8
41
+ ],
42
+ "upsample_rates": [
43
+ 4,
44
+ 4,
45
+ 4,
46
+ 4
47
+ ]
48
+ }
speecht5_hifigan/pth/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b171e9bcd8a2b50dc9780040478dfa26783a9ee4be012cf5776914f091d6887b
3
+ size 50672453
speecht5_hifigan/source.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ https://huggingface.co/Xenova/speecht5_hifigan
2
+ https://huggingface.co/microsoft/speecht5_hifigan