speecht5_hifigan
Browse files- speecht5_hifigan/.gitattributes +35 -0
- speecht5_hifigan/README.md +67 -0
- speecht5_hifigan/config.json +48 -0
- speecht5_hifigan/onnx/model.onnx +3 -0
- speecht5_hifigan/onnx/model_quantized.onnx +3 -0
- speecht5_hifigan/pth/.gitattributes +34 -0
- speecht5_hifigan/pth/README.md +28 -0
- speecht5_hifigan/pth/config.json +48 -0
- speecht5_hifigan/pth/pytorch_model.bin +3 -0
- speecht5_hifigan/source.txt +2 -0
speecht5_hifigan/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
speecht5_hifigan/README.md
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: microsoft/speecht5_hifigan
|
| 3 |
+
library_name: transformers.js
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
https://huggingface.co/microsoft/speecht5_hifigan with ONNX weights to be compatible with Transformers.js.
|
| 7 |
+
|
| 8 |
+
## Usage (Transformers.js)
|
| 9 |
+
|
| 10 |
+
If you haven't already, you can install the [Transformers.js](https://huggingface.co/docs/transformers.js) JavaScript library from [NPM](https://www.npmjs.com/package/@huggingface/transformers) using:
|
| 11 |
+
```bash
|
| 12 |
+
npm i @huggingface/transformers
|
| 13 |
+
```
|
| 14 |
+
|
| 15 |
+
**Example:** Generate speech from text.
|
| 16 |
+
```js
|
| 17 |
+
import { AutoTokenizer, AutoProcessor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, Tensor } from '@huggingface/transformers';
|
| 18 |
+
|
| 19 |
+
// Load the tokenizer and processor
|
| 20 |
+
const tokenizer = await AutoTokenizer.from_pretrained('Xenova/speecht5_tts');
|
| 21 |
+
const processor = await AutoProcessor.from_pretrained('Xenova/speecht5_tts');
|
| 22 |
+
|
| 23 |
+
// Load the models
|
| 24 |
+
// NOTE: We use the unquantized versions as they are more accurate
|
| 25 |
+
const model = await SpeechT5ForTextToSpeech.from_pretrained('Xenova/speecht5_tts', { dtype: "fp32" }); // Options: "fp32", "fp16", "q8", "q4"
|
| 26 |
+
const vocoder = await SpeechT5HifiGan.from_pretrained('Xenova/speecht5_hifigan', { dtype: "fp32" }); // Options: "fp32", "fp16", "q8", "q4"
|
| 27 |
+
|
| 28 |
+
// Load speaker embeddings from URL
|
| 29 |
+
const speaker_embeddings_data = new Float32Array(
|
| 30 |
+
await (await fetch('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin')).arrayBuffer()
|
| 31 |
+
);
|
| 32 |
+
const speaker_embeddings = new Tensor(
|
| 33 |
+
'float32',
|
| 34 |
+
speaker_embeddings_data,
|
| 35 |
+
[1, speaker_embeddings_data.length]
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
// Run tokenization
|
| 39 |
+
const { input_ids } = tokenizer('Hello, my dog is cute');
|
| 40 |
+
|
| 41 |
+
// Generate waveform
|
| 42 |
+
const { waveform } = await model.generate_speech(input_ids, speaker_embeddings, { vocoder });
|
| 43 |
+
console.log(waveform);
|
| 44 |
+
// Tensor {
|
| 45 |
+
// dims: [ 26112 ],
|
| 46 |
+
// type: 'float32',
|
| 47 |
+
// size: 26112,
|
| 48 |
+
// data: Float32Array(26112) [ -0.00043630177970044315, -0.00018082228780258447, ... ],
|
| 49 |
+
// }
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
Optionally, save the audio to a wav file (Node.js):
|
| 53 |
+
```js
|
| 54 |
+
// Write to file (Node.js)
|
| 55 |
+
import wavefile from 'wavefile';
|
| 56 |
+
import fs from 'fs';
|
| 57 |
+
|
| 58 |
+
const wav = new wavefile.WaveFile();
|
| 59 |
+
wav.fromScratch(1, processor.feature_extractor.config.sampling_rate, '32f', waveform.data);
|
| 60 |
+
fs.writeFileSync('out.wav', wav.toBuffer());
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
<audio controls src="https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/on1ij9Y269ne9zlYN9mdb.wav"></audio>
|
| 64 |
+
|
| 65 |
+
---
|
| 66 |
+
|
| 67 |
+
Note: Having a separate repo for ONNX weights is intended to be a temporary solution until WebML gains more traction. If you would like to make your models web-ready, we recommend converting to ONNX using [🤗 Optimum](https://huggingface.co/docs/optimum/index) and structuring your repo like this one (with ONNX weights located in a subfolder named `onnx`).
|
speecht5_hifigan/config.json
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"SpeechT5HifiGan"
|
| 4 |
+
],
|
| 5 |
+
"initializer_range": 0.01,
|
| 6 |
+
"leaky_relu_slope": 0.1,
|
| 7 |
+
"model_in_dim": 80,
|
| 8 |
+
"model_type": "hifigan",
|
| 9 |
+
"normalize_before": true,
|
| 10 |
+
"resblock_dilation_sizes": [
|
| 11 |
+
[
|
| 12 |
+
1,
|
| 13 |
+
3,
|
| 14 |
+
5
|
| 15 |
+
],
|
| 16 |
+
[
|
| 17 |
+
1,
|
| 18 |
+
3,
|
| 19 |
+
5
|
| 20 |
+
],
|
| 21 |
+
[
|
| 22 |
+
1,
|
| 23 |
+
3,
|
| 24 |
+
5
|
| 25 |
+
]
|
| 26 |
+
],
|
| 27 |
+
"resblock_kernel_sizes": [
|
| 28 |
+
3,
|
| 29 |
+
7,
|
| 30 |
+
11
|
| 31 |
+
],
|
| 32 |
+
"sampling_rate": 16000,
|
| 33 |
+
"torch_dtype": "float32",
|
| 34 |
+
"transformers_version": "4.27.0.dev0",
|
| 35 |
+
"upsample_initial_channel": 512,
|
| 36 |
+
"upsample_kernel_sizes": [
|
| 37 |
+
8,
|
| 38 |
+
8,
|
| 39 |
+
8,
|
| 40 |
+
8
|
| 41 |
+
],
|
| 42 |
+
"upsample_rates": [
|
| 43 |
+
4,
|
| 44 |
+
4,
|
| 45 |
+
4,
|
| 46 |
+
4
|
| 47 |
+
]
|
| 48 |
+
}
|
speecht5_hifigan/onnx/model.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:26408d9b8b3e83e66a3ef614a5621b84cacdcabd81b3a6a2fa984eb788566e6c
|
| 3 |
+
size 55432026
|
speecht5_hifigan/onnx/model_quantized.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d092ea63bbc7839f7317657b5f531ebb68377ee05ec866ec096dba25911f9feb
|
| 3 |
+
size 18251672
|
speecht5_hifigan/pth/.gitattributes
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
speecht5_hifigan/pth/README.md
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
tags:
|
| 4 |
+
- audio
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
# SpeechT5 HiFi-GAN Vocoder
|
| 8 |
+
|
| 9 |
+
This is the HiFi-GAN vocoder for use with the SpeechT5 text-to-speech and voice conversion models.
|
| 10 |
+
|
| 11 |
+
SpeechT5 was first released in [this repository](https://github.com/microsoft/SpeechT5/), [original weights](https://huggingface.co/mechanicalsea/speecht5-tts). The license used is [MIT](https://github.com/microsoft/SpeechT5/blob/main/LICENSE).
|
| 12 |
+
|
| 13 |
+
Disclaimer: The team releasing SpeechT5 did not write a model card for this model so this model card has been written by the Hugging Face team.
|
| 14 |
+
|
| 15 |
+
## Citation
|
| 16 |
+
|
| 17 |
+
**BibTeX:**
|
| 18 |
+
|
| 19 |
+
```bibtex
|
| 20 |
+
@inproceedings{ao-etal-2022-speecht5,
|
| 21 |
+
title = {{S}peech{T}5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing},
|
| 22 |
+
author = {Ao, Junyi and Wang, Rui and Zhou, Long and Wang, Chengyi and Ren, Shuo and Wu, Yu and Liu, Shujie and Ko, Tom and Li, Qing and Zhang, Yu and Wei, Zhihua and Qian, Yao and Li, Jinyu and Wei, Furu},
|
| 23 |
+
booktitle = {Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
|
| 24 |
+
month = {May},
|
| 25 |
+
year = {2022},
|
| 26 |
+
pages={5723--5738},
|
| 27 |
+
}
|
| 28 |
+
```
|
speecht5_hifigan/pth/config.json
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"SpeechT5HifiGan"
|
| 4 |
+
],
|
| 5 |
+
"initializer_range": 0.01,
|
| 6 |
+
"leaky_relu_slope": 0.1,
|
| 7 |
+
"model_in_dim": 80,
|
| 8 |
+
"model_type": "hifigan",
|
| 9 |
+
"normalize_before": true,
|
| 10 |
+
"resblock_dilation_sizes": [
|
| 11 |
+
[
|
| 12 |
+
1,
|
| 13 |
+
3,
|
| 14 |
+
5
|
| 15 |
+
],
|
| 16 |
+
[
|
| 17 |
+
1,
|
| 18 |
+
3,
|
| 19 |
+
5
|
| 20 |
+
],
|
| 21 |
+
[
|
| 22 |
+
1,
|
| 23 |
+
3,
|
| 24 |
+
5
|
| 25 |
+
]
|
| 26 |
+
],
|
| 27 |
+
"resblock_kernel_sizes": [
|
| 28 |
+
3,
|
| 29 |
+
7,
|
| 30 |
+
11
|
| 31 |
+
],
|
| 32 |
+
"sampling_rate": 16000,
|
| 33 |
+
"torch_dtype": "float32",
|
| 34 |
+
"transformers_version": "4.27.0.dev0",
|
| 35 |
+
"upsample_initial_channel": 512,
|
| 36 |
+
"upsample_kernel_sizes": [
|
| 37 |
+
8,
|
| 38 |
+
8,
|
| 39 |
+
8,
|
| 40 |
+
8
|
| 41 |
+
],
|
| 42 |
+
"upsample_rates": [
|
| 43 |
+
4,
|
| 44 |
+
4,
|
| 45 |
+
4,
|
| 46 |
+
4
|
| 47 |
+
]
|
| 48 |
+
}
|
speecht5_hifigan/pth/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b171e9bcd8a2b50dc9780040478dfa26783a9ee4be012cf5776914f091d6887b
|
| 3 |
+
size 50672453
|
speecht5_hifigan/source.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/Xenova/speecht5_hifigan
|
| 2 |
+
https://huggingface.co/microsoft/speecht5_hifigan
|