Upload README.md with huggingface_hub
Browse files
README.md
CHANGED
|
@@ -13,7 +13,7 @@ pipeline_tag: text-to-speech
|
|
| 13 |
|
| 14 |
# Piper TTS: en_US-ryan-medium
|
| 15 |
|
| 16 |
-
Medium-
|
| 17 |
|
| 18 |
## Model Details
|
| 19 |
|
|
@@ -23,13 +23,16 @@ Medium-quality US English male voice.
|
|
| 23 |
| Format | ONNX |
|
| 24 |
| Language | English (US) |
|
| 25 |
| Gender | Male |
|
| 26 |
-
|
|
| 27 |
| Sample Rate | 22050 Hz |
|
| 28 |
| License | CC BY-NC-SA 4.0 |
|
| 29 |
|
|
|
|
|
|
|
|
|
|
| 30 |
## Usage
|
| 31 |
|
| 32 |
-
|
| 33 |
|
| 34 |
```python
|
| 35 |
from piper import PiperVoice
|
|
@@ -40,6 +43,62 @@ for chunk in voice.synthesize("Hello, this is a test."):
|
|
| 40 |
pass
|
| 41 |
```
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
## Fine-tuning
|
| 44 |
|
| 45 |
You can fine-tune this model on your own voice data using [Trelis Studio](https://studio.trelis.com). Piper models can be trained on custom datasets to create personalized voices.
|
|
|
|
| 13 |
|
| 14 |
# Piper TTS: en_US-ryan-medium
|
| 15 |
|
| 16 |
+
Medium-size US English male voice.
|
| 17 |
|
| 18 |
## Model Details
|
| 19 |
|
|
|
|
| 23 |
| Format | ONNX |
|
| 24 |
| Language | English (US) |
|
| 25 |
| Gender | Male |
|
| 26 |
+
| Model Size | medium (~63 MB ONNX, ~15M params) |
|
| 27 |
| Sample Rate | 22050 Hz |
|
| 28 |
| License | CC BY-NC-SA 4.0 |
|
| 29 |
|
| 30 |
+
> **Note:** Piper uses the terms "medium", "high", etc. to refer to **model size**, not output quality.
|
| 31 |
+
> Medium models (~63 MB, ~15M params) and high models (~114 MB, ~28M params) both produce 22.05 kHz audio.
|
| 32 |
+
|
| 33 |
## Usage
|
| 34 |
|
| 35 |
+
### With piper-tts (GPL)
|
| 36 |
|
| 37 |
```python
|
| 38 |
from piper import PiperVoice
|
|
|
|
| 43 |
pass
|
| 44 |
```
|
| 45 |
|
| 46 |
+
### Standalone ONNX (MIT — no piper-tts dependency)
|
| 47 |
+
|
| 48 |
+
Requires `espeak-ng` installed (`brew install espeak-ng` / `apt install espeak-ng`).
|
| 49 |
+
|
| 50 |
+
```python
|
| 51 |
+
import json, subprocess, numpy as np, onnxruntime as ort, soundfile as sf
|
| 52 |
+
from huggingface_hub import hf_hub_download
|
| 53 |
+
|
| 54 |
+
model_id = "Trelis/piper-en-us-ryan-medium"
|
| 55 |
+
onnx_path = hf_hub_download(model_id, "model.onnx")
|
| 56 |
+
config_path = hf_hub_download(model_id, "model.onnx.json")
|
| 57 |
+
|
| 58 |
+
with open(config_path) as f:
|
| 59 |
+
config = json.load(f)
|
| 60 |
+
|
| 61 |
+
session = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
|
| 62 |
+
phoneme_id_map = config["phoneme_id_map"]
|
| 63 |
+
espeak_voice = config["espeak"]["voice"]
|
| 64 |
+
|
| 65 |
+
def phonemize(text, voice):
|
| 66 |
+
out = subprocess.run(
|
| 67 |
+
["espeak-ng", "-v", voice, "-q", "--ipa=2", "-x", text],
|
| 68 |
+
capture_output=True, text=True,
|
| 69 |
+
).stdout.strip()
|
| 70 |
+
return [list(line.replace("_", " ")) for line in out.split("\n") if line.strip()]
|
| 71 |
+
|
| 72 |
+
def to_ids(phonemes, pmap):
|
| 73 |
+
ids = [pmap["^"][0], pmap["_"][0]]
|
| 74 |
+
for p in phonemes:
|
| 75 |
+
if p in pmap:
|
| 76 |
+
ids.extend(pmap[p])
|
| 77 |
+
ids.append(pmap["_"][0])
|
| 78 |
+
ids.append(pmap["$"][0])
|
| 79 |
+
return ids
|
| 80 |
+
|
| 81 |
+
text = "Hello, this is a test."
|
| 82 |
+
audio_chunks = []
|
| 83 |
+
for sentence in phonemize(text, espeak_voice):
|
| 84 |
+
ids = to_ids(sentence, phoneme_id_map)
|
| 85 |
+
if len(ids) < 3:
|
| 86 |
+
continue
|
| 87 |
+
audio = session.run(None, {
|
| 88 |
+
"input": np.array([ids], dtype=np.int64),
|
| 89 |
+
"input_lengths": np.array([len(ids)], dtype=np.int64),
|
| 90 |
+
"scales": np.array([
|
| 91 |
+
config["inference"]["noise_scale"],
|
| 92 |
+
config["inference"]["length_scale"],
|
| 93 |
+
config["inference"]["noise_w"],
|
| 94 |
+
], dtype=np.float32),
|
| 95 |
+
})[0]
|
| 96 |
+
audio_chunks.append(audio.squeeze())
|
| 97 |
+
|
| 98 |
+
audio = np.concatenate(audio_chunks).astype(np.float32)
|
| 99 |
+
sf.write("output.wav", audio, config["audio"]["sample_rate"])
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
## Fine-tuning
|
| 103 |
|
| 104 |
You can fine-tune this model on your own voice data using [Trelis Studio](https://studio.trelis.com). Piper models can be trained on custom datasets to create personalized voices.
|