|
|
--- |
|
|
license: apache-2.0 |
|
|
language: |
|
|
- en |
|
|
base_model: |
|
|
- neuphonic/neutts-air |
|
|
pipeline_tag: text-to-speech |
|
|
library_name: transformers |
|
|
--- |
|
|
|
|
|
# NeuTTS Express |
|
|
|
|
|
<audio controls src="https://cdn-uploads.huggingface.co/production/uploads/6599dc66eabe0f3e98de7cf6/EyqWd8BtQT-o5nyb6CavE.wav"></audio> |
|
|
|
|
|
- Compact model size (0.5B) |
|
|
- Emotional and paralinguistic controls via \(tags\) |
|
|
- Supports streaming for low latency usecases |
|
|
- Instant voice cloning |
|
|
|
|
|
Supported tags: `(neutral)`, `(happy)`, `(sad)`, `(angry)`, `(surprised)`, `(disgusted)`, `(fearful)`, `(giggle)`, `(gasp)`, `(exhale)`, `(laugh)`, `(chuckle)`, `(sign)` |
|
|
|
|
|
```python |
|
|
import time |
|
|
import re |
|
|
import types |
|
|
import soundfile as sf |
|
|
from neuttsair.neutts import NeuTTSAir |
|
|
|
|
|
# patch function |
|
|
def patched_to_phones(self, text: str) -> str: |
|
|
pattern = r'\([^)]*\)' |
|
|
preserved_matches = list(re.finditer(pattern, text)) |
|
|
|
|
|
if not preserved_matches: |
|
|
phones = self.phonemizer.phonemize([text])[0] |
|
|
return " ".join(phones.split()) |
|
|
|
|
|
parts = re.split(pattern, text) |
|
|
preserved_texts = [m.group() for m in preserved_matches] |
|
|
|
|
|
phonemized_parts = [] |
|
|
for part in parts: |
|
|
if part.strip(): |
|
|
phonemized = self.phonemizer.phonemize([part])[0] |
|
|
phonemized = " ".join(phonemized.split()) |
|
|
phonemized_parts.append(phonemized) |
|
|
else: |
|
|
phonemized_parts.append("") |
|
|
|
|
|
result = [] |
|
|
for i, phonemized_part in enumerate(phonemized_parts): |
|
|
if phonemized_part: |
|
|
result.append(phonemized_part) |
|
|
if i < len(preserved_texts): |
|
|
result.append(preserved_texts[i]) |
|
|
|
|
|
return " ".join(result) |
|
|
|
|
|
# apply monkey patch |
|
|
NeuTTSAir._to_phones = patched_to_phones |
|
|
|
|
|
# inference |
|
|
tts = NeuTTSAir( |
|
|
backbone_repo="BarryFutureman/NeuTTS-Express", |
|
|
backbone_device="cuda", |
|
|
codec_repo="neuphonic/distill-neucodec", |
|
|
codec_device="cuda" |
|
|
) |
|
|
|
|
|
input_text = "I just got the best news ever (giggle), and I couldn't be happier!" |
|
|
start_time = time.time() |
|
|
wav = tts.infer(input_text) |
|
|
end_time = time.time() |
|
|
|
|
|
sf.write("output.wav", wav, 24000) |
|
|
``` |
|
|
|
|
|
Higher `temperature`, `top-p`, `top-k` is recommended. |