NeuTTS Express
- Compact model size (0.5B)
- Emotional and paralinguistic controls via (tags)
- Supports streaming for low latency usecases
- Instant voice cloning
Supported tags: (neutral), (happy), (sad), (angry), (surprised), (disgusted), (fearful), (giggle), (gasp), (exhale), (laugh), (chuckle), (sign)
import time
import re
import types
import soundfile as sf
from neuttsair.neutts import NeuTTSAir
# patch function
def patched_to_phones(self, text: str) -> str:
pattern = r'\([^)]*\)'
preserved_matches = list(re.finditer(pattern, text))
if not preserved_matches:
phones = self.phonemizer.phonemize([text])[0]
return " ".join(phones.split())
parts = re.split(pattern, text)
preserved_texts = [m.group() for m in preserved_matches]
phonemized_parts = []
for part in parts:
if part.strip():
phonemized = self.phonemizer.phonemize([part])[0]
phonemized = " ".join(phonemized.split())
phonemized_parts.append(phonemized)
else:
phonemized_parts.append("")
result = []
for i, phonemized_part in enumerate(phonemized_parts):
if phonemized_part:
result.append(phonemized_part)
if i < len(preserved_texts):
result.append(preserved_texts[i])
return " ".join(result)
# apply monkey patch
NeuTTSAir._to_phones = patched_to_phones
# inference
tts = NeuTTSAir(
backbone_repo="BarryFutureman/NeuTTS-Express",
backbone_device="cuda",
codec_repo="neuphonic/distill-neucodec",
codec_device="cuda"
)
input_text = "I just got the best news ever (giggle), and I couldn't be happier!"
start_time = time.time()
wav = tts.infer(input_text)
end_time = time.time()
sf.write("output.wav", wav, 24000)
Higher temperature, top-p, top-k is recommended.
- Downloads last month
- 88