File size: 2,153 Bytes
dafc205 a1dc315 335551e a1dc315 335551e 3b711a6 335551e 3b711a6 335551e 2bdbee6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
---
license: apache-2.0
language:
- en
base_model:
- neuphonic/neutts-air
pipeline_tag: text-to-speech
library_name: transformers
---
# NeuTTS Express
<audio controls src="https://cdn-uploads.huggingface.co/production/uploads/6599dc66eabe0f3e98de7cf6/EyqWd8BtQT-o5nyb6CavE.wav"></audio>
- Compact model size (0.5B)
- Emotional and paralinguistic controls via \(tags\)
- Supports streaming for low latency usecases
- Instant voice cloning
Supported tags: `(neutral)`, `(happy)`, `(sad)`, `(angry)`, `(surprised)`, `(disgusted)`, `(fearful)`, `(giggle)`, `(gasp)`, `(exhale)`, `(laugh)`, `(chuckle)`, `(sign)`
```python
import time
import re
import types
import soundfile as sf
from neuttsair.neutts import NeuTTSAir
# patch function
def patched_to_phones(self, text: str) -> str:
pattern = r'\([^)]*\)'
preserved_matches = list(re.finditer(pattern, text))
if not preserved_matches:
phones = self.phonemizer.phonemize([text])[0]
return " ".join(phones.split())
parts = re.split(pattern, text)
preserved_texts = [m.group() for m in preserved_matches]
phonemized_parts = []
for part in parts:
if part.strip():
phonemized = self.phonemizer.phonemize([part])[0]
phonemized = " ".join(phonemized.split())
phonemized_parts.append(phonemized)
else:
phonemized_parts.append("")
result = []
for i, phonemized_part in enumerate(phonemized_parts):
if phonemized_part:
result.append(phonemized_part)
if i < len(preserved_texts):
result.append(preserved_texts[i])
return " ".join(result)
# apply monkey patch
NeuTTSAir._to_phones = patched_to_phones
# inference
tts = NeuTTSAir(
backbone_repo="BarryFutureman/NeuTTS-Express",
backbone_device="cuda",
codec_repo="neuphonic/distill-neucodec",
codec_device="cuda"
)
input_text = "I just got the best news ever (giggle), and I couldn't be happier!"
start_time = time.time()
wav = tts.infer(input_text)
end_time = time.time()
sf.write("output.wav", wav, 24000)
```
Higher `temperature`, `top-p`, `top-k` is recommended. |