Spaces:
Sleeping
Sleeping
File size: 4,533 Bytes
bad74fd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | """
TTS Engine Router — routes synthesis to local models or YourVoic API.
"""
import os
import io
import time
import tempfile
import requests
import numpy as np
import soundfile as sf
import logging
logger = logging.getLogger(__name__)
YOURVOIC_API_KEY = os.environ.get("YOURVOIC_API_KEY", "")
YOURVOIC_STREAM_URL = "https://yourvoic.com/api/v1/tts/stream"
def synthesize_yourvoic(text, language_code, voice="Peter", speed=1.0):
"""
Synthesize text using YourVoic API.
Returns (audio_array, sample_rate) or raises on failure.
"""
if not YOURVOIC_API_KEY:
raise RuntimeError(
"YOURVOIC_API_KEY not set. Add it as a Space secret."
)
headers = {
"X-API-Key": YOURVOIC_API_KEY,
"Content-Type": "application/json",
}
payload = {
"text": text,
"voice": voice,
"language": language_code,
"model": "aura-prime",
"speed": speed,
}
t0 = time.time()
response = requests.post(
YOURVOIC_STREAM_URL,
headers=headers,
json=payload,
stream=True,
timeout=60,
)
if response.status_code != 200:
raise RuntimeError(
f"YourVoic API error {response.status_code}: {response.text[:200]}"
)
# Collect streamed audio bytes
audio_bytes = io.BytesIO()
for chunk in response.iter_content(chunk_size=8192):
audio_bytes.write(chunk)
audio_bytes.seek(0)
elapsed = time.time() - t0
logger.info(f"YourVoic TTS: {len(text)} chars, {elapsed:.2f}s")
# Read audio from WAV bytes
audio_array, sample_rate = sf.read(audio_bytes, dtype="float32")
return audio_array, sample_rate
def synthesize_yourvoic_to_file(text, language_code, output_path, voice="Peter", speed=1.0):
"""Synthesize via YourVoic and save to file."""
audio, sr = synthesize_yourvoic(text, language_code, voice, speed)
sf.write(output_path, audio, sr)
return output_path, sr
def synthesize_local(text, tts_pipe):
"""
Synthesize text using local HuggingFace TTS pipeline (MMS-TTS).
Returns (audio_array, sample_rate).
"""
t0 = time.time()
result = tts_pipe(text)
audio = np.array(result["audio"]).squeeze()
sr = result["sampling_rate"]
elapsed = time.time() - t0
logger.info(f"Local TTS: {len(text)} chars, {elapsed:.2f}s, {len(audio)/sr:.1f}s audio")
return audio, sr
def synthesize_chunked(text, language_config, tts_pipe=None, sentences_per_chunk=2):
"""
Synthesize long text by chunking into sentence groups.
Routes to either YourVoic or local TTS based on language config.
Args:
text: Full text to synthesize
language_config: Dict from LANGUAGES (has tts_engine, yourvoic_lang, etc.)
tts_pipe: Local HuggingFace TTS pipeline (needed for local engine)
sentences_per_chunk: How many sentences to synthesize per API call
Returns:
(audio_array, sample_rate)
"""
import re
sentences = re.split(r'(?<=[.!?])\s+', text)
sentences = [s.strip() for s in sentences if s.strip()]
if not sentences:
return np.array([], dtype=np.float32), 16000
engine = language_config["tts_engine"]
audio_segments = []
output_sr = None
for i in range(0, len(sentences), sentences_per_chunk):
chunk_text = ' '.join(sentences[i:i + sentences_per_chunk])
if not chunk_text:
continue
try:
if engine == "yourvoic":
voice = language_config["yourvoic_voices"][0] if language_config["yourvoic_voices"] else "Peter"
lang_code = language_config["yourvoic_lang"]
audio_seg, seg_sr = synthesize_yourvoic(chunk_text, lang_code, voice)
else:
if tts_pipe is None:
raise RuntimeError("Local TTS pipeline not loaded")
audio_seg, seg_sr = synthesize_local(chunk_text, tts_pipe)
if output_sr is None:
output_sr = seg_sr
if len(audio_seg) > 0:
audio_segments.append(audio_seg)
# Small silence between chunks
silence = np.zeros(int(0.15 * seg_sr), dtype=np.float32)
audio_segments.append(silence)
except Exception as e:
logger.error(f"TTS chunk failed: {e}")
continue
if not audio_segments:
return np.array([], dtype=np.float32), output_sr or 16000
return np.concatenate(audio_segments), output_sr
|