Delete tts_engine.py
Browse files- tts_engine.py +0 -146
tts_engine.py
DELETED
|
@@ -1,146 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
TTS Engine Router — routes synthesis to local models or YourVoic API.
|
| 3 |
-
"""
|
| 4 |
-
|
| 5 |
-
import os
|
| 6 |
-
import io
|
| 7 |
-
import time
|
| 8 |
-
import tempfile
|
| 9 |
-
import requests
|
| 10 |
-
import numpy as np
|
| 11 |
-
import soundfile as sf
|
| 12 |
-
import logging
|
| 13 |
-
|
| 14 |
-
logger = logging.getLogger(__name__)
|
| 15 |
-
|
| 16 |
-
YOURVOIC_API_KEY = os.environ.get("YOURVOIC_API_KEY", "")
|
| 17 |
-
YOURVOIC_STREAM_URL = "https://yourvoic.com/api/v1/tts/stream"
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
def synthesize_yourvoic(text, language_code, voice="Peter", speed=1.0):
|
| 21 |
-
"""
|
| 22 |
-
Synthesize text using YourVoic API.
|
| 23 |
-
Returns (audio_array, sample_rate) or raises on failure.
|
| 24 |
-
"""
|
| 25 |
-
if not YOURVOIC_API_KEY:
|
| 26 |
-
raise RuntimeError(
|
| 27 |
-
"YOURVOIC_API_KEY not set. Add it as a Space secret."
|
| 28 |
-
)
|
| 29 |
-
|
| 30 |
-
headers = {
|
| 31 |
-
"X-API-Key": YOURVOIC_API_KEY,
|
| 32 |
-
"Content-Type": "application/json",
|
| 33 |
-
}
|
| 34 |
-
payload = {
|
| 35 |
-
"text": text,
|
| 36 |
-
"voice": voice,
|
| 37 |
-
"language": language_code,
|
| 38 |
-
"model": "aura-prime",
|
| 39 |
-
"speed": speed,
|
| 40 |
-
}
|
| 41 |
-
|
| 42 |
-
t0 = time.time()
|
| 43 |
-
response = requests.post(
|
| 44 |
-
YOURVOIC_STREAM_URL,
|
| 45 |
-
headers=headers,
|
| 46 |
-
json=payload,
|
| 47 |
-
stream=True,
|
| 48 |
-
timeout=60,
|
| 49 |
-
)
|
| 50 |
-
|
| 51 |
-
if response.status_code != 200:
|
| 52 |
-
raise RuntimeError(
|
| 53 |
-
f"YourVoic API error {response.status_code}: {response.text[:200]}"
|
| 54 |
-
)
|
| 55 |
-
|
| 56 |
-
# Collect streamed audio bytes
|
| 57 |
-
audio_bytes = io.BytesIO()
|
| 58 |
-
for chunk in response.iter_content(chunk_size=8192):
|
| 59 |
-
audio_bytes.write(chunk)
|
| 60 |
-
audio_bytes.seek(0)
|
| 61 |
-
|
| 62 |
-
elapsed = time.time() - t0
|
| 63 |
-
logger.info(f"YourVoic TTS: {len(text)} chars, {elapsed:.2f}s")
|
| 64 |
-
|
| 65 |
-
# Read audio from WAV bytes
|
| 66 |
-
audio_array, sample_rate = sf.read(audio_bytes, dtype="float32")
|
| 67 |
-
return audio_array, sample_rate
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
def synthesize_yourvoic_to_file(text, language_code, output_path, voice="Peter", speed=1.0):
|
| 71 |
-
"""Synthesize via YourVoic and save to file."""
|
| 72 |
-
audio, sr = synthesize_yourvoic(text, language_code, voice, speed)
|
| 73 |
-
sf.write(output_path, audio, sr)
|
| 74 |
-
return output_path, sr
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
def synthesize_local(text, tts_pipe):
|
| 78 |
-
"""
|
| 79 |
-
Synthesize text using local HuggingFace TTS pipeline (MMS-TTS).
|
| 80 |
-
Returns (audio_array, sample_rate).
|
| 81 |
-
"""
|
| 82 |
-
t0 = time.time()
|
| 83 |
-
result = tts_pipe(text)
|
| 84 |
-
audio = np.array(result["audio"]).squeeze()
|
| 85 |
-
sr = result["sampling_rate"]
|
| 86 |
-
elapsed = time.time() - t0
|
| 87 |
-
logger.info(f"Local TTS: {len(text)} chars, {elapsed:.2f}s, {len(audio)/sr:.1f}s audio")
|
| 88 |
-
return audio, sr
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
def synthesize_chunked(text, language_config, tts_pipe=None, sentences_per_chunk=2):
|
| 92 |
-
"""
|
| 93 |
-
Synthesize long text by chunking into sentence groups.
|
| 94 |
-
Routes to either YourVoic or local TTS based on language config.
|
| 95 |
-
|
| 96 |
-
Args:
|
| 97 |
-
text: Full text to synthesize
|
| 98 |
-
language_config: Dict from LANGUAGES (has tts_engine, yourvoic_lang, etc.)
|
| 99 |
-
tts_pipe: Local HuggingFace TTS pipeline (needed for local engine)
|
| 100 |
-
sentences_per_chunk: How many sentences to synthesize per API call
|
| 101 |
-
|
| 102 |
-
Returns:
|
| 103 |
-
(audio_array, sample_rate)
|
| 104 |
-
"""
|
| 105 |
-
import re
|
| 106 |
-
sentences = re.split(r'(?<=[.!?])\s+', text)
|
| 107 |
-
sentences = [s.strip() for s in sentences if s.strip()]
|
| 108 |
-
|
| 109 |
-
if not sentences:
|
| 110 |
-
return np.array([], dtype=np.float32), 16000
|
| 111 |
-
|
| 112 |
-
engine = language_config["tts_engine"]
|
| 113 |
-
audio_segments = []
|
| 114 |
-
output_sr = None
|
| 115 |
-
|
| 116 |
-
for i in range(0, len(sentences), sentences_per_chunk):
|
| 117 |
-
chunk_text = ' '.join(sentences[i:i + sentences_per_chunk])
|
| 118 |
-
if not chunk_text:
|
| 119 |
-
continue
|
| 120 |
-
|
| 121 |
-
try:
|
| 122 |
-
if engine == "yourvoic":
|
| 123 |
-
voice = language_config["yourvoic_voices"][0] if language_config["yourvoic_voices"] else "Peter"
|
| 124 |
-
lang_code = language_config["yourvoic_lang"]
|
| 125 |
-
audio_seg, seg_sr = synthesize_yourvoic(chunk_text, lang_code, voice)
|
| 126 |
-
else:
|
| 127 |
-
if tts_pipe is None:
|
| 128 |
-
raise RuntimeError("Local TTS pipeline not loaded")
|
| 129 |
-
audio_seg, seg_sr = synthesize_local(chunk_text, tts_pipe)
|
| 130 |
-
|
| 131 |
-
if output_sr is None:
|
| 132 |
-
output_sr = seg_sr
|
| 133 |
-
if len(audio_seg) > 0:
|
| 134 |
-
audio_segments.append(audio_seg)
|
| 135 |
-
# Small silence between chunks
|
| 136 |
-
silence = np.zeros(int(0.15 * seg_sr), dtype=np.float32)
|
| 137 |
-
audio_segments.append(silence)
|
| 138 |
-
|
| 139 |
-
except Exception as e:
|
| 140 |
-
logger.error(f"TTS chunk failed: {e}")
|
| 141 |
-
continue
|
| 142 |
-
|
| 143 |
-
if not audio_segments:
|
| 144 |
-
return np.array([], dtype=np.float32), output_sr or 16000
|
| 145 |
-
|
| 146 |
-
return np.concatenate(audio_segments), output_sr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|