PlotweaverModel commited on
Commit
cff4cca
·
verified ·
1 Parent(s): bdb126b

Delete tts_engine.py

Browse files
Files changed (1) hide show
  1. tts_engine.py +0 -146
tts_engine.py DELETED
@@ -1,146 +0,0 @@
1
- """
2
- TTS Engine Router — routes synthesis to local models or YourVoic API.
3
- """
4
-
5
- import os
6
- import io
7
- import time
8
- import tempfile
9
- import requests
10
- import numpy as np
11
- import soundfile as sf
12
- import logging
13
-
14
- logger = logging.getLogger(__name__)
15
-
16
- YOURVOIC_API_KEY = os.environ.get("YOURVOIC_API_KEY", "")
17
- YOURVOIC_STREAM_URL = "https://yourvoic.com/api/v1/tts/stream"
18
-
19
-
20
- def synthesize_yourvoic(text, language_code, voice="Peter", speed=1.0):
21
- """
22
- Synthesize text using YourVoic API.
23
- Returns (audio_array, sample_rate) or raises on failure.
24
- """
25
- if not YOURVOIC_API_KEY:
26
- raise RuntimeError(
27
- "YOURVOIC_API_KEY not set. Add it as a Space secret."
28
- )
29
-
30
- headers = {
31
- "X-API-Key": YOURVOIC_API_KEY,
32
- "Content-Type": "application/json",
33
- }
34
- payload = {
35
- "text": text,
36
- "voice": voice,
37
- "language": language_code,
38
- "model": "aura-prime",
39
- "speed": speed,
40
- }
41
-
42
- t0 = time.time()
43
- response = requests.post(
44
- YOURVOIC_STREAM_URL,
45
- headers=headers,
46
- json=payload,
47
- stream=True,
48
- timeout=60,
49
- )
50
-
51
- if response.status_code != 200:
52
- raise RuntimeError(
53
- f"YourVoic API error {response.status_code}: {response.text[:200]}"
54
- )
55
-
56
- # Collect streamed audio bytes
57
- audio_bytes = io.BytesIO()
58
- for chunk in response.iter_content(chunk_size=8192):
59
- audio_bytes.write(chunk)
60
- audio_bytes.seek(0)
61
-
62
- elapsed = time.time() - t0
63
- logger.info(f"YourVoic TTS: {len(text)} chars, {elapsed:.2f}s")
64
-
65
- # Read audio from WAV bytes
66
- audio_array, sample_rate = sf.read(audio_bytes, dtype="float32")
67
- return audio_array, sample_rate
68
-
69
-
70
- def synthesize_yourvoic_to_file(text, language_code, output_path, voice="Peter", speed=1.0):
71
- """Synthesize via YourVoic and save to file."""
72
- audio, sr = synthesize_yourvoic(text, language_code, voice, speed)
73
- sf.write(output_path, audio, sr)
74
- return output_path, sr
75
-
76
-
77
- def synthesize_local(text, tts_pipe):
78
- """
79
- Synthesize text using local HuggingFace TTS pipeline (MMS-TTS).
80
- Returns (audio_array, sample_rate).
81
- """
82
- t0 = time.time()
83
- result = tts_pipe(text)
84
- audio = np.array(result["audio"]).squeeze()
85
- sr = result["sampling_rate"]
86
- elapsed = time.time() - t0
87
- logger.info(f"Local TTS: {len(text)} chars, {elapsed:.2f}s, {len(audio)/sr:.1f}s audio")
88
- return audio, sr
89
-
90
-
91
- def synthesize_chunked(text, language_config, tts_pipe=None, sentences_per_chunk=2):
92
- """
93
- Synthesize long text by chunking into sentence groups.
94
- Routes to either YourVoic or local TTS based on language config.
95
-
96
- Args:
97
- text: Full text to synthesize
98
- language_config: Dict from LANGUAGES (has tts_engine, yourvoic_lang, etc.)
99
- tts_pipe: Local HuggingFace TTS pipeline (needed for local engine)
100
- sentences_per_chunk: How many sentences to synthesize per API call
101
-
102
- Returns:
103
- (audio_array, sample_rate)
104
- """
105
- import re
106
- sentences = re.split(r'(?<=[.!?])\s+', text)
107
- sentences = [s.strip() for s in sentences if s.strip()]
108
-
109
- if not sentences:
110
- return np.array([], dtype=np.float32), 16000
111
-
112
- engine = language_config["tts_engine"]
113
- audio_segments = []
114
- output_sr = None
115
-
116
- for i in range(0, len(sentences), sentences_per_chunk):
117
- chunk_text = ' '.join(sentences[i:i + sentences_per_chunk])
118
- if not chunk_text:
119
- continue
120
-
121
- try:
122
- if engine == "yourvoic":
123
- voice = language_config["yourvoic_voices"][0] if language_config["yourvoic_voices"] else "Peter"
124
- lang_code = language_config["yourvoic_lang"]
125
- audio_seg, seg_sr = synthesize_yourvoic(chunk_text, lang_code, voice)
126
- else:
127
- if tts_pipe is None:
128
- raise RuntimeError("Local TTS pipeline not loaded")
129
- audio_seg, seg_sr = synthesize_local(chunk_text, tts_pipe)
130
-
131
- if output_sr is None:
132
- output_sr = seg_sr
133
- if len(audio_seg) > 0:
134
- audio_segments.append(audio_seg)
135
- # Small silence between chunks
136
- silence = np.zeros(int(0.15 * seg_sr), dtype=np.float32)
137
- audio_segments.append(silence)
138
-
139
- except Exception as e:
140
- logger.error(f"TTS chunk failed: {e}")
141
- continue
142
-
143
- if not audio_segments:
144
- return np.array([], dtype=np.float32), output_sr or 16000
145
-
146
- return np.concatenate(audio_segments), output_sr