Spaces:
Sleeping
Sleeping
github-actions[bot] commited on
Commit ·
62f0a86
1
Parent(s): 74be2eb
Sync from GitHub 3b0f7c9651fbf32085152edd156d5113f1f288cc
Browse files- src/artifacts/tts_adapter.py +79 -5
src/artifacts/tts_adapter.py
CHANGED
|
@@ -71,6 +71,7 @@ class ElevenLabsTTS(TTSAdapter):
|
|
| 71 |
self.client = ElevenLabs(api_key=self.api_key)
|
| 72 |
self.default_voice = os.getenv("TTS_ELEVENLABS_VOICE_1", "Rachel")
|
| 73 |
self.default_model = os.getenv("TTS_ELEVENLABS_MODEL", "eleven_multilingual_v2")
|
|
|
|
| 74 |
self._voice_aliases = self._load_voice_aliases()
|
| 75 |
|
| 76 |
def _load_voice_aliases(self) -> dict[str, str]:
|
|
@@ -99,6 +100,83 @@ class ElevenLabsTTS(TTSAdapter):
|
|
| 99 |
candidate = self.default_voice
|
| 100 |
return self._voice_aliases.get(candidate.lower(), candidate)
|
| 101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
def _write_audio_output(self, audio: Any, output_path: str) -> None:
|
| 103 |
"""
|
| 104 |
ElevenLabs SDK returns either bytes, file-like, or iterable chunks depending
|
|
@@ -154,11 +232,7 @@ class ElevenLabsTTS(TTSAdapter):
|
|
| 154 |
for voice_candidate in voice_candidates:
|
| 155 |
for model_candidate in model_candidates:
|
| 156 |
try:
|
| 157 |
-
audio = self.
|
| 158 |
-
text=text,
|
| 159 |
-
voice=voice_candidate,
|
| 160 |
-
model=model_candidate,
|
| 161 |
-
)
|
| 162 |
self._write_audio_output(audio, output_path)
|
| 163 |
return output_path
|
| 164 |
except Exception as exc:
|
|
|
|
| 71 |
self.client = ElevenLabs(api_key=self.api_key)
|
| 72 |
self.default_voice = os.getenv("TTS_ELEVENLABS_VOICE_1", "Rachel")
|
| 73 |
self.default_model = os.getenv("TTS_ELEVENLABS_MODEL", "eleven_multilingual_v2")
|
| 74 |
+
self.output_format = os.getenv("TTS_ELEVENLABS_OUTPUT_FORMAT", "mp3_44100_128")
|
| 75 |
self._voice_aliases = self._load_voice_aliases()
|
| 76 |
|
| 77 |
def _load_voice_aliases(self) -> dict[str, str]:
|
|
|
|
| 100 |
candidate = self.default_voice
|
| 101 |
return self._voice_aliases.get(candidate.lower(), candidate)
|
| 102 |
|
| 103 |
+
def _try_call_variants(self, method: Any, variants: list[dict[str, Any]]) -> Any:
|
| 104 |
+
"""
|
| 105 |
+
Some ElevenLabs SDK versions use different parameter names.
|
| 106 |
+
Try a small set of known-compatible signatures.
|
| 107 |
+
"""
|
| 108 |
+
last_type_error: TypeError | None = None
|
| 109 |
+
for kwargs in variants:
|
| 110 |
+
try:
|
| 111 |
+
return method(**kwargs)
|
| 112 |
+
except TypeError as exc:
|
| 113 |
+
last_type_error = exc
|
| 114 |
+
continue
|
| 115 |
+
if last_type_error:
|
| 116 |
+
raise last_type_error
|
| 117 |
+
raise RuntimeError("Unable to call ElevenLabs SDK method with known signatures.")
|
| 118 |
+
|
| 119 |
+
def _request_audio(self, text: str, voice_candidate: str, model_candidate: str) -> Any:
|
| 120 |
+
"""
|
| 121 |
+
Support both legacy and modern ElevenLabs Python SDK APIs:
|
| 122 |
+
- legacy: client.generate(...)
|
| 123 |
+
- modern: client.text_to_speech.convert(...)/convert_as_stream(...)
|
| 124 |
+
"""
|
| 125 |
+
if hasattr(self.client, "generate"):
|
| 126 |
+
return self._try_call_variants(
|
| 127 |
+
self.client.generate,
|
| 128 |
+
[
|
| 129 |
+
{"text": text, "voice": voice_candidate, "model": model_candidate},
|
| 130 |
+
{"text": text, "voice": voice_candidate, "model_id": model_candidate},
|
| 131 |
+
{"text": text, "voice_id": voice_candidate, "model_id": model_candidate},
|
| 132 |
+
],
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
tts_api = getattr(self.client, "text_to_speech", None)
|
| 136 |
+
if tts_api is not None:
|
| 137 |
+
if hasattr(tts_api, "convert_as_stream"):
|
| 138 |
+
return self._try_call_variants(
|
| 139 |
+
tts_api.convert_as_stream,
|
| 140 |
+
[
|
| 141 |
+
{
|
| 142 |
+
"text": text,
|
| 143 |
+
"voice_id": voice_candidate,
|
| 144 |
+
"model_id": model_candidate,
|
| 145 |
+
"output_format": self.output_format,
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"text": text,
|
| 149 |
+
"voice_id": voice_candidate,
|
| 150 |
+
"model_id": model_candidate,
|
| 151 |
+
},
|
| 152 |
+
{"text": text, "voice": voice_candidate, "model": model_candidate},
|
| 153 |
+
],
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
if hasattr(tts_api, "convert"):
|
| 157 |
+
return self._try_call_variants(
|
| 158 |
+
tts_api.convert,
|
| 159 |
+
[
|
| 160 |
+
{
|
| 161 |
+
"text": text,
|
| 162 |
+
"voice_id": voice_candidate,
|
| 163 |
+
"model_id": model_candidate,
|
| 164 |
+
"output_format": self.output_format,
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"text": text,
|
| 168 |
+
"voice_id": voice_candidate,
|
| 169 |
+
"model_id": model_candidate,
|
| 170 |
+
},
|
| 171 |
+
{"text": text, "voice": voice_candidate, "model": model_candidate},
|
| 172 |
+
],
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
raise AttributeError(
|
| 176 |
+
"No compatible ElevenLabs synthesis method found on client "
|
| 177 |
+
"(expected generate() or text_to_speech.convert())."
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
def _write_audio_output(self, audio: Any, output_path: str) -> None:
|
| 181 |
"""
|
| 182 |
ElevenLabs SDK returns either bytes, file-like, or iterable chunks depending
|
|
|
|
| 232 |
for voice_candidate in voice_candidates:
|
| 233 |
for model_candidate in model_candidates:
|
| 234 |
try:
|
| 235 |
+
audio = self._request_audio(text, voice_candidate, model_candidate)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
self._write_audio_output(audio, output_path)
|
| 237 |
return output_path
|
| 238 |
except Exception as exc:
|