Spaces:
Sleeping
Sleeping
github-actions[bot] commited on
Commit ·
74be2eb
1
Parent(s): 1815b1f
Sync from GitHub abaefddd1c944e527964d7379da3cfc39209b3a5
Browse files- src/artifacts/podcast_generator.py +18 -5
- src/artifacts/tts_adapter.py +98 -15
src/artifacts/podcast_generator.py
CHANGED
|
@@ -76,6 +76,7 @@ class PodcastGenerator:
|
|
| 76 |
tts_provider = tts_provider or os.getenv("TTS_PROVIDER", "edge")
|
| 77 |
self.tts = get_tts_adapter(tts_provider)
|
| 78 |
self.tts_provider = tts_provider
|
|
|
|
| 79 |
|
| 80 |
# Default settings from .env
|
| 81 |
self.default_duration = os.getenv("DEFAULT_PODCAST_DURATION", "5min")
|
|
@@ -141,13 +142,18 @@ class PodcastGenerator:
|
|
| 141 |
|
| 142 |
# 3. Synthesize audio segments
|
| 143 |
print(f"🎵 Synthesizing audio with {self.tts_provider}...")
|
|
|
|
| 144 |
audio_segments = self._synthesize_segments(script, user_id, notebook_id, hosts)
|
| 145 |
if not audio_segments:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
return {
|
| 147 |
-
"error":
|
| 148 |
-
"Transcript generated but audio synthesis failed for all segments. "
|
| 149 |
-
"Check TTS provider credentials, quota, and configured voices."
|
| 150 |
-
),
|
| 151 |
"transcript": script,
|
| 152 |
"audio_path": None,
|
| 153 |
"metadata": {
|
|
@@ -159,6 +165,7 @@ class PodcastGenerator:
|
|
| 159 |
"llm_model": self.model,
|
| 160 |
"num_segments": len(script),
|
| 161 |
"topic_focus": topic_focus,
|
|
|
|
| 162 |
"generated_at": datetime.utcnow().isoformat(),
|
| 163 |
},
|
| 164 |
}
|
|
@@ -449,6 +456,7 @@ IMPORTANT:
|
|
| 449 |
voices = voice_maps.get(self.tts_provider, voice_maps["edge"])
|
| 450 |
|
| 451 |
audio_files: List[str] = []
|
|
|
|
| 452 |
total = len(script)
|
| 453 |
|
| 454 |
for i, segment in enumerate(script, 1):
|
|
@@ -463,7 +471,12 @@ IMPORTANT:
|
|
| 463 |
audio_files.append(output_path)
|
| 464 |
print(f" ✓ Segment {i}/{total}: {speaker}")
|
| 465 |
except Exception as e:
|
| 466 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 467 |
continue
|
| 468 |
|
| 469 |
return audio_files
|
|
|
|
| 76 |
tts_provider = tts_provider or os.getenv("TTS_PROVIDER", "edge")
|
| 77 |
self.tts = get_tts_adapter(tts_provider)
|
| 78 |
self.tts_provider = tts_provider
|
| 79 |
+
self._last_tts_errors: List[str] = []
|
| 80 |
|
| 81 |
# Default settings from .env
|
| 82 |
self.default_duration = os.getenv("DEFAULT_PODCAST_DURATION", "5min")
|
|
|
|
| 142 |
|
| 143 |
# 3. Synthesize audio segments
|
| 144 |
print(f"🎵 Synthesizing audio with {self.tts_provider}...")
|
| 145 |
+
self._last_tts_errors = []
|
| 146 |
audio_segments = self._synthesize_segments(script, user_id, notebook_id, hosts)
|
| 147 |
if not audio_segments:
|
| 148 |
+
tts_error_preview = "; ".join(self._last_tts_errors[:3]).strip()
|
| 149 |
+
failure_message = (
|
| 150 |
+
"Transcript generated but audio synthesis failed for all segments. "
|
| 151 |
+
"Check TTS provider credentials, quota, and configured voices."
|
| 152 |
+
)
|
| 153 |
+
if tts_error_preview:
|
| 154 |
+
failure_message = f"{failure_message} Provider errors: {tts_error_preview}"
|
| 155 |
return {
|
| 156 |
+
"error": failure_message,
|
|
|
|
|
|
|
|
|
|
| 157 |
"transcript": script,
|
| 158 |
"audio_path": None,
|
| 159 |
"metadata": {
|
|
|
|
| 165 |
"llm_model": self.model,
|
| 166 |
"num_segments": len(script),
|
| 167 |
"topic_focus": topic_focus,
|
| 168 |
+
"tts_errors": self._last_tts_errors[:20],
|
| 169 |
"generated_at": datetime.utcnow().isoformat(),
|
| 170 |
},
|
| 171 |
}
|
|
|
|
| 456 |
voices = voice_maps.get(self.tts_provider, voice_maps["edge"])
|
| 457 |
|
| 458 |
audio_files: List[str] = []
|
| 459 |
+
self._last_tts_errors = []
|
| 460 |
total = len(script)
|
| 461 |
|
| 462 |
for i, segment in enumerate(script, 1):
|
|
|
|
| 471 |
audio_files.append(output_path)
|
| 472 |
print(f" ✓ Segment {i}/{total}: {speaker}")
|
| 473 |
except Exception as e:
|
| 474 |
+
error_detail = (
|
| 475 |
+
f"segment={i}/{total}, speaker={speaker}, voice={voice}, "
|
| 476 |
+
f"error={type(e).__name__}: {' '.join(str(e).split())}"
|
| 477 |
+
)
|
| 478 |
+
self._last_tts_errors.append(error_detail)
|
| 479 |
+
print(f" ⚠️ Failed {error_detail}")
|
| 480 |
continue
|
| 481 |
|
| 482 |
return audio_files
|
src/artifacts/tts_adapter.py
CHANGED
|
@@ -3,7 +3,7 @@ Text-to-Speech adapter supporting multiple providers.
|
|
| 3 |
"""
|
| 4 |
import os
|
| 5 |
from pathlib import Path
|
| 6 |
-
from typing import Literal, Optional
|
| 7 |
from abc import ABC, abstractmethod
|
| 8 |
from dotenv import load_dotenv
|
| 9 |
|
|
@@ -66,26 +66,109 @@ class ElevenLabsTTS(TTSAdapter):
|
|
| 66 |
from elevenlabs.client import ElevenLabs
|
| 67 |
|
| 68 |
self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
|
|
|
|
|
|
|
| 69 |
self.client = ElevenLabs(api_key=self.api_key)
|
| 70 |
self.default_voice = os.getenv("TTS_ELEVENLABS_VOICE_1", "Rachel")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
def synthesize(self, text: str, output_path: str, voice: Optional[str] = None) -> str:
|
| 73 |
"""
|
| 74 |
Popular voices: Rachel, Domi, Bella, Antoni, Elli, Josh, Arnold, Adam, Sam
|
| 75 |
"""
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
|
| 91 |
class EdgeTTS(TTSAdapter):
|
|
@@ -156,4 +239,4 @@ if __name__ == "__main__":
|
|
| 156 |
|
| 157 |
print(f"✓ Audio generated: {output_file}")
|
| 158 |
print(f" Provider: {args.provider or os.getenv('TTS_PROVIDER', 'edge')}")
|
| 159 |
-
print(f" Voice: {args.voice or 'default'}")
|
|
|
|
| 3 |
"""
|
| 4 |
import os
|
| 5 |
from pathlib import Path
|
| 6 |
+
from typing import Any, Literal, Optional
|
| 7 |
from abc import ABC, abstractmethod
|
| 8 |
from dotenv import load_dotenv
|
| 9 |
|
|
|
|
| 66 |
from elevenlabs.client import ElevenLabs
|
| 67 |
|
| 68 |
self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
|
| 69 |
+
if not self.api_key:
|
| 70 |
+
raise ValueError("ELEVENLABS_API_KEY environment variable not set")
|
| 71 |
self.client = ElevenLabs(api_key=self.api_key)
|
| 72 |
self.default_voice = os.getenv("TTS_ELEVENLABS_VOICE_1", "Rachel")
|
| 73 |
+
self.default_model = os.getenv("TTS_ELEVENLABS_MODEL", "eleven_multilingual_v2")
|
| 74 |
+
self._voice_aliases = self._load_voice_aliases()
|
| 75 |
+
|
| 76 |
+
def _load_voice_aliases(self) -> dict[str, str]:
|
| 77 |
+
"""Best-effort map of configured voice names to voice IDs."""
|
| 78 |
+
try:
|
| 79 |
+
response = self.client.voices.get_all()
|
| 80 |
+
voices = getattr(response, "voices", response)
|
| 81 |
+
except Exception:
|
| 82 |
+
return {}
|
| 83 |
+
|
| 84 |
+
aliases: dict[str, str] = {}
|
| 85 |
+
for voice in voices or []:
|
| 86 |
+
if isinstance(voice, dict):
|
| 87 |
+
name = voice.get("name")
|
| 88 |
+
voice_id = voice.get("voice_id")
|
| 89 |
+
else:
|
| 90 |
+
name = getattr(voice, "name", None)
|
| 91 |
+
voice_id = getattr(voice, "voice_id", None)
|
| 92 |
+
if name and voice_id:
|
| 93 |
+
aliases[str(name).strip().lower()] = str(voice_id).strip()
|
| 94 |
+
return aliases
|
| 95 |
+
|
| 96 |
+
def _resolve_voice(self, voice: str) -> str:
|
| 97 |
+
candidate = str(voice or "").strip()
|
| 98 |
+
if not candidate:
|
| 99 |
+
candidate = self.default_voice
|
| 100 |
+
return self._voice_aliases.get(candidate.lower(), candidate)
|
| 101 |
+
|
| 102 |
+
def _write_audio_output(self, audio: Any, output_path: str) -> None:
|
| 103 |
+
"""
|
| 104 |
+
ElevenLabs SDK returns either bytes, file-like, or iterable chunks depending
|
| 105 |
+
on version/options. Handle all supported shapes safely.
|
| 106 |
+
"""
|
| 107 |
+
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
| 108 |
+
with open(output_path, "wb") as f:
|
| 109 |
+
if isinstance(audio, (bytes, bytearray)):
|
| 110 |
+
f.write(bytes(audio))
|
| 111 |
+
return
|
| 112 |
+
|
| 113 |
+
if hasattr(audio, "read"):
|
| 114 |
+
data = audio.read()
|
| 115 |
+
if isinstance(data, str):
|
| 116 |
+
data = data.encode("utf-8")
|
| 117 |
+
if not isinstance(data, (bytes, bytearray)):
|
| 118 |
+
raise TypeError("ElevenLabs returned unsupported file-like payload.")
|
| 119 |
+
f.write(bytes(data))
|
| 120 |
+
return
|
| 121 |
+
|
| 122 |
+
wrote_any = False
|
| 123 |
+
for chunk in audio:
|
| 124 |
+
if chunk is None:
|
| 125 |
+
continue
|
| 126 |
+
wrote_any = True
|
| 127 |
+
if isinstance(chunk, int):
|
| 128 |
+
f.write(bytes([chunk]))
|
| 129 |
+
elif isinstance(chunk, str):
|
| 130 |
+
f.write(chunk.encode("utf-8"))
|
| 131 |
+
elif isinstance(chunk, (bytes, bytearray)):
|
| 132 |
+
f.write(bytes(chunk))
|
| 133 |
+
else:
|
| 134 |
+
raise TypeError(f"Unsupported ElevenLabs audio chunk type: {type(chunk)!r}")
|
| 135 |
+
|
| 136 |
+
if not wrote_any:
|
| 137 |
+
raise RuntimeError("ElevenLabs returned an empty audio stream.")
|
| 138 |
|
| 139 |
def synthesize(self, text: str, output_path: str, voice: Optional[str] = None) -> str:
|
| 140 |
"""
|
| 141 |
Popular voices: Rachel, Domi, Bella, Antoni, Elli, Josh, Arnold, Adam, Sam
|
| 142 |
"""
|
| 143 |
+
requested_voice = voice or self.default_voice
|
| 144 |
+
resolved_voice = self._resolve_voice(requested_voice)
|
| 145 |
+
voice_candidates = [resolved_voice]
|
| 146 |
+
if requested_voice != resolved_voice:
|
| 147 |
+
voice_candidates.append(requested_voice)
|
| 148 |
+
|
| 149 |
+
model_candidates = [self.default_model]
|
| 150 |
+
if self.default_model != "eleven_multilingual_v2":
|
| 151 |
+
model_candidates.append("eleven_multilingual_v2")
|
| 152 |
+
|
| 153 |
+
errors: list[str] = []
|
| 154 |
+
for voice_candidate in voice_candidates:
|
| 155 |
+
for model_candidate in model_candidates:
|
| 156 |
+
try:
|
| 157 |
+
audio = self.client.generate(
|
| 158 |
+
text=text,
|
| 159 |
+
voice=voice_candidate,
|
| 160 |
+
model=model_candidate,
|
| 161 |
+
)
|
| 162 |
+
self._write_audio_output(audio, output_path)
|
| 163 |
+
return output_path
|
| 164 |
+
except Exception as exc:
|
| 165 |
+
errors.append(
|
| 166 |
+
f"voice={voice_candidate}, model={model_candidate}: "
|
| 167 |
+
f"{type(exc).__name__}: {exc}"
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
preview = " | ".join(errors[:3]) if errors else "unknown ElevenLabs error"
|
| 171 |
+
raise RuntimeError(f"ElevenLabs synthesis failed. {preview}")
|
| 172 |
|
| 173 |
|
| 174 |
class EdgeTTS(TTSAdapter):
|
|
|
|
| 239 |
|
| 240 |
print(f"✓ Audio generated: {output_file}")
|
| 241 |
print(f" Provider: {args.provider or os.getenv('TTS_PROVIDER', 'edge')}")
|
| 242 |
+
print(f" Voice: {args.voice or 'default'}")
|