Spaces:

pbichpur
/

NotebookLMClone

Sleeping

App Files Files Community

github-actions[bot] commited on 21 days ago

Commit

74be2eb

1 Parent(s): 1815b1f

Sync from GitHub abaefddd1c944e527964d7379da3cfc39209b3a5

Browse files

Files changed (2) hide show

src/artifacts/podcast_generator.py +18 -5
src/artifacts/tts_adapter.py +98 -15

src/artifacts/podcast_generator.py CHANGED Viewed

@@ -76,6 +76,7 @@ class PodcastGenerator:
         tts_provider = tts_provider or os.getenv("TTS_PROVIDER", "edge")
         self.tts = get_tts_adapter(tts_provider)
         self.tts_provider = tts_provider
         # Default settings from .env
         self.default_duration = os.getenv("DEFAULT_PODCAST_DURATION", "5min")
@@ -141,13 +142,18 @@ class PodcastGenerator:
         # 3. Synthesize audio segments
         print(f"🎵 Synthesizing audio with {self.tts_provider}...")
         audio_segments = self._synthesize_segments(script, user_id, notebook_id, hosts)
         if not audio_segments:
             return {
-                "error": (
-                    "Transcript generated but audio synthesis failed for all segments. "
-                    "Check TTS provider credentials, quota, and configured voices."
-                ),
                 "transcript": script,
                 "audio_path": None,
                 "metadata": {
@@ -159,6 +165,7 @@ class PodcastGenerator:
                     "llm_model": self.model,
                     "num_segments": len(script),
                     "topic_focus": topic_focus,
                     "generated_at": datetime.utcnow().isoformat(),
                 },
             }
@@ -449,6 +456,7 @@ IMPORTANT:
         voices = voice_maps.get(self.tts_provider, voice_maps["edge"])
         audio_files: List[str] = []
         total = len(script)
         for i, segment in enumerate(script, 1):
@@ -463,7 +471,12 @@ IMPORTANT:
                 audio_files.append(output_path)
                 print(f"  ✓ Segment {i}/{total}: {speaker}")
             except Exception as e:
-                print(f"  ⚠️  Failed segment {i}: {e}")
                 continue
         return audio_files

         tts_provider = tts_provider or os.getenv("TTS_PROVIDER", "edge")
         self.tts = get_tts_adapter(tts_provider)
         self.tts_provider = tts_provider
+        self._last_tts_errors: List[str] = []
         # Default settings from .env
         self.default_duration = os.getenv("DEFAULT_PODCAST_DURATION", "5min")
         # 3. Synthesize audio segments
         print(f"🎵 Synthesizing audio with {self.tts_provider}...")
+        self._last_tts_errors = []
         audio_segments = self._synthesize_segments(script, user_id, notebook_id, hosts)
         if not audio_segments:
+            tts_error_preview = "; ".join(self._last_tts_errors[:3]).strip()
+            failure_message = (
+                "Transcript generated but audio synthesis failed for all segments. "
+                "Check TTS provider credentials, quota, and configured voices."
+            )
+            if tts_error_preview:
+                failure_message = f"{failure_message} Provider errors: {tts_error_preview}"
             return {
+                "error": failure_message,
                 "transcript": script,
                 "audio_path": None,
                 "metadata": {
                     "llm_model": self.model,
                     "num_segments": len(script),
                     "topic_focus": topic_focus,
+                    "tts_errors": self._last_tts_errors[:20],
                     "generated_at": datetime.utcnow().isoformat(),
                 },
             }
         voices = voice_maps.get(self.tts_provider, voice_maps["edge"])
         audio_files: List[str] = []
+        self._last_tts_errors = []
         total = len(script)
         for i, segment in enumerate(script, 1):
                 audio_files.append(output_path)
                 print(f"  ✓ Segment {i}/{total}: {speaker}")
             except Exception as e:
+                error_detail = (
+                    f"segment={i}/{total}, speaker={speaker}, voice={voice}, "
+                    f"error={type(e).__name__}: {' '.join(str(e).split())}"
+                )
+                self._last_tts_errors.append(error_detail)
+                print(f"  ⚠️  Failed {error_detail}")
                 continue
         return audio_files

src/artifacts/tts_adapter.py CHANGED Viewed

@@ -3,7 +3,7 @@ Text-to-Speech adapter supporting multiple providers.
 """
 import os
 from pathlib import Path
-from typing import Literal, Optional
 from abc import ABC, abstractmethod
 from dotenv import load_dotenv
@@ -66,26 +66,109 @@ class ElevenLabsTTS(TTSAdapter):
         from elevenlabs.client import ElevenLabs
         self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
         self.client = ElevenLabs(api_key=self.api_key)
         self.default_voice = os.getenv("TTS_ELEVENLABS_VOICE_1", "Rachel")
     def synthesize(self, text: str, output_path: str, voice: Optional[str] = None) -> str:
         """
         Popular voices: Rachel, Domi, Bella, Antoni, Elli, Josh, Arnold, Adam, Sam
         """
-        voice = voice or self.default_voice
-        audio = self.client.generate(
-            text=text,
-            voice=voice,
-            model="eleven_monolingual_v1"
-        )
-        with open(output_path, "wb") as f:
-            for chunk in audio:
-                f.write(chunk)
-        return output_path
 class EdgeTTS(TTSAdapter):
@@ -156,4 +239,4 @@ if __name__ == "__main__":
     print(f"✓ Audio generated: {output_file}")
     print(f"  Provider: {args.provider or os.getenv('TTS_PROVIDER', 'edge')}")
-    print(f"  Voice: {args.voice or 'default'}")

 """
 import os
 from pathlib import Path
+from typing import Any, Literal, Optional
 from abc import ABC, abstractmethod
 from dotenv import load_dotenv
         from elevenlabs.client import ElevenLabs
         self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
+        if not self.api_key:
+            raise ValueError("ELEVENLABS_API_KEY environment variable not set")
         self.client = ElevenLabs(api_key=self.api_key)
         self.default_voice = os.getenv("TTS_ELEVENLABS_VOICE_1", "Rachel")
+        self.default_model = os.getenv("TTS_ELEVENLABS_MODEL", "eleven_multilingual_v2")
+        self._voice_aliases = self._load_voice_aliases()
+    def _load_voice_aliases(self) -> dict[str, str]:
+        """Best-effort map of configured voice names to voice IDs."""
+        try:
+            response = self.client.voices.get_all()
+            voices = getattr(response, "voices", response)
+        except Exception:
+            return {}
+        aliases: dict[str, str] = {}
+        for voice in voices or []:
+            if isinstance(voice, dict):
+                name = voice.get("name")
+                voice_id = voice.get("voice_id")
+            else:
+                name = getattr(voice, "name", None)
+                voice_id = getattr(voice, "voice_id", None)
+            if name and voice_id:
+                aliases[str(name).strip().lower()] = str(voice_id).strip()
+        return aliases
+    def _resolve_voice(self, voice: str) -> str:
+        candidate = str(voice or "").strip()
+        if not candidate:
+            candidate = self.default_voice
+        return self._voice_aliases.get(candidate.lower(), candidate)
+    def _write_audio_output(self, audio: Any, output_path: str) -> None:
+        """
+        ElevenLabs SDK returns either bytes, file-like, or iterable chunks depending
+        on version/options. Handle all supported shapes safely.
+        """
+        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+        with open(output_path, "wb") as f:
+            if isinstance(audio, (bytes, bytearray)):
+                f.write(bytes(audio))
+                return
+            if hasattr(audio, "read"):
+                data = audio.read()
+                if isinstance(data, str):
+                    data = data.encode("utf-8")
+                if not isinstance(data, (bytes, bytearray)):
+                    raise TypeError("ElevenLabs returned unsupported file-like payload.")
+                f.write(bytes(data))
+                return
+            wrote_any = False
+            for chunk in audio:
+                if chunk is None:
+                    continue
+                wrote_any = True
+                if isinstance(chunk, int):
+                    f.write(bytes([chunk]))
+                elif isinstance(chunk, str):
+                    f.write(chunk.encode("utf-8"))
+                elif isinstance(chunk, (bytes, bytearray)):
+                    f.write(bytes(chunk))
+                else:
+                    raise TypeError(f"Unsupported ElevenLabs audio chunk type: {type(chunk)!r}")
+            if not wrote_any:
+                raise RuntimeError("ElevenLabs returned an empty audio stream.")
     def synthesize(self, text: str, output_path: str, voice: Optional[str] = None) -> str:
         """
         Popular voices: Rachel, Domi, Bella, Antoni, Elli, Josh, Arnold, Adam, Sam
         """
+        requested_voice = voice or self.default_voice
+        resolved_voice = self._resolve_voice(requested_voice)
+        voice_candidates = [resolved_voice]
+        if requested_voice != resolved_voice:
+            voice_candidates.append(requested_voice)
+        model_candidates = [self.default_model]
+        if self.default_model != "eleven_multilingual_v2":
+            model_candidates.append("eleven_multilingual_v2")
+        errors: list[str] = []
+        for voice_candidate in voice_candidates:
+            for model_candidate in model_candidates:
+                try:
+                    audio = self.client.generate(
+                        text=text,
+                        voice=voice_candidate,
+                        model=model_candidate,
+                    )
+                    self._write_audio_output(audio, output_path)
+                    return output_path
+                except Exception as exc:
+                    errors.append(
+                        f"voice={voice_candidate}, model={model_candidate}: "
+                        f"{type(exc).__name__}: {exc}"
+                    )
+        preview = " | ".join(errors[:3]) if errors else "unknown ElevenLabs error"
+        raise RuntimeError(f"ElevenLabs synthesis failed. {preview}")
 class EdgeTTS(TTSAdapter):
     print(f"✓ Audio generated: {output_file}")
     print(f"  Provider: {args.provider or os.getenv('TTS_PROVIDER', 'edge')}")
+    print(f"  Voice: {args.voice or 'default'}")