Chatterbox-Multilingual-TTS

Sleeping

App Files Files Community

flozi00 commited on Dec 18, 2025

Commit

0849031

1 Parent(s): 661b10f

chatterbox only

Browse files

Files changed (6) hide show

README.md +13 -8
app.py +73 -118
engine/backends/__init__.py +0 -8
engine/backends/gemini_backend.py +0 -267
engine/tts_engine.py +3 -15
requirements.txt +0 -3

README.md CHANGED Viewed

@@ -16,7 +16,7 @@ A modular text-to-speech engine for generating professional phone announcements
 ## Features
-- 🎙️ **Standard Voices (Default)**: Google Gemini TTS prebuilt voices
 - 🌍 **23 Languages**: German, English, French, Spanish, Italian, and many more
 - 🎭 **Voice Cloning**: Uses Chatterbox Multilingual + reference audio
 - 🔌 **Modular Architecture**: Easy to swap TTS backends
@@ -46,7 +46,7 @@ engine/
 └── backends/
     ├── base.py           # Abstract backend interface
     ├── chatterbox_backend.py  # Default: Chatterbox Multilingual
-    └── gemini_backend.py      # Optional: Google Gemini TTS
 ```
 ## Usage
@@ -82,11 +82,7 @@ audio = engine.generate(
 ### Switch Backend
-```python
-# Use Gemini instead of Chatterbox (requires a Gemini API key provided per request)
-engine.set_backend("gemini")
-audio = engine.generate("Hello world!", language="en")
-```
 ### With Background Music
@@ -157,7 +153,16 @@ engine = TTSEngine(config)
 ### Environment Variables
 - `HF_TOKEN`: HuggingFace token for model downloads (Chatterbox)
-- Gemini API key: Must be provided per request via the UI; do not rely on environment variables.
 ## Supported Languages

 ## Features
+- 🎙️ **Standard Voices (Default)**: Local voice prompts from `.wav` files in `voices/`
 - 🌍 **23 Languages**: German, English, French, Spanish, Italian, and many more
 - 🎭 **Voice Cloning**: Uses Chatterbox Multilingual + reference audio
 - 🔌 **Modular Architecture**: Easy to swap TTS backends
 └── backends/
     ├── base.py           # Abstract backend interface
     ├── chatterbox_backend.py  # Default: Chatterbox Multilingual
 ```
 ## Usage
 ### Switch Backend
+This project currently ships with the Chatterbox backend.
 ### With Background Music
 ### Environment Variables
 - `HF_TOKEN`: HuggingFace token for model downloads (Chatterbox)
+- `PHONE_SPEAKER_TTS_VOICES_DIR`: Override the default voices folder (defaults to `./voices`)
+### Default Voices Folder
+Put `.wav` files into `voices/` (or the folder pointed to by `PHONE_SPEAKER_TTS_VOICES_DIR`).
+The file name (without extension) becomes the voice name.
+Example: `voices/flozi.wav` → voice `flozi`.
+If the folder contains no `.wav` files, the UI will force **Voice cloning** and require an uploaded reference sample.
 ## Supported Languages

app.py CHANGED Viewed

@@ -1,13 +1,15 @@
 """Phone Speaker TTS - Gradio Application.
 UI requirements:
-- Use Gemini TTS by default for standard voices
 - Provide a dropdown to choose a voice
 - Include a "Voice cloning" option; when selected, show reference-audio upload
     and use Chatterbox (voice cloning capable) backend.
 """
 import random
 import gradio as gr
 import numpy as np
@@ -33,14 +35,6 @@ from engine import TTSEngine
 from engine.audio_processor import AudioProcessor
 from engine.backends.chatterbox_backend import DEFAULT_VOICE_PROMPTS
-try:
-    from engine.backends.gemini_backend import GeminiBackend
-    HAS_GEMINI_BACKEND = True
-except Exception:
-    GeminiBackend = None
-    HAS_GEMINI_BACKEND = False
 # --- Configuration ---
 DEVICE = (
     "cuda"
@@ -111,19 +105,27 @@ ENGINE = None
 VOICE_CLONING_OPTION = "Voice cloning"
-def _is_gemini_ready() -> bool:
-    """Return True if Gemini backend can be used (SDK import present).
-    API key may be provided per request via UI.
-    """
-    if not HAS_GEMINI_BACKEND:
-        return False
-    try:
-        import google.genai  # noqa: F401
-        return True
-    except Exception:
-        return False
 def get_engine() -> TTSEngine:
@@ -136,16 +138,14 @@ def get_engine() -> TTSEngine:
         logger.info("Initializing TTS Engine...")
         ENGINE = TTSEngine(
             EngineConfig(
-                default_backend="gemini",
                 device=DEVICE,
                 default_language="de",
             )
         )
-        # Do not force-load backends on startup.
-        # - Gemini can be authenticated via per-request API key in the UI.
-        # - Chatterbox is heavy and should only load when voice cloning is used.
-        ENGINE.set_backend("gemini")
         logger.info("TTS Engine ready!")
@@ -197,22 +197,19 @@ def get_default_voice(language: str) -> str:
 def get_voice_choices() -> list[str]:
     """Get voice dropdown choices.
-    - Standard voices: Gemini prebuilt voices
-    - Special entry: Voice cloning (uses Chatterbox)
     """
-    voices: list[str] = []
-    if HAS_GEMINI_BACKEND and _is_gemini_ready():
-        try:
-            voices.extend(list(GeminiBackend.AVAILABLE_VOICES))
-        except Exception:
-            pass
-    # Always include the special option
-    voices.append(VOICE_CLONING_OPTION)
-    return voices
 def _resolve_backend_for_voice_choice(voice_choice: str) -> str:
-    return "chatterbox" if voice_choice == VOICE_CLONING_OPTION else "gemini"
 def get_background_music_choices() -> list[tuple[str, str]]:
@@ -239,7 +236,6 @@ def generate_announcement(
     text: str,
     language: str,
     voice_choice: str,
-    gemini_api_key: str = "",
     voice_audio: str = None,
     background_music: str = "",
     custom_music: str = None,
@@ -280,18 +276,28 @@ def generate_announcement(
             torch.cuda.manual_seed_all(seed)
     # Voice resolution:
-    # - Voice cloning: use reference audio (or fallback per-language prompt)
-    # - Standard voice: use Gemini prebuilt voice
-    voice_kwargs = {}
-    if backend_name == "chatterbox":
-        if not voice_audio or not str(voice_audio).strip():
-            voice_audio = get_default_voice(language)
     else:
-        voice_audio = None
-        if voice_choice and voice_choice != VOICE_CLONING_OPTION:
-            voice_kwargs["voice"] = voice_choice
-        if gemini_api_key and str(gemini_api_key).strip():
-            voice_kwargs["api_key"] = str(gemini_api_key).strip()
     # Determine which background music to use (custom upload takes priority)
     music_path = None
@@ -315,7 +321,6 @@ def generate_announcement(
             language=language,
             voice_audio=voice_audio,
             split_sentences=True,
-            **voice_kwargs,
         )
         # Process with background music
@@ -359,7 +364,6 @@ def generate_announcement(
             language=language,
             voice_audio=voice_audio,
             split_sentences=True,
-            **voice_kwargs,
         )
         return result
@@ -374,33 +378,16 @@ def on_language_change(language: str, voice_choice: str):
 def on_voice_choice_change(voice_choice: str):
     """Switch UI elements depending on voice selection."""
-    backend = _resolve_backend_for_voice_choice(voice_choice)
-    if backend == "gemini":
-        language_choices = get_language_choices_for_backend("gemini")
-        default_language = (
-            "de"
-            if any(v == "de" for _, v in language_choices)
-            else (language_choices[0][1] if language_choices else "en")
-        )
-        return (
-            gr.update(choices=language_choices, value=default_language),
-            gr.update(visible=False, value=None),
-            gr.update(visible=True),
-            gr.update(value=get_example_text(default_language)),
-        )
-    # Voice cloning
     language_choices = get_language_choices_for_backend("chatterbox")
     default_language = (
         "de"
         if any(v == "de" for _, v in language_choices)
         else (language_choices[0][1] if language_choices else "en")
     )
     return (
         gr.update(choices=language_choices, value=default_language),
-        gr.update(visible=True, value=None),
-        gr.update(visible=False, value=""),
         gr.update(value=get_example_text(default_language)),
     )
@@ -429,65 +416,34 @@ def create_interface():
             elem_classes=["main-title"],
         )
-        if not _is_gemini_ready():
-            gr.Markdown(
-                """
-                **Note:** Gemini is currently unavailable.
-                Please install `google-genai` or use **Voice cloning**.
-                """
-            )
-        else:
-            gr.Markdown(
-                """
-                **Tip (Public App):** You can enter your own Gemini API key.
-                This way the costs are billed to the user rather than the app operator.
-                **Note:** API keys must be supplied per request via the UI; the app does not read keys from environment variables.
-                """
-            )
         with gr.Row():
             # Left column - Input
             with gr.Column(scale=1):
                 default_voice_choice = (
-                    "Kore"
-                    if _is_gemini_ready() and "Kore" in get_voice_choices()
-                    else VOICE_CLONING_OPTION
                 )
                 voice_choice = gr.Dropdown(
-                    choices=get_voice_choices(),
                     value=default_voice_choice,
                     label="🗣️ Voice",
-                    info="Default: Gemini voices. 'Voice cloning' uses reference audio (Chatterbox).",
-                )
-                gemini_api_key = gr.Textbox(
-                    label="🔑 Gemini API Key",
-                    type="password",
-                    placeholder="Enter Gemini API key for this request",
-                    info="Provide your Gemini API key for this request; environment variables are not used.",
-                    visible=(
-                        _is_gemini_ready()
-                        and default_voice_choice != VOICE_CLONING_OPTION
-                    ),
                 )
                 language = gr.Dropdown(
-                    choices=(
-                        get_language_choices_for_backend("gemini")
-                        if _is_gemini_ready()
-                        else get_language_choices_for_backend("chatterbox")
-                    ),
-                    value=(
-                        "de"
-                        if _is_gemini_ready()
-                        and any(
-                            v == "de"
-                            for _, v in get_language_choices_for_backend("gemini")
-                        )
-                        else "de"
-                    ),
                     label="🌍 Language",
                     info="Choose the language of the announcement",
                 )
@@ -608,7 +564,7 @@ def create_interface():
         voice_choice.change(
             fn=on_voice_choice_change,
             inputs=[voice_choice],
-            outputs=[language, voice_audio, gemini_api_key, text],
             show_progress=False,
         )
@@ -625,7 +581,6 @@ def create_interface():
                 text,
                 language,
                 voice_choice,
-                gemini_api_key,
                 voice_audio,
                 background_music,
                 custom_music,

 """Phone Speaker TTS - Gradio Application.
 UI requirements:
+- Load default voices from a folder of .wav files (e.g. voices/flozi.wav -> "flozi")
 - Provide a dropdown to choose a voice
 - Include a "Voice cloning" option; when selected, show reference-audio upload
     and use Chatterbox (voice cloning capable) backend.
 """
+import os
 import random
+from pathlib import Path
 import gradio as gr
 import numpy as np
 from engine.audio_processor import AudioProcessor
 from engine.backends.chatterbox_backend import DEFAULT_VOICE_PROMPTS
 # --- Configuration ---
 DEVICE = (
     "cuda"
 VOICE_CLONING_OPTION = "Voice cloning"
+def _get_voices_dir() -> Path:
+    env_dir = os.environ.get("PHONE_SPEAKER_TTS_VOICES_DIR")
+    if env_dir and str(env_dir).strip():
+        return Path(env_dir).expanduser()
+    return Path(__file__).parent / "voices"
+def _list_default_voices() -> dict[str, Path]:
+    voices_dir = _get_voices_dir()
+    if not voices_dir.exists() or not voices_dir.is_dir():
+        return {}
+    voices: dict[str, Path] = {}
+    for wav_path in sorted(voices_dir.glob("*.wav")):
+        name = wav_path.stem.strip()
+        if name:
+            voices[name] = wav_path
+    return voices
+def _has_default_voices() -> bool:
+    return len(_list_default_voices()) > 0
 def get_engine() -> TTSEngine:
         logger.info("Initializing TTS Engine...")
         ENGINE = TTSEngine(
             EngineConfig(
+                default_backend="chatterbox",
                 device=DEVICE,
                 default_language="de",
             )
         )
+        # Do not force-load models on startup; Chatterbox is heavy and should load on demand.
+        ENGINE.set_backend("chatterbox")
         logger.info("TTS Engine ready!")
 def get_voice_choices() -> list[str]:
     """Get voice dropdown choices.
+    - Standard voices: local .wav prompts from voices folder
+    - Special entry: Voice cloning (uses Chatterbox + user provided reference)
     """
+    voices = list(_list_default_voices().keys())
+    if voices:
+        voices.append(VOICE_CLONING_OPTION)
+        return voices
+    # If there are no default voices, force voice cloning.
+    return [VOICE_CLONING_OPTION]
 def _resolve_backend_for_voice_choice(voice_choice: str) -> str:
+    return "chatterbox"
 def get_background_music_choices() -> list[tuple[str, str]]:
     text: str,
     language: str,
     voice_choice: str,
     voice_audio: str = None,
     background_music: str = "",
     custom_music: str = None,
             torch.cuda.manual_seed_all(seed)
     # Voice resolution:
+    # - Default voice: use voices/<name>.wav (local prompt)
+    # - Voice cloning: use uploaded reference audio
+    default_voices = _list_default_voices()
+    if voice_choice != VOICE_CLONING_OPTION:
+        if voice_choice not in default_voices:
+            raise gr.Error(
+                f"Unknown voice '{voice_choice}'. Add '{voice_choice}.wav' to '{_get_voices_dir()}' or select '{VOICE_CLONING_OPTION}'."
+            )
+        voice_audio = str(default_voices[voice_choice])
     else:
+        # Force voice cloning when there are no default voices.
+        if not _has_default_voices():
+            if not voice_audio or not str(voice_audio).strip():
+                raise gr.Error(
+                    f"No default voices found in '{_get_voices_dir()}'. Please upload a reference audio sample for voice cloning."
+                )
+        # If default voices exist, keep previous behavior: fall back to a per-language prompt.
+        if (
+            voice_audio is None or not str(voice_audio).strip()
+        ) and _has_default_voices():
+            voice_audio = get_default_voice(language)
     # Determine which background music to use (custom upload takes priority)
     music_path = None
             language=language,
             voice_audio=voice_audio,
             split_sentences=True,
         )
         # Process with background music
             language=language,
             voice_audio=voice_audio,
             split_sentences=True,
         )
         return result
 def on_voice_choice_change(voice_choice: str):
     """Switch UI elements depending on voice selection."""
     language_choices = get_language_choices_for_backend("chatterbox")
     default_language = (
         "de"
         if any(v == "de" for _, v in language_choices)
         else (language_choices[0][1] if language_choices else "en")
     )
+    show_voice_audio = voice_choice == VOICE_CLONING_OPTION
     return (
         gr.update(choices=language_choices, value=default_language),
+        gr.update(visible=show_voice_audio, value=None if show_voice_audio else None),
         gr.update(value=get_example_text(default_language)),
     )
             elem_classes=["main-title"],
         )
+        voices_dir = _get_voices_dir()
+        gr.Markdown(
+            f"""
+            **Default voices folder:** `{voices_dir}`
+            Put `.wav` files there named like `flozi.wav` → voice `flozi`.
+            If the folder has no `.wav` files, the UI will force **Voice cloning**.
+            """
+        )
         with gr.Row():
             # Left column - Input
             with gr.Column(scale=1):
+                voice_choices = get_voice_choices()
                 default_voice_choice = (
+                    voice_choices[0] if voice_choices else VOICE_CLONING_OPTION
                 )
                 voice_choice = gr.Dropdown(
+                    choices=voice_choices,
                     value=default_voice_choice,
                     label="🗣️ Voice",
+                    info="Default voices come from the voices folder. 'Voice cloning' uses uploaded reference audio.",
                 )
                 language = gr.Dropdown(
+                    choices=get_language_choices_for_backend("chatterbox"),
+                    value="de",
                     label="🌍 Language",
                     info="Choose the language of the announcement",
                 )
         voice_choice.change(
             fn=on_voice_choice_change,
             inputs=[voice_choice],
+            outputs=[language, voice_audio, text],
             show_progress=False,
         )
                 text,
                 language,
                 voice_choice,
                 voice_audio,
                 background_music,
                 custom_music,

engine/backends/__init__.py CHANGED Viewed

@@ -3,11 +3,3 @@ from .base import BackendConfig, TTSBackend, TTSResult
 from .chatterbox_backend import ChatterboxBackend
 __all__ = ["TTSBackend", "TTSResult", "BackendConfig", "ChatterboxBackend"]
-# Optional backends
-try:
-    from .gemini_backend import GeminiBackend
-    __all__.append("GeminiBackend")
-except ImportError:
-    pass  # google-genai not installed


3	from .chatterbox_backend import ChatterboxBackend
4
5	__all__ = ["TTSBackend", "TTSResult", "BackendConfig", "ChatterboxBackend"]

engine/backends/gemini_backend.py DELETED Viewed

@@ -1,267 +0,0 @@
-"""
-Google Gemini TTS Backend.
-Uses Google's Gemini API for text-to-speech synthesis.
-"""
-import io
-from typing import Optional
-import numpy as np
-from loguru import logger
-from .base import BackendConfig, TTSBackend, TTSResult
-class GeminiBackend(TTSBackend):
-    """
-    Google Gemini TTS Backend.
-    Features:
-    - High-quality neural TTS
-    - Multiple preset voices
-    - No voice cloning (uses preset voices)
-    Authentication:
-    - API key must be provided per request (do not rely on environment variables).
-    - Per-request keys are recommended for public apps.
-    """
-    # Available Gemini voices
-    AVAILABLE_VOICES = [
-        "Puck",
-        "Charon",
-        "Kore",
-        "Fenrir",
-        "Aoede",
-        "Leda",
-        "Orus",
-        "Zephyr",
-    ]
-    # Gemini has limited language support compared to Chatterbox
-    SUPPORTED_LANGUAGES = {
-        "en": "English",
-        "de": "German",
-        "es": "Spanish",
-        "fr": "French",
-        "it": "Italian",
-        "pt": "Portuguese",
-        "ja": "Japanese",
-        "ko": "Korean",
-        "zh": "Chinese",
-    }
-    def __init__(
-        self,
-        config: Optional[BackendConfig] = None,
-        voice: str = "Kore",
-        api_key: Optional[str] = None,
-    ):
-        super().__init__(config)
-        self._client = None
-        self._api_key: Optional[str] = api_key
-        self._api_key_fingerprint: Optional[str] = None
-        self.voice = voice if voice in self.AVAILABLE_VOICES else "Kore"
-    @property
-    def name(self) -> str:
-        return "Google Gemini TTS"
-    @property
-    def supports_voice_cloning(self) -> bool:
-        return False
-    @property
-    def supported_languages(self) -> dict[str, str]:
-        return self.SUPPORTED_LANGUAGES.copy()
-    def set_api_key(self, api_key: Optional[str]) -> None:
-        """Set (or clear) the API key used by this backend.
-        Note: This is kept in memory only.
-        """
-        api_key = (api_key or "").strip() or None
-        if api_key == self._api_key:
-            return
-        self._api_key = api_key
-        # Force re-init on next call.
-        if self._is_loaded:
-            self.unload()
-    def load(self, api_key: Optional[str] = None) -> None:
-        """Initialize the Gemini client. The API key must be provided per request."""
-        desired_key = (api_key or self._api_key or "").strip()
-        if not desired_key:
-            raise ValueError(
-                "Gemini API key missing. Provide api_key for this request (do not rely on environment variables)."
-            )
-        desired_fingerprint = f"len:{len(desired_key)}"
-        if self._is_loaded and self._client is not None:
-            if self._api_key_fingerprint == desired_fingerprint:
-                return
-            # Different key than the currently initialized client.
-            self.unload()
-        try:
-            import google.genai as genai
-            self._client = genai.Client(api_key=desired_key)
-            self._is_loaded = True
-            self._api_key_fingerprint = desired_fingerprint
-            logger.info("Gemini client initialized successfully")
-        except Exception as e:
-            logger.error(f"Failed to initialize Gemini client: {e}")
-            raise
-    def unload(self) -> None:
-        """Clean up Gemini client."""
-        self._client = None
-        self._is_loaded = False
-        self._api_key_fingerprint = None
-        logger.info("Gemini client unloaded")
-    def set_voice(self, voice: str) -> None:
-        """Set the voice to use for synthesis."""
-        if voice not in self.AVAILABLE_VOICES:
-            raise ValueError(
-                f"Unknown voice '{voice}'. Available: {self.AVAILABLE_VOICES}"
-            )
-        self.voice = voice
-    def generate(
-        self,
-        text: str,
-        language: str = "de",
-        voice_audio_path: Optional[str] = None,
-        voice: Optional[str] = None,
-        api_key: Optional[str] = None,
-        **kwargs,
-    ) -> TTSResult:
-        """
-        Generate speech from text using Gemini.
-        Args:
-            text: Text to synthesize
-            language: Language code (for text processing, voice determines actual synthesis)
-            voice_audio_path: Ignored (Gemini doesn't support voice cloning)
-            voice: Voice name to use (default: instance voice setting)
-        Returns:
-            TTSResult with audio waveform and sample rate
-        """
-        # Allow per-request key (useful for public apps where users bring their own key).
-        self.load(api_key=api_key)
-        if voice_audio_path:
-            logger.warning(
-                "Gemini backend doesn't support voice cloning, ignoring voice_audio_path"
-            )
-        from google.genai import types as genai_types
-        selected_voice = voice or self.voice
-        logger.info(
-            f"Generating speech with Gemini: voice={selected_voice}, text='{text[:50]}...'"
-        )
-        contents = [
-            genai_types.Content(
-                role="user", parts=[genai_types.Part.from_text(text=text)]
-            )
-        ]
-        config = genai_types.GenerateContentConfig(
-            temperature=1,
-            response_modalities=["audio"],
-            speech_config=genai_types.SpeechConfig(
-                voice_config=genai_types.VoiceConfig(
-                    prebuilt_voice_config=genai_types.PrebuiltVoiceConfig(
-                        voice_name=selected_voice
-                    )
-                )
-            ),
-        )
-        try:
-            audio_chunks = []
-            mime_type = None
-            for chunk in self._client.models.generate_content_stream(
-                model="gemini-2.5-pro-preview-tts",
-                contents=contents,
-                config=config,
-            ):
-                if chunk.candidates:
-                    inline_data = chunk.candidates[0].content.parts[0].inline_data
-                    audio_chunks.append(inline_data.data)
-                    if mime_type is None:
-                        mime_type = inline_data.mime_type
-            if not audio_chunks:
-                raise RuntimeError("No audio data received from Gemini API")
-            raw_audio = b"".join(audio_chunks)
-            # Convert to numpy array
-            audio_np, sample_rate = self._process_audio(raw_audio, mime_type)
-            return TTSResult(audio=audio_np, sample_rate=sample_rate)
-        except Exception as e:
-            logger.error(f"Gemini TTS generation failed: {e}")
-            raise
-    def _process_audio(
-        self, raw_audio: bytes, mime_type: str
-    ) -> tuple[np.ndarray, int]:
-        """Process raw audio data from Gemini into numpy array."""
-        from pydub import AudioSegment
-        # Parse MIME type for audio parameters
-        sample_rate = 24000  # Default
-        bits_per_sample = 16
-        if mime_type and "audio/L" in mime_type:
-            # Parse format like audio/L16;rate=24000
-            parts = mime_type.split(";")
-            for part in parts:
-                part = part.strip()
-                if part.startswith("audio/L"):
-                    try:
-                        bits_per_sample = int(part.split("L")[1])
-                    except (ValueError, IndexError):
-                        pass
-                elif part.lower().startswith("rate="):
-                    try:
-                        sample_rate = int(part.split("=")[1])
-                    except (ValueError, IndexError):
-                        pass
-            # Create AudioSegment from raw PCM
-            audio_segment = AudioSegment(
-                data=raw_audio,
-                sample_width=bits_per_sample // 8,
-                frame_rate=sample_rate,
-                channels=1,
-            )
-        elif mime_type == "audio/mpeg":
-            audio_segment = AudioSegment.from_file(io.BytesIO(raw_audio), format="mp3")
-            sample_rate = audio_segment.frame_rate
-        else:
-            # Try auto-detection
-            audio_segment = AudioSegment.from_file(io.BytesIO(raw_audio))
-            sample_rate = audio_segment.frame_rate
-        # Convert to numpy array
-        samples = np.array(audio_segment.get_array_of_samples())
-        # Normalize to float32 [-1, 1]
-        if audio_segment.sample_width == 2:  # 16-bit
-            samples = samples.astype(np.float32) / 32768.0
-        elif audio_segment.sample_width == 1:  # 8-bit
-            samples = (samples.astype(np.float32) - 128) / 128.0
-        return samples, sample_rate

engine/tts_engine.py CHANGED Viewed

@@ -28,7 +28,7 @@ class EngineConfig:
     """Configuration for the TTS Engine."""
     # Backend settings
-    default_backend: str = "gemini"
     device: str = "auto"  # "auto", "cuda", "mps", "cpu"
     # Default generation settings
@@ -63,7 +63,7 @@ class TTSEngine:
         )
         # Switch backend
-        engine.set_backend("gemini")
         audio = engine.generate("Welcome to our service.", language="en")
     """
@@ -188,16 +188,13 @@ class TTSEngine:
         # Generate voice ID for caching.
         # - Voice cloning: derive from reference audio when available
-        # - Preset voices (e.g. Gemini): include requested voice in cache key
-        requested_voice = kwargs.get("voice")
         if voice_audio:
             voice_id = (
                 Path(voice_audio).stem
                 if os.path.exists(voice_audio or "")
                 else "custom"
             )
-        elif requested_voice:
-            voice_id = f"voice-{requested_voice}"
         else:
             voice_id = "default"
@@ -299,12 +296,3 @@ class TTSEngine:
     def clear_cache(self) -> int:
         """Clear the local audio cache. Returns number of files deleted."""
         return self._cache.clear_local()
-# Register additional backends if available
-try:
-    from .backends.gemini_backend import GeminiBackend
-    TTSEngine.register_backend("gemini", GeminiBackend)
-except ImportError:
-    pass  # Gemini backend not available

     """Configuration for the TTS Engine."""
     # Backend settings
+    default_backend: str = "chatterbox"
     device: str = "auto"  # "auto", "cuda", "mps", "cpu"
     # Default generation settings
         )
         # Switch backend
+        engine.set_backend("chatterbox")
         audio = engine.generate("Welcome to our service.", language="en")
     """
         # Generate voice ID for caching.
         # - Voice cloning: derive from reference audio when available
+        # - If no reference audio: use "default"
         if voice_audio:
             voice_id = (
                 Path(voice_audio).stem
                 if os.path.exists(voice_audio or "")
                 else "custom"
             )
         else:
             voice_id = "default"
     def clear_cache(self) -> int:
         """Clear the local audio cache. Returns number of files deleted."""
         return self._cache.clear_local()

requirements.txt CHANGED Viewed

@@ -25,9 +25,6 @@ huggingface_hub>=0.20.0
 # Logging
 loguru>=0.7.0
-# Optional: Gemini backend
-google-genai>=0.3.0
 # Optional: Caching to HuggingFace Hub
 # pandas>=2.0.0

 # Logging
 loguru>=0.7.0
 # Optional: Caching to HuggingFace Hub
 # pandas>=2.0.0