flozi00 commited on
Commit
0849031
Β·
1 Parent(s): 661b10f

chatterbox only

Browse files
README.md CHANGED
@@ -16,7 +16,7 @@ A modular text-to-speech engine for generating professional phone announcements
16
 
17
  ## Features
18
 
19
- - πŸŽ™οΈ **Standard Voices (Default)**: Google Gemini TTS prebuilt voices
20
  - 🌍 **23 Languages**: German, English, French, Spanish, Italian, and many more
21
  - 🎭 **Voice Cloning**: Uses Chatterbox Multilingual + reference audio
22
  - πŸ”Œ **Modular Architecture**: Easy to swap TTS backends
@@ -46,7 +46,7 @@ engine/
46
  └── backends/
47
  β”œβ”€β”€ base.py # Abstract backend interface
48
  β”œβ”€β”€ chatterbox_backend.py # Default: Chatterbox Multilingual
49
- └── gemini_backend.py # Optional: Google Gemini TTS
50
  ```
51
 
52
  ## Usage
@@ -82,11 +82,7 @@ audio = engine.generate(
82
 
83
  ### Switch Backend
84
 
85
- ```python
86
- # Use Gemini instead of Chatterbox (requires a Gemini API key provided per request)
87
- engine.set_backend("gemini")
88
- audio = engine.generate("Hello world!", language="en")
89
- ```
90
 
91
  ### With Background Music
92
 
@@ -157,7 +153,16 @@ engine = TTSEngine(config)
157
  ### Environment Variables
158
 
159
  - `HF_TOKEN`: HuggingFace token for model downloads (Chatterbox)
160
- - Gemini API key: Must be provided per request via the UI; do not rely on environment variables.
 
 
 
 
 
 
 
 
 
161
 
162
 
163
  ## Supported Languages
 
16
 
17
  ## Features
18
 
19
+ - πŸŽ™οΈ **Standard Voices (Default)**: Local voice prompts from `.wav` files in `voices/`
20
  - 🌍 **23 Languages**: German, English, French, Spanish, Italian, and many more
21
  - 🎭 **Voice Cloning**: Uses Chatterbox Multilingual + reference audio
22
  - πŸ”Œ **Modular Architecture**: Easy to swap TTS backends
 
46
  └── backends/
47
  β”œβ”€β”€ base.py # Abstract backend interface
48
  β”œβ”€β”€ chatterbox_backend.py # Default: Chatterbox Multilingual
49
+
50
  ```
51
 
52
  ## Usage
 
82
 
83
  ### Switch Backend
84
 
85
+ This project currently ships with the Chatterbox backend.
 
 
 
 
86
 
87
  ### With Background Music
88
 
 
153
  ### Environment Variables
154
 
155
  - `HF_TOKEN`: HuggingFace token for model downloads (Chatterbox)
156
+ - `PHONE_SPEAKER_TTS_VOICES_DIR`: Override the default voices folder (defaults to `./voices`)
157
+
158
+ ### Default Voices Folder
159
+
160
+ Put `.wav` files into `voices/` (or the folder pointed to by `PHONE_SPEAKER_TTS_VOICES_DIR`).
161
+ The file name (without extension) becomes the voice name.
162
+
163
+ Example: `voices/flozi.wav` β†’ voice `flozi`.
164
+
165
+ If the folder contains no `.wav` files, the UI will force **Voice cloning** and require an uploaded reference sample.
166
 
167
 
168
  ## Supported Languages
app.py CHANGED
@@ -1,13 +1,15 @@
1
  """Phone Speaker TTS - Gradio Application.
2
 
3
  UI requirements:
4
- - Use Gemini TTS by default for standard voices
5
  - Provide a dropdown to choose a voice
6
  - Include a "Voice cloning" option; when selected, show reference-audio upload
7
  and use Chatterbox (voice cloning capable) backend.
8
  """
9
 
 
10
  import random
 
11
 
12
  import gradio as gr
13
  import numpy as np
@@ -33,14 +35,6 @@ from engine import TTSEngine
33
  from engine.audio_processor import AudioProcessor
34
  from engine.backends.chatterbox_backend import DEFAULT_VOICE_PROMPTS
35
 
36
- try:
37
- from engine.backends.gemini_backend import GeminiBackend
38
-
39
- HAS_GEMINI_BACKEND = True
40
- except Exception:
41
- GeminiBackend = None
42
- HAS_GEMINI_BACKEND = False
43
-
44
  # --- Configuration ---
45
  DEVICE = (
46
  "cuda"
@@ -111,19 +105,27 @@ ENGINE = None
111
  VOICE_CLONING_OPTION = "Voice cloning"
112
 
113
 
114
- def _is_gemini_ready() -> bool:
115
- """Return True if Gemini backend can be used (SDK import present).
 
 
 
116
 
117
- API key may be provided per request via UI.
118
- """
119
- if not HAS_GEMINI_BACKEND:
120
- return False
121
- try:
122
- import google.genai # noqa: F401
123
 
124
- return True
125
- except Exception:
126
- return False
 
 
 
 
 
 
 
 
 
 
 
127
 
128
 
129
  def get_engine() -> TTSEngine:
@@ -136,16 +138,14 @@ def get_engine() -> TTSEngine:
136
  logger.info("Initializing TTS Engine...")
137
  ENGINE = TTSEngine(
138
  EngineConfig(
139
- default_backend="gemini",
140
  device=DEVICE,
141
  default_language="de",
142
  )
143
  )
144
 
145
- # Do not force-load backends on startup.
146
- # - Gemini can be authenticated via per-request API key in the UI.
147
- # - Chatterbox is heavy and should only load when voice cloning is used.
148
- ENGINE.set_backend("gemini")
149
 
150
  logger.info("TTS Engine ready!")
151
 
@@ -197,22 +197,19 @@ def get_default_voice(language: str) -> str:
197
  def get_voice_choices() -> list[str]:
198
  """Get voice dropdown choices.
199
 
200
- - Standard voices: Gemini prebuilt voices
201
- - Special entry: Voice cloning (uses Chatterbox)
202
  """
203
- voices: list[str] = []
204
- if HAS_GEMINI_BACKEND and _is_gemini_ready():
205
- try:
206
- voices.extend(list(GeminiBackend.AVAILABLE_VOICES))
207
- except Exception:
208
- pass
209
- # Always include the special option
210
- voices.append(VOICE_CLONING_OPTION)
211
- return voices
212
 
213
 
214
  def _resolve_backend_for_voice_choice(voice_choice: str) -> str:
215
- return "chatterbox" if voice_choice == VOICE_CLONING_OPTION else "gemini"
216
 
217
 
218
  def get_background_music_choices() -> list[tuple[str, str]]:
@@ -239,7 +236,6 @@ def generate_announcement(
239
  text: str,
240
  language: str,
241
  voice_choice: str,
242
- gemini_api_key: str = "",
243
  voice_audio: str = None,
244
  background_music: str = "",
245
  custom_music: str = None,
@@ -280,18 +276,28 @@ def generate_announcement(
280
  torch.cuda.manual_seed_all(seed)
281
 
282
  # Voice resolution:
283
- # - Voice cloning: use reference audio (or fallback per-language prompt)
284
- # - Standard voice: use Gemini prebuilt voice
285
- voice_kwargs = {}
286
- if backend_name == "chatterbox":
287
- if not voice_audio or not str(voice_audio).strip():
288
- voice_audio = get_default_voice(language)
 
 
 
 
289
  else:
290
- voice_audio = None
291
- if voice_choice and voice_choice != VOICE_CLONING_OPTION:
292
- voice_kwargs["voice"] = voice_choice
293
- if gemini_api_key and str(gemini_api_key).strip():
294
- voice_kwargs["api_key"] = str(gemini_api_key).strip()
 
 
 
 
 
 
295
 
296
  # Determine which background music to use (custom upload takes priority)
297
  music_path = None
@@ -315,7 +321,6 @@ def generate_announcement(
315
  language=language,
316
  voice_audio=voice_audio,
317
  split_sentences=True,
318
- **voice_kwargs,
319
  )
320
 
321
  # Process with background music
@@ -359,7 +364,6 @@ def generate_announcement(
359
  language=language,
360
  voice_audio=voice_audio,
361
  split_sentences=True,
362
- **voice_kwargs,
363
  )
364
  return result
365
 
@@ -374,33 +378,16 @@ def on_language_change(language: str, voice_choice: str):
374
 
375
  def on_voice_choice_change(voice_choice: str):
376
  """Switch UI elements depending on voice selection."""
377
- backend = _resolve_backend_for_voice_choice(voice_choice)
378
-
379
- if backend == "gemini":
380
- language_choices = get_language_choices_for_backend("gemini")
381
- default_language = (
382
- "de"
383
- if any(v == "de" for _, v in language_choices)
384
- else (language_choices[0][1] if language_choices else "en")
385
- )
386
- return (
387
- gr.update(choices=language_choices, value=default_language),
388
- gr.update(visible=False, value=None),
389
- gr.update(visible=True),
390
- gr.update(value=get_example_text(default_language)),
391
- )
392
-
393
- # Voice cloning
394
  language_choices = get_language_choices_for_backend("chatterbox")
395
  default_language = (
396
  "de"
397
  if any(v == "de" for _, v in language_choices)
398
  else (language_choices[0][1] if language_choices else "en")
399
  )
 
400
  return (
401
  gr.update(choices=language_choices, value=default_language),
402
- gr.update(visible=True, value=None),
403
- gr.update(visible=False, value=""),
404
  gr.update(value=get_example_text(default_language)),
405
  )
406
 
@@ -429,65 +416,34 @@ def create_interface():
429
  elem_classes=["main-title"],
430
  )
431
 
432
- if not _is_gemini_ready():
433
- gr.Markdown(
434
- """
435
- **Note:** Gemini is currently unavailable.
436
- Please install `google-genai` or use **Voice cloning**.
437
- """
438
- )
439
- else:
440
- gr.Markdown(
441
- """
442
- **Tip (Public App):** You can enter your own Gemini API key.
443
- This way the costs are billed to the user rather than the app operator.
444
-
445
- **Note:** API keys must be supplied per request via the UI; the app does not read keys from environment variables.
446
- """
447
- )
448
 
449
  with gr.Row():
450
  # Left column - Input
451
  with gr.Column(scale=1):
 
452
  default_voice_choice = (
453
- "Kore"
454
- if _is_gemini_ready() and "Kore" in get_voice_choices()
455
- else VOICE_CLONING_OPTION
456
  )
457
 
458
  voice_choice = gr.Dropdown(
459
- choices=get_voice_choices(),
460
  value=default_voice_choice,
461
  label="πŸ—£οΈ Voice",
462
- info="Default: Gemini voices. 'Voice cloning' uses reference audio (Chatterbox).",
463
- )
464
-
465
- gemini_api_key = gr.Textbox(
466
- label="πŸ”‘ Gemini API Key",
467
- type="password",
468
- placeholder="Enter Gemini API key for this request",
469
- info="Provide your Gemini API key for this request; environment variables are not used.",
470
- visible=(
471
- _is_gemini_ready()
472
- and default_voice_choice != VOICE_CLONING_OPTION
473
- ),
474
  )
475
 
476
  language = gr.Dropdown(
477
- choices=(
478
- get_language_choices_for_backend("gemini")
479
- if _is_gemini_ready()
480
- else get_language_choices_for_backend("chatterbox")
481
- ),
482
- value=(
483
- "de"
484
- if _is_gemini_ready()
485
- and any(
486
- v == "de"
487
- for _, v in get_language_choices_for_backend("gemini")
488
- )
489
- else "de"
490
- ),
491
  label="🌍 Language",
492
  info="Choose the language of the announcement",
493
  )
@@ -608,7 +564,7 @@ def create_interface():
608
  voice_choice.change(
609
  fn=on_voice_choice_change,
610
  inputs=[voice_choice],
611
- outputs=[language, voice_audio, gemini_api_key, text],
612
  show_progress=False,
613
  )
614
 
@@ -625,7 +581,6 @@ def create_interface():
625
  text,
626
  language,
627
  voice_choice,
628
- gemini_api_key,
629
  voice_audio,
630
  background_music,
631
  custom_music,
 
1
  """Phone Speaker TTS - Gradio Application.
2
 
3
  UI requirements:
4
+ - Load default voices from a folder of .wav files (e.g. voices/flozi.wav -> "flozi")
5
  - Provide a dropdown to choose a voice
6
  - Include a "Voice cloning" option; when selected, show reference-audio upload
7
  and use Chatterbox (voice cloning capable) backend.
8
  """
9
 
10
+ import os
11
  import random
12
+ from pathlib import Path
13
 
14
  import gradio as gr
15
  import numpy as np
 
35
  from engine.audio_processor import AudioProcessor
36
  from engine.backends.chatterbox_backend import DEFAULT_VOICE_PROMPTS
37
 
 
 
 
 
 
 
 
 
38
  # --- Configuration ---
39
  DEVICE = (
40
  "cuda"
 
105
  VOICE_CLONING_OPTION = "Voice cloning"
106
 
107
 
108
+ def _get_voices_dir() -> Path:
109
+ env_dir = os.environ.get("PHONE_SPEAKER_TTS_VOICES_DIR")
110
+ if env_dir and str(env_dir).strip():
111
+ return Path(env_dir).expanduser()
112
+ return Path(__file__).parent / "voices"
113
 
 
 
 
 
 
 
114
 
115
+ def _list_default_voices() -> dict[str, Path]:
116
+ voices_dir = _get_voices_dir()
117
+ if not voices_dir.exists() or not voices_dir.is_dir():
118
+ return {}
119
+ voices: dict[str, Path] = {}
120
+ for wav_path in sorted(voices_dir.glob("*.wav")):
121
+ name = wav_path.stem.strip()
122
+ if name:
123
+ voices[name] = wav_path
124
+ return voices
125
+
126
+
127
+ def _has_default_voices() -> bool:
128
+ return len(_list_default_voices()) > 0
129
 
130
 
131
  def get_engine() -> TTSEngine:
 
138
  logger.info("Initializing TTS Engine...")
139
  ENGINE = TTSEngine(
140
  EngineConfig(
141
+ default_backend="chatterbox",
142
  device=DEVICE,
143
  default_language="de",
144
  )
145
  )
146
 
147
+ # Do not force-load models on startup; Chatterbox is heavy and should load on demand.
148
+ ENGINE.set_backend("chatterbox")
 
 
149
 
150
  logger.info("TTS Engine ready!")
151
 
 
197
  def get_voice_choices() -> list[str]:
198
  """Get voice dropdown choices.
199
 
200
+ - Standard voices: local .wav prompts from voices folder
201
+ - Special entry: Voice cloning (uses Chatterbox + user provided reference)
202
  """
203
+ voices = list(_list_default_voices().keys())
204
+ if voices:
205
+ voices.append(VOICE_CLONING_OPTION)
206
+ return voices
207
+ # If there are no default voices, force voice cloning.
208
+ return [VOICE_CLONING_OPTION]
 
 
 
209
 
210
 
211
  def _resolve_backend_for_voice_choice(voice_choice: str) -> str:
212
+ return "chatterbox"
213
 
214
 
215
  def get_background_music_choices() -> list[tuple[str, str]]:
 
236
  text: str,
237
  language: str,
238
  voice_choice: str,
 
239
  voice_audio: str = None,
240
  background_music: str = "",
241
  custom_music: str = None,
 
276
  torch.cuda.manual_seed_all(seed)
277
 
278
  # Voice resolution:
279
+ # - Default voice: use voices/<name>.wav (local prompt)
280
+ # - Voice cloning: use uploaded reference audio
281
+ default_voices = _list_default_voices()
282
+
283
+ if voice_choice != VOICE_CLONING_OPTION:
284
+ if voice_choice not in default_voices:
285
+ raise gr.Error(
286
+ f"Unknown voice '{voice_choice}'. Add '{voice_choice}.wav' to '{_get_voices_dir()}' or select '{VOICE_CLONING_OPTION}'."
287
+ )
288
+ voice_audio = str(default_voices[voice_choice])
289
  else:
290
+ # Force voice cloning when there are no default voices.
291
+ if not _has_default_voices():
292
+ if not voice_audio or not str(voice_audio).strip():
293
+ raise gr.Error(
294
+ f"No default voices found in '{_get_voices_dir()}'. Please upload a reference audio sample for voice cloning."
295
+ )
296
+ # If default voices exist, keep previous behavior: fall back to a per-language prompt.
297
+ if (
298
+ voice_audio is None or not str(voice_audio).strip()
299
+ ) and _has_default_voices():
300
+ voice_audio = get_default_voice(language)
301
 
302
  # Determine which background music to use (custom upload takes priority)
303
  music_path = None
 
321
  language=language,
322
  voice_audio=voice_audio,
323
  split_sentences=True,
 
324
  )
325
 
326
  # Process with background music
 
364
  language=language,
365
  voice_audio=voice_audio,
366
  split_sentences=True,
 
367
  )
368
  return result
369
 
 
378
 
379
  def on_voice_choice_change(voice_choice: str):
380
  """Switch UI elements depending on voice selection."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
  language_choices = get_language_choices_for_backend("chatterbox")
382
  default_language = (
383
  "de"
384
  if any(v == "de" for _, v in language_choices)
385
  else (language_choices[0][1] if language_choices else "en")
386
  )
387
+ show_voice_audio = voice_choice == VOICE_CLONING_OPTION
388
  return (
389
  gr.update(choices=language_choices, value=default_language),
390
+ gr.update(visible=show_voice_audio, value=None if show_voice_audio else None),
 
391
  gr.update(value=get_example_text(default_language)),
392
  )
393
 
 
416
  elem_classes=["main-title"],
417
  )
418
 
419
+ voices_dir = _get_voices_dir()
420
+ gr.Markdown(
421
+ f"""
422
+ **Default voices folder:** `{voices_dir}`
423
+
424
+ Put `.wav` files there named like `flozi.wav` β†’ voice `flozi`.
425
+ If the folder has no `.wav` files, the UI will force **Voice cloning**.
426
+ """
427
+ )
 
 
 
 
 
 
 
428
 
429
  with gr.Row():
430
  # Left column - Input
431
  with gr.Column(scale=1):
432
+ voice_choices = get_voice_choices()
433
  default_voice_choice = (
434
+ voice_choices[0] if voice_choices else VOICE_CLONING_OPTION
 
 
435
  )
436
 
437
  voice_choice = gr.Dropdown(
438
+ choices=voice_choices,
439
  value=default_voice_choice,
440
  label="πŸ—£οΈ Voice",
441
+ info="Default voices come from the voices folder. 'Voice cloning' uses uploaded reference audio.",
 
 
 
 
 
 
 
 
 
 
 
442
  )
443
 
444
  language = gr.Dropdown(
445
+ choices=get_language_choices_for_backend("chatterbox"),
446
+ value="de",
 
 
 
 
 
 
 
 
 
 
 
 
447
  label="🌍 Language",
448
  info="Choose the language of the announcement",
449
  )
 
564
  voice_choice.change(
565
  fn=on_voice_choice_change,
566
  inputs=[voice_choice],
567
+ outputs=[language, voice_audio, text],
568
  show_progress=False,
569
  )
570
 
 
581
  text,
582
  language,
583
  voice_choice,
 
584
  voice_audio,
585
  background_music,
586
  custom_music,
engine/backends/__init__.py CHANGED
@@ -3,11 +3,3 @@ from .base import BackendConfig, TTSBackend, TTSResult
3
  from .chatterbox_backend import ChatterboxBackend
4
 
5
  __all__ = ["TTSBackend", "TTSResult", "BackendConfig", "ChatterboxBackend"]
6
-
7
- # Optional backends
8
- try:
9
- from .gemini_backend import GeminiBackend
10
-
11
- __all__.append("GeminiBackend")
12
- except ImportError:
13
- pass # google-genai not installed
 
3
  from .chatterbox_backend import ChatterboxBackend
4
 
5
  __all__ = ["TTSBackend", "TTSResult", "BackendConfig", "ChatterboxBackend"]
 
 
 
 
 
 
 
 
engine/backends/gemini_backend.py DELETED
@@ -1,267 +0,0 @@
1
- """
2
- Google Gemini TTS Backend.
3
- Uses Google's Gemini API for text-to-speech synthesis.
4
- """
5
-
6
- import io
7
- from typing import Optional
8
-
9
- import numpy as np
10
- from loguru import logger
11
-
12
- from .base import BackendConfig, TTSBackend, TTSResult
13
-
14
-
15
- class GeminiBackend(TTSBackend):
16
- """
17
- Google Gemini TTS Backend.
18
-
19
- Features:
20
- - High-quality neural TTS
21
- - Multiple preset voices
22
- - No voice cloning (uses preset voices)
23
-
24
- Authentication:
25
- - API key must be provided per request (do not rely on environment variables).
26
- - Per-request keys are recommended for public apps.
27
- """
28
-
29
- # Available Gemini voices
30
- AVAILABLE_VOICES = [
31
- "Puck",
32
- "Charon",
33
- "Kore",
34
- "Fenrir",
35
- "Aoede",
36
- "Leda",
37
- "Orus",
38
- "Zephyr",
39
- ]
40
-
41
- # Gemini has limited language support compared to Chatterbox
42
- SUPPORTED_LANGUAGES = {
43
- "en": "English",
44
- "de": "German",
45
- "es": "Spanish",
46
- "fr": "French",
47
- "it": "Italian",
48
- "pt": "Portuguese",
49
- "ja": "Japanese",
50
- "ko": "Korean",
51
- "zh": "Chinese",
52
- }
53
-
54
- def __init__(
55
- self,
56
- config: Optional[BackendConfig] = None,
57
- voice: str = "Kore",
58
- api_key: Optional[str] = None,
59
- ):
60
- super().__init__(config)
61
- self._client = None
62
- self._api_key: Optional[str] = api_key
63
- self._api_key_fingerprint: Optional[str] = None
64
- self.voice = voice if voice in self.AVAILABLE_VOICES else "Kore"
65
-
66
- @property
67
- def name(self) -> str:
68
- return "Google Gemini TTS"
69
-
70
- @property
71
- def supports_voice_cloning(self) -> bool:
72
- return False
73
-
74
- @property
75
- def supported_languages(self) -> dict[str, str]:
76
- return self.SUPPORTED_LANGUAGES.copy()
77
-
78
- def set_api_key(self, api_key: Optional[str]) -> None:
79
- """Set (or clear) the API key used by this backend.
80
-
81
- Note: This is kept in memory only.
82
- """
83
- api_key = (api_key or "").strip() or None
84
- if api_key == self._api_key:
85
- return
86
- self._api_key = api_key
87
- # Force re-init on next call.
88
- if self._is_loaded:
89
- self.unload()
90
-
91
- def load(self, api_key: Optional[str] = None) -> None:
92
- """Initialize the Gemini client. The API key must be provided per request."""
93
- desired_key = (api_key or self._api_key or "").strip()
94
- if not desired_key:
95
- raise ValueError(
96
- "Gemini API key missing. Provide api_key for this request (do not rely on environment variables)."
97
- )
98
-
99
- desired_fingerprint = f"len:{len(desired_key)}"
100
-
101
- if self._is_loaded and self._client is not None:
102
- if self._api_key_fingerprint == desired_fingerprint:
103
- return
104
- # Different key than the currently initialized client.
105
- self.unload()
106
-
107
- try:
108
- import google.genai as genai
109
-
110
- self._client = genai.Client(api_key=desired_key)
111
- self._is_loaded = True
112
- self._api_key_fingerprint = desired_fingerprint
113
- logger.info("Gemini client initialized successfully")
114
- except Exception as e:
115
- logger.error(f"Failed to initialize Gemini client: {e}")
116
- raise
117
-
118
- def unload(self) -> None:
119
- """Clean up Gemini client."""
120
- self._client = None
121
- self._is_loaded = False
122
- self._api_key_fingerprint = None
123
- logger.info("Gemini client unloaded")
124
-
125
- def set_voice(self, voice: str) -> None:
126
- """Set the voice to use for synthesis."""
127
- if voice not in self.AVAILABLE_VOICES:
128
- raise ValueError(
129
- f"Unknown voice '{voice}'. Available: {self.AVAILABLE_VOICES}"
130
- )
131
- self.voice = voice
132
-
133
- def generate(
134
- self,
135
- text: str,
136
- language: str = "de",
137
- voice_audio_path: Optional[str] = None,
138
- voice: Optional[str] = None,
139
- api_key: Optional[str] = None,
140
- **kwargs,
141
- ) -> TTSResult:
142
- """
143
- Generate speech from text using Gemini.
144
-
145
- Args:
146
- text: Text to synthesize
147
- language: Language code (for text processing, voice determines actual synthesis)
148
- voice_audio_path: Ignored (Gemini doesn't support voice cloning)
149
- voice: Voice name to use (default: instance voice setting)
150
-
151
- Returns:
152
- TTSResult with audio waveform and sample rate
153
- """
154
- # Allow per-request key (useful for public apps where users bring their own key).
155
- self.load(api_key=api_key)
156
-
157
- if voice_audio_path:
158
- logger.warning(
159
- "Gemini backend doesn't support voice cloning, ignoring voice_audio_path"
160
- )
161
-
162
- from google.genai import types as genai_types
163
-
164
- selected_voice = voice or self.voice
165
-
166
- logger.info(
167
- f"Generating speech with Gemini: voice={selected_voice}, text='{text[:50]}...'"
168
- )
169
-
170
- contents = [
171
- genai_types.Content(
172
- role="user", parts=[genai_types.Part.from_text(text=text)]
173
- )
174
- ]
175
-
176
- config = genai_types.GenerateContentConfig(
177
- temperature=1,
178
- response_modalities=["audio"],
179
- speech_config=genai_types.SpeechConfig(
180
- voice_config=genai_types.VoiceConfig(
181
- prebuilt_voice_config=genai_types.PrebuiltVoiceConfig(
182
- voice_name=selected_voice
183
- )
184
- )
185
- ),
186
- )
187
-
188
- try:
189
- audio_chunks = []
190
- mime_type = None
191
-
192
- for chunk in self._client.models.generate_content_stream(
193
- model="gemini-2.5-pro-preview-tts",
194
- contents=contents,
195
- config=config,
196
- ):
197
- if chunk.candidates:
198
- inline_data = chunk.candidates[0].content.parts[0].inline_data
199
- audio_chunks.append(inline_data.data)
200
- if mime_type is None:
201
- mime_type = inline_data.mime_type
202
-
203
- if not audio_chunks:
204
- raise RuntimeError("No audio data received from Gemini API")
205
-
206
- raw_audio = b"".join(audio_chunks)
207
-
208
- # Convert to numpy array
209
- audio_np, sample_rate = self._process_audio(raw_audio, mime_type)
210
-
211
- return TTSResult(audio=audio_np, sample_rate=sample_rate)
212
-
213
- except Exception as e:
214
- logger.error(f"Gemini TTS generation failed: {e}")
215
- raise
216
-
217
- def _process_audio(
218
- self, raw_audio: bytes, mime_type: str
219
- ) -> tuple[np.ndarray, int]:
220
- """Process raw audio data from Gemini into numpy array."""
221
- from pydub import AudioSegment
222
-
223
- # Parse MIME type for audio parameters
224
- sample_rate = 24000 # Default
225
- bits_per_sample = 16
226
-
227
- if mime_type and "audio/L" in mime_type:
228
- # Parse format like audio/L16;rate=24000
229
- parts = mime_type.split(";")
230
- for part in parts:
231
- part = part.strip()
232
- if part.startswith("audio/L"):
233
- try:
234
- bits_per_sample = int(part.split("L")[1])
235
- except (ValueError, IndexError):
236
- pass
237
- elif part.lower().startswith("rate="):
238
- try:
239
- sample_rate = int(part.split("=")[1])
240
- except (ValueError, IndexError):
241
- pass
242
-
243
- # Create AudioSegment from raw PCM
244
- audio_segment = AudioSegment(
245
- data=raw_audio,
246
- sample_width=bits_per_sample // 8,
247
- frame_rate=sample_rate,
248
- channels=1,
249
- )
250
- elif mime_type == "audio/mpeg":
251
- audio_segment = AudioSegment.from_file(io.BytesIO(raw_audio), format="mp3")
252
- sample_rate = audio_segment.frame_rate
253
- else:
254
- # Try auto-detection
255
- audio_segment = AudioSegment.from_file(io.BytesIO(raw_audio))
256
- sample_rate = audio_segment.frame_rate
257
-
258
- # Convert to numpy array
259
- samples = np.array(audio_segment.get_array_of_samples())
260
-
261
- # Normalize to float32 [-1, 1]
262
- if audio_segment.sample_width == 2: # 16-bit
263
- samples = samples.astype(np.float32) / 32768.0
264
- elif audio_segment.sample_width == 1: # 8-bit
265
- samples = (samples.astype(np.float32) - 128) / 128.0
266
-
267
- return samples, sample_rate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
engine/tts_engine.py CHANGED
@@ -28,7 +28,7 @@ class EngineConfig:
28
  """Configuration for the TTS Engine."""
29
 
30
  # Backend settings
31
- default_backend: str = "gemini"
32
  device: str = "auto" # "auto", "cuda", "mps", "cpu"
33
 
34
  # Default generation settings
@@ -63,7 +63,7 @@ class TTSEngine:
63
  )
64
 
65
  # Switch backend
66
- engine.set_backend("gemini")
67
  audio = engine.generate("Welcome to our service.", language="en")
68
  """
69
 
@@ -188,16 +188,13 @@ class TTSEngine:
188
 
189
  # Generate voice ID for caching.
190
  # - Voice cloning: derive from reference audio when available
191
- # - Preset voices (e.g. Gemini): include requested voice in cache key
192
- requested_voice = kwargs.get("voice")
193
  if voice_audio:
194
  voice_id = (
195
  Path(voice_audio).stem
196
  if os.path.exists(voice_audio or "")
197
  else "custom"
198
  )
199
- elif requested_voice:
200
- voice_id = f"voice-{requested_voice}"
201
  else:
202
  voice_id = "default"
203
 
@@ -299,12 +296,3 @@ class TTSEngine:
299
  def clear_cache(self) -> int:
300
  """Clear the local audio cache. Returns number of files deleted."""
301
  return self._cache.clear_local()
302
-
303
-
304
- # Register additional backends if available
305
- try:
306
- from .backends.gemini_backend import GeminiBackend
307
-
308
- TTSEngine.register_backend("gemini", GeminiBackend)
309
- except ImportError:
310
- pass # Gemini backend not available
 
28
  """Configuration for the TTS Engine."""
29
 
30
  # Backend settings
31
+ default_backend: str = "chatterbox"
32
  device: str = "auto" # "auto", "cuda", "mps", "cpu"
33
 
34
  # Default generation settings
 
63
  )
64
 
65
  # Switch backend
66
+ engine.set_backend("chatterbox")
67
  audio = engine.generate("Welcome to our service.", language="en")
68
  """
69
 
 
188
 
189
  # Generate voice ID for caching.
190
  # - Voice cloning: derive from reference audio when available
191
+ # - If no reference audio: use "default"
 
192
  if voice_audio:
193
  voice_id = (
194
  Path(voice_audio).stem
195
  if os.path.exists(voice_audio or "")
196
  else "custom"
197
  )
 
 
198
  else:
199
  voice_id = "default"
200
 
 
296
  def clear_cache(self) -> int:
297
  """Clear the local audio cache. Returns number of files deleted."""
298
  return self._cache.clear_local()
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -25,9 +25,6 @@ huggingface_hub>=0.20.0
25
  # Logging
26
  loguru>=0.7.0
27
 
28
- # Optional: Gemini backend
29
- google-genai>=0.3.0
30
-
31
  # Optional: Caching to HuggingFace Hub
32
  # pandas>=2.0.0
33
 
 
25
  # Logging
26
  loguru>=0.7.0
27
 
 
 
 
28
  # Optional: Caching to HuggingFace Hub
29
  # pandas>=2.0.0
30