flozi00 commited on
Commit
d63c0fa
·
1 Parent(s): 73f01a4

Update README and app structure for Gemini TTS integration; adjust backend defaults and voice selection logic

Browse files
Files changed (4) hide show
  1. README.md +5 -5
  2. app.py +175 -16
  3. engine/tts_engine.py +11 -7
  4. requirements.txt +1 -1
README.md CHANGED
@@ -16,9 +16,9 @@ A modular text-to-speech engine for generating professional phone announcements
16
 
17
  ## Features
18
 
19
- - 🎙️ **High-Quality TTS**: Using Chatterbox Multilingual for natural speech synthesis
20
  - 🌍 **23 Languages**: German, English, French, Spanish, Italian, and many more
21
- - 🎭 **Voice Cloning**: Clone any voice from a short audio sample
22
  - 🔌 **Modular Architecture**: Easy to swap TTS backends
23
  - 🎵 **Background Music**: Optional background music mixing
24
  - 💾 **Caching**: Local and HuggingFace Hub caching support
@@ -30,7 +30,7 @@ A modular text-to-speech engine for generating professional phone announcements
30
  pip install -r requirements.txt
31
 
32
  # Run the application
33
- python app_new.py
34
  ```
35
 
36
  ## Architecture
@@ -156,8 +156,8 @@ engine = TTSEngine(config)
156
 
157
  ### Environment Variables
158
 
159
- - `HF_TOKEN`: HuggingFace token for model downloads
160
- - `GEMINI_API_KEY`: Google API key (for Gemini backend)
161
 
162
  ## Supported Languages
163
 
 
16
 
17
  ## Features
18
 
19
+ - 🎙️ **Standard Voices (Default)**: Google Gemini TTS prebuilt voices
20
  - 🌍 **23 Languages**: German, English, French, Spanish, Italian, and many more
21
+ - 🎭 **Voice Cloning**: Uses Chatterbox Multilingual + reference audio
22
  - 🔌 **Modular Architecture**: Easy to swap TTS backends
23
  - 🎵 **Background Music**: Optional background music mixing
24
  - 💾 **Caching**: Local and HuggingFace Hub caching support
 
30
  pip install -r requirements.txt
31
 
32
  # Run the application
33
+ python app.py
34
  ```
35
 
36
  ## Architecture
 
156
 
157
  ### Environment Variables
158
 
159
+ - `HF_TOKEN`: HuggingFace token for model downloads (Chatterbox)
160
+ - `GEMINI_API_KEY`: Google API key (Gemini TTS; default for standard voices)
161
 
162
  ## Supported Languages
163
 
app.py CHANGED
@@ -1,10 +1,13 @@
1
- """
2
- Telefonansagen TTS - Simplified Gradio Application
3
 
4
- A streamlined interface for generating professional phone announcements
5
- using the modular TTS engine with Chatterbox Multilingual as default backend.
 
 
 
6
  """
7
 
 
8
  import random
9
 
10
  import gradio as gr
@@ -31,6 +34,14 @@ from engine import TTSEngine
31
  from engine.audio_processor import AudioProcessor
32
  from engine.backends.chatterbox_backend import DEFAULT_VOICE_PROMPTS
33
 
 
 
 
 
 
 
 
 
34
  # --- Configuration ---
35
  DEVICE = (
36
  "cuda"
@@ -98,6 +109,21 @@ EXAMPLE_TEXTS = {
98
  ENGINE = None
99
 
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  def get_engine() -> TTSEngine:
102
  """Get or initialize the TTS engine."""
103
  global ENGINE
@@ -108,13 +134,23 @@ def get_engine() -> TTSEngine:
108
  logger.info("Initializing TTS Engine...")
109
  ENGINE = TTSEngine(
110
  EngineConfig(
111
- default_backend="chatterbox",
112
  device=DEVICE,
113
  default_language="de",
114
  )
115
  )
116
- # Pre-load the model
117
- ENGINE.load_backend()
 
 
 
 
 
 
 
 
 
 
118
  logger.info("TTS Engine ready!")
119
 
120
  return ENGINE
@@ -141,6 +177,17 @@ def get_language_choices() -> list[tuple[str, str]]:
141
  return choices
142
 
143
 
 
 
 
 
 
 
 
 
 
 
 
144
  def get_example_text(language: str) -> str:
145
  """Get example text for a language."""
146
  return EXAMPLE_TEXTS.get(language, EXAMPLE_TEXTS["en"])
@@ -151,6 +198,27 @@ def get_default_voice(language: str) -> str:
151
  return DEFAULT_VOICE_PROMPTS.get(language)
152
 
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  def get_background_music_choices() -> list[tuple[str, str]]:
155
  """Get available background music choices."""
156
  processor = AudioProcessor()
@@ -174,6 +242,7 @@ def get_background_music_choices() -> list[tuple[str, str]]:
174
  def generate_announcement(
175
  text: str,
176
  language: str,
 
177
  voice_audio: str = None,
178
  background_music: str = "",
179
  custom_music: str = None,
@@ -201,6 +270,10 @@ def generate_announcement(
201
  """
202
  engine = get_engine()
203
 
 
 
 
 
204
  # Set seed for reproducibility
205
  if seed != 0:
206
  torch.manual_seed(seed)
@@ -209,9 +282,17 @@ def generate_announcement(
209
  if DEVICE == "cuda":
210
  torch.cuda.manual_seed_all(seed)
211
 
212
- # Use default voice if none provided
213
- if not voice_audio or not str(voice_audio).strip():
214
- voice_audio = get_default_voice(language)
 
 
 
 
 
 
 
 
215
 
216
  # Determine which background music to use (custom upload takes priority)
217
  music_path = None
@@ -235,6 +316,7 @@ def generate_announcement(
235
  language=language,
236
  voice_audio=voice_audio,
237
  split_sentences=True,
 
238
  )
239
 
240
  # Process with background music
@@ -278,13 +360,48 @@ def generate_announcement(
278
  language=language,
279
  voice_audio=voice_audio,
280
  split_sentences=True,
 
281
  )
282
  return result
283
 
284
 
285
- def on_language_change(language: str):
286
  """Handle language selection change."""
287
- return get_example_text(language), get_default_voice(language)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
 
290
  # --- Gradio Interface ---
@@ -311,12 +428,45 @@ def create_interface():
311
  elem_classes=["main-title"],
312
  )
313
 
 
 
 
 
 
 
 
 
314
  with gr.Row():
315
  # Left column - Input
316
  with gr.Column(scale=1):
 
 
 
 
 
 
 
 
 
 
 
 
 
317
  language = gr.Dropdown(
318
- choices=get_language_choices(),
319
- value="de",
 
 
 
 
 
 
 
 
 
 
 
 
320
  label="🌍 Sprache / Language",
321
  info="Wählen Sie die Sprache der Ansage",
322
  )
@@ -335,7 +485,8 @@ def create_interface():
335
  sources=["upload", "microphone"],
336
  type="filepath",
337
  label="Referenz-Audio für Stimmklonung",
338
- value=get_default_voice("de"),
 
339
  )
340
  gr.Markdown(
341
  """
@@ -433,9 +584,16 @@ def create_interface():
433
  )
434
 
435
  # Event handlers
 
 
 
 
 
 
 
436
  language.change(
437
  fn=on_language_change,
438
- inputs=[language],
439
  outputs=[text, voice_audio],
440
  show_progress=False,
441
  )
@@ -445,6 +603,7 @@ def create_interface():
445
  inputs=[
446
  text,
447
  language,
 
448
  voice_audio,
449
  background_music,
450
  custom_music,
 
1
+ """Telefonansagen TTS - Gradio Application.
 
2
 
3
+ UI requirements:
4
+ - Use Gemini TTS by default for standard voices
5
+ - Provide a dropdown to choose a voice
6
+ - Include a "Voice cloning" option; when selected, show reference-audio upload
7
+ and use Chatterbox (voice cloning capable) backend.
8
  """
9
 
10
+ import os
11
  import random
12
 
13
  import gradio as gr
 
34
  from engine.audio_processor import AudioProcessor
35
  from engine.backends.chatterbox_backend import DEFAULT_VOICE_PROMPTS
36
 
37
+ try:
38
+ from engine.backends.gemini_backend import GeminiBackend
39
+
40
+ HAS_GEMINI_BACKEND = True
41
+ except Exception:
42
+ GeminiBackend = None
43
+ HAS_GEMINI_BACKEND = False
44
+
45
  # --- Configuration ---
46
  DEVICE = (
47
  "cuda"
 
109
  ENGINE = None
110
 
111
 
112
+ VOICE_CLONING_OPTION = "Voice cloning"
113
+
114
+
115
+ def _is_gemini_ready() -> bool:
116
+ """Return True if Gemini backend can likely be used."""
117
+ if not HAS_GEMINI_BACKEND:
118
+ return False
119
+ try:
120
+ import google.genai # noqa: F401
121
+
122
+ return bool(os.environ.get("GEMINI_API_KEY"))
123
+ except Exception:
124
+ return False
125
+
126
+
127
  def get_engine() -> TTSEngine:
128
  """Get or initialize the TTS engine."""
129
  global ENGINE
 
134
  logger.info("Initializing TTS Engine...")
135
  ENGINE = TTSEngine(
136
  EngineConfig(
137
+ default_backend="gemini",
138
  device=DEVICE,
139
  default_language="de",
140
  )
141
  )
142
+
143
+ # Pre-load preferred backend if possible; fall back to chatterbox.
144
+ try:
145
+ ENGINE.load_backend("gemini")
146
+ ENGINE.set_backend("gemini")
147
+ except Exception as e:
148
+ logger.warning(
149
+ f"Gemini backend not ready ({e}). Falling back to chatterbox."
150
+ )
151
+ ENGINE.set_backend("chatterbox")
152
+ ENGINE.load_backend("chatterbox")
153
+
154
  logger.info("TTS Engine ready!")
155
 
156
  return ENGINE
 
177
  return choices
178
 
179
 
180
+ def get_language_choices_for_backend(backend: str) -> list[tuple[str, str]]:
181
+ engine = get_engine()
182
+ supported = engine.get_supported_languages(backend=backend)
183
+ choices = []
184
+ for code in supported.keys():
185
+ display = LANGUAGE_DISPLAY.get(code, f"{supported[code]} ({code})")
186
+ choices.append((display, code))
187
+ choices.sort(key=lambda x: (x[1] != "de", x[0]))
188
+ return choices
189
+
190
+
191
  def get_example_text(language: str) -> str:
192
  """Get example text for a language."""
193
  return EXAMPLE_TEXTS.get(language, EXAMPLE_TEXTS["en"])
 
198
  return DEFAULT_VOICE_PROMPTS.get(language)
199
 
200
 
201
+ def get_voice_choices() -> list[str]:
202
+ """Get voice dropdown choices.
203
+
204
+ - Standard voices: Gemini prebuilt voices
205
+ - Special entry: Voice cloning (uses Chatterbox)
206
+ """
207
+ voices: list[str] = []
208
+ if HAS_GEMINI_BACKEND:
209
+ try:
210
+ voices.extend(list(GeminiBackend.AVAILABLE_VOICES))
211
+ except Exception:
212
+ pass
213
+ # Always include the special option
214
+ voices.append(VOICE_CLONING_OPTION)
215
+ return voices
216
+
217
+
218
+ def _resolve_backend_for_voice_choice(voice_choice: str) -> str:
219
+ return "chatterbox" if voice_choice == VOICE_CLONING_OPTION else "gemini"
220
+
221
+
222
  def get_background_music_choices() -> list[tuple[str, str]]:
223
  """Get available background music choices."""
224
  processor = AudioProcessor()
 
242
  def generate_announcement(
243
  text: str,
244
  language: str,
245
+ voice_choice: str,
246
  voice_audio: str = None,
247
  background_music: str = "",
248
  custom_music: str = None,
 
270
  """
271
  engine = get_engine()
272
 
273
+ # Select backend based on voice choice
274
+ backend_name = _resolve_backend_for_voice_choice(voice_choice)
275
+ engine.set_backend(backend_name)
276
+
277
  # Set seed for reproducibility
278
  if seed != 0:
279
  torch.manual_seed(seed)
 
282
  if DEVICE == "cuda":
283
  torch.cuda.manual_seed_all(seed)
284
 
285
+ # Voice resolution:
286
+ # - Voice cloning: use reference audio (or fallback per-language prompt)
287
+ # - Standard voice: use Gemini prebuilt voice
288
+ voice_kwargs = {}
289
+ if backend_name == "chatterbox":
290
+ if not voice_audio or not str(voice_audio).strip():
291
+ voice_audio = get_default_voice(language)
292
+ else:
293
+ voice_audio = None
294
+ if voice_choice and voice_choice != VOICE_CLONING_OPTION:
295
+ voice_kwargs["voice"] = voice_choice
296
 
297
  # Determine which background music to use (custom upload takes priority)
298
  music_path = None
 
316
  language=language,
317
  voice_audio=voice_audio,
318
  split_sentences=True,
319
+ **voice_kwargs,
320
  )
321
 
322
  # Process with background music
 
360
  language=language,
361
  voice_audio=voice_audio,
362
  split_sentences=True,
363
+ **voice_kwargs,
364
  )
365
  return result
366
 
367
 
368
+ def on_language_change(language: str, voice_choice: str):
369
  """Handle language selection change."""
370
+ # Only update reference-audio default for voice cloning.
371
+ if voice_choice == VOICE_CLONING_OPTION:
372
+ return get_example_text(language), gr.update(value=None)
373
+ return get_example_text(language), gr.update()
374
+
375
+
376
+ def on_voice_choice_change(voice_choice: str):
377
+ """Switch UI elements depending on voice selection."""
378
+ backend = _resolve_backend_for_voice_choice(voice_choice)
379
+
380
+ if backend == "gemini":
381
+ language_choices = get_language_choices_for_backend("gemini")
382
+ default_language = (
383
+ "de"
384
+ if any(v == "de" for _, v in language_choices)
385
+ else (language_choices[0][1] if language_choices else "en")
386
+ )
387
+ return (
388
+ gr.update(choices=language_choices, value=default_language),
389
+ gr.update(visible=False, value=None),
390
+ gr.update(value=get_example_text(default_language)),
391
+ )
392
+
393
+ # Voice cloning
394
+ language_choices = get_language_choices_for_backend("chatterbox")
395
+ default_language = (
396
+ "de"
397
+ if any(v == "de" for _, v in language_choices)
398
+ else (language_choices[0][1] if language_choices else "en")
399
+ )
400
+ return (
401
+ gr.update(choices=language_choices, value=default_language),
402
+ gr.update(visible=True, value=None),
403
+ gr.update(value=get_example_text(default_language)),
404
+ )
405
 
406
 
407
  # --- Gradio Interface ---
 
428
  elem_classes=["main-title"],
429
  )
430
 
431
+ if not _is_gemini_ready():
432
+ gr.Markdown(
433
+ """
434
+ **Hinweis:** Gemini ist aktuell nicht verfügbar.
435
+ Bitte `google-genai` installieren und `GEMINI_API_KEY` setzen, oder **Voice cloning** verwenden.
436
+ """
437
+ )
438
+
439
  with gr.Row():
440
  # Left column - Input
441
  with gr.Column(scale=1):
442
+ default_voice_choice = (
443
+ "Kore"
444
+ if _is_gemini_ready() and "Kore" in get_voice_choices()
445
+ else VOICE_CLONING_OPTION
446
+ )
447
+
448
+ voice_choice = gr.Dropdown(
449
+ choices=get_voice_choices(),
450
+ value=default_voice_choice,
451
+ label="🗣️ Stimme / Voice",
452
+ info="Standard: Gemini Stimmen. 'Voice cloning' nutzt Referenz-Audio (Chatterbox).",
453
+ )
454
+
455
  language = gr.Dropdown(
456
+ choices=(
457
+ get_language_choices_for_backend("gemini")
458
+ if _is_gemini_ready()
459
+ else get_language_choices_for_backend("chatterbox")
460
+ ),
461
+ value=(
462
+ "de"
463
+ if _is_gemini_ready()
464
+ and any(
465
+ v == "de"
466
+ for _, v in get_language_choices_for_backend("gemini")
467
+ )
468
+ else "de"
469
+ ),
470
  label="🌍 Sprache / Language",
471
  info="Wählen Sie die Sprache der Ansage",
472
  )
 
485
  sources=["upload", "microphone"],
486
  type="filepath",
487
  label="Referenz-Audio für Stimmklonung",
488
+ visible=(default_voice_choice == VOICE_CLONING_OPTION),
489
+ value=None,
490
  )
491
  gr.Markdown(
492
  """
 
584
  )
585
 
586
  # Event handlers
587
+ voice_choice.change(
588
+ fn=on_voice_choice_change,
589
+ inputs=[voice_choice],
590
+ outputs=[language, voice_audio, text],
591
+ show_progress=False,
592
+ )
593
+
594
  language.change(
595
  fn=on_language_change,
596
+ inputs=[language, voice_choice],
597
  outputs=[text, voice_audio],
598
  show_progress=False,
599
  )
 
603
  inputs=[
604
  text,
605
  language,
606
+ voice_choice,
607
  voice_audio,
608
  background_music,
609
  custom_music,
engine/tts_engine.py CHANGED
@@ -28,7 +28,7 @@ class EngineConfig:
28
  """Configuration for the TTS Engine."""
29
 
30
  # Backend settings
31
- default_backend: str = "chatterbox"
32
  device: str = "auto" # "auto", "cuda", "mps", "cpu"
33
 
34
  # Default generation settings
@@ -186,16 +186,20 @@ class TTSEngine:
186
  language = language or self.config.default_language
187
  backend = self.current_backend
188
 
189
- # Generate voice ID for caching
190
- voice_id = (
191
- "default"
192
- if not voice_audio
193
- else (
 
194
  Path(voice_audio).stem
195
  if os.path.exists(voice_audio or "")
196
  else "custom"
197
  )
198
- )
 
 
 
199
 
200
  # Check cache
201
  if use_cache and self._cache.config.enabled:
 
28
  """Configuration for the TTS Engine."""
29
 
30
  # Backend settings
31
+ default_backend: str = "gemini"
32
  device: str = "auto" # "auto", "cuda", "mps", "cpu"
33
 
34
  # Default generation settings
 
186
  language = language or self.config.default_language
187
  backend = self.current_backend
188
 
189
+ # Generate voice ID for caching.
190
+ # - Voice cloning: derive from reference audio when available
191
+ # - Preset voices (e.g. Gemini): include requested voice in cache key
192
+ requested_voice = kwargs.get("voice")
193
+ if voice_audio:
194
+ voice_id = (
195
  Path(voice_audio).stem
196
  if os.path.exists(voice_audio or "")
197
  else "custom"
198
  )
199
+ elif requested_voice:
200
+ voice_id = f"voice-{requested_voice}"
201
+ else:
202
+ voice_id = "default"
203
 
204
  # Check cache
205
  if use_cache and self._cache.config.enabled:
requirements.txt CHANGED
@@ -26,7 +26,7 @@ huggingface_hub>=0.20.0
26
  loguru>=0.7.0
27
 
28
  # Optional: Gemini backend
29
- # google-genai>=0.3.0
30
 
31
  # Optional: Caching to HuggingFace Hub
32
  # pandas>=2.0.0
 
26
  loguru>=0.7.0
27
 
28
  # Optional: Gemini backend
29
+ google-genai>=0.3.0
30
 
31
  # Optional: Caching to HuggingFace Hub
32
  # pandas>=2.0.0