flozi00 commited on
Commit
87b184a
·
1 Parent(s): 5ca9916

Refactor code structure for improved readability and maintainability

Browse files
Files changed (3) hide show
  1. app.py +156 -16
  2. engine/backends/base.py +110 -0
  3. engine/tts_engine.py +43 -7
app.py CHANGED
@@ -28,6 +28,7 @@ except ImportError:
28
  from loguru import logger
29
 
30
  from engine import TTSEngine
 
31
  from engine.backends.chatterbox_backend import DEFAULT_VOICE_PROMPTS
32
 
33
  # --- Configuration ---
@@ -150,21 +151,46 @@ def get_default_voice(language: str) -> str:
150
  return DEFAULT_VOICE_PROMPTS.get(language)
151
 
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  # --- Main Generation Function ---
154
  @spaces.GPU
155
  def generate_announcement(
156
  text: str,
157
  language: str,
158
  voice_audio: str = None,
 
 
 
 
 
159
  seed: int = 0,
160
  ) -> tuple[int, np.ndarray]:
161
  """
162
  Generate a phone announcement.
163
 
164
  Args:
165
- text: Text to synthesize (max 500 characters)
166
  language: Language code
167
  voice_audio: Optional path to reference audio for voice cloning
 
 
 
 
 
168
  seed: Random seed (0 = random)
169
 
170
  Returns:
@@ -180,23 +206,77 @@ def generate_announcement(
180
  if DEVICE == "cuda":
181
  torch.cuda.manual_seed_all(seed)
182
 
183
- # Truncate text
184
- text = text[:500]
185
-
186
  # Use default voice if none provided
187
  if not voice_audio or not str(voice_audio).strip():
188
  voice_audio = get_default_voice(language)
189
 
190
- logger.info(f"Generating: lang={language}, text='{text[:50]}...'")
191
-
192
- # Generate audio
193
- result = engine.generate(
194
- text=text,
195
- language=language,
196
- voice_audio=voice_audio,
 
 
 
 
197
  )
198
 
199
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
 
202
  def on_language_change(language: str):
@@ -243,8 +323,8 @@ def create_interface():
243
  label="📝 Text der Ansage",
244
  placeholder="Geben Sie hier den Text Ihrer Telefonansage ein...",
245
  lines=5,
246
- max_lines=10,
247
- info="Maximal 500 Zeichen",
248
  )
249
 
250
  with gr.Accordion("🎤 Stimmeinstellungen (Optional)", open=False):
@@ -261,6 +341,55 @@ def create_interface():
261
  """
262
  )
263
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  with gr.Accordion("⚙️ Erweiterte Einstellungen", open=False):
265
  seed = gr.Number(
266
  value=0,
@@ -286,8 +415,9 @@ def create_interface():
286
  ### ℹ️ Hinweise
287
 
288
  - Die Generierung kann einige Sekunden dauern
289
- - Für beste Ergebnisse verwenden Sie klare, kurze Sätze
290
  - Referenz-Audio sollte 5-15 Sekunden lang sein
 
291
 
292
  ---
293
 
@@ -309,7 +439,17 @@ def create_interface():
309
 
310
  generate_btn.click(
311
  fn=generate_announcement,
312
- inputs=[text, language, voice_audio, seed],
 
 
 
 
 
 
 
 
 
 
313
  outputs=[audio_output],
314
  )
315
 
 
28
  from loguru import logger
29
 
30
  from engine import TTSEngine
31
+ from engine.audio_processor import AudioProcessor
32
  from engine.backends.chatterbox_backend import DEFAULT_VOICE_PROMPTS
33
 
34
  # --- Configuration ---
 
151
  return DEFAULT_VOICE_PROMPTS.get(language)
152
 
153
 
154
+ def get_background_music_choices() -> list[tuple[str, str]]:
155
+ """Get available background music choices."""
156
+ processor = AudioProcessor()
157
+ music_files = processor.list_available_music()
158
+
159
+ # Create choices with display names
160
+ choices = [("🔇 Keine Hintergrundmusik", "")]
161
+ for name in music_files:
162
+ # Create a nicer display name
163
+ display = name.replace("_", " ").replace("-", " ").title()
164
+ choices.append((f"🎵 {display}", name))
165
+
166
+ return choices
167
+
168
+
169
  # --- Main Generation Function ---
170
  @spaces.GPU
171
  def generate_announcement(
172
  text: str,
173
  language: str,
174
  voice_audio: str = None,
175
+ background_music: str = "",
176
+ custom_music: str = None,
177
+ music_volume: float = -15.0,
178
+ fade_in: float = 0.5,
179
+ fade_out: float = 0.5,
180
  seed: int = 0,
181
  ) -> tuple[int, np.ndarray]:
182
  """
183
  Generate a phone announcement.
184
 
185
  Args:
186
+ text: Text to synthesize (supports long text with automatic sentence splitting)
187
  language: Language code
188
  voice_audio: Optional path to reference audio for voice cloning
189
+ background_music: Name of preset background music file
190
+ custom_music: Path to custom uploaded background music
191
+ music_volume: Volume of background music in dB (default: -15)
192
+ fade_in: Fade in duration in seconds
193
+ fade_out: Fade out duration in seconds
194
  seed: Random seed (0 = random)
195
 
196
  Returns:
 
206
  if DEVICE == "cuda":
207
  torch.cuda.manual_seed_all(seed)
208
 
 
 
 
209
  # Use default voice if none provided
210
  if not voice_audio or not str(voice_audio).strip():
211
  voice_audio = get_default_voice(language)
212
 
213
+ # Determine which background music to use (custom upload takes priority)
214
+ music_path = None
215
+ if custom_music and str(custom_music).strip():
216
+ music_path = custom_music
217
+ logger.info(f"Using custom background music: {music_path}")
218
+ elif background_music and str(background_music).strip():
219
+ music_path = background_music
220
+ logger.info(f"Using preset background music: {music_path}")
221
+
222
+ logger.info(
223
+ f"Generating: lang={language}, text='{text[:50]}...' ({len(text)} chars)"
224
  )
225
 
226
+ # Generate audio (engine handles sentence splitting automatically)
227
+ # If we have background music, we need to process the audio
228
+ if music_path:
229
+ # Generate raw audio first (with sentence splitting for long texts)
230
+ result = engine.generate_raw(
231
+ text=text,
232
+ language=language,
233
+ voice_audio=voice_audio,
234
+ split_sentences=True,
235
+ )
236
+
237
+ # Process with background music
238
+ from engine.audio_processor import AudioProcessingConfig, AudioProcessor
239
+
240
+ processor = AudioProcessor(
241
+ AudioProcessingConfig(
242
+ background_music_path=music_path,
243
+ music_volume_db=music_volume,
244
+ fade_in_ms=int(fade_in * 1000),
245
+ fade_out_ms=int(fade_out * 1000),
246
+ padding_start_ms=int(
247
+ fade_in * 1000 * 1.2
248
+ ), # Slightly longer padding for fades
249
+ padding_end_ms=int(fade_out * 1000 * 1.2),
250
+ )
251
+ )
252
+
253
+ # Process and get bytes
254
+ processed_bytes = processor.process(
255
+ audio=result.audio,
256
+ sample_rate=result.sample_rate,
257
+ )
258
+
259
+ # Convert back to numpy for Gradio
260
+ import io
261
+
262
+ from pydub import AudioSegment
263
+
264
+ audio_segment = AudioSegment.from_mp3(io.BytesIO(processed_bytes))
265
+ samples = np.array(audio_segment.get_array_of_samples())
266
+
267
+ # Convert to float32 normalized
268
+ samples = samples.astype(np.float32) / 32768.0
269
+
270
+ return (audio_segment.frame_rate, samples)
271
+ else:
272
+ # No background music, use direct generation
273
+ result = engine.generate(
274
+ text=text,
275
+ language=language,
276
+ voice_audio=voice_audio,
277
+ split_sentences=True,
278
+ )
279
+ return result
280
 
281
 
282
  def on_language_change(language: str):
 
323
  label="📝 Text der Ansage",
324
  placeholder="Geben Sie hier den Text Ihrer Telefonansage ein...",
325
  lines=5,
326
+ max_lines=15,
327
+ info="Lange Texte werden automatisch in Sätze aufgeteilt",
328
  )
329
 
330
  with gr.Accordion("🎤 Stimmeinstellungen (Optional)", open=False):
 
341
  """
342
  )
343
 
344
+ with gr.Accordion("🎵 Hintergrundmusik (Optional)", open=False):
345
+ background_music = gr.Dropdown(
346
+ choices=get_background_music_choices(),
347
+ value="",
348
+ label="Voreingestellte Musik",
349
+ info="Wählen Sie eine Hintergrundmusik aus der Bibliothek",
350
+ )
351
+
352
+ custom_music = gr.Audio(
353
+ sources=["upload"],
354
+ type="filepath",
355
+ label="Oder eigene Musik hochladen",
356
+ elem_id="custom_music",
357
+ )
358
+
359
+ music_volume = gr.Slider(
360
+ minimum=-30,
361
+ maximum=0,
362
+ value=-15,
363
+ step=1,
364
+ label="🔊 Musiklautstärke (dB)",
365
+ info="Lautstärke der Hintergrundmusik relativ zur Sprache",
366
+ )
367
+
368
+ with gr.Row():
369
+ fade_in = gr.Slider(
370
+ minimum=0,
371
+ maximum=3,
372
+ value=0.5,
373
+ step=0.1,
374
+ label="⏫ Einblenden (Sek.)",
375
+ info="Fade-In Dauer",
376
+ )
377
+ fade_out = gr.Slider(
378
+ minimum=0,
379
+ maximum=3,
380
+ value=0.5,
381
+ step=0.1,
382
+ label="⏬ Ausblenden (Sek.)",
383
+ info="Fade-Out Dauer",
384
+ )
385
+
386
+ gr.Markdown(
387
+ """
388
+ 💡 **Hinweis:** Eigene hochgeladene Musik hat Vorrang vor der Auswahl.
389
+ Die Musik wird automatisch geloopt und auf die Länge der Ansage zugeschnitten.
390
+ """
391
+ )
392
+
393
  with gr.Accordion("⚙️ Erweiterte Einstellungen", open=False):
394
  seed = gr.Number(
395
  value=0,
 
415
  ### ℹ️ Hinweise
416
 
417
  - Die Generierung kann einige Sekunden dauern
418
+ - Lange Texte werden automatisch in Sätze aufgeteilt
419
  - Referenz-Audio sollte 5-15 Sekunden lang sein
420
+ - Hintergrundmusik wird automatisch geloopt
421
 
422
  ---
423
 
 
439
 
440
  generate_btn.click(
441
  fn=generate_announcement,
442
+ inputs=[
443
+ text,
444
+ language,
445
+ voice_audio,
446
+ background_music,
447
+ custom_music,
448
+ music_volume,
449
+ fade_in,
450
+ fade_out,
451
+ seed,
452
+ ],
453
  outputs=[audio_output],
454
  )
455
 
engine/backends/base.py CHANGED
@@ -3,6 +3,7 @@ Abstract base class for TTS backends.
3
  All TTS backends must implement this interface to be compatible with the engine.
4
  """
5
 
 
6
  from abc import ABC, abstractmethod
7
  from dataclasses import dataclass
8
  from typing import Optional
@@ -10,6 +11,63 @@ from typing import Optional
10
  import numpy as np
11
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  @dataclass
14
  class TTSResult:
15
  """Result from TTS generation."""
@@ -124,6 +182,58 @@ class TTSBackend(ABC):
124
  """
125
  pass
126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  def __repr__(self) -> str:
128
  status = "loaded" if self._is_loaded else "not loaded"
129
  return f"{self.__class__.__name__}(name='{self.name}', status={status})"
 
3
  All TTS backends must implement this interface to be compatible with the engine.
4
  """
5
 
6
+ import re
7
  from abc import ABC, abstractmethod
8
  from dataclasses import dataclass
9
  from typing import Optional
 
11
  import numpy as np
12
 
13
 
14
+ def split_into_sentences(text: str, max_chars: int = 250) -> list[str]:
15
+ """
16
+ Split text into sentences for better TTS quality on long texts.
17
+
18
+ Args:
19
+ text: Input text to split
20
+ max_chars: Maximum characters per chunk (default: 250)
21
+
22
+ Returns:
23
+ List of text chunks, each suitable for TTS generation
24
+ """
25
+ if len(text) <= max_chars:
26
+ return [text]
27
+
28
+ # Sentence-ending punctuation patterns
29
+ # Handles: . ! ? and their equivalents in other languages
30
+ sentence_enders = r"(?<=[.!?。?!،؟])\s+"
31
+
32
+ # Split by sentence endings
33
+ sentences = re.split(sentence_enders, text)
34
+
35
+ # Merge short sentences and split long ones
36
+ chunks = []
37
+ current_chunk = ""
38
+
39
+ for sentence in sentences:
40
+ sentence = sentence.strip()
41
+ if not sentence:
42
+ continue
43
+
44
+ # If sentence itself is too long, split by commas or other breaks
45
+ if len(sentence) > max_chars:
46
+ # Try splitting by comma, semicolon, or dash
47
+ sub_parts = re.split(r"(?<=[,;:،–—])\s+", sentence)
48
+ for part in sub_parts:
49
+ part = part.strip()
50
+ if not part:
51
+ continue
52
+ if len(current_chunk) + len(part) + 1 <= max_chars:
53
+ current_chunk = f"{current_chunk} {part}".strip()
54
+ else:
55
+ if current_chunk:
56
+ chunks.append(current_chunk)
57
+ current_chunk = part
58
+ elif len(current_chunk) + len(sentence) + 1 <= max_chars:
59
+ current_chunk = f"{current_chunk} {sentence}".strip()
60
+ else:
61
+ if current_chunk:
62
+ chunks.append(current_chunk)
63
+ current_chunk = sentence
64
+
65
+ if current_chunk:
66
+ chunks.append(current_chunk)
67
+
68
+ return chunks if chunks else [text]
69
+
70
+
71
  @dataclass
72
  class TTSResult:
73
  """Result from TTS generation."""
 
182
  """
183
  pass
184
 
185
+ def generate_long(
186
+ self,
187
+ text: str,
188
+ language: str = "de",
189
+ voice_audio_path: Optional[str] = None,
190
+ max_chars_per_chunk: int = 250,
191
+ silence_between_ms: int = 300,
192
+ **kwargs,
193
+ ) -> "TTSResult":
194
+ """
195
+ Generate speech from long text by splitting into sentences.
196
+
197
+ Args:
198
+ text: The text to synthesize (can be long)
199
+ language: Language code (e.g., "de", "en")
200
+ voice_audio_path: Optional path to reference audio for voice cloning
201
+ max_chars_per_chunk: Maximum characters per chunk (default: 250)
202
+ silence_between_ms: Silence between chunks in milliseconds (default: 300)
203
+ **kwargs: Backend-specific parameters
204
+
205
+ Returns:
206
+ TTSResult containing concatenated audio waveform and sample rate
207
+ """
208
+ from loguru import logger
209
+
210
+ chunks = split_into_sentences(text, max_chars_per_chunk)
211
+
212
+ if len(chunks) == 1:
213
+ return self.generate(text, language, voice_audio_path, **kwargs)
214
+
215
+ logger.info(f"Splitting text into {len(chunks)} chunks for generation")
216
+ audio_segments = []
217
+ sample_rate = None
218
+
219
+ for i, chunk in enumerate(chunks):
220
+ logger.debug(f"Generating chunk {i+1}/{len(chunks)}: '{chunk[:50]}...'")
221
+ result = self.generate(chunk, language, voice_audio_path, **kwargs)
222
+ audio_segments.append(result.audio)
223
+ if sample_rate is None:
224
+ sample_rate = result.sample_rate
225
+
226
+ # Add silence between chunks (except after last)
227
+ if i < len(chunks) - 1 and silence_between_ms > 0:
228
+ silence_samples = int(sample_rate * silence_between_ms / 1000)
229
+ silence = np.zeros(silence_samples, dtype=result.audio.dtype)
230
+ audio_segments.append(silence)
231
+
232
+ # Concatenate all segments
233
+ combined_audio = np.concatenate(audio_segments)
234
+
235
+ return TTSResult(audio=combined_audio, sample_rate=sample_rate)
236
+
237
  def __repr__(self) -> str:
238
  status = "loaded" if self._is_loaded else "not loaded"
239
  return f"{self.__class__.__name__}(name='{self.name}', status={status})"
engine/tts_engine.py CHANGED
@@ -160,6 +160,8 @@ class TTSEngine:
160
  background_music: Optional[str] = None,
161
  output_path: Optional[str] = None,
162
  use_cache: bool = True,
 
 
163
  **kwargs,
164
  ) -> Union[bytes, str, tuple[int, np.ndarray]]:
165
  """
@@ -172,6 +174,8 @@ class TTSEngine:
172
  background_music: Name/path of background music file
173
  output_path: Optional path to save output file
174
  use_cache: Whether to use caching (default: True)
 
 
175
  **kwargs: Additional backend-specific parameters
176
 
177
  Returns:
@@ -203,11 +207,21 @@ class TTSEngine:
203
  return output_path
204
  return cached
205
 
206
- # Generate audio
207
  logger.info(f"Generating TTS: backend={backend.name}, lang={language}")
208
- result = backend.generate(
209
- text=text, language=language, voice_audio_path=voice_audio, **kwargs
210
- )
 
 
 
 
 
 
 
 
 
 
211
 
212
  # Determine if we need post-processing
213
  use_music = background_music or (
@@ -239,18 +253,40 @@ class TTSEngine:
239
  text: str,
240
  language: Optional[str] = None,
241
  voice_audio: Optional[str] = None,
 
 
242
  **kwargs,
243
  ) -> TTSResult:
244
  """
245
  Generate raw audio without post-processing.
246
 
 
 
 
 
 
 
 
 
247
  Returns:
248
  TTSResult with audio array and sample rate
249
  """
250
  language = language or self.config.default_language
251
- return self.current_backend.generate(
252
- text=text, language=language, voice_audio_path=voice_audio, **kwargs
253
- )
 
 
 
 
 
 
 
 
 
 
 
 
254
 
255
  def list_background_music(self) -> list[str]:
256
  """List available background music files."""
 
160
  background_music: Optional[str] = None,
161
  output_path: Optional[str] = None,
162
  use_cache: bool = True,
163
+ split_sentences: bool = True,
164
+ max_chars_per_chunk: int = 250,
165
  **kwargs,
166
  ) -> Union[bytes, str, tuple[int, np.ndarray]]:
167
  """
 
174
  background_music: Name/path of background music file
175
  output_path: Optional path to save output file
176
  use_cache: Whether to use caching (default: True)
177
+ split_sentences: Auto-split long text into sentences (default: True)
178
+ max_chars_per_chunk: Max chars per chunk when splitting (default: 250)
179
  **kwargs: Additional backend-specific parameters
180
 
181
  Returns:
 
207
  return output_path
208
  return cached
209
 
210
+ # Generate audio (use sentence splitting for long texts)
211
  logger.info(f"Generating TTS: backend={backend.name}, lang={language}")
212
+ if split_sentences and len(text) > max_chars_per_chunk:
213
+ logger.info(f"Text is {len(text)} chars, splitting into sentences")
214
+ result = backend.generate_long(
215
+ text=text,
216
+ language=language,
217
+ voice_audio_path=voice_audio,
218
+ max_chars_per_chunk=max_chars_per_chunk,
219
+ **kwargs,
220
+ )
221
+ else:
222
+ result = backend.generate(
223
+ text=text, language=language, voice_audio_path=voice_audio, **kwargs
224
+ )
225
 
226
  # Determine if we need post-processing
227
  use_music = background_music or (
 
253
  text: str,
254
  language: Optional[str] = None,
255
  voice_audio: Optional[str] = None,
256
+ split_sentences: bool = True,
257
+ max_chars_per_chunk: int = 250,
258
  **kwargs,
259
  ) -> TTSResult:
260
  """
261
  Generate raw audio without post-processing.
262
 
263
+ Args:
264
+ text: Text to synthesize
265
+ language: Language code (default from config)
266
+ voice_audio: Path/URL to reference audio for voice cloning
267
+ split_sentences: Auto-split long text into sentences (default: True)
268
+ max_chars_per_chunk: Max chars per chunk when splitting (default: 250)
269
+ **kwargs: Additional backend-specific parameters
270
+
271
  Returns:
272
  TTSResult with audio array and sample rate
273
  """
274
  language = language or self.config.default_language
275
+ backend = self.current_backend
276
+
277
+ if split_sentences and len(text) > max_chars_per_chunk:
278
+ logger.info(f"Text is {len(text)} chars, splitting into sentences")
279
+ return backend.generate_long(
280
+ text=text,
281
+ language=language,
282
+ voice_audio_path=voice_audio,
283
+ max_chars_per_chunk=max_chars_per_chunk,
284
+ **kwargs,
285
+ )
286
+ else:
287
+ return backend.generate(
288
+ text=text, language=language, voice_audio_path=voice_audio, **kwargs
289
+ )
290
 
291
  def list_background_music(self) -> list[str]:
292
  """List available background music files."""