Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import numpy as np | |
| import soundfile as sf | |
| import tempfile | |
| import traceback | |
| import os | |
| from pydub import AudioSegment, silence | |
| from concurrent.futures import ThreadPoolExecutor, TimeoutError | |
| # Bark (Top-Level) – wichtig: NICHT aus bark.generation importieren | |
| from bark import SAMPLE_RATE, generate_audio, preload_models | |
| # Stabiler auf HF Spaces | |
| import torch | |
| torch.set_num_threads(1) | |
| # --------- Helper --------- | |
| def build_padded_prompt(w: str) -> str: | |
| """Kurzer Kontext für Wörter/Kurzphrasen → bessere Prosodie.""" | |
| w = w.strip() | |
| return f"Bitte wiederhole deutlich: {w}. Noch einmal: {w}!" | |
| def synth_core(text: str, history_prompt: str | None = None): | |
| """Kernaufruf an Bark.""" | |
| return generate_audio(text, history_prompt=history_prompt) | |
| def synth_with_timeout(text: str, history_prompt: str | None = None, timeout_s: int = 120): | |
| """Führe Bark in einem separaten Thread aus, damit die UI nicht ewig hängt.""" | |
| with ThreadPoolExecutor(max_workers=1) as ex: | |
| fut = ex.submit(synth_core, text, history_prompt) | |
| return fut.result(timeout=timeout_s) | |
| # --------- TTS Callback --------- | |
| def tts_bark(text, preset_choice, custom_preset, seed, export_mp3, word_mode): | |
| try: | |
| if not text or not text.strip(): | |
| return None, None, "Bitte Text eingeben." | |
| # Preset = nur Stimmfarbe; Sprache kommt aus dem Text | |
| history_prompt = None | |
| if custom_preset and custom_preset.strip(): | |
| history_prompt = custom_preset.strip() | |
| elif preset_choice and preset_choice != "Auto (kein Preset)": | |
| history_prompt = preset_choice | |
| # Reproduzierbarkeit (optional) | |
| if seed is not None and str(seed).strip() != "": | |
| try: | |
| np.random.seed(int(seed)) | |
| except Exception: | |
| pass | |
| raw_text = text.strip() | |
| prompt = raw_text | |
| # Ein‑Wort‑Optimierung: Kontext + später Auto‑Trim | |
| do_trim = False | |
| if word_mode or len(raw_text.split()) <= 2: | |
| prompt = build_padded_prompt(raw_text) | |
| do_trim = True | |
| # TIMEOUT-Schutz (z. B. 120 s) | |
| try: | |
| audio_array = synth_with_timeout(prompt, history_prompt=history_prompt, timeout_s=120) | |
| except TimeoutError: | |
| return None, None, ( | |
| "Timeout bei der Generierung.\n" | |
| "Tipps:\n" | |
| "- Läuft der Space auf GPU? (Settings → Hardware → T4)\n" | |
| "- Direkt nach Neustart ist der erste Lauf langsamer (Modelle laden)\n" | |
| "- Teste mit sehr kurzem Text\n" | |
| ) | |
| # absichern & clippen | |
| audio_array = np.asarray(audio_array, dtype=np.float32) | |
| audio_array = np.clip(audio_array, -1.0, 1.0) | |
| # WAV speichern | |
| wav_path = tempfile.mkstemp(suffix=".wav")[1] | |
| sf.write(wav_path, audio_array, SAMPLE_RATE) | |
| out_path = wav_path | |
| # Bei Wortmodus: bestes Segment auto‑trimmen | |
| # Bei Wortmodus: bestes Segment auto‑trimmen (robust) | |
| if do_trim: | |
| try: | |
| audio = AudioSegment.from_wav(wav_path) | |
| # 1) Erst normal versuchen, etwas großzügiger | |
| chunks = silence.split_on_silence( | |
| audio, | |
| min_silence_len=120, # etwas kürzer | |
| silence_thresh=audio.dBFS - 18, # toleranter | |
| keep_silence=20 | |
| ) | |
| best_seg = None | |
| if chunks: | |
| best_seg = max(chunks, key=lambda c: len(c)) | |
| else: | |
| # 2) Fallback: nicht‑stille Abschnitte selbst detektieren | |
| spans = silence.detect_nonsilent( | |
| audio, | |
| min_silence_len=120, | |
| silence_thresh=audio.dBFS - 18 | |
| ) | |
| if spans: | |
| # längsten nicht‑stillen Abschnitt wählen | |
| start, end = max(spans, key=lambda s: s[1]-s[0]) | |
| best_seg = audio[start:end] | |
| if best_seg: | |
| # leichte Nachbearbeitung: normalisieren + winzige Ränder | |
| best_seg = best_seg.normalize(headroom=1.0) | |
| trimmed_path = wav_path.replace(".wav", "_word.wav") | |
| best_seg.export(trimmed_path, format="wav") | |
| out_path = trimmed_path | |
| else: | |
| print("[Trim] Kein Segment gefunden – liefere Original-WAV zurück.") | |
| except Exception as e: | |
| print(f"[Trim] Hinweis: {e}") | |
| # Optional MP3 exportieren | |
| if export_mp3: | |
| try: | |
| mp3_path = out_path.replace(".wav", ".mp3") | |
| AudioSegment.from_wav(out_path).export(mp3_path, format="mp3") | |
| out_path = mp3_path | |
| except Exception as e: | |
| # Wenn MP3 scheitert, wenigstens WAV liefern + Meldung | |
| return (SAMPLE_RATE, audio_array), wav_path, f"MP3-Export fehlgeschlagen (ffmpeg?). WAV ist bereit. {e}" | |
| return (SAMPLE_RATE, audio_array), out_path, "Fertig." | |
| except Exception: | |
| tb = traceback.format_exc() | |
| print("### EXCEPTION ###\n", tb) | |
| return None, None, f"Fehler:\n{tb}" | |
| # --------- App (UI) --------- | |
| def warmup(): | |
| """Modelle vorladen (macht den ersten echten Call spürbar schneller).""" | |
| try: | |
| preload_models() | |
| _ = generate_audio("ok") # Mini‑Aufruf fürs Caching | |
| except Exception as e: | |
| print(f"[Warmup] Hinweis: {e}") | |
| COMMON_PRESETS = [ | |
| "Auto (kein Preset)", | |
| "v2/en_speaker_6", | |
| "v2/en_speaker_9", | |
| "v2/de_speaker_3", | |
| "v2/de_speaker_9", | |
| ] | |
| with gr.Blocks() as demo: | |
| gr.Markdown( | |
| "# Suno Bark – robuster Space\n" | |
| "- Für einzelne Wörter: **Ein‑Wort‑Optimierung** aktivieren (Kontext + Auto‑Trim).\n" | |
| "- Am schnellsten auf **GPU (T4)**. Direkt nach Neustart ist der erste Lauf langsam (Warmup).\n" | |
| "- MP3 erst aktivieren, wenn das WAV passt (spart Zeit)." | |
| ) | |
| text = gr.Textbox(label="Text (z. B. Igbo / Russisch / Englisch ...)", placeholder="Ndeewo! Kedu ka i mere?", lines=3) | |
| with gr.Row(): | |
| preset_choice = gr.Dropdown(COMMON_PRESETS, value="v2/de_speaker_9", label="v2/de_speaker_9") | |
| custom_preset = gr.Textbox(label="Eigenes Preset (optional)", placeholder="z. B. v2/en_speaker_0") | |
| with gr.Row(): | |
| seed = gr.Number(value=42, precision=0, label="Seed (optional)") | |
| export_mp3 = gr.Checkbox(value=True, label="MP3 zusätzlich exportieren") | |
| word_mode = gr.Checkbox(value=True, label="Ein‑Wort‑Optimierung (Kontext + Auto‑Trim)") | |
| btn = gr.Button("Generieren") | |
| audio_out = gr.Audio(label="Vorschau", type="numpy") | |
| file_out = gr.File(label="Download (WAV/MP3)") | |
| status = gr.Textbox(label="Status / Hinweise", interactive=False) | |
| btn.click( | |
| tts_bark, | |
| inputs=[text, preset_choice, custom_preset, seed, export_mp3, word_mode], | |
| outputs=[audio_out, file_out, status] | |
| ) | |
| # Warm‑up beim Laden | |
| demo.load(fn=lambda: warmup(), inputs=None, outputs=None) | |
| # Queue verhindert Hänger bei parallelen Requests | |
| # neu (Gradio >=4.x) | |
| demo.queue(max_size=4).launch() |