Spaces:

BIBLETUM
/

Audio_itits

Sleeping

App Files Files Community

BIBLETUM commited on Oct 28, 2025

Commit

44e7908

verified ·

1 Parent(s): 7242553

Update app.py

Browse files

Files changed (1) hide show

app.py +163 -53

app.py CHANGED Viewed

@@ -10,17 +10,15 @@ import gradio as gr
 OUTDIR = Path("outputs")
 OUTDIR.mkdir(parents=True, exist_ok=True)
 def slug(s: str) -> str:
     """Make a safe filename slug (ASCII, underscores)."""
     if s is None:
         s = ""
     return "".join(c if c.isalnum() else "_" for c in s)[:80].strip("_")
 def save_wav(path: Path, sr: int, audio):
     import numpy as np
-    import scipy.io.wavfile as wav
     if hasattr(audio, "detach"):
         audio = audio.detach().cpu().numpy()
@@ -28,13 +26,12 @@ def save_wav(path: Path, sr: int, audio):
     a = np.squeeze(a)
     if a.ndim == 2 and a.shape[0] < a.shape[1]:
         a = a.T
-    # normalize if needed
     max_abs = np.max(np.abs(a)) if a.size else 1.0
     if np.isfinite(max_abs) and max_abs > 1.0:
         a = a / max_abs
     wav.write(str(path), int(sr), a)
 MODEL_NAMES = {
     "suno/bark-small": "bark",
     "facebook/mms-tts-rus": "mms",
@@ -44,7 +41,6 @@ MODEL_NAMES = {
 _model_cache: Dict[str, object] = {}
 _device_hint = "auto"
 def _load_bark():
     from transformers import pipeline
     pipe = pipeline("text-to-speech", model="suno/bark-small", device_map=_device_hint)
@@ -57,7 +53,6 @@ def _load_bark():
     return generate
 def _load_mms():
     from transformers import pipeline
     pipe = pipeline("text-to-speech", model="facebook/mms-tts-rus", device_map=_device_hint)
@@ -70,7 +65,6 @@ def _load_mms():
     return generate
 def _load_seamless():
     import torch
     import numpy as np
@@ -81,7 +75,6 @@ def _load_seamless():
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    # КЛЮЧЕВОЕ: use_fast=False, чтобы не требовался tiktoken
     proc = AutoProcessor.from_pretrained(
         "facebook/seamless-m4t-v2-large",
         use_fast=False
@@ -98,7 +91,6 @@ def _load_seamless():
     return generate
 def get_generator(kind: str):
     if kind in _model_cache:
         return _model_cache[kind]
@@ -113,25 +105,22 @@ def get_generator(kind: str):
     _model_cache[kind] = gen
     return gen
 DEFAULT_PROMPTS = (
     "Привет! Это короткий тест русского TTS.\n"
     "Сегодня мы проверяем интонации, паузы и четкость дикции.\n"
     "Немного сложнее: числа 3.14 и 2025 читаем правильно."
 )
 def run_tts(
     prompts_text: str,
     split_lines: bool,
     model_choice: str,
-) -> tuple:
-    """Main Gradio callback.
     Returns:
-        files: list[str] — файловые пути для скачивания
-        df:    pd.DataFrame — таблица с метаданными
-        last_audio: tuple[int, np.ndarray] | None — предпросмотр последнего файла
     """
     text_items: List[str] = []
     if split_lines:
@@ -147,12 +136,12 @@ def run_tts(
     kind = MODEL_NAMES[model_choice]
     gen = get_generator(kind)
-    stamp_dir = OUTDIR / time.strftime("%Y%m%d-%H%M%S")
     stamp_dir.mkdir(parents=True, exist_ok=True)
     rows = []
     file_paths: List[str] = []
-    last_audio_payload = None
     for p in text_items:
         t0 = time.time()
@@ -162,6 +151,7 @@ def run_tts(
         save_wav(path, sr, audio)
         rows.append({
             "model": model_choice,
             "prompt": p,
             "file": str(path),
@@ -169,57 +159,177 @@ def run_tts(
             "gen_time_s": round(dt, 3),
         })
         file_paths.append(str(path))
-        last_audio_payload = str(path)
     df = pd.DataFrame(rows)
-    return file_paths, df, last_audio_payload
-description_md = (
     """
     Russian TTS Bench: выберите модель и введите один или несколько промптов.\
-    По умолчанию каждая строка — отдельный промпт. Результаты сохраняются в `outputs/…`.
     **Модели:**
     - `suno/bark-small` — небольшой мультиязычный TTS.
     - `facebook/mms-tts-rus` — русская TTS из проекта MMS.
-    - `facebook/seamless-m4t-v2-large` — крупная модель перевода/говорения; тяжёлая для CPU.\
     """
 )
-with gr.Blocks(title="Russian TTS Bench") as demo:
-    gr.Markdown("# 🗣️ Russian TTS Bench")
-    gr.Markdown(description_md)
-    with gr.Row():
-        model_choice = gr.Dropdown(
-            label="Модель",
-            choices=list(MODEL_NAMES.keys()),
-            value="suno/bark-small",
         )
-        split_lines = gr.Checkbox(value=True, label="Одна строка = один промпт")
-    prompts = gr.Textbox(
-        label="Промпты",
-        value=DEFAULT_PROMPTS,
-        lines=6,
-        placeholder="Каждая строка — отдельный промпт…",
-    )
-    run_btn = gr.Button("Сгенерировать", variant="primary")
-    with gr.Row():
-        files = gr.Files(label="Файлы .wav для скачивания")
-    with gr.Row():
-        df_out = gr.Dataframe(label="Таблица результатов", interactive=False)
-    with gr.Row():
-        preview = gr.Audio(label="Предпросмотр последнего семпла", autoplay=False)
-    run_btn.click(
-        fn=run_tts,
-        inputs=[prompts, split_lines, model_choice],
-        outputs=[files, df_out, preview],
-    )
 if __name__ == "__main__":
-    demo.launch()

 OUTDIR = Path("outputs")
 OUTDIR.mkdir(parents=True, exist_ok=True)
 def slug(s: str) -> str:
     """Make a safe filename slug (ASCII, underscores)."""
     if s is None:
         s = ""
     return "".join(c if c.isalnum() else "_" for c in s)[:80].strip("_")
 def save_wav(path: Path, sr: int, audio):
     import numpy as np
+    from scipy.io import wavfile as wav
     if hasattr(audio, "detach"):
         audio = audio.detach().cpu().numpy()
     a = np.squeeze(a)
     if a.ndim == 2 and a.shape[0] < a.shape[1]:
         a = a.T
+    # normalize if needed (safety)
     max_abs = np.max(np.abs(a)) if a.size else 1.0
     if np.isfinite(max_abs) and max_abs > 1.0:
         a = a / max_abs
     wav.write(str(path), int(sr), a)
 MODEL_NAMES = {
     "suno/bark-small": "bark",
     "facebook/mms-tts-rus": "mms",
 _model_cache: Dict[str, object] = {}
 _device_hint = "auto"
 def _load_bark():
     from transformers import pipeline
     pipe = pipeline("text-to-speech", model="suno/bark-small", device_map=_device_hint)
     return generate
 def _load_mms():
     from transformers import pipeline
     pipe = pipeline("text-to-speech", model="facebook/mms-tts-rus", device_map=_device_hint)
     return generate
 def _load_seamless():
     import torch
     import numpy as np
     device = "cuda" if torch.cuda.is_available() else "cpu"
     proc = AutoProcessor.from_pretrained(
         "facebook/seamless-m4t-v2-large",
         use_fast=False
     return generate
 def get_generator(kind: str):
     if kind in _model_cache:
         return _model_cache[kind]
     _model_cache[kind] = gen
     return gen
 DEFAULT_PROMPTS = (
     "Привет! Это короткий тест русского TTS.\n"
     "Сегодня мы проверяем интонации, паузы и четкость дикции.\n"
     "Немного сложнее: числа 3.14 и 2025 читаем правильно."
 )
 def run_tts(
     prompts_text: str,
     split_lines: bool,
     model_choice: str,
+):
+    """Main Gradio callback: TTS.
     Returns:
+        files: list[str] — пути к wav
+        df:    pd.DataFrame — таблица метаданных
+        last_audio: str | None — путь к последнему файлу для предпросмотра
     """
     text_items: List[str] = []
     if split_lines:
     kind = MODEL_NAMES[model_choice]
     gen = get_generator(kind)
+    stamp_dir = OUTDIR / "tts" / time.strftime("%Y%m%d-%H%M%S")
     stamp_dir.mkdir(parents=True, exist_ok=True)
     rows = []
     file_paths: List[str] = []
+    last_audio_path = None
     for p in text_items:
         t0 = time.time()
         save_wav(path, sr, audio)
         rows.append({
+            "task": "tts",
             "model": model_choice,
             "prompt": p,
             "file": str(path),
             "gen_time_s": round(dt, 3),
         })
         file_paths.append(str(path))
+        last_audio_path = str(path)
+    df = pd.DataFrame(rows)
+    return file_paths, df, last_audio_path
+_music_pipes: Dict[str, object] = {}
+MUSIC_MODELS = [
+    "facebook/musicgen-small",
+]
+def get_music_pipe(model_name: str):
+    if model_name in _music_pipes:
+        return _music_pipes[model_name]
+    from transformers import pipeline
+    pipe = pipeline("text-to-audio", model=model_name, device_map=_device_hint)
+    _music_pipes[model_name] = pipe
+    return pipe
+MUSIC_DEFAULT_PROMPTS = (
+    "High-energy 90s rock track with distorted electric guitars, driving bass, and hard-hitting acoustic drums\n"
+    "Modern electronic dance track with punchy kick, bright synth lead, and sidechained pads, 128 BPM\n"
+    "Dark industrial electro with gritty bass, sharp snares, and mechanical percussion"
+)
+def run_music(
+    prompts_text: str,
+    split_lines: bool,
+    model_name: str,
+    do_sample: bool,
+):
+    """Main Gradio callback: MusicGen."""
+    text_items: List[str] = []
+    if split_lines:
+        for line in [s.strip() for s in prompts_text.splitlines()]:
+            if line:
+                text_items.append(line)
+    else:
+        text_items = [prompts_text.strip()] if prompts_text.strip() else []
+    if not text_items:
+        return [], pd.DataFrame(), None
+    pipe = get_music_pipe(model_name)
+    stamp_dir = OUTDIR / "music" / slug(model_name) / time.strftime("%Y%m%d-%H%M%S")
+    stamp_dir.mkdir(parents=True, exist_ok=True)
+    rows = []
+    file_paths: List[str] = []
+    last_audio_path = None
+    for p in text_items:
+        t0 = time.time()
+        # Параметры генерации держим минимальными и совместимыми
+        out = pipe(p, forward_params={"do_sample": bool(do_sample)})
+        dt = time.time() - t0
+        sr = int(out["sampling_rate"])
+        audio = np.asarray(out["audio"], dtype=np.float32)
+        path = stamp_dir / f"{slug(p)}.wav"
+        save_wav(path, sr, audio)
+        rows.append({
+            "task": "music",
+            "model": model_name,
+            "prompt": p,
+            "file": str(path),
+            "sr": sr,
+            "gen_time_s": round(dt, 3),
+        })
+        file_paths.append(str(path))
+        last_audio_path = str(path)
     df = pd.DataFrame(rows)
+    return file_paths, df, last_audio_path
+tts_description_md = (
     """
     Russian TTS Bench: выберите модель и введите один или несколько промптов.\
+    По умолчанию каждая строка — отдельный промпт. Результаты сохраняются в `outputs/tts/…`.
     **Модели:**
     - `suno/bark-small` — небольшой мультиязычный TTS.
     - `facebook/mms-tts-rus` — русская TTS из проекта MMS.
+    - `facebook/seamless-m4t-v2-large` — крупная модель перевода/говорения; тяжёлая для CPU.
+    """
+)
+music_description_md = (
+    """
+    **Music Gen:** текст → музыка на базе MusicGen. По умолчанию каждая строка — отдельный промпт.\
+    Результаты сохраняются в `outputs/music/<model>/…`.
+    **Модели:**
+    - `facebook/musicgen-small`
+    - (опционально) `facebook/musicgen-stereo-small` — раскомментируйте в коде.
     """
 )
+with gr.Blocks(title="Speech & Music Bench") as demo:
+    gr.Markdown("# 🎙️🪄 Speech & Music Bench")
+    with gr.Tab("🗣️ TTS"):
+        gr.Markdown(tts_description_md)
+        with gr.Row():
+            model_choice = gr.Dropdown(
+                label="Модель TTS",
+                choices=list(MODEL_NAMES.keys()),
+                value="suno/bark-small",
+            )
+            split_lines_tts = gr.Checkbox(value=True, label="Одна строка = один промпт")
+        prompts_tts = gr.Textbox(
+            label="Промпты",
+            value=DEFAULT_PROMPTS,
+            lines=6,
+            placeholder="Каждая строка — отдельный промпт…",
+        )
+        run_btn_tts = gr.Button("Сгенерировать речь", variant="primary")
+        with gr.Row():
+            files_tts = gr.Files(label="Файлы .wav для скачивания")
+        with gr.Row():
+            df_out_tts = gr.Dataframe(label="Таблица результатов", interactive=False)
+        with gr.Row():
+            preview_tts = gr.Audio(label="Предпросмотр последнего семпла", autoplay=False)
+        run_btn_tts.click(
+            fn=run_tts,
+            inputs=[prompts_tts, split_lines_tts, model_choice],
+            outputs=[files_tts, df_out_tts, preview_tts],
         )
+    with gr.Tab("🎵 Music"):
+        gr.Markdown(music_description_md)
+        with gr.Row():
+            music_model = gr.Dropdown(
+                label="Модель MusicGen",
+                choices=MUSIC_MODELS,
+                value=MUSIC_MODELS[0],
+            )
+            split_lines_music = gr.Checkbox(value=True, label="Одна строка = один промпт")
+            do_sample = gr.Checkbox(value=True, label="do_sample")
+        prompts_music = gr.Textbox(
+            label="Музыкальные промпты",
+            value=MUSIC_DEFAULT_PROMPTS,
+            lines=6,
+            placeholder="Каждая строка — отдельный промпт…",
+        )
+        run_btn_music = gr.Button("Сгенерировать музыку", variant="primary")
+        with gr.Row():
+            files_music = gr.Files(label="Файлы .wav для скачивания")
+        with gr.Row():
+            df_out_music = gr.Dataframe(label="Таблица результатов", interactive=False)
+        with gr.Row():
+            preview_music = gr.Audio(label="Предпросмотр последнего трека", autoplay=False)
+        run_btn_music.click(
+            fn=run_music,
+            inputs=[prompts_music, split_lines_music, music_model, do_sample],
+            outputs=[files_music, df_out_music, preview_music],
+        )
 if __name__ == "__main__":
+    demo.launch()