Spaces:

artificialguybr
/

fish-s2-pro-zero

Running on Zero

App Files Files Community

artificialguybr commited on 8 days ago

Commit

84a2fb0

verified ·

1 Parent(s): 6795799

Create app.py

Browse files

Files changed (1) hide show

app.py +201 -0

app.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import os
+import sys
+import traceback
+from pathlib import Path
+import gradio as gr
+import numpy as np
+import spaces
+import torch
+from huggingface_hub import snapshot_download
+sys.path.append(str(Path(__file__).parent))
+from fish_speech.models.text2semantic.inference import (
+    init_model,
+    generate_long,
+    load_codec_model,
+    decode_to_audio,
+    encode_audio
+)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+precision = torch.bfloat16
+print("Downloading Fish Audio S2 Pro weights...")
+checkpoint_dir = snapshot_download(repo_id="fishaudio/s2-pro")
+print("Loading LLAMA model...")
+llama_model, decode_one_token = init_model(
+    checkpoint_path=checkpoint_dir,
+    device=device,
+    precision=precision,
+    compile=False
+)
+with torch.device(device):
+    llama_model.setup_caches(
+        max_batch_size=1,
+        max_seq_len=llama_model.config.max_seq_len,
+        dtype=next(llama_model.parameters()).dtype,
+    )
+print("Loading Codec model...")
+codec_checkpoint = os.path.join(checkpoint_dir, "codec.pth")
+codec_model = load_codec_model(codec_checkpoint, device=device, precision=precision)
+print("All models loaded successfully!")
+@spaces.GPU(duration=120)
+def tts_inference(
+    text,
+    ref_audio,
+    ref_text,
+    max_new_tokens,
+    chunk_length,
+    top_p,
+    repetition_penalty,
+    temperature
+):
+    """
+    Main TTS Generation function decorated with @spaces.GPU
+    to request GPU allocation only during execution.
+    """
+    try:
+        prompt_tokens_list = None
+        if ref_audio is not None and ref_text:
+            prompt_tokens_list = [encode_audio(ref_audio, codec_model, device).cpu()]
+        generator = generate_long(
+            model=llama_model,
+            device=device,
+            decode_one_token=decode_one_token,
+            text=text,
+            num_samples=1,
+            max_new_tokens=max_new_tokens,
+            top_p=top_p,
+            top_k=30,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+            compile=False,
+            iterative_prompt=True,
+            chunk_length=chunk_length,
+            prompt_text=[ref_text] if ref_text else None,
+            prompt_tokens=prompt_tokens_list,
+        )
+        codes = []
+        for response in generator:
+            if response.action == "sample":
+                codes.append(response.codes)
+            elif response.action == "next":
+                break
+        if not codes:
+            raise gr.Error("No audio generated. Please check your text.")
+        merged_codes = torch.cat(codes, dim=1)
+        audio_waveform = decode_to_audio(merged_codes.to(device), codec_model)
+        audio_np = audio_waveform.cpu().float().numpy()
+        return (codec_model.sample_rate, audio_np)
+    except Exception as e:
+        traceback.print_exc()
+        raise gr.Error(f"Inference Error: {str(e)}")
+custom_theme = gr.themes.Soft(
+    primary_hue="blue",
+    secondary_hue="indigo",
+    font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
+).set(
+    block_title_text_weight="600",
+    block_border_width="1px",
+    block_shadow="0px 2px 4px rgba(0, 0, 0, 0.05)",
+    button_shadow="0px 2px 4px rgba(0, 0, 0, 0.1)",
+)
+with gr.Blocks(theme=custom_theme, title="Fish Audio S2 Pro") as app:
+    gr.Markdown(
+        """
+        <div style="text-align: center; max-width: 800px; margin: 0 auto; padding: 20px 0;">
+            <h1 style="font-size: 2.5rem; font-weight: 800; color: #1E3A8A; margin-bottom: 10px;">
+                🐟 Fish Audio S2 Pro
+            </h1>
+            <p style="font-size: 1.1rem; color: #4B5563;">
+                State-of-the-Art Dual-Autoregressive Text-to-Speech.
+                Supports 80+ languages, emotional inline control (e.g., <code>[laugh]</code>, <code>[whisper]</code>), and zero-shot voice cloning.
+            </p>
+        </div>
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=5):
+            gr.Markdown("### ✍️ Text Input")
+            text_input = gr.Textbox(
+                show_label=False,
+                placeholder="Enter the text you want to synthesize here.\nTry adding tags like [laugh], [whisper], or [angry]!",
+                lines=7
+            )
+            with gr.Accordion("🎙️ Voice Cloning (Optional Reference)", open=False):
+                gr.Markdown("Upload a 5-10 second clear audio clip and type its exact transcription to clone the voice.")
+                ref_audio = gr.Audio(label="Reference Audio", type="filepath")
+                ref_text = gr.Textbox(label="Reference Text", placeholder="Transcription of the reference audio...")
+            with gr.Accordion("⚙️ Advanced Settings", open=False):
+                with gr.Row():
+                    max_new_tokens = gr.Slider(0, 2048, 1024, step=8, label="Max New Tokens (0 = unlimited)")
+                    chunk_length = gr.Slider(100, 400, 200, step=8, label="Chunk Length")
+                with gr.Row():
+                    top_p = gr.Slider(0.1, 1.0, 0.7, step=0.01, label="Top-P")
+                    repetition_penalty = gr.Slider(0.9, 2.0, 1.2, step=0.01, label="Repetition Penalty")
+                    temperature = gr.Slider(0.1, 1.0, 0.7, step=0.01, label="Temperature")
+            generate_btn = gr.Button("🚀 Generate Speech", variant="primary", size="lg")
+        with gr.Column(scale=4):
+            gr.Markdown("### 🎧 Output")
+            audio_output = gr.Audio(label="Generated Audio", type="numpy", interactive=False, autoplay=True)
+            gr.Markdown(
+                """
+                <div style="background-color: #EFF6FF; padding: 15px; border-radius: 8px; margin-top: 20px;">
+                    <h4 style="margin-top: 0; color: #1D4ED8;">💡 Pro Tips</h4>
+                    <ul style="margin-bottom: 0; color: #1E3A8A; font-size: 0.95rem;">
+                        <li>You don't need phonemes, the model understands raw text seamlessly.</li>
+                        <li>Try wrapping specific words in brackets for inline emotional control.</li>
+                        <li>For cloning, the closer the transcript matches the audio, the better the result.</li>
+                    </ul>
+                </div>
+                """
+            )
+    gr.Markdown("### 🌟 Examples")
+    gr.Examples(
+        examples=[
+            ["Hello world! This is a test of the Fish Audio S2 Pro model.", None, "", 1024, 200, 0.7, 1.2, 0.7],
+            ["I can't believe it! [laugh] This is absolutely amazing!", None, "", 1024, 200, 0.7, 1.2, 0.7],
+            ["[whisper in small voice] I have a secret to tell you... promise you won't tell anyone?", None, "", 1024, 200, 0.7, 1.2, 0.7]
+        ],
+        inputs=[text_input, ref_audio, ref_text, max_new_tokens, chunk_length, top_p, repetition_penalty, temperature],
+        outputs=[audio_output],
+        fn=tts_inference,
+        cache_examples=False,
+    )
+    generate_btn.click(
+        fn=tts_inference,
+        inputs=[text_input, ref_audio, ref_text, max_new_tokens, chunk_length, top_p, repetition_penalty, temperature],
+        outputs=[audio_output]
+    )
+if __name__ == "__main__":
+    app.launch()