Spaces:

Respair
/

Voice_Design

Running

File size: 16,794 Bytes

import gradio as gr
from gradio_client import Client
import os
import random
import numpy as np
import scipy.io.wavfile as wavfile




# try:
#     client = Client(os.environ['src'])
# except:
#     client = Client("http://localhost:7861/")

css = """
.gradio-container input::placeholder,
.gradio-container textarea::placeholder {
    color: #333333 !important;
}
code {
    background-color: #ffde9f;
    padding: 2px 4px;
    border-radius: 3px;
}

.gr-checkbox label span,
.gr-check-radio label span,
[data-testid="checkbox"] label span,
.checkbox-container span {
    color: #ECF2F7 !important;
}

#advanced-accordion > button,
#advanced-accordion > button span,
#advanced-accordion > div > button,
#advanced-accordion > div > button span,
#advanced-accordion .label-wrap,
#advanced-accordion .label-wrap span,
#advanced-accordion > .open,
#advanced-accordion > .open span {
    color: #FFD700 !important;
}

#voice-preset-container .gallery button,
#voice-preset-container .gr-examples button,
#voice-preset-container .examples button,
#voice-preset-container button.sample {
    background-color: #c8b8d4 !important;
    border: 1px solid #b8a8c4 !important;
    color: #1a1a1a !important;
    font-weight: 500 !important;
    margin: 4px !important;
    padding: 10px 14px !important;
    border-radius: 6px !important;
    transition: background-color 0.2s ease !important;
}

#voice-preset-container .gallery button:hover,
#voice-preset-container .gr-examples button:hover,
#voice-preset-container .examples button:hover,
#voice-preset-container button.sample:hover {
    background-color: #baadc9 !important;
    border-color: #a89ab8 !important;
}

body {
    background: none !important;
}

body::before {
    content: "";
    position: fixed;
    top: 0;
    left: 0;
    width: 100%;
    height: 100%;
    z-index: -1;
    pointer-events: none;
    background: url('https://i.postimg.cc/1smD6GPf/gradio-theme-rin2.png') center center / cover no-repeat;
}

"""

VOICE_EXAMPLES = {
    "甘えた女の子 / ゆっくり": "かわいくて高い声の女の子が、甘えながらゆっくりのんびりしゃべってる感じの音声がほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/onnanoko_amai.wav
    "激怒する女性 / 感情爆発": "低くて激しい声の女性が、感情を抑えきれずに怒りを爆発させながら、早口でまくしたてるような声で読んでほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/angry.wav
    "落ち着いた男性 / 呆れ気味": "落ち着いた低めの声の男性が、相手の言動に少し呆れつつも感情を表に出さず、静かで平坦なトーンで淡々と話してるような声で読んでほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/guy_cool.wav
    "Calm man / mildly exasperated (EN)": "Read this in the voice of a calm, low-pitched man who sounds mildly exasperated but keeps his emotions in check, speaking in a flat, even tone without much expression.", # Nothing
    "冷たい女性 / 憎しみ (1)": "低くて冷たい声の女性が、怒りを内に秘めながら憎しみのこもった口調で、淡々と早めに話してるような声で読んでほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/woman_cold_frustrated_2.wav
    "冷たい女性 / 憎しみ (2)": "低くて冷たい声の女性が、怒りを内に秘めながら憎しみのこもった口調で、淡々と早めに話してるような声で読んでほしい。", # same text different result --> https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/woman_cold_frustrated.wav
}

VOICE_PRESET_LIST = list(VOICE_EXAMPLES.items())

# label -> local file path (ship these in your Space repo under samples/)
PREGENERATED_AUDIO = {
    "甘えた女の子 / ゆっくり": "samples/onnanoko_amai.wav",
    "激怒する女性 / 感情爆発": "samples/angry.wav",
    "落ち着いた男性 / 呆れ気味": "samples/guy_cool.wav",
    "冷たい女性 / 憎しみ (1)": "samples/woman_cold_frustrated_2.wav",
    "冷たい女性 / 憎しみ (2)": "samples/woman_cold_frustrated.wav",
}

def load_pregenerated_to_main(label):
    """
    Click handler from Examples tab:
    loads instruction text into the Instruction box (optional)
    and loads the pre-generated WAV into the MAIN tab audio_output.
    """
    desc = VOICE_EXAMPLES.get(label, "")
    path = PREGENERATED_AUDIO.get(label)

    if path and os.path.exists(path):
        sr, data = wavfile.read(path)

        if isinstance(data, np.ndarray) and data.ndim == 2 and data.shape[0] in (1, 2) and data.shape[0] < data.shape[1]:
            data = data.T

        return (
            gr.update(value=desc),          # voice_desc_input
            (sr, data),                     # audio_output (MAIN TAB)
            f"Status: Loaded pre-generated sample: {label}"
        )

    return (
        gr.update(value=desc),
        None,
        f"Status: No pre-generated audio found for: {label}"
    )


def run_generation_pipeline_client(
    raw_text,
    voice_description,
    cfg_text,
    cfg_style,
    min_temp,
    max_temp,
    top_k,
    min_p,
    dry_multiplier,
    seed,
):
    try:
        result = client.predict(
            raw_text,
            voice_description,
            cfg_text,
            cfg_style,
            min_temp,
            max_temp,
            top_k,
            min_p,
            dry_multiplier,
            seed,
            "",
            api_name="/run_generation_pipeline"
        )

        if result is None:
            return None, "Status: No response from server"

        if isinstance(result, (list, tuple)) and len(result) == 2:
            audio_result, status_msg = result
            if audio_result is not None:
                if isinstance(audio_result, str) and os.path.exists(audio_result):
                    sr, data = wavfile.read(audio_result)
                elif isinstance(audio_result, (list, tuple)) and len(audio_result) >= 2:
                    sr = audio_result[0]
                    data = np.array(audio_result[1]) if isinstance(audio_result[1], list) else audio_result[1]
                else:
                    return None, status_msg

                if isinstance(data, np.ndarray) and data.ndim == 2 and data.shape[0] in (1, 2) and data.shape[0] < data.shape[1]:
                    data = data.T

                return (sr, data), status_msg
            return None, status_msg

        return None, "Status: Unexpected response format from server"

    except Exception as e:
        return None, f"Status: Connection error: {str(e)}"


with gr.Blocks(theme="Respair/Shiki@10.1.0", css=css) as demo:
    gr.Markdown(
    """
    <div style="text-align: left;">
    Demo is closed until further notice; thank you for using it. Feel free to check the pre-generated samples at the <code>Examples</code> tab. <br>
    </div>
    """
    )
    with gr.Tabs():

        with gr.TabItem("Speech Generation"):
            with gr.Row():
                with gr.Column(scale=2):
                    text_input = gr.Textbox(
                        label="Text",
                        lines=5,
                        max_length=125,
                        value="準備もできましたけど、いきなり本題に入ると分かりにくいかもしれないので、まずは今日やることを短く整理して、手順を一つずつ確認しながら進めていきますね。途中で気になるところがあったら、その都度止めて大丈夫です。",
                    )

                    with gr.Column(elem_id="voice-desc-wrap"):
                        voice_desc_input = gr.Textbox(
                        label="Instruction",
                        value="低くて激しい声の女性が、感情を抑えきれずに怒りを爆発させながら、早口でまくしたてるような声で読んでほしい。",
                        lines=2,
                    )
                    with gr.Row(equal_height=False):
                        with gr.Accordion("----------------------------------⭐ 🛠️ ⭐", open=False):
                    
                            seed_slider = gr.Slider( 
                                label="Seed (-1 for random)", minimum=-1, maximum=2700000000, value=2700000000, step=1
                            )
                            gr.Markdown('<h3 style="color: #FFD700;">Style / CFG Parameters</h3>')
                            cfg_text_slider = gr.Slider(
                                label="CFG Text", minimum=0.5, maximum=3.0, value=1.15, step=0.05,
                            )
                            cfg_style_slider = gr.Slider(
                                label="CFG Style",
                                minimum=0.5, maximum=3.0, value=1.2, step=0.1,
                            )
                            gr.Markdown('<h3 style="color: #FFD700;">Sampling Parameters</h3>')
                            min_temp_slider = gr.Slider(
                                label="Min Temperature (adaptive)", minimum=0.0, maximum=2.0, value=0.25, step=0.05,
                            )
                            max_temp_slider = gr.Slider(
                                label="Max Temperature (adaptive)", minimum=0.0, maximum=2.0, value=1.0, step=0.05,
                            )
                            top_k_slider = gr.Slider(
                                label="Top K (0 = off)", minimum=0, maximum=200, value=0, step=5,
                            )
                            min_p_slider = gr.Slider(
                                label="Min P (0 = off)", minimum=0.0, maximum=1.0, value=0.0, step=0.01,
                            )

                            gr.Markdown('<h3 style="color: #FFD700;">Repetition Control</h3>')

                            dry_multiplier_slider = gr.Slider(
                                label="DRY Multiplier (0 = off)", minimum=0.0, maximum=5.0, value=0.8, step=0.1,
                            )

                            # gr.Markdown('<h3 style="color: #FFD700;">Other</h3>')

     
                        with gr.Column(scale=1):
                            generate_button = gr.Button("🎤 Generate", variant="primary", size="lg")

                with gr.Column(scale=1):
                    status_output = gr.Textbox(label="Status", interactive=False)
                    audio_output = gr.Audio(
                        label="Generated Speech",
                        interactive=False
                    )

            # random_desc_button.click(
            #     fn=lambda: random.choice(VOICE_PRESET_LIST)[1],
            #     inputs=[],
            #     outputs=[voice_desc_input],
            # )

            generate_button.click(
                fn=run_generation_pipeline_client,
                inputs=[
                    text_input,
                    voice_desc_input,
                    cfg_text_slider,
                    cfg_style_slider,
                    min_temp_slider,
                    max_temp_slider,
                    top_k_slider,
                    min_p_slider,
                    dry_multiplier_slider,
                    seed_slider,
                ],
                outputs=[audio_output, status_output],
                concurrency_limit=4,
            )

        with gr.TabItem("Examples"):
            gr.HTML("""
            <div style="background-color: rgba(255, 255, 255, 0.025); padding: 20px; border-radius: 12px; backdrop-filter: blur(10px); box-shadow: 0 4px 6px rgba(0,0,0,0.5); margin-top: 8px;">
                <p style="color: #1a1a1a; font-weight: 500; line-height: 1.6; font-size: 14px; text-align: center; margin: 0;">
                クリックするとメインタブの音声プレイヤーにプリジェネ音声がロードされます。 / Click a preset to load the pre-generated audio into the main tab player.
                </p>
            </div>
            """)

            with gr.Row():
                with gr.Column(scale=1, elem_id="voice-preset-container"):
                    gr.HTML("""
                    <div style="background-color: rgba(255, 255, 255, 0.55); padding: 8px 12px; border-radius: 8px; backdrop-filter: blur(10px); box-shadow: 0 2px 4px rgba(0,0,0,0.08); text-align: center; max-width: 220px; margin: 0 auto 12px auto;">
                        <h3 style="color: #000000; margin: 0; font-size: 16px;">Examples</h3>
                    </div>
                    """)
                    example_label_holder = gr.Textbox(visible=False)

                    gr.Examples(
                        examples=[[label] for label in PREGENERATED_AUDIO.keys()],
                        inputs=[example_label_holder],
                        outputs=[voice_desc_input, audio_output, status_output],  # <-- MAIN TAB outputs
                        fn=load_pregenerated_to_main,
                        label="Click to load a pre-generated sample",
                        cache_examples=False,
                        run_on_click=True,
                        examples_per_page=10,
                    )

        with gr.TabItem("Info"):
            gr.HTML('<h1 style="text-align: center;">🌸 Takane - Voice Design 🎨 </h1>')
            
            gr.HTML("""
            <div style="background-color: rgba(255, 255, 255, 0.525); padding: 30px; border-radius: 12px; backdrop-filter: blur(5px); max-width: 100%; box-shadow: 0 4px 6px rgba(0,0,0,0.5);">
            <div style="display: flex; gap: 24px; flex-wrap: wrap; justify-content: center;">
                
                <div style="flex: 1; min-width: 280px;">
                <h3 style="color: #000000; margin: 0 0 12px 0; font-size: 20px; text-align: center;">日本語</h3>
                <p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 16px; margin: 0; text-align: center;">
                    本モデルのバックボーンは
                    <a href="https://huggingface.co/spaces/Respair/Takane" target="_blank" rel="noopener noreferrer"
                    style="color: #b45309; text-decoration: none; font-weight: 600;">
                    Takane
                    </a>
                    を改良したもので、ネイティブ 44.1kHz コーデックを備えた完全自回帰のエンコーダ・デコーダ型 Transformer です。<br><br>
                    <strong>CFG Style</strong> を上げると指示への追従が強くなりますが、上げすぎると過剰な条件付け（over-conditioning）が起きて音質が劣化する場合があります。
                </p>
                </div>

                <div style="flex: 1; min-width: 280px;">
                <h3 style="color: #000000; margin: 0 0 12px 0; font-size: 20px; text-align: center;">English</h3>
                <p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 16px; margin: 0; text-align: center;">
                    The backbone is a modified version of
                    <a href="https://huggingface.co/spaces/Respair/Takane" target="_blank" rel="noopener noreferrer"
                    style="color: #b45309; text-decoration: none; font-weight: 600;">
                    Takane
                    </a>,
                    a fully autoregressive encoder-decoder transformer with a native 44.1khz codec.<br><br>
                    Raise <strong>CFG Style</strong> if you want stronger adherence; pushing it too high can cause over-conditioning and degrade quality. <br><br>
                    <code>This model is only in Japanese</code>, if you enjoy anime, this is yours to play with.
                </p>
                </div>

            </div>
            </div>
            """)

    def load_default():
        label = "激怒する女性 / 感情爆発"
        desc = VOICE_EXAMPLES.get(label, "")
        path = PREGENERATED_AUDIO.get(label)
    
        if path and os.path.exists(path):
            sr, data = wavfile.read(path)
            if isinstance(data, np.ndarray) and data.ndim == 2 and data.shape[0] in (1, 2) and data.shape[0] < data.shape[1]:
                data = data.T
            return gr.update(value=desc), (sr, data), gr.update(value=f"Status: Loaded default sample: {label}")
    
        return gr.update(value=desc), None, gr.update(value=f"Status: Default sample missing: {label}")
    
    demo.load(
        fn=load_default,
        inputs=None,
        outputs=[voice_desc_input, audio_output, status_output],
    )

if __name__ == "__main__":
    demo.queue(api_open=False, max_size=15).launch()