import gradio as gr from gradio_client import Client import os import random import numpy as np import scipy.io.wavfile as wavfile # try: # client = Client(os.environ['src']) # except: # client = Client("http://localhost:7861/") css = """ .gradio-container input::placeholder, .gradio-container textarea::placeholder { color: #333333 !important; } code { background-color: #ffde9f; padding: 2px 4px; border-radius: 3px; } .gr-checkbox label span, .gr-check-radio label span, [data-testid="checkbox"] label span, .checkbox-container span { color: #ECF2F7 !important; } #advanced-accordion > button, #advanced-accordion > button span, #advanced-accordion > div > button, #advanced-accordion > div > button span, #advanced-accordion .label-wrap, #advanced-accordion .label-wrap span, #advanced-accordion > .open, #advanced-accordion > .open span { color: #FFD700 !important; } #voice-preset-container .gallery button, #voice-preset-container .gr-examples button, #voice-preset-container .examples button, #voice-preset-container button.sample { background-color: #c8b8d4 !important; border: 1px solid #b8a8c4 !important; color: #1a1a1a !important; font-weight: 500 !important; margin: 4px !important; padding: 10px 14px !important; border-radius: 6px !important; transition: background-color 0.2s ease !important; } #voice-preset-container .gallery button:hover, #voice-preset-container .gr-examples button:hover, #voice-preset-container .examples button:hover, #voice-preset-container button.sample:hover { background-color: #baadc9 !important; border-color: #a89ab8 !important; } body { background: none !important; } body::before { content: ""; position: fixed; top: 0; left: 0; width: 100%; height: 100%; z-index: -1; pointer-events: none; background: url('https://i.postimg.cc/1smD6GPf/gradio-theme-rin2.png') center center / cover no-repeat; } """ VOICE_EXAMPLES = { "甘えた女の子 / ゆっくり": "かわいくて高い声の女の子が、甘えながらゆっくりのんびりしゃべってる感じの音声がほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/onnanoko_amai.wav "激怒する女性 / 感情爆発": "低くて激しい声の女性が、感情を抑えきれずに怒りを爆発させながら、早口でまくしたてるような声で読んでほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/angry.wav "落ち着いた男性 / 呆れ気味": "落ち着いた低めの声の男性が、相手の言動に少し呆れつつも感情を表に出さず、静かで平坦なトーンで淡々と話してるような声で読んでほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/guy_cool.wav "Calm man / mildly exasperated (EN)": "Read this in the voice of a calm, low-pitched man who sounds mildly exasperated but keeps his emotions in check, speaking in a flat, even tone without much expression.", # Nothing "冷たい女性 / 憎しみ (1)": "低くて冷たい声の女性が、怒りを内に秘めながら憎しみのこもった口調で、淡々と早めに話してるような声で読んでほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/woman_cold_frustrated_2.wav "冷たい女性 / 憎しみ (2)": "低くて冷たい声の女性が、怒りを内に秘めながら憎しみのこもった口調で、淡々と早めに話してるような声で読んでほしい。", # same text different result --> https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/woman_cold_frustrated.wav } VOICE_PRESET_LIST = list(VOICE_EXAMPLES.items()) # label -> local file path (ship these in your Space repo under samples/) PREGENERATED_AUDIO = { "甘えた女の子 / ゆっくり": "samples/onnanoko_amai.wav", "激怒する女性 / 感情爆発": "samples/angry.wav", "落ち着いた男性 / 呆れ気味": "samples/guy_cool.wav", "冷たい女性 / 憎しみ (1)": "samples/woman_cold_frustrated_2.wav", "冷たい女性 / 憎しみ (2)": "samples/woman_cold_frustrated.wav", } def load_pregenerated_to_main(label): """ Click handler from Examples tab: loads instruction text into the Instruction box (optional) and loads the pre-generated WAV into the MAIN tab audio_output. """ desc = VOICE_EXAMPLES.get(label, "") path = PREGENERATED_AUDIO.get(label) if path and os.path.exists(path): sr, data = wavfile.read(path) if isinstance(data, np.ndarray) and data.ndim == 2 and data.shape[0] in (1, 2) and data.shape[0] < data.shape[1]: data = data.T return ( gr.update(value=desc), # voice_desc_input (sr, data), # audio_output (MAIN TAB) f"Status: Loaded pre-generated sample: {label}" ) return ( gr.update(value=desc), None, f"Status: No pre-generated audio found for: {label}" ) def run_generation_pipeline_client( raw_text, voice_description, cfg_text, cfg_style, min_temp, max_temp, top_k, min_p, dry_multiplier, seed, ): try: result = client.predict( raw_text, voice_description, cfg_text, cfg_style, min_temp, max_temp, top_k, min_p, dry_multiplier, seed, "", api_name="/run_generation_pipeline" ) if result is None: return None, "Status: No response from server" if isinstance(result, (list, tuple)) and len(result) == 2: audio_result, status_msg = result if audio_result is not None: if isinstance(audio_result, str) and os.path.exists(audio_result): sr, data = wavfile.read(audio_result) elif isinstance(audio_result, (list, tuple)) and len(audio_result) >= 2: sr = audio_result[0] data = np.array(audio_result[1]) if isinstance(audio_result[1], list) else audio_result[1] else: return None, status_msg if isinstance(data, np.ndarray) and data.ndim == 2 and data.shape[0] in (1, 2) and data.shape[0] < data.shape[1]: data = data.T return (sr, data), status_msg return None, status_msg return None, "Status: Unexpected response format from server" except Exception as e: return None, f"Status: Connection error: {str(e)}" with gr.Blocks(theme="Respair/Shiki@10.1.0", css=css) as demo: gr.Markdown( """
Demo is closed until further notice; thank you for using it. Feel free to check the pre-generated samples at the Examples tab.
""" ) with gr.Tabs(): with gr.TabItem("Speech Generation"): with gr.Row(): with gr.Column(scale=2): text_input = gr.Textbox( label="Text", lines=5, max_length=125, value="準備もできましたけど、いきなり本題に入ると分かりにくいかもしれないので、まずは今日やることを短く整理して、手順を一つずつ確認しながら進めていきますね。途中で気になるところがあったら、その都度止めて大丈夫です。", ) with gr.Column(elem_id="voice-desc-wrap"): voice_desc_input = gr.Textbox( label="Instruction", value="低くて激しい声の女性が、感情を抑えきれずに怒りを爆発させながら、早口でまくしたてるような声で読んでほしい。", lines=2, ) with gr.Row(equal_height=False): with gr.Accordion("----------------------------------⭐ 🛠️ ⭐", open=False): seed_slider = gr.Slider( label="Seed (-1 for random)", minimum=-1, maximum=2700000000, value=2700000000, step=1 ) gr.Markdown('

Style / CFG Parameters

') cfg_text_slider = gr.Slider( label="CFG Text", minimum=0.5, maximum=3.0, value=1.15, step=0.05, ) cfg_style_slider = gr.Slider( label="CFG Style", minimum=0.5, maximum=3.0, value=1.2, step=0.1, ) gr.Markdown('

Sampling Parameters

') min_temp_slider = gr.Slider( label="Min Temperature (adaptive)", minimum=0.0, maximum=2.0, value=0.25, step=0.05, ) max_temp_slider = gr.Slider( label="Max Temperature (adaptive)", minimum=0.0, maximum=2.0, value=1.0, step=0.05, ) top_k_slider = gr.Slider( label="Top K (0 = off)", minimum=0, maximum=200, value=0, step=5, ) min_p_slider = gr.Slider( label="Min P (0 = off)", minimum=0.0, maximum=1.0, value=0.0, step=0.01, ) gr.Markdown('

Repetition Control

') dry_multiplier_slider = gr.Slider( label="DRY Multiplier (0 = off)", minimum=0.0, maximum=5.0, value=0.8, step=0.1, ) # gr.Markdown('

Other

') with gr.Column(scale=1): generate_button = gr.Button("🎤 Generate", variant="primary", size="lg") with gr.Column(scale=1): status_output = gr.Textbox(label="Status", interactive=False) audio_output = gr.Audio( label="Generated Speech", interactive=False ) # random_desc_button.click( # fn=lambda: random.choice(VOICE_PRESET_LIST)[1], # inputs=[], # outputs=[voice_desc_input], # ) generate_button.click( fn=run_generation_pipeline_client, inputs=[ text_input, voice_desc_input, cfg_text_slider, cfg_style_slider, min_temp_slider, max_temp_slider, top_k_slider, min_p_slider, dry_multiplier_slider, seed_slider, ], outputs=[audio_output, status_output], concurrency_limit=4, ) with gr.TabItem("Examples"): gr.HTML("""

クリックするとメインタブの音声プレイヤーにプリジェネ音声がロードされます。 / Click a preset to load the pre-generated audio into the main tab player.

""") with gr.Row(): with gr.Column(scale=1, elem_id="voice-preset-container"): gr.HTML("""

Examples

""") example_label_holder = gr.Textbox(visible=False) gr.Examples( examples=[[label] for label in PREGENERATED_AUDIO.keys()], inputs=[example_label_holder], outputs=[voice_desc_input, audio_output, status_output], # <-- MAIN TAB outputs fn=load_pregenerated_to_main, label="Click to load a pre-generated sample", cache_examples=False, run_on_click=True, examples_per_page=10, ) with gr.TabItem("Info"): gr.HTML('

🌸 Takane - Voice Design 🎨

') gr.HTML("""

日本語

本モデルのバックボーンは Takane を改良したもので、ネイティブ 44.1kHz コーデックを備えた完全自回帰のエンコーダ・デコーダ型 Transformer です。

CFG Style を上げると指示への追従が強くなりますが、上げすぎると過剰な条件付け(over-conditioning)が起きて音質が劣化する場合があります。

English

The backbone is a modified version of Takane , a fully autoregressive encoder-decoder transformer with a native 44.1khz codec.

Raise CFG Style if you want stronger adherence; pushing it too high can cause over-conditioning and degrade quality.

This model is only in Japanese, if you enjoy anime, this is yours to play with.

""") def load_default(): label = "激怒する女性 / 感情爆発" desc = VOICE_EXAMPLES.get(label, "") path = PREGENERATED_AUDIO.get(label) if path and os.path.exists(path): sr, data = wavfile.read(path) if isinstance(data, np.ndarray) and data.ndim == 2 and data.shape[0] in (1, 2) and data.shape[0] < data.shape[1]: data = data.T return gr.update(value=desc), (sr, data), gr.update(value=f"Status: Loaded default sample: {label}") return gr.update(value=desc), None, gr.update(value=f"Status: Default sample missing: {label}") demo.load( fn=load_default, inputs=None, outputs=[voice_desc_input, audio_output, status_output], ) if __name__ == "__main__": demo.queue(api_open=False, max_size=15).launch()