import gradio as gr from gradio_client import Client import os import random import numpy as np import scipy.io.wavfile as wavfile # try: # client = Client(os.environ['src']) # except: # client = Client("http://localhost:7861/") css = """ .gradio-container input::placeholder, .gradio-container textarea::placeholder { color: #333333 !important; } code { background-color: #ffde9f; padding: 2px 4px; border-radius: 3px; } .gr-checkbox label span, .gr-check-radio label span, [data-testid="checkbox"] label span, .checkbox-container span { color: #ECF2F7 !important; } #advanced-accordion > button, #advanced-accordion > button span, #advanced-accordion > div > button, #advanced-accordion > div > button span, #advanced-accordion .label-wrap, #advanced-accordion .label-wrap span, #advanced-accordion > .open, #advanced-accordion > .open span { color: #FFD700 !important; } #voice-preset-container .gallery button, #voice-preset-container .gr-examples button, #voice-preset-container .examples button, #voice-preset-container button.sample { background-color: #c8b8d4 !important; border: 1px solid #b8a8c4 !important; color: #1a1a1a !important; font-weight: 500 !important; margin: 4px !important; padding: 10px 14px !important; border-radius: 6px !important; transition: background-color 0.2s ease !important; } #voice-preset-container .gallery button:hover, #voice-preset-container .gr-examples button:hover, #voice-preset-container .examples button:hover, #voice-preset-container button.sample:hover { background-color: #baadc9 !important; border-color: #a89ab8 !important; } body { background: none !important; } body::before { content: ""; position: fixed; top: 0; left: 0; width: 100%; height: 100%; z-index: -1; pointer-events: none; background: url('https://i.postimg.cc/1smD6GPf/gradio-theme-rin2.png') center center / cover no-repeat; } """ VOICE_EXAMPLES = { "甘えた女の子 / ゆっくり": "かわいくて高い声の女の子が、甘えながらゆっくりのんびりしゃべってる感じの音声がほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/onnanoko_amai.wav "激怒する女性 / 感情爆発": "低くて激しい声の女性が、感情を抑えきれずに怒りを爆発させながら、早口でまくしたてるような声で読んでほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/angry.wav "落ち着いた男性 / 呆れ気味": "落ち着いた低めの声の男性が、相手の言動に少し呆れつつも感情を表に出さず、静かで平坦なトーンで淡々と話してるような声で読んでほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/guy_cool.wav "Calm man / mildly exasperated (EN)": "Read this in the voice of a calm, low-pitched man who sounds mildly exasperated but keeps his emotions in check, speaking in a flat, even tone without much expression.", # Nothing "冷たい女性 / 憎しみ (1)": "低くて冷たい声の女性が、怒りを内に秘めながら憎しみのこもった口調で、淡々と早めに話してるような声で読んでほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/woman_cold_frustrated_2.wav "冷たい女性 / 憎しみ (2)": "低くて冷たい声の女性が、怒りを内に秘めながら憎しみのこもった口調で、淡々と早めに話してるような声で読んでほしい。", # same text different result --> https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/woman_cold_frustrated.wav } VOICE_PRESET_LIST = list(VOICE_EXAMPLES.items()) # label -> local file path (ship these in your Space repo under samples/) PREGENERATED_AUDIO = { "甘えた女の子 / ゆっくり": "samples/onnanoko_amai.wav", "激怒する女性 / 感情爆発": "samples/angry.wav", "落ち着いた男性 / 呆れ気味": "samples/guy_cool.wav", "冷たい女性 / 憎しみ (1)": "samples/woman_cold_frustrated_2.wav", "冷たい女性 / 憎しみ (2)": "samples/woman_cold_frustrated.wav", } def load_pregenerated_to_main(label): """ Click handler from Examples tab: loads instruction text into the Instruction box (optional) and loads the pre-generated WAV into the MAIN tab audio_output. """ desc = VOICE_EXAMPLES.get(label, "") path = PREGENERATED_AUDIO.get(label) if path and os.path.exists(path): sr, data = wavfile.read(path) if isinstance(data, np.ndarray) and data.ndim == 2 and data.shape[0] in (1, 2) and data.shape[0] < data.shape[1]: data = data.T return ( gr.update(value=desc), # voice_desc_input (sr, data), # audio_output (MAIN TAB) f"Status: Loaded pre-generated sample: {label}" ) return ( gr.update(value=desc), None, f"Status: No pre-generated audio found for: {label}" ) def run_generation_pipeline_client( raw_text, voice_description, cfg_text, cfg_style, min_temp, max_temp, top_k, min_p, dry_multiplier, seed, ): try: result = client.predict( raw_text, voice_description, cfg_text, cfg_style, min_temp, max_temp, top_k, min_p, dry_multiplier, seed, "", api_name="/run_generation_pipeline" ) if result is None: return None, "Status: No response from server" if isinstance(result, (list, tuple)) and len(result) == 2: audio_result, status_msg = result if audio_result is not None: if isinstance(audio_result, str) and os.path.exists(audio_result): sr, data = wavfile.read(audio_result) elif isinstance(audio_result, (list, tuple)) and len(audio_result) >= 2: sr = audio_result[0] data = np.array(audio_result[1]) if isinstance(audio_result[1], list) else audio_result[1] else: return None, status_msg if isinstance(data, np.ndarray) and data.ndim == 2 and data.shape[0] in (1, 2) and data.shape[0] < data.shape[1]: data = data.T return (sr, data), status_msg return None, status_msg return None, "Status: Unexpected response format from server" except Exception as e: return None, f"Status: Connection error: {str(e)}" with gr.Blocks(theme="Respair/Shiki@10.1.0", css=css) as demo: gr.Markdown( """
Examples tab. クリックするとメインタブの音声プレイヤーにプリジェネ音声がロードされます。 / Click a preset to load the pre-generated audio into the main tab player.
本モデルのバックボーンは
Takane
を改良したもので、ネイティブ 44.1kHz コーデックを備えた完全自回帰のエンコーダ・デコーダ型 Transformer です。
CFG Style を上げると指示への追従が強くなりますが、上げすぎると過剰な条件付け(over-conditioning)が起きて音質が劣化する場合があります。
The backbone is a modified version of
Takane
,
a fully autoregressive encoder-decoder transformer with a native 44.1khz codec.
Raise CFG Style if you want stronger adherence; pushing it too high can cause over-conditioning and degrade quality.
This model is only in Japanese, if you enjoy anime, this is yours to play with.