Spaces:
Running
Running
File size: 16,794 Bytes
b2ed3d3 d99da81 7fbf919 6b05521 b2ed3d3 6b05521 4cb44e3 b2ed3d3 4cb44e3 6b05521 b2ed3d3 6b05521 1505e24 6b05521 b2ed3d3 12369c4 b2ed3d3 930f74a b2ed3d3 8116d64 b2ed3d3 ba697f2 b2ed3d3 1e7ed96 b2ed3d3 1e7ed96 b2ed3d3 4cb44e3 b2ed3d3 4cb44e3 b2ed3d3 adce806 b2ed3d3 4cb44e3 b2ed3d3 4cb44e3 b2ed3d3 3593633 b2ed3d3 b2300c3 b2ed3d3 812b614 b2ed3d3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 | import gradio as gr
from gradio_client import Client
import os
import random
import numpy as np
import scipy.io.wavfile as wavfile
# try:
# client = Client(os.environ['src'])
# except:
# client = Client("http://localhost:7861/")
css = """
.gradio-container input::placeholder,
.gradio-container textarea::placeholder {
color: #333333 !important;
}
code {
background-color: #ffde9f;
padding: 2px 4px;
border-radius: 3px;
}
.gr-checkbox label span,
.gr-check-radio label span,
[data-testid="checkbox"] label span,
.checkbox-container span {
color: #ECF2F7 !important;
}
#advanced-accordion > button,
#advanced-accordion > button span,
#advanced-accordion > div > button,
#advanced-accordion > div > button span,
#advanced-accordion .label-wrap,
#advanced-accordion .label-wrap span,
#advanced-accordion > .open,
#advanced-accordion > .open span {
color: #FFD700 !important;
}
#voice-preset-container .gallery button,
#voice-preset-container .gr-examples button,
#voice-preset-container .examples button,
#voice-preset-container button.sample {
background-color: #c8b8d4 !important;
border: 1px solid #b8a8c4 !important;
color: #1a1a1a !important;
font-weight: 500 !important;
margin: 4px !important;
padding: 10px 14px !important;
border-radius: 6px !important;
transition: background-color 0.2s ease !important;
}
#voice-preset-container .gallery button:hover,
#voice-preset-container .gr-examples button:hover,
#voice-preset-container .examples button:hover,
#voice-preset-container button.sample:hover {
background-color: #baadc9 !important;
border-color: #a89ab8 !important;
}
body {
background: none !important;
}
body::before {
content: "";
position: fixed;
top: 0;
left: 0;
width: 100%;
height: 100%;
z-index: -1;
pointer-events: none;
background: url('https://i.postimg.cc/1smD6GPf/gradio-theme-rin2.png') center center / cover no-repeat;
}
"""
VOICE_EXAMPLES = {
"甘えた女の子 / ゆっくり": "かわいくて高い声の女の子が、甘えながらゆっくりのんびりしゃべってる感じの音声がほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/onnanoko_amai.wav
"激怒する女性 / 感情爆発": "低くて激しい声の女性が、感情を抑えきれずに怒りを爆発させながら、早口でまくしたてるような声で読んでほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/angry.wav
"落ち着いた男性 / 呆れ気味": "落ち着いた低めの声の男性が、相手の言動に少し呆れつつも感情を表に出さず、静かで平坦なトーンで淡々と話してるような声で読んでほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/guy_cool.wav
"Calm man / mildly exasperated (EN)": "Read this in the voice of a calm, low-pitched man who sounds mildly exasperated but keeps his emotions in check, speaking in a flat, even tone without much expression.", # Nothing
"冷たい女性 / 憎しみ (1)": "低くて冷たい声の女性が、怒りを内に秘めながら憎しみのこもった口調で、淡々と早めに話してるような声で読んでほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/woman_cold_frustrated_2.wav
"冷たい女性 / 憎しみ (2)": "低くて冷たい声の女性が、怒りを内に秘めながら憎しみのこもった口調で、淡々と早めに話してるような声で読んでほしい。", # same text different result --> https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/woman_cold_frustrated.wav
}
VOICE_PRESET_LIST = list(VOICE_EXAMPLES.items())
# label -> local file path (ship these in your Space repo under samples/)
PREGENERATED_AUDIO = {
"甘えた女の子 / ゆっくり": "samples/onnanoko_amai.wav",
"激怒する女性 / 感情爆発": "samples/angry.wav",
"落ち着いた男性 / 呆れ気味": "samples/guy_cool.wav",
"冷たい女性 / 憎しみ (1)": "samples/woman_cold_frustrated_2.wav",
"冷たい女性 / 憎しみ (2)": "samples/woman_cold_frustrated.wav",
}
def load_pregenerated_to_main(label):
"""
Click handler from Examples tab:
loads instruction text into the Instruction box (optional)
and loads the pre-generated WAV into the MAIN tab audio_output.
"""
desc = VOICE_EXAMPLES.get(label, "")
path = PREGENERATED_AUDIO.get(label)
if path and os.path.exists(path):
sr, data = wavfile.read(path)
if isinstance(data, np.ndarray) and data.ndim == 2 and data.shape[0] in (1, 2) and data.shape[0] < data.shape[1]:
data = data.T
return (
gr.update(value=desc), # voice_desc_input
(sr, data), # audio_output (MAIN TAB)
f"Status: Loaded pre-generated sample: {label}"
)
return (
gr.update(value=desc),
None,
f"Status: No pre-generated audio found for: {label}"
)
def run_generation_pipeline_client(
raw_text,
voice_description,
cfg_text,
cfg_style,
min_temp,
max_temp,
top_k,
min_p,
dry_multiplier,
seed,
):
try:
result = client.predict(
raw_text,
voice_description,
cfg_text,
cfg_style,
min_temp,
max_temp,
top_k,
min_p,
dry_multiplier,
seed,
"",
api_name="/run_generation_pipeline"
)
if result is None:
return None, "Status: No response from server"
if isinstance(result, (list, tuple)) and len(result) == 2:
audio_result, status_msg = result
if audio_result is not None:
if isinstance(audio_result, str) and os.path.exists(audio_result):
sr, data = wavfile.read(audio_result)
elif isinstance(audio_result, (list, tuple)) and len(audio_result) >= 2:
sr = audio_result[0]
data = np.array(audio_result[1]) if isinstance(audio_result[1], list) else audio_result[1]
else:
return None, status_msg
if isinstance(data, np.ndarray) and data.ndim == 2 and data.shape[0] in (1, 2) and data.shape[0] < data.shape[1]:
data = data.T
return (sr, data), status_msg
return None, status_msg
return None, "Status: Unexpected response format from server"
except Exception as e:
return None, f"Status: Connection error: {str(e)}"
with gr.Blocks(theme="Respair/Shiki@10.1.0", css=css) as demo:
gr.Markdown(
"""
<div style="text-align: left;">
Demo is closed until further notice; thank you for using it. Feel free to check the pre-generated samples at the <code>Examples</code> tab. <br>
</div>
"""
)
with gr.Tabs():
with gr.TabItem("Speech Generation"):
with gr.Row():
with gr.Column(scale=2):
text_input = gr.Textbox(
label="Text",
lines=5,
max_length=125,
value="準備もできましたけど、いきなり本題に入ると分かりにくいかもしれないので、まずは今日やることを短く整理して、手順を一つずつ確認しながら進めていきますね。途中で気になるところがあったら、その都度止めて大丈夫です。",
)
with gr.Column(elem_id="voice-desc-wrap"):
voice_desc_input = gr.Textbox(
label="Instruction",
value="低くて激しい声の女性が、感情を抑えきれずに怒りを爆発させながら、早口でまくしたてるような声で読んでほしい。",
lines=2,
)
with gr.Row(equal_height=False):
with gr.Accordion("----------------------------------⭐ 🛠️ ⭐", open=False):
seed_slider = gr.Slider(
label="Seed (-1 for random)", minimum=-1, maximum=2700000000, value=2700000000, step=1
)
gr.Markdown('<h3 style="color: #FFD700;">Style / CFG Parameters</h3>')
cfg_text_slider = gr.Slider(
label="CFG Text", minimum=0.5, maximum=3.0, value=1.15, step=0.05,
)
cfg_style_slider = gr.Slider(
label="CFG Style",
minimum=0.5, maximum=3.0, value=1.2, step=0.1,
)
gr.Markdown('<h3 style="color: #FFD700;">Sampling Parameters</h3>')
min_temp_slider = gr.Slider(
label="Min Temperature (adaptive)", minimum=0.0, maximum=2.0, value=0.25, step=0.05,
)
max_temp_slider = gr.Slider(
label="Max Temperature (adaptive)", minimum=0.0, maximum=2.0, value=1.0, step=0.05,
)
top_k_slider = gr.Slider(
label="Top K (0 = off)", minimum=0, maximum=200, value=0, step=5,
)
min_p_slider = gr.Slider(
label="Min P (0 = off)", minimum=0.0, maximum=1.0, value=0.0, step=0.01,
)
gr.Markdown('<h3 style="color: #FFD700;">Repetition Control</h3>')
dry_multiplier_slider = gr.Slider(
label="DRY Multiplier (0 = off)", minimum=0.0, maximum=5.0, value=0.8, step=0.1,
)
# gr.Markdown('<h3 style="color: #FFD700;">Other</h3>')
with gr.Column(scale=1):
generate_button = gr.Button("🎤 Generate", variant="primary", size="lg")
with gr.Column(scale=1):
status_output = gr.Textbox(label="Status", interactive=False)
audio_output = gr.Audio(
label="Generated Speech",
interactive=False
)
# random_desc_button.click(
# fn=lambda: random.choice(VOICE_PRESET_LIST)[1],
# inputs=[],
# outputs=[voice_desc_input],
# )
generate_button.click(
fn=run_generation_pipeline_client,
inputs=[
text_input,
voice_desc_input,
cfg_text_slider,
cfg_style_slider,
min_temp_slider,
max_temp_slider,
top_k_slider,
min_p_slider,
dry_multiplier_slider,
seed_slider,
],
outputs=[audio_output, status_output],
concurrency_limit=4,
)
with gr.TabItem("Examples"):
gr.HTML("""
<div style="background-color: rgba(255, 255, 255, 0.025); padding: 20px; border-radius: 12px; backdrop-filter: blur(10px); box-shadow: 0 4px 6px rgba(0,0,0,0.5); margin-top: 8px;">
<p style="color: #1a1a1a; font-weight: 500; line-height: 1.6; font-size: 14px; text-align: center; margin: 0;">
クリックするとメインタブの音声プレイヤーにプリジェネ音声がロードされます。 / Click a preset to load the pre-generated audio into the main tab player.
</p>
</div>
""")
with gr.Row():
with gr.Column(scale=1, elem_id="voice-preset-container"):
gr.HTML("""
<div style="background-color: rgba(255, 255, 255, 0.55); padding: 8px 12px; border-radius: 8px; backdrop-filter: blur(10px); box-shadow: 0 2px 4px rgba(0,0,0,0.08); text-align: center; max-width: 220px; margin: 0 auto 12px auto;">
<h3 style="color: #000000; margin: 0; font-size: 16px;">Examples</h3>
</div>
""")
example_label_holder = gr.Textbox(visible=False)
gr.Examples(
examples=[[label] for label in PREGENERATED_AUDIO.keys()],
inputs=[example_label_holder],
outputs=[voice_desc_input, audio_output, status_output], # <-- MAIN TAB outputs
fn=load_pregenerated_to_main,
label="Click to load a pre-generated sample",
cache_examples=False,
run_on_click=True,
examples_per_page=10,
)
with gr.TabItem("Info"):
gr.HTML('<h1 style="text-align: center;">🌸 Takane - Voice Design 🎨 </h1>')
gr.HTML("""
<div style="background-color: rgba(255, 255, 255, 0.525); padding: 30px; border-radius: 12px; backdrop-filter: blur(5px); max-width: 100%; box-shadow: 0 4px 6px rgba(0,0,0,0.5);">
<div style="display: flex; gap: 24px; flex-wrap: wrap; justify-content: center;">
<div style="flex: 1; min-width: 280px;">
<h3 style="color: #000000; margin: 0 0 12px 0; font-size: 20px; text-align: center;">日本語</h3>
<p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 16px; margin: 0; text-align: center;">
本モデルのバックボーンは
<a href="https://huggingface.co/spaces/Respair/Takane" target="_blank" rel="noopener noreferrer"
style="color: #b45309; text-decoration: none; font-weight: 600;">
Takane
</a>
を改良したもので、ネイティブ 44.1kHz コーデックを備えた完全自回帰のエンコーダ・デコーダ型 Transformer です。<br><br>
<strong>CFG Style</strong> を上げると指示への追従が強くなりますが、上げすぎると過剰な条件付け(over-conditioning)が起きて音質が劣化する場合があります。
</p>
</div>
<div style="flex: 1; min-width: 280px;">
<h3 style="color: #000000; margin: 0 0 12px 0; font-size: 20px; text-align: center;">English</h3>
<p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 16px; margin: 0; text-align: center;">
The backbone is a modified version of
<a href="https://huggingface.co/spaces/Respair/Takane" target="_blank" rel="noopener noreferrer"
style="color: #b45309; text-decoration: none; font-weight: 600;">
Takane
</a>,
a fully autoregressive encoder-decoder transformer with a native 44.1khz codec.<br><br>
Raise <strong>CFG Style</strong> if you want stronger adherence; pushing it too high can cause over-conditioning and degrade quality. <br><br>
<code>This model is only in Japanese</code>, if you enjoy anime, this is yours to play with.
</p>
</div>
</div>
</div>
""")
def load_default():
label = "激怒する女性 / 感情爆発"
desc = VOICE_EXAMPLES.get(label, "")
path = PREGENERATED_AUDIO.get(label)
if path and os.path.exists(path):
sr, data = wavfile.read(path)
if isinstance(data, np.ndarray) and data.ndim == 2 and data.shape[0] in (1, 2) and data.shape[0] < data.shape[1]:
data = data.T
return gr.update(value=desc), (sr, data), gr.update(value=f"Status: Loaded default sample: {label}")
return gr.update(value=desc), None, gr.update(value=f"Status: Default sample missing: {label}")
demo.load(
fn=load_default,
inputs=None,
outputs=[voice_desc_input, audio_output, status_output],
)
if __name__ == "__main__":
demo.queue(api_open=False, max_size=15).launch() |