Update app.py
Browse files
app.py
CHANGED
|
@@ -1,16 +1,11 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
from gradio_client import Client
|
| 3 |
import os
|
| 4 |
-
import csv
|
| 5 |
import numpy as np
|
| 6 |
import scipy.io.wavfile as wavfile
|
| 7 |
-
import tempfile
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
client = Client(os.environ['src'])
|
| 12 |
-
except:
|
| 13 |
-
client = Client("http://localhost:7860/")
|
| 14 |
|
| 15 |
css = """
|
| 16 |
.gradio-container input::placeholder,
|
|
@@ -22,13 +17,6 @@ code {
|
|
| 22 |
padding: 2px 4px;
|
| 23 |
border-radius: 3px;
|
| 24 |
}
|
| 25 |
-
#settings-accordion summary {
|
| 26 |
-
justify-content: center;
|
| 27 |
-
}
|
| 28 |
-
.examples-holder > .label {
|
| 29 |
-
color: #b45309 !important;
|
| 30 |
-
font-weight: 600;
|
| 31 |
-
}
|
| 32 |
|
| 33 |
.gr-checkbox label span,
|
| 34 |
.gr-check-radio label span,
|
|
@@ -36,11 +24,6 @@ code {
|
|
| 36 |
.checkbox-container span {
|
| 37 |
color: #ECF2F7 !important;
|
| 38 |
}
|
| 39 |
-
.gr-checkbox label span.selected,
|
| 40 |
-
.gr-check-radio label span.selected,
|
| 41 |
-
[data-testid="checkbox"].selected span {
|
| 42 |
-
color: #FFD700 !important;
|
| 43 |
-
}
|
| 44 |
|
| 45 |
#advanced-accordion > button,
|
| 46 |
#advanced-accordion > button span,
|
|
@@ -53,145 +36,6 @@ code {
|
|
| 53 |
color: #FFD700 !important;
|
| 54 |
}
|
| 55 |
|
| 56 |
-
.examples-table {
|
| 57 |
-
border-collapse: collapse !important;
|
| 58 |
-
width: 100% !important;
|
| 59 |
-
}
|
| 60 |
-
|
| 61 |
-
.examples-table tbody tr {
|
| 62 |
-
background-color: #d4c896 !important;
|
| 63 |
-
border-bottom: 2px solid #c4b886 !important;
|
| 64 |
-
}
|
| 65 |
-
|
| 66 |
-
.examples-table tbody tr:hover {
|
| 67 |
-
background-color: #c9bd8b !important;
|
| 68 |
-
}
|
| 69 |
-
|
| 70 |
-
.examples-table tbody td {
|
| 71 |
-
background-color: #d4c896 !important;
|
| 72 |
-
padding: 12px 16px !important;
|
| 73 |
-
color: #1a1a1a !important;
|
| 74 |
-
font-weight: 500 !important;
|
| 75 |
-
border: none !important;
|
| 76 |
-
}
|
| 77 |
-
|
| 78 |
-
.examples-table thead th {
|
| 79 |
-
background-color: #bfb07a !important;
|
| 80 |
-
color: #1a1a1a !important;
|
| 81 |
-
font-weight: 600 !important;
|
| 82 |
-
padding: 10px 16px !important;
|
| 83 |
-
}
|
| 84 |
-
|
| 85 |
-
.gallery,
|
| 86 |
-
.gr-examples,
|
| 87 |
-
.examples {
|
| 88 |
-
background: transparent !important;
|
| 89 |
-
}
|
| 90 |
-
|
| 91 |
-
.gallery > div,
|
| 92 |
-
.gr-examples > div,
|
| 93 |
-
.examples > div {
|
| 94 |
-
background: transparent !important;
|
| 95 |
-
}
|
| 96 |
-
|
| 97 |
-
.gallery button,
|
| 98 |
-
.gr-examples button,
|
| 99 |
-
.examples button,
|
| 100 |
-
.gr-sample-textbox,
|
| 101 |
-
button.sample {
|
| 102 |
-
background-color: #d4c896 !important;
|
| 103 |
-
border: 1px solid #c4b886 !important;
|
| 104 |
-
color: #1a1a1a !important;
|
| 105 |
-
font-weight: 500 !important;
|
| 106 |
-
margin: 4px !important;
|
| 107 |
-
padding: 10px 14px !important;
|
| 108 |
-
border-radius: 6px !important;
|
| 109 |
-
transition: background-color 0.2s ease !important;
|
| 110 |
-
}
|
| 111 |
-
|
| 112 |
-
.gallery button:hover,
|
| 113 |
-
.gr-examples button:hover,
|
| 114 |
-
.examples button:hover,
|
| 115 |
-
.gr-sample-textbox:hover,
|
| 116 |
-
button.sample:hover {
|
| 117 |
-
background-color: #c9bd8b !important;
|
| 118 |
-
border-color: #b4a876 !important;
|
| 119 |
-
}
|
| 120 |
-
|
| 121 |
-
#mono-examples-container .gallery button,
|
| 122 |
-
#mono-examples-container .gr-examples button,
|
| 123 |
-
#mono-examples-container .examples button,
|
| 124 |
-
#mono-examples-container button.sample {
|
| 125 |
-
background-color: #d4c896 !important;
|
| 126 |
-
border-color: #c4b886 !important;
|
| 127 |
-
}
|
| 128 |
-
|
| 129 |
-
#stereo-examples-container .gallery button,
|
| 130 |
-
#stereo-examples-container .gr-examples button,
|
| 131 |
-
#stereo-examples-container .examples button,
|
| 132 |
-
#stereo-examples-container button.sample {
|
| 133 |
-
background-color: #c8d4a6 !important;
|
| 134 |
-
border-color: #b8c496 !important;
|
| 135 |
-
}
|
| 136 |
-
|
| 137 |
-
#stereo-examples-container .gallery button:hover,
|
| 138 |
-
#stereo-examples-container .gr-examples button:hover,
|
| 139 |
-
#stereo-examples-container .examples button:hover,
|
| 140 |
-
#stereo-examples-container button.sample:hover {
|
| 141 |
-
background-color: #bdc9a0 !important;
|
| 142 |
-
border-color: #a8b486 !important;
|
| 143 |
-
}
|
| 144 |
-
|
| 145 |
-
.gr-examples table,
|
| 146 |
-
.examples table,
|
| 147 |
-
table.examples-table {
|
| 148 |
-
width: 100% !important;
|
| 149 |
-
border-collapse: collapse !important;
|
| 150 |
-
}
|
| 151 |
-
|
| 152 |
-
.gr-examples table tr,
|
| 153 |
-
.examples table tr {
|
| 154 |
-
background-color: #d4c896 !important;
|
| 155 |
-
}
|
| 156 |
-
|
| 157 |
-
.gr-examples table tr:hover,
|
| 158 |
-
.examples table tr:hover {
|
| 159 |
-
background-color: #c9bd8b !important;
|
| 160 |
-
}
|
| 161 |
-
|
| 162 |
-
.gr-examples table td,
|
| 163 |
-
.examples table td {
|
| 164 |
-
background-color: inherit !important;
|
| 165 |
-
padding: 12px 16px !important;
|
| 166 |
-
color: #1a1a1a !important;
|
| 167 |
-
font-weight: 500 !important;
|
| 168 |
-
cursor: pointer !important;
|
| 169 |
-
}
|
| 170 |
-
|
| 171 |
-
.gr-examples .gr-samples-table,
|
| 172 |
-
.examples .gr-samples-table {
|
| 173 |
-
background: transparent !important;
|
| 174 |
-
}
|
| 175 |
-
|
| 176 |
-
.gr-examples .gr-samples-table tr,
|
| 177 |
-
.examples .gr-samples-table tr {
|
| 178 |
-
background-color: #d4c896 !important;
|
| 179 |
-
margin-bottom: 4px !important;
|
| 180 |
-
}
|
| 181 |
-
|
| 182 |
-
.gr-examples > div > div,
|
| 183 |
-
.examples > div > div {
|
| 184 |
-
background-color: #d4c896 !important;
|
| 185 |
-
border-radius: 6px !important;
|
| 186 |
-
margin: 4px !important;
|
| 187 |
-
padding: 8px 12px !important;
|
| 188 |
-
}
|
| 189 |
-
|
| 190 |
-
.gr-examples > div > div:hover,
|
| 191 |
-
.examples > div > div:hover {
|
| 192 |
-
background-color: #c9bd8b !important;
|
| 193 |
-
}
|
| 194 |
-
|
| 195 |
body {
|
| 196 |
background: none !important;
|
| 197 |
}
|
|
@@ -207,199 +51,45 @@ body::before {
|
|
| 207 |
pointer-events: none;
|
| 208 |
background: url('https://i.postimg.cc/1smD6GPf/gradio-theme-rin2.png') center center / cover no-repeat;
|
| 209 |
}
|
| 210 |
-
|
| 211 |
"""
|
| 212 |
|
| 213 |
|
| 214 |
-
def save_audio_to_temp(audio_data):
|
| 215 |
-
if audio_data is None:
|
| 216 |
-
return None
|
| 217 |
-
sample_rate, audio_array = audio_data
|
| 218 |
-
if isinstance(audio_array, list):
|
| 219 |
-
audio_array = np.array(audio_array)
|
| 220 |
-
if audio_array.dtype == np.float32 or audio_array.dtype == np.float64:
|
| 221 |
-
audio_array = (audio_array * 32767).astype(np.int16)
|
| 222 |
-
tmp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
|
| 223 |
-
wavfile.write(tmp.name, sample_rate, audio_array)
|
| 224 |
-
tmp.close()
|
| 225 |
-
return tmp.name
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
def cleanup_temp(path):
|
| 229 |
-
if path is not None:
|
| 230 |
-
try:
|
| 231 |
-
os.unlink(path)
|
| 232 |
-
except:
|
| 233 |
-
pass
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
def load_examples(csv_path):
|
| 237 |
-
examples = []
|
| 238 |
-
if not os.path.exists(csv_path):
|
| 239 |
-
return examples
|
| 240 |
-
try:
|
| 241 |
-
with open(csv_path, 'r', encoding='utf-8') as f:
|
| 242 |
-
reader = csv.reader(f, delimiter=',', quotechar='"', doublequote=True)
|
| 243 |
-
for row in reader:
|
| 244 |
-
if len(row) >= 2:
|
| 245 |
-
audio_path = row[0].strip()
|
| 246 |
-
text = row[1].strip()
|
| 247 |
-
if text.startswith('"') and text.endswith('"'):
|
| 248 |
-
text = text[1:-1]
|
| 249 |
-
elif text.startswith("'") and text.endswith("'"):
|
| 250 |
-
text = text[1:-1]
|
| 251 |
-
if text.startswith('\u201c') and text.endswith('\u201d'):
|
| 252 |
-
text = text[1:-1]
|
| 253 |
-
if text.startswith('\u300c') and text.endswith('\u300d'):
|
| 254 |
-
text = text[1:-1]
|
| 255 |
-
|
| 256 |
-
speaker_id = 1
|
| 257 |
-
if len(row) >= 3 and row[2].strip():
|
| 258 |
-
try:
|
| 259 |
-
speaker_id = int(row[2].strip())
|
| 260 |
-
except ValueError:
|
| 261 |
-
speaker_id = 1
|
| 262 |
-
|
| 263 |
-
pregenerated_audio = None
|
| 264 |
-
if audio_path and audio_path.lower() != "none" and audio_path != "":
|
| 265 |
-
if not os.path.isabs(audio_path):
|
| 266 |
-
base_dir = os.path.dirname(csv_path)
|
| 267 |
-
audio_path = os.path.join(base_dir, audio_path)
|
| 268 |
-
if os.path.exists(audio_path):
|
| 269 |
-
pregenerated_audio = audio_path
|
| 270 |
-
examples.append([text, pregenerated_audio, speaker_id])
|
| 271 |
-
except Exception as e:
|
| 272 |
-
pass
|
| 273 |
-
return examples
|
| 274 |
-
|
| 275 |
-
|
| 276 |
def run_generation_pipeline_client(
|
| 277 |
raw_text,
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
temperature,
|
| 283 |
-
min_temp,
|
| 284 |
-
max_temp,
|
| 285 |
-
top_k,
|
| 286 |
-
top_p,
|
| 287 |
-
min_p,
|
| 288 |
-
dry_multiplier,
|
| 289 |
-
max_tokens,
|
| 290 |
-
pan_idx,
|
| 291 |
-
width_idx,
|
| 292 |
-
seed,
|
| 293 |
-
):
|
| 294 |
-
try:
|
| 295 |
-
audio_path = save_audio_to_temp(audio_prompt)
|
| 296 |
-
audio_for_api = None
|
| 297 |
-
if audio_path is not None:
|
| 298 |
-
audio_for_api = {"path": audio_path, "meta": {"_type": "gradio.FileData"}}
|
| 299 |
-
|
| 300 |
-
result = client.predict(
|
| 301 |
-
raw_text,
|
| 302 |
-
audio_for_api,
|
| 303 |
-
use_stereo,
|
| 304 |
-
speaker_id,
|
| 305 |
-
cfg_scale,
|
| 306 |
-
temperature,
|
| 307 |
-
min_temp,
|
| 308 |
-
max_temp,
|
| 309 |
-
top_k,
|
| 310 |
-
top_p,
|
| 311 |
-
min_p,
|
| 312 |
-
dry_multiplier,
|
| 313 |
-
max_tokens,
|
| 314 |
-
pan_idx,
|
| 315 |
-
width_idx,
|
| 316 |
-
seed,
|
| 317 |
-
"",
|
| 318 |
-
api_name="/run_generation_pipeline"
|
| 319 |
-
)
|
| 320 |
-
|
| 321 |
-
cleanup_temp(audio_path)
|
| 322 |
-
|
| 323 |
-
if result is None:
|
| 324 |
-
return None, "Status: No response from server"
|
| 325 |
-
|
| 326 |
-
if isinstance(result, (list, tuple)) and len(result) == 2:
|
| 327 |
-
audio_result, status_msg = result
|
| 328 |
-
if audio_result is not None:
|
| 329 |
-
if isinstance(audio_result, str) and os.path.exists(audio_result):
|
| 330 |
-
sr, data = wavfile.read(audio_result)
|
| 331 |
-
return (sr, data), status_msg
|
| 332 |
-
elif isinstance(audio_result, (list, tuple)) and len(audio_result) >= 2:
|
| 333 |
-
sr = audio_result[0]
|
| 334 |
-
data = audio_result[1]
|
| 335 |
-
if isinstance(data, list):
|
| 336 |
-
data = np.array(data)
|
| 337 |
-
return (sr, data), status_msg
|
| 338 |
-
return None, status_msg
|
| 339 |
-
|
| 340 |
-
return None, "Status: Unexpected response format from server"
|
| 341 |
-
|
| 342 |
-
except Exception as e:
|
| 343 |
-
cleanup_temp(audio_path if 'audio_path' in dir() else None)
|
| 344 |
-
return None, f"Status: Connection error: {str(e)}"
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
def run_alchemy_pipeline_client(
|
| 348 |
-
raw_text,
|
| 349 |
-
ref_audio_1,
|
| 350 |
-
ref_audio_2,
|
| 351 |
-
mix_ratio,
|
| 352 |
-
cfg_scale,
|
| 353 |
-
speaker_cfg_scale,
|
| 354 |
-
speaker_adaln_scale,
|
| 355 |
temperature,
|
| 356 |
min_temp,
|
| 357 |
max_temp,
|
| 358 |
temp_exponent,
|
| 359 |
top_k,
|
| 360 |
-
top_p,
|
| 361 |
min_p,
|
| 362 |
dry_multiplier,
|
| 363 |
max_tokens,
|
| 364 |
seed,
|
| 365 |
):
|
| 366 |
try:
|
| 367 |
-
ref1_path = save_audio_to_temp(ref_audio_1)
|
| 368 |
-
ref2_path = save_audio_to_temp(ref_audio_2)
|
| 369 |
-
|
| 370 |
-
ref1_for_api = None
|
| 371 |
-
if ref1_path is not None:
|
| 372 |
-
ref1_for_api = {"path": ref1_path, "meta": {"_type": "gradio.FileData"}}
|
| 373 |
-
|
| 374 |
-
ref2_for_api = None
|
| 375 |
-
if ref2_path is not None:
|
| 376 |
-
ref2_for_api = {"path": ref2_path, "meta": {"_type": "gradio.FileData"}}
|
| 377 |
-
|
| 378 |
result = client.predict(
|
| 379 |
raw_text,
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
speaker_cfg_scale,
|
| 385 |
-
speaker_adaln_scale,
|
| 386 |
temperature,
|
| 387 |
min_temp,
|
| 388 |
max_temp,
|
| 389 |
temp_exponent,
|
| 390 |
top_k,
|
| 391 |
-
top_p,
|
| 392 |
min_p,
|
| 393 |
dry_multiplier,
|
| 394 |
max_tokens,
|
| 395 |
seed,
|
| 396 |
-
"",
|
| 397 |
-
api_name="/
|
| 398 |
)
|
| 399 |
|
| 400 |
-
cleanup_temp(ref1_path)
|
| 401 |
-
cleanup_temp(ref2_path)
|
| 402 |
-
|
| 403 |
if result is None:
|
| 404 |
return None, "Status: No response from server"
|
| 405 |
|
|
@@ -420,169 +110,90 @@ def run_alchemy_pipeline_client(
|
|
| 420 |
return None, "Status: Unexpected response format from server"
|
| 421 |
|
| 422 |
except Exception as e:
|
| 423 |
-
cleanup_temp(ref1_path if 'ref1_path' in dir() else None)
|
| 424 |
-
cleanup_temp(ref2_path if 'ref2_path' in dir() else None)
|
| 425 |
return None, f"Status: Connection error: {str(e)}"
|
| 426 |
|
| 427 |
|
| 428 |
-
examples_mono_csv_path = "samples/examples_mono.csv"
|
| 429 |
-
examples_stereo_csv_path = "samples/examples_stereo.csv"
|
| 430 |
-
|
| 431 |
-
example_list_mono = load_examples(examples_mono_csv_path)
|
| 432 |
-
example_list_stereo = load_examples(examples_stereo_csv_path)
|
| 433 |
-
|
| 434 |
-
|
| 435 |
with gr.Blocks(theme="Respair/Shiki@10.1.0", css=css) as demo:
|
| 436 |
|
| 437 |
-
# gr.Markdown('<h1 style="text-align: center;">🪷 Takane - Mirei 「美嶺」 </h1>')
|
| 438 |
-
# gr.HTML('<h1 style="text-align: center;">🪷 Takane - Mirei 「美嶺」 </h1>')
|
| 439 |
-
# gr.HTML('<h1 style="text-align: center; font-size: 24px;">🪷 Takane - Mirei 「美嶺」 </h1>')
|
| 440 |
gr.Markdown(
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
</div>
|
| 450 |
-
"""
|
| 451 |
)
|
| 452 |
|
| 453 |
with gr.Tabs():
|
| 454 |
|
|
|
|
| 455 |
with gr.TabItem("Speech Generation"):
|
| 456 |
with gr.Row():
|
| 457 |
with gr.Column(scale=2):
|
| 458 |
text_input = gr.Textbox(
|
| 459 |
-
label="Text
|
| 460 |
lines=5,
|
| 461 |
-
max_length=125,
|
| 462 |
placeholder="ここでテキストを入力してください...\n\n"
|
| 463 |
)
|
| 464 |
|
| 465 |
-
|
| 466 |
-
label="
|
| 467 |
-
|
| 468 |
-
|
| 469 |
)
|
| 470 |
|
| 471 |
with gr.Row(equal_height=False):
|
| 472 |
with gr.Accordion("----------------------------------⭐ 🛠️ ⭐", open=False):
|
| 473 |
|
| 474 |
-
|
| 475 |
-
label="Audio Prompt (Optional — Please set Speaker ID to 1)",
|
| 476 |
-
sources=["upload", "microphone"],
|
| 477 |
-
type="numpy"
|
| 478 |
-
)
|
| 479 |
|
| 480 |
-
|
| 481 |
-
label="
|
| 482 |
-
minimum=1,
|
| 483 |
-
maximum=2000,
|
| 484 |
-
value=1,
|
| 485 |
-
step=1
|
| 486 |
)
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
minimum=1.0,
|
| 493 |
-
maximum=3.0,
|
| 494 |
-
value=1.15,
|
| 495 |
-
step=0.05
|
| 496 |
)
|
| 497 |
|
|
|
|
|
|
|
| 498 |
temperature_slider = gr.Slider(
|
| 499 |
-
label="Temperature
|
| 500 |
-
minimum=0.0,
|
| 501 |
-
maximum=2.0,
|
| 502 |
-
value=1.0,
|
| 503 |
-
step=0.05
|
| 504 |
)
|
| 505 |
-
|
| 506 |
min_temp_slider = gr.Slider(
|
| 507 |
-
label="Min Temperature (0 =
|
| 508 |
-
minimum=0.0,
|
| 509 |
-
maximum=2.0,
|
| 510 |
-
value=0.25,
|
| 511 |
-
step=0.05
|
| 512 |
)
|
| 513 |
-
|
| 514 |
max_temp_slider = gr.Slider(
|
| 515 |
-
label="Max Temperature (0 =
|
| 516 |
-
minimum=0.0,
|
| 517 |
-
maximum=2.0,
|
| 518 |
-
value=1.0,
|
| 519 |
-
step=0.05
|
| 520 |
)
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
label="Top K (0 = off)",
|
| 524 |
-
minimum=0,
|
| 525 |
-
maximum=200,
|
| 526 |
-
value=0,
|
| 527 |
-
step=1
|
| 528 |
)
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
label="Top P",
|
| 532 |
-
minimum=0.0,
|
| 533 |
-
maximum=1.0,
|
| 534 |
-
value=1.0,
|
| 535 |
-
step=0.01
|
| 536 |
)
|
| 537 |
-
|
| 538 |
min_p_slider = gr.Slider(
|
| 539 |
-
label="Min P (0 = off)",
|
| 540 |
-
minimum=0.0,
|
| 541 |
-
maximum=1.0,
|
| 542 |
-
value=0.0,
|
| 543 |
-
step=0.01
|
| 544 |
)
|
| 545 |
|
| 546 |
-
|
| 547 |
-
label="Max Tokens",
|
| 548 |
-
minimum=100,
|
| 549 |
-
maximum=1500,
|
| 550 |
-
value=768,
|
| 551 |
-
step=10
|
| 552 |
-
)
|
| 553 |
|
| 554 |
dry_multiplier_slider = gr.Slider(
|
| 555 |
-
label="DRY Multiplier (0 = off)",
|
| 556 |
-
minimum=0.0,
|
| 557 |
-
maximum=2.0,
|
| 558 |
-
value=0.8,
|
| 559 |
-
step=0.1
|
| 560 |
)
|
| 561 |
|
| 562 |
-
|
| 563 |
-
label="Seed (-1 for random)",
|
| 564 |
-
minimum=-1,
|
| 565 |
-
maximum=2700000000,
|
| 566 |
-
value=-1,
|
| 567 |
-
step=1
|
| 568 |
-
)
|
| 569 |
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
pan_idx_slider = gr.Slider(
|
| 573 |
-
label="Pan (0=Left, 5=Center, 10=Right)",
|
| 574 |
-
minimum=0,
|
| 575 |
-
maximum=10,
|
| 576 |
-
value=2,
|
| 577 |
-
step=1
|
| 578 |
)
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
label="Stereo Width (0=Narrow, 10=Wide)",
|
| 582 |
-
minimum=0,
|
| 583 |
-
maximum=10,
|
| 584 |
-
value=5,
|
| 585 |
-
step=1
|
| 586 |
)
|
| 587 |
|
| 588 |
with gr.Column(scale=1):
|
|
@@ -595,545 +206,122 @@ with gr.Blocks(theme="Respair/Shiki@10.1.0", css=css) as demo:
|
|
| 595 |
interactive=False
|
| 596 |
)
|
| 597 |
|
| 598 |
-
def on_stereo_toggle(use_stereo):
|
| 599 |
-
if use_stereo:
|
| 600 |
-
temp, min_t, max_t, min_p_val = 1.0, 0.0, 0.0, 0.05
|
| 601 |
-
if len(example_list_stereo) >= 2:
|
| 602 |
-
ex = example_list_stereo[1]
|
| 603 |
-
text = ex[0]
|
| 604 |
-
sid = ex[2]
|
| 605 |
-
path = ex[1]
|
| 606 |
-
if path and os.path.exists(path):
|
| 607 |
-
sr, data = wavfile.read(path)
|
| 608 |
-
return (gr.update(value=temp), gr.update(value=min_t), gr.update(value=max_t), gr.update(value=min_p_val),
|
| 609 |
-
gr.update(value=text), gr.update(value=sid), (sr, data), "Status: Stereo example loaded / ステレオ例を読み込みました")
|
| 610 |
-
return (gr.update(value=temp), gr.update(value=min_t), gr.update(value=max_t), gr.update(value=min_p_val),
|
| 611 |
-
gr.update(), gr.update(), gr.update(), gr.update())
|
| 612 |
-
else:
|
| 613 |
-
temp, min_t, max_t, min_p_val = 1.0, 0.25, 1.0, 0.0
|
| 614 |
-
if len(example_list_mono) >= 2:
|
| 615 |
-
ex = example_list_mono[-1]
|
| 616 |
-
text = ex[0]
|
| 617 |
-
sid = ex[2]
|
| 618 |
-
path = ex[1]
|
| 619 |
-
if path and os.path.exists(path):
|
| 620 |
-
sr, data = wavfile.read(path)
|
| 621 |
-
return (gr.update(value=temp), gr.update(value=min_t), gr.update(value=max_t), gr.update(value=min_p_val),
|
| 622 |
-
gr.update(value=text), gr.update(value=sid), (sr, data), "Status: Mono example loaded / モノラル例を読み込みました")
|
| 623 |
-
return (gr.update(value=temp), gr.update(value=min_t), gr.update(value=max_t), gr.update(value=min_p_val),
|
| 624 |
-
gr.update(), gr.update(), gr.update(), gr.update())
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
use_stereo_checkbox.change(
|
| 628 |
-
fn=on_stereo_toggle,
|
| 629 |
-
inputs=[use_stereo_checkbox],
|
| 630 |
-
outputs=[temperature_slider, min_temp_slider, max_temp_slider, min_p_slider,
|
| 631 |
-
text_input, speaker_id_slider, audio_output, status_output]
|
| 632 |
-
)
|
| 633 |
-
|
| 634 |
generate_button.click(
|
| 635 |
fn=run_generation_pipeline_client,
|
| 636 |
inputs=[
|
| 637 |
text_input,
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
temperature_slider,
|
| 643 |
min_temp_slider,
|
| 644 |
max_temp_slider,
|
|
|
|
| 645 |
top_k_slider,
|
| 646 |
-
top_p_slider,
|
| 647 |
min_p_slider,
|
| 648 |
dry_multiplier_slider,
|
| 649 |
max_tokens_slider,
|
| 650 |
-
pan_idx_slider,
|
| 651 |
-
width_idx_slider,
|
| 652 |
seed_slider,
|
| 653 |
],
|
| 654 |
outputs=[audio_output, status_output],
|
| 655 |
-
concurrency_limit=4
|
| 656 |
)
|
| 657 |
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
# <div style="background-color: rgba(255, 255, 255, 0.025); padding: 20px; border-radius: 12px; backdrop-filter: blur(10px); box-shadow: 0 4px 6px rgba(0,0,0,0.5); margin-top: 8px;">
|
| 662 |
-
# <p style="color: #1a1a1a; font-weight: 500; line-height: 1.6; font-size: 14px; text-align: center; margin: 0;">
|
| 663 |
-
# Upload audio references and mix them to create new voices.
|
| 664 |
-
# Upload two references and adjust the mix ratio, or use a single reference.
|
| 665 |
-
# </p>
|
| 666 |
-
# </div>
|
| 667 |
-
# """)
|
| 668 |
-
|
| 669 |
with gr.Row():
|
| 670 |
-
with gr.Column(scale=2):
|
| 671 |
-
alch_text_input = gr.Textbox(
|
| 672 |
-
label="Synthesize",
|
| 673 |
-
lines=5,
|
| 674 |
-
placeholder="日本語のテキストを入力してください...\n\n",
|
| 675 |
-
value="睡眠は、心身の健康を保つために欠かせない大切な時間です。良質な睡眠をとることで、一日の疲れを癒やし、脳内の情報を整理することができます。寝る前にスマートフォンを控えたり、温かい飲み物を飲んでリラックスすることで、より深く眠れるようになります。明日も元気に過ごすために、今夜はゆっくりと体を休めましょうね。"
|
| 676 |
-
)
|
| 677 |
-
|
| 678 |
-
alch_mix_ratio = gr.Slider(
|
| 679 |
-
minimum=0.0,
|
| 680 |
-
maximum=1.0,
|
| 681 |
-
value=0.5,
|
| 682 |
-
step=0.05,
|
| 683 |
-
label="Mix Ratio",
|
| 684 |
-
)
|
| 685 |
-
|
| 686 |
-
with gr.Row():
|
| 687 |
-
with gr.Column():
|
| 688 |
-
gr.HTML("""
|
| 689 |
-
<div style="background-color: rgba(255, 255, 255, 0.55); padding: 8px 12px; border-radius: 8px; backdrop-filter: blur(10px); box-shadow: 0 2px 4px rgba(0,0,0,0.08); text-align: center; max-width: 180px; margin: 0 auto;">
|
| 690 |
-
<h3 style="color: #000000; margin: 0; font-size: 16px;">🔊 Reference 1</h3>
|
| 691 |
-
</div>
|
| 692 |
-
""")
|
| 693 |
-
alch_ref_audio_1 = gr.Audio(
|
| 694 |
-
label="Reference Audio 1",
|
| 695 |
-
sources=["upload", "microphone"],
|
| 696 |
-
type="numpy",
|
| 697 |
-
value="samples/sample_01.mp3"
|
| 698 |
-
)
|
| 699 |
-
with gr.Column():
|
| 700 |
-
gr.HTML("""
|
| 701 |
-
<div style="background-color: rgba(255, 255, 255, 0.525); padding: 8px 12px; border-radius: 8px; backdrop-filter: blur(10px); box-shadow: 0 2px 4px rgba(0,0,0,0.08); text-align: center; max-width: 250px; margin: 0 auto;">
|
| 702 |
-
<h3 style="color: #000000; margin: 0; font-size: 16px;">🔊 Reference 2 (Optional)</h3>
|
| 703 |
-
</div>
|
| 704 |
-
""")
|
| 705 |
-
alch_ref_audio_2 = gr.Audio(
|
| 706 |
-
label="Reference Audio 2 (Optional)",
|
| 707 |
-
sources=["upload", "microphone"],
|
| 708 |
-
type="numpy",
|
| 709 |
-
value="samples/sample_02.mp3"
|
| 710 |
-
)
|
| 711 |
-
|
| 712 |
-
with gr.Row(equal_height=False):
|
| 713 |
-
with gr.Accordion("----------------------------------⭐ 🛠️ ⭐", open=False):
|
| 714 |
-
|
| 715 |
-
gr.Markdown('<h3 style="color: #FFD700;">Style Parameters</h3>')
|
| 716 |
-
|
| 717 |
-
alch_cfg_scale = gr.Slider(
|
| 718 |
-
label="CFG Scale",
|
| 719 |
-
minimum=0.5,
|
| 720 |
-
maximum=3.0,
|
| 721 |
-
value=1.4,
|
| 722 |
-
step=0.05
|
| 723 |
-
)
|
| 724 |
-
|
| 725 |
-
alch_speaker_cfg_scale = gr.Slider(
|
| 726 |
-
label="Speaker CFG Scale (Legacy - just rely AdaLN scale)",
|
| 727 |
-
minimum=0.5,
|
| 728 |
-
maximum=3.0,
|
| 729 |
-
value=1.,
|
| 730 |
-
step=0.1
|
| 731 |
-
)
|
| 732 |
-
|
| 733 |
-
alch_speaker_adaln_scale = gr.Slider(
|
| 734 |
-
label="Speaker AdaLN Scale (値を上げると参照音声に近づきますが、不自然になる可能性があります)",
|
| 735 |
-
minimum=0.0,
|
| 736 |
-
maximum=3.0,
|
| 737 |
-
value=2.,
|
| 738 |
-
step=0.1
|
| 739 |
-
)
|
| 740 |
-
|
| 741 |
-
gr.Markdown('<h3 style="color: #FFD700;">Sampling Parameters</h3>')
|
| 742 |
-
|
| 743 |
-
alch_temperature = gr.Slider(
|
| 744 |
-
label="Temperature (Min/Max Temp の両方が 0 に設定されている場合のみ有効です)",
|
| 745 |
-
minimum=0.0,
|
| 746 |
-
maximum=2.0,
|
| 747 |
-
value=0.0,
|
| 748 |
-
step=0.05
|
| 749 |
-
)
|
| 750 |
-
|
| 751 |
-
alch_min_temp = gr.Slider(
|
| 752 |
-
label="Min Temperature (0 = off)",
|
| 753 |
-
minimum=0.0,
|
| 754 |
-
maximum=1.0,
|
| 755 |
-
value=0.1,
|
| 756 |
-
step=0.05
|
| 757 |
-
)
|
| 758 |
-
|
| 759 |
-
alch_max_temp = gr.Slider(
|
| 760 |
-
label="Max Temperature (0 = off)",
|
| 761 |
-
minimum=0.0,
|
| 762 |
-
maximum=2.0,
|
| 763 |
-
value=1.0,
|
| 764 |
-
step=0.1
|
| 765 |
-
)
|
| 766 |
-
|
| 767 |
-
alch_temp_exponent = gr.Slider(
|
| 768 |
-
label="Temperature Exponent",
|
| 769 |
-
minimum=0.5,
|
| 770 |
-
maximum=2.0,
|
| 771 |
-
value=1.0,
|
| 772 |
-
step=0.1
|
| 773 |
-
)
|
| 774 |
-
|
| 775 |
-
alch_top_k = gr.Slider(
|
| 776 |
-
label="Top K (0 = off)",
|
| 777 |
-
minimum=0,
|
| 778 |
-
maximum=200,
|
| 779 |
-
value=0,
|
| 780 |
-
step=5
|
| 781 |
-
)
|
| 782 |
-
|
| 783 |
-
alch_top_p = gr.Slider(
|
| 784 |
-
label="Top P",
|
| 785 |
-
minimum=0.0,
|
| 786 |
-
maximum=1.0,
|
| 787 |
-
value=1.0,
|
| 788 |
-
step=0.01
|
| 789 |
-
)
|
| 790 |
-
|
| 791 |
-
alch_min_p = gr.Slider(
|
| 792 |
-
label="Min P (0 = off)",
|
| 793 |
-
minimum=0.0,
|
| 794 |
-
maximum=1.0,
|
| 795 |
-
value=0.0,
|
| 796 |
-
step=0.01
|
| 797 |
-
)
|
| 798 |
-
|
| 799 |
-
alch_max_tokens = gr.Slider(
|
| 800 |
-
label="Max Tokens",
|
| 801 |
-
minimum=100,
|
| 802 |
-
maximum=1500,
|
| 803 |
-
value=1024,
|
| 804 |
-
step=10
|
| 805 |
-
)
|
| 806 |
-
|
| 807 |
-
alch_dry_multiplier = gr.Slider(
|
| 808 |
-
label="DRY Multiplier (0 = off)",
|
| 809 |
-
minimum=0.0,
|
| 810 |
-
maximum=2.0,
|
| 811 |
-
value=0.8,
|
| 812 |
-
step=0.1
|
| 813 |
-
)
|
| 814 |
-
|
| 815 |
-
alch_seed = gr.Slider(
|
| 816 |
-
label="Seed (-1 for random)",
|
| 817 |
-
minimum=-1,
|
| 818 |
-
maximum=2700000000,
|
| 819 |
-
value=42,
|
| 820 |
-
step=1
|
| 821 |
-
)
|
| 822 |
-
|
| 823 |
-
with gr.Column(scale=1):
|
| 824 |
-
alch_generate_button = gr.Button("⚗️ Generate", variant="primary", size="lg")
|
| 825 |
-
|
| 826 |
with gr.Column(scale=1):
|
| 827 |
-
|
| 828 |
-
|
| 829 |
-
|
| 830 |
-
interactive=False,
|
| 831 |
-
value="samples/audio_62.wav"
|
| 832 |
-
|
| 833 |
-
)
|
| 834 |
-
|
| 835 |
-
alch_generate_button.click(
|
| 836 |
-
fn=run_alchemy_pipeline_client,
|
| 837 |
-
inputs=[
|
| 838 |
-
alch_text_input,
|
| 839 |
-
alch_ref_audio_1,
|
| 840 |
-
alch_ref_audio_2,
|
| 841 |
-
alch_mix_ratio,
|
| 842 |
-
alch_cfg_scale,
|
| 843 |
-
alch_speaker_cfg_scale,
|
| 844 |
-
alch_speaker_adaln_scale,
|
| 845 |
-
alch_temperature,
|
| 846 |
-
alch_min_temp,
|
| 847 |
-
alch_max_temp,
|
| 848 |
-
alch_temp_exponent,
|
| 849 |
-
alch_top_k,
|
| 850 |
-
alch_top_p,
|
| 851 |
-
alch_min_p,
|
| 852 |
-
alch_dry_multiplier,
|
| 853 |
-
alch_max_tokens,
|
| 854 |
-
alch_seed,
|
| 855 |
-
],
|
| 856 |
-
outputs=[alch_audio_output, alch_status_output],
|
| 857 |
-
concurrency_limit=4
|
| 858 |
-
)
|
| 859 |
|
| 860 |
-
|
| 861 |
-
gr.HTML("""
|
| 862 |
-
<div style="background-color: rgba(255, 255, 255, 0.025); padding: 60px 30px; border-radius: 12px; backdrop-filter: blur(5px); max-width: 100%; box-shadow: 0 4px 6px rgba(0,0,0,0.1); text-align: center;">
|
| 863 |
-
<p style="color: #1a1a1a; font-weight: 600; line-height: 1.8; margin-bottom: 8px; font-size: 24px;">
|
| 864 |
-
Soon...
|
| 865 |
-
</p>
|
| 866 |
-
<p style="color: #555; font-weight: 400; line-height: 1.6; margin-bottom: 24px; font-size: 14px;">
|
| 867 |
-
Here, you can create the voice you want by describing it with a prompt (gender, tone, characteristic, emotion etc.)
|
| 868 |
-
</p>
|
| 869 |
-
<p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; margin-bottom: 8px; font-size: 20px;">
|
| 870 |
-
近日公開...
|
| 871 |
-
</p>
|
| 872 |
-
<p style="color: #555; font-weight: 400; line-height: 1.6; font-size: 13px;">
|
| 873 |
このモードでは、性別・口調・感情などをプロンプトで指示するだけで、好みの声を作成できます。
|
|
|
|
| 874 |
</p>
|
| 875 |
-
</div>
|
| 876 |
-
""")
|
| 877 |
-
with gr.TabItem("Examples"):
|
| 878 |
-
gr.HTML("""
|
| 879 |
-
<div style="background-color: rgba(255, 255, 255, 0.025); padding: 20px; border-radius: 12px; backdrop-filter: blur(10px); box-shadow: 0 4px 6px rgba(0,0,0,0.5); margin-top: 8px;">
|
| 880 |
-
<p style="color: #1a1a1a; font-weight: 500; line-height: 1.6; font-size: 14px; text-align: center; margin: 0;">
|
| 881 |
-
Click on any example below to load the text and hear the pre-generated audio. / 下の例をクリックすると、テキストが読み込まれ、生成済みの音声を聞くことができます。
|
| 882 |
-
</p>
|
| 883 |
-
</div>
|
| 884 |
-
""")
|
| 885 |
-
|
| 886 |
-
|
| 887 |
-
example_text_holder = gr.Textbox(visible=False)
|
| 888 |
-
|
| 889 |
-
def load_mono_example_fn(text):
|
| 890 |
-
for ex in example_list_mono:
|
| 891 |
-
if ex[0] == text:
|
| 892 |
-
pregenerated_path = ex[1]
|
| 893 |
-
sid = ex[2]
|
| 894 |
-
if pregenerated_path and os.path.exists(pregenerated_path):
|
| 895 |
-
try:
|
| 896 |
-
sample_rate, audio_data = wavfile.read(pregenerated_path)
|
| 897 |
-
status = "Status: Mono example loaded / モノラル例を読み込みました"
|
| 898 |
-
return text, False, sid, (sample_rate, audio_data), status
|
| 899 |
-
except Exception as e:
|
| 900 |
-
return text, False, sid, None, f"Status: Error loading audio: {str(e)}"
|
| 901 |
-
else:
|
| 902 |
-
return text, False, sid, None, "Status: Mono example loaded (no pre-generated audio)"
|
| 903 |
-
return text, False, 1, None, "Status: Mono example loaded"
|
| 904 |
-
|
| 905 |
-
def load_stereo_example_fn(text):
|
| 906 |
-
for ex in example_list_stereo:
|
| 907 |
-
if ex[0] == text:
|
| 908 |
-
pregenerated_path = ex[1]
|
| 909 |
-
sid = ex[2]
|
| 910 |
-
if pregenerated_path and os.path.exists(pregenerated_path):
|
| 911 |
-
try:
|
| 912 |
-
sample_rate, audio_data = wavfile.read(pregenerated_path)
|
| 913 |
-
status = "Status: Stereo example loaded / ステレオ例を読み込みました"
|
| 914 |
-
return text, True, sid, (sample_rate, audio_data), status
|
| 915 |
-
except Exception as e:
|
| 916 |
-
return text, True, sid, None, f"Status: Error loading audio: {str(e)}"
|
| 917 |
-
else:
|
| 918 |
-
return text, True, sid, None, "Status: Stereo example loaded (no pre-generated audio)"
|
| 919 |
-
return text, True, 1, None, "Status: Stereo example loaded"
|
| 920 |
|
| 921 |
-
|
| 922 |
-
|
| 923 |
-
|
| 924 |
-
|
| 925 |
-
<
|
| 926 |
-
|
| 927 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 928 |
""")
|
| 929 |
-
|
| 930 |
-
if example_list_mono:
|
| 931 |
-
mono_example_texts = [[ex[0]] for ex in example_list_mono]
|
| 932 |
-
gr.Examples(
|
| 933 |
-
examples=mono_example_texts,
|
| 934 |
-
inputs=[example_text_holder],
|
| 935 |
-
outputs=[text_input, use_stereo_checkbox, speaker_id_slider, audio_output, status_output],
|
| 936 |
-
fn=load_mono_example_fn,
|
| 937 |
-
label="Click to load a mono example",
|
| 938 |
-
cache_examples=False,
|
| 939 |
-
run_on_click=True,
|
| 940 |
-
examples_per_page=15
|
| 941 |
-
)
|
| 942 |
-
else:
|
| 943 |
-
gr.Markdown("*No mono examples available*")
|
| 944 |
|
| 945 |
-
with gr.Column(scale=1
|
| 946 |
-
|
| 947 |
gr.HTML("""
|
| 948 |
-
|
| 949 |
-
|
| 950 |
-
</div>
|
| 951 |
-
""")
|
| 952 |
-
|
| 953 |
-
|
| 954 |
-
if example_list_stereo:
|
| 955 |
-
stereo_example_texts = [[ex[0]] for ex in example_list_stereo]
|
| 956 |
-
gr.Examples(
|
| 957 |
-
examples=stereo_example_texts,
|
| 958 |
-
inputs=[example_text_holder],
|
| 959 |
-
outputs=[text_input, use_stereo_checkbox, speaker_id_slider, audio_output, status_output],
|
| 960 |
-
fn=load_stereo_example_fn,
|
| 961 |
-
label="Click to load a stereo example",
|
| 962 |
-
cache_examples=False,
|
| 963 |
-
run_on_click=True,
|
| 964 |
-
examples_per_page=15
|
| 965 |
-
)
|
| 966 |
-
else:
|
| 967 |
-
gr.Markdown("*No stereo examples available*")
|
| 968 |
-
|
| 969 |
-
with gr.TabItem("Guide"):
|
| 970 |
-
gr.HTML('<h1 style="text-align: center;">🪷 Takane - Mirei 「美嶺」 </h1>')
|
| 971 |
-
with gr.Row():
|
| 972 |
-
with gr.Column(scale=1):
|
| 973 |
-
|
| 974 |
-
gr.HTML("""
|
| 975 |
-
<div style="background-color: rgba(255, 255, 255, 0.525); padding: 30px; border-radius: 12px; backdrop-filter: blur(5px); max-width: 100%; box-shadow: 0 4px 6px rgba(0,0,0,0.5);">
|
| 976 |
-
<h2 style="color: #000000; margin-bottom: 20px; font-size: 28px;">日本語</h2>
|
| 977 |
-
|
| 978 |
-
<div style="background-color: rgba(255, 200, 200, 0.45); padding: 16px 20px; border-radius: 8px; border-left: 4px solid #cc4444; margin-bottom: 20px;">
|
| 979 |
-
<p style="color: #1a1a1a; font-weight: 600; line-height: 1.8; font-size: 14px; margin: 0;">
|
| 980 |
-
⚠️ サーバー負荷を軽くするため、オリジナルのTakaneとは違い、このデモではハルシネーション対策などの安全策を省いています。
|
| 981 |
-
そのため、モデルの出力が崩れたり、デモが止まるリスクは通常より高いです。
|
| 982 |
-
たまにメンテしますが、あくまでWIPということでご了承ください。
|
| 983 |
-
</p>
|
| 984 |
-
</div>
|
| 985 |
-
|
| 986 |
-
<p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; margin-bottom: 20px; font-size: 16px;">
|
| 987 |
-
Mirei「美嶺」 は、Takane の系譜に連なる最新モデルであり、高品質なアニメっぽい日本語音声生成のためにゼロから再構築・再設計されたモデルです。より大規模で高速、かつクリーンな音質を実現し、機能も増え、学習コストも大幅に抑えられています。
|
| 988 |
-
</p>
|
| 989 |
-
|
| 990 |
-
<p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; margin-bottom: 20px; font-size: 16px;">
|
| 991 |
-
これは、現代の最高峰の技術を駆使し、究極の日本語音声合成モデルを開発するという私の目標に向けた、新たな一歩です。
|
| 992 |
-
</p>
|
| 993 |
-
|
| 994 |
-
<h3 style="color: #000000; margin-top: 30px; margin-bottom: 15px; font-size: 20px;">初代 Takane との比較:</h3>
|
| 995 |
-
<ul style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 15px;">
|
| 996 |
-
<li style="margin: 8px 0;">大幅な容量拡大: 500M → 1.2B (Mono) / 3B (Stereo) パラメータ</li>
|
| 997 |
-
<li style="margin: 8px 0;">より多くのデータでの学習</li>
|
| 998 |
-
<li style="margin: 8px 0;">実験的な制御可能 End-to-end Stereo 生成</li>
|
| 999 |
-
<li style="margin: 8px 0;">Promptable Description Guided (TBA)</li>
|
| 1000 |
-
<li style="margin: 8px 0;">FiLM ベースの文字起こし不要なゼロショット・オーディオプロンプティング (Alchemy mode)</li>
|
| 1001 |
-
<li style="margin: 8px 0;">この種のモデルとしては最も包括的なサンプリングツールキット</li>
|
| 1002 |
-
<li style="margin: 8px 0;">アーティファクトを極限まで抑え、鮮明な音質を実現する新しい 44.1khz - 25hz コーデック</li>
|
| 1003 |
-
</ul>
|
| 1004 |
-
|
| 1005 |
-
<hr style="border: none; border-top: 1px solid rgba(0,0,0,0.15); margin: 25px 0;">
|
| 1006 |
-
|
| 1007 |
-
<h3 style="color: #000000; margin-top: 20px; margin-bottom: 15px; font-size: 20px;">注意事項:</h3>
|
| 1008 |
-
<ul style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 14px;">
|
| 1009 |
-
<li style="margin: 8px 0;"><strong>Stereo</strong> は概念実証(PoC)段階です。Pan と Width で方向と深度を制御できますが、Width は現時点では近似的なものです。</li>
|
| 1010 |
-
<li style="margin: 8px 0;"><strong>Speaker IDs</strong>: <code>speaker_id = 1</code> はランダムを意味します。多くの学習データ(特に Stereo)には話者タグがありません。未知の話者は分布外(OOD)になる可能性があるため品質にばらつきが出ますが、Stereo でも機能はします。</li>
|
| 1011 |
-
<li style="margin: 8px 0;"><strong>データセットのバイアス / 制限</strong>: 左耳へのパンニング(<code>pan = 1–4</code>)や浅い深度(<code>< 5</code>)のデータが過多です。Hover/rotation はここではサポートされていません。</li>
|
| 1012 |
-
<li style="margin: 8px 0;"><strong>絵文字</strong>: サポートは最小限ですが、ASMR プロンプトの方向付けに役立ちます(例:😮💨 🌬️ 👂 🤭 🍭 💋)。NSFW の場合、テキスト(またはテキスト+絵文字)を主に使用してください。絵文字のみのプロンプトはあまりうまくいきません。</li>
|
| 1013 |
-
<li style="margin: 8px 0;">絵文字は特定の音に必ずしも対応しているわけではないため、配置場所はそれほど重要ではありませんが、全体的な雰囲気(vibe)に影響を与えます。</li>
|
| 1014 |
-
<li style="margin: 8px 0;"><strong>多数の設定項目</strong>: 基本的にはそのままで素晴らしい結果が得られますが、場合によっては調整が必要になることもあります。</li>
|
| 1015 |
-
<li style="margin: 8px 0;"><strong><code><asmr></code> タグ</strong>: 任意ですが、出力が変化するため、有無の両方を試してみてください。</li>
|
| 1016 |
-
<li style="margin: 8px 0;"><strong>NSFW</strong>: Mono の方が一般的に制御しやすいです(データ量が多いため)。入力テキストの先頭に <code>♡♡</code> を付けることを推奨します。</li>
|
| 1017 |
-
<li style="margin: 8px 0;"><strong>Audio prompting</strong>: ベースモードでは、プロンプトは6秒以内かつ綺麗にトリミングされたものにしてください。Alchemy mode では、Prefill とは異なる手法を用いているため、この制約はそれほど重要ではありません。</li>
|
| 1018 |
-
<li style="margin: 8px 0;"><strong>長さ制限</strong>: 最大出力は29.9秒です。このデモでは長時間の生成はサポートされていません。</li>
|
| 1019 |
-
<li style="margin: 8px 0;"><strong>再生環境</strong>: 音場表現のしっかりしたヘッドホンの使用を推奨します。</li>
|
| 1020 |
-
<li style="margin: 8px 0;"><strong>多様性の調整</strong>: バリエーションの激しい出力の場合、<code>Temperature</code> を <code>~0.8–1.0</code> に上げ、<code>DRY = 0</code> に設定してください。</li>
|
| 1021 |
-
<li style="margin: 8px 0;"><code>DRY</code> をオフにすると、出力とテキストの相関が弱い場合にモデルが崩壊する可能性があります。</li>
|
| 1022 |
-
<li style="margin: 8px 0;">Takane シリーズはアニメスタイルの出力に特化していますが、Mirei バリアントでは通常の日本語もよりサポートされています。</li>
|
| 1023 |
-
</ul>
|
| 1024 |
-
|
| 1025 |
-
<h3 style="color: #000000; margin-top: 30px; margin-bottom: 15px; font-size: 20px;">Prefilling と RyuseiNet についての注記</h3>
|
| 1026 |
-
<p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 14px;">
|
| 1027 |
-
コーデックベースのモデルにおける Audio prompting のデフォルトモードは Prefilling です。これは、ボイスクリップからオーディオトークンを抽出し、その文字起こしと共に入力に結合する方法です。
|
| 1028 |
-
問題点はコンテキストを消費してしまうことです。モデルは入力を、プロンプトの自然な続きとして扱うためです。
|
| 1029 |
-
</p>
|
| 1030 |
-
<p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 14px;">
|
| 1031 |
-
例えば、モデルが主に30秒のチャンクで学習されている場合、リファレンス+出力の合計長は30秒未満である必要があります。また、安全のためにバッファを持たせることも推奨されます。例えば、出力が約20秒になる場合、オーディオプロンプトは5〜6秒程度に収めるべきです。
|
| 1032 |
-
</p>
|
| 1033 |
-
<p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 14px;">
|
| 1034 |
-
RyuseiNet (Alchemy mode) では、この制限はありません。それに話者の類似性がより高く、入力テキストに対してより頑健です。ただし、参考のために両方の手法を残しています。
|
| 1035 |
-
</p>
|
| 1036 |
-
|
| 1037 |
-
<div style="margin-top: 40px; padding-top: 20px; border-top: 1px solid rgba(0,0,0,0.1);">
|
| 1038 |
-
<p style="color: #666; font-size: 14px; text-align: center;">
|
| 1039 |
-
|
| 1040 |
-
</p>
|
| 1041 |
-
</div>
|
| 1042 |
-
|
| 1043 |
-
</div>
|
| 1044 |
-
""")
|
| 1045 |
-
with gr.Column(scale=1):
|
| 1046 |
-
gr.HTML("""
|
| 1047 |
-
<div style="background-color: rgba(255, 255, 255, 0.525); padding: 30px; border-radius: 12px; backdrop-filter: blur(5px); max-width: 100%; box-shadow: 0 4px 6px rgba(0,0,0,0.5);">
|
| 1048 |
-
|
| 1049 |
-
<h2 style="color: #000000; margin-bottom: 20px; font-size: 28px;">English</h2>
|
| 1050 |
-
|
| 1051 |
-
<p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; margin-bottom: 20px; font-size: 16px;">
|
| 1052 |
-
Mirei (美嶺) is the continuation of my work from Takane, rebuilt and redesigned from the ground up for high-quality, anime-style Japanese voice generation. It's larger, faster, sounds cleaner, has more features, and is also much cheaper to train.
|
| 1053 |
-
</p>
|
| 1054 |
-
|
| 1055 |
-
<p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; margin-bottom: 20px; font-size: 16px;">
|
| 1056 |
-
It is another step toward my goal of developing the ultimate Japanese speech and audio synthesis model, pushing as far as today's best technology can take.
|
| 1057 |
-
</p>
|
| 1058 |
|
| 1059 |
-
|
| 1060 |
-
|
| 1061 |
-
|
| 1062 |
-
|
| 1063 |
-
<li style="margin: 8px 0;">Experimental controllable end-to-end stereo generation</li>
|
| 1064 |
-
<li style="margin: 8px 0;">Promptable Description Guided (TBA)</li>
|
| 1065 |
-
<li style="margin: 8px 0;">FiLM-based, transcription-less zero-shot audio prompting (Alchemy mode)</li>
|
| 1066 |
-
<li style="margin: 8px 0;">The most comprehensive sampling toolkit in any model of this kind</li>
|
| 1067 |
-
<li style="margin: 8px 0;">A new 44.1khz - 25hz codec that sounds crisp with as few artifacts as possible</li>
|
| 1068 |
-
</ul>
|
| 1069 |
-
|
| 1070 |
-
<hr style="border: none; border-top: 1px solid rgba(0,0,0,0.15); margin: 25px 0;">
|
| 1071 |
-
|
| 1072 |
-
<h3 style="color: #000000; margin-top: 20px; margin-bottom: 15px; font-size: 20px;">Notes:</h3>
|
| 1073 |
-
<ul style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 14px;">
|
| 1074 |
-
<li style="margin: 8px 0;"><strong>Stereo</strong> is a proof of concept. Use pan and width to control direction and depth, but width is approximate right now (until I come up with a better extraction algorithm).</li>
|
| 1075 |
-
<li style="margin: 8px 0;"><strong>Speaker IDs</strong>: <code>speaker_id = 1</code> means random. Most training data (especially stereo) lacks speaker tags. Unseen speakers may be out-of-distribution, so quality can vary (stereo can still work).</li>
|
| 1076 |
-
<li style="margin: 8px 0;"><strong>Dataset bias / limits</strong>: Left-ear panning (<code>pan = 1–4</code>) and shallower depth (<code>< 5</code>) are overrepresented. Hover/rotation isn't supported here (but it is technically feasible).</li>
|
| 1077 |
-
<li style="margin: 8px 0;"><strong>Emoji</strong> support is minimal but can help ground ASMR prompts (e.g., 😮💨 🌬️ 👂 🤭 🍭 💋). For NSFW, rely more on text (or text + emoji). Emoji-only prompts rarely work well.</li>
|
| 1078 |
-
<li style="margin: 8px 0;">Emojis don't necessarily correlate with specific sounds, so their placement doesn't matter much; but they affect the overall vibe.</li>
|
| 1079 |
-
<li style="margin: 8px 0;"><strong>Lots of knobs</strong>: You'll get great results most of the time, but the remaining cases may take some tuning.</li>
|
| 1080 |
-
<li style="margin: 8px 0;"><strong><code><asmr></code> tag</strong>: Optional, but it does change the output — try with and without it.</li>
|
| 1081 |
-
<li style="margin: 8px 0;"><strong>NSFW</strong>: Mono is generally easier to steer (more data). Prepending the input text with <code>♡♡</code> is recommended.</li>
|
| 1082 |
-
<li style="margin: 8px 0;"><strong>Audio prompting</strong>: In base mode, keep prompts ≤ 6s, cleanly trimmed (no abrupt cuts). In Alchemy mode, this constraint doesn't matter as much (we use a different method than prefill).</li>
|
| 1083 |
-
<li style="margin: 8px 0;"><strong>Length limit</strong>: Max output is 29.9s; long-form generation isn't supported in this demo.</li>
|
| 1084 |
-
<li style="margin: 8px 0;"><strong>Playback</strong>: Headphones with a decent soundstage are recommended; otherwise spatial effects may feel weaker.</li>
|
| 1085 |
-
<li style="margin: 8px 0;"><strong>Diversity tuning</strong>: For high-variance outputs (laughter, extreme emotions, aegi/chupa), raise <code>temperature</code> to <code>~0.8–1.0</code> and set <code>DRY = 0</code>.</li>
|
| 1086 |
-
<li style="margin: 8px 0;">Turning off <code>DRY</code> can cause the model to collapse when the output is weakly correlated with the text.</li>
|
| 1087 |
-
<li style="margin: 8px 0;">While the Takane model family is geared towards anime-style outputs, normal Japanese is also better supported with the Mirei variant (I have provided an example).</li>
|
| 1088 |
-
</ul>
|
| 1089 |
-
|
| 1090 |
-
<h3 style="color: #000000; margin-top: 30px; margin-bottom: 15px; font-size: 20px;">A Note about Prefilling vs RyuseiNet</h3>
|
| 1091 |
-
<p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 14px;">
|
| 1092 |
-
The default mode for audio prompting in any codec-based model is prefilling — where you extract audio tokens from your voice clip and glue them together with the transcription as your input.
|
| 1093 |
-
The problem is it eats up your context, because the model treats your input as if it is a natural continuation of your prompt.
|
| 1094 |
-
</p>
|
| 1095 |
-
<p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 14px;">
|
| 1096 |
-
For example, if your model was trained on mostly 30-second chunks, the length of your reference + the output should be less than 30 seconds. It's also recommended to keep a safety buffer. For example, if your output will be roughly 20 seconds, your audio prompt should be around 5–6 seconds to be in the safe zone.
|
| 1097 |
-
</p>
|
| 1098 |
-
<p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 14px;">
|
| 1099 |
-
For RyuseiNet (Alchemy mode), we don't have this limitation, it has a better speaker similarity and more robust to your input text. But I am keeping both methods for reference.
|
| 1100 |
-
</p>
|
| 1101 |
-
|
| 1102 |
-
<div style="margin-top: 40px; padding-top: 20px; border-top: 1px solid rgba(0,0,0,0.1);">
|
| 1103 |
-
<p style="color: #666; font-size: 14px; text-align: center;">
|
| 1104 |
-
|
| 1105 |
-
</p>
|
| 1106 |
-
</div>
|
| 1107 |
|
| 1108 |
-
|
| 1109 |
-
|
| 1110 |
-
|
| 1111 |
-
<
|
| 1112 |
-
|
| 1113 |
-
|
| 1114 |
-
|
| 1115 |
-
|
| 1116 |
-
|
| 1117 |
-
|
| 1118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1119 |
""")
|
| 1120 |
-
# def load_default_example():
|
| 1121 |
-
# if len(example_list_stereo) >= 2:
|
| 1122 |
-
# return load_stereo_example_fn(example_list_stereo[1][0])
|
| 1123 |
-
# return gr.update(), gr.update(), gr.update(), None, ""
|
| 1124 |
|
| 1125 |
-
|
| 1126 |
-
|
| 1127 |
-
|
| 1128 |
-
|
| 1129 |
-
|
| 1130 |
-
|
| 1131 |
-
|
| 1132 |
-
|
| 1133 |
-
|
| 1134 |
-
|
| 1135 |
|
| 1136 |
if __name__ == "__main__":
|
| 1137 |
-
demo.queue(api_open=False, max_size=15).launch()
|
| 1138 |
-
|
| 1139 |
-
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
from gradio_client import Client
|
| 3 |
import os
|
|
|
|
| 4 |
import numpy as np
|
| 5 |
import scipy.io.wavfile as wavfile
|
|
|
|
| 6 |
|
| 7 |
+
client = Client(os.environ['src'])
|
| 8 |
+
# client = Client("http://localhost:7861/")
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
css = """
|
| 11 |
.gradio-container input::placeholder,
|
|
|
|
| 17 |
padding: 2px 4px;
|
| 18 |
border-radius: 3px;
|
| 19 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
.gr-checkbox label span,
|
| 22 |
.gr-check-radio label span,
|
|
|
|
| 24 |
.checkbox-container span {
|
| 25 |
color: #ECF2F7 !important;
|
| 26 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
#advanced-accordion > button,
|
| 29 |
#advanced-accordion > button span,
|
|
|
|
| 36 |
color: #FFD700 !important;
|
| 37 |
}
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
body {
|
| 40 |
background: none !important;
|
| 41 |
}
|
|
|
|
| 51 |
pointer-events: none;
|
| 52 |
background: url('https://i.postimg.cc/1smD6GPf/gradio-theme-rin2.png') center center / cover no-repeat;
|
| 53 |
}
|
|
|
|
| 54 |
"""
|
| 55 |
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
def run_generation_pipeline_client(
|
| 58 |
raw_text,
|
| 59 |
+
voice_description,
|
| 60 |
+
cfg_text,
|
| 61 |
+
cfg_style,
|
| 62 |
+
description_adaln_scale,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
temperature,
|
| 64 |
min_temp,
|
| 65 |
max_temp,
|
| 66 |
temp_exponent,
|
| 67 |
top_k,
|
|
|
|
| 68 |
min_p,
|
| 69 |
dry_multiplier,
|
| 70 |
max_tokens,
|
| 71 |
seed,
|
| 72 |
):
|
| 73 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
result = client.predict(
|
| 75 |
raw_text,
|
| 76 |
+
voice_description,
|
| 77 |
+
cfg_text,
|
| 78 |
+
cfg_style,
|
| 79 |
+
description_adaln_scale,
|
|
|
|
|
|
|
| 80 |
temperature,
|
| 81 |
min_temp,
|
| 82 |
max_temp,
|
| 83 |
temp_exponent,
|
| 84 |
top_k,
|
|
|
|
| 85 |
min_p,
|
| 86 |
dry_multiplier,
|
| 87 |
max_tokens,
|
| 88 |
seed,
|
| 89 |
+
"", # user_ip
|
| 90 |
+
api_name="/run_generation_pipeline"
|
| 91 |
)
|
| 92 |
|
|
|
|
|
|
|
|
|
|
| 93 |
if result is None:
|
| 94 |
return None, "Status: No response from server"
|
| 95 |
|
|
|
|
| 110 |
return None, "Status: Unexpected response format from server"
|
| 111 |
|
| 112 |
except Exception as e:
|
|
|
|
|
|
|
| 113 |
return None, f"Status: Connection error: {str(e)}"
|
| 114 |
|
| 115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
with gr.Blocks(theme="Respair/Shiki@10.1.0", css=css) as demo:
|
| 117 |
|
|
|
|
|
|
|
|
|
|
| 118 |
gr.Markdown(
|
| 119 |
+
"""
|
| 120 |
+
<div style="text-align: left;">
|
| 121 |
+
モデルの性能を発揮し、ハルシネーションや不自然な生成を防ぐには、学習データに沿った入力スタイルを守る必要があります。
|
| 122 |
+
使用前に <code>Guide</code> タブをよくチェックしてください。
|
| 123 |
+
I hope you enjoy! <br>
|
| 124 |
+
もしSpaceが正しく読み込めない(Errorと表示される)場合は、何度かページをリロードしてみてください。<br>
|
| 125 |
+
</div>
|
| 126 |
+
"""
|
|
|
|
|
|
|
| 127 |
)
|
| 128 |
|
| 129 |
with gr.Tabs():
|
| 130 |
|
| 131 |
+
# ==================== Speech Generation Tab ====================
|
| 132 |
with gr.TabItem("Speech Generation"):
|
| 133 |
with gr.Row():
|
| 134 |
with gr.Column(scale=2):
|
| 135 |
text_input = gr.Textbox(
|
| 136 |
+
label="Text",
|
| 137 |
lines=5,
|
|
|
|
| 138 |
placeholder="ここでテキストを入力してください...\n\n"
|
| 139 |
)
|
| 140 |
|
| 141 |
+
voice_desc_input = gr.Textbox(
|
| 142 |
+
label="Voice Description",
|
| 143 |
+
lines=3,
|
| 144 |
+
placeholder="Describe the voice you want, e.g. 'a calm, warm female voice speaking softly'...",
|
| 145 |
)
|
| 146 |
|
| 147 |
with gr.Row(equal_height=False):
|
| 148 |
with gr.Accordion("----------------------------------⭐ 🛠️ ⭐", open=False):
|
| 149 |
|
| 150 |
+
gr.Markdown('<h3 style="color: #FFD700;">Style / CFG Parameters</h3>')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
+
cfg_text_slider = gr.Slider(
|
| 153 |
+
label="CFG Text", minimum=0.5, maximum=3.0, value=1.4, step=0.05,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
)
|
| 155 |
+
cfg_style_slider = gr.Slider(
|
| 156 |
+
label="CFG Style (値を上げると記述への忠実度が増しますが、不自然になる可能性があります)",
|
| 157 |
+
minimum=0.5, maximum=5.0, value=2.0, step=0.1,
|
| 158 |
+
)
|
| 159 |
+
desc_adaln_scale_slider = gr.Slider(
|
| 160 |
+
label="Description AdaLN Scale", minimum=0.0, maximum=3.0, value=1.0, step=0.1,
|
|
|
|
|
|
|
|
|
|
| 161 |
)
|
| 162 |
|
| 163 |
+
gr.Markdown('<h3 style="color: #FFD700;">Sampling Parameters</h3>')
|
| 164 |
+
|
| 165 |
temperature_slider = gr.Slider(
|
| 166 |
+
label="Temperature", minimum=0.0, maximum=2.0, value=0.4, step=0.05,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
)
|
|
|
|
| 168 |
min_temp_slider = gr.Slider(
|
| 169 |
+
label="Min Temperature (adaptive)", minimum=0.0, maximum=2.0, value=0.8, step=0.05,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
)
|
|
|
|
| 171 |
max_temp_slider = gr.Slider(
|
| 172 |
+
label="Max Temperature (adaptive)", minimum=0.0, maximum=2.0, value=1.0, step=0.05,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
)
|
| 174 |
+
temp_exponent_slider = gr.Slider(
|
| 175 |
+
label="Temperature Exponent", minimum=0.5, maximum=2.0, value=1.0, step=0.1,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
)
|
| 177 |
+
top_k_slider = gr.Slider(
|
| 178 |
+
label="Top K (0 = off)", minimum=0, maximum=200, value=0, step=5,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
)
|
|
|
|
| 180 |
min_p_slider = gr.Slider(
|
| 181 |
+
label="Min P (0 = off)", minimum=0.0, maximum=1.0, value=0.0, step=0.01,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
)
|
| 183 |
|
| 184 |
+
gr.Markdown('<h3 style="color: #FFD700;">Repetition Control (DRY)</h3>')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
dry_multiplier_slider = gr.Slider(
|
| 187 |
+
label="DRY Multiplier (0 = off)", minimum=0.0, maximum=5.0, value=3.8, step=0.1,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
)
|
| 189 |
|
| 190 |
+
gr.Markdown('<h3 style="color: #FFD700;">Other</h3>')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
+
max_tokens_slider = gr.Slider(
|
| 193 |
+
label="Max Tokens", minimum=100, maximum=1500, value=1024, step=10,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
)
|
| 195 |
+
seed_slider = gr.Slider(
|
| 196 |
+
label="Seed (-1 for random)", minimum=-1, maximum=2700000000, value=42, step=1,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
)
|
| 198 |
|
| 199 |
with gr.Column(scale=1):
|
|
|
|
| 206 |
interactive=False
|
| 207 |
)
|
| 208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
generate_button.click(
|
| 210 |
fn=run_generation_pipeline_client,
|
| 211 |
inputs=[
|
| 212 |
text_input,
|
| 213 |
+
voice_desc_input,
|
| 214 |
+
cfg_text_slider,
|
| 215 |
+
cfg_style_slider,
|
| 216 |
+
desc_adaln_scale_slider,
|
| 217 |
temperature_slider,
|
| 218 |
min_temp_slider,
|
| 219 |
max_temp_slider,
|
| 220 |
+
temp_exponent_slider,
|
| 221 |
top_k_slider,
|
|
|
|
| 222 |
min_p_slider,
|
| 223 |
dry_multiplier_slider,
|
| 224 |
max_tokens_slider,
|
|
|
|
|
|
|
| 225 |
seed_slider,
|
| 226 |
],
|
| 227 |
outputs=[audio_output, status_output],
|
| 228 |
+
concurrency_limit=4,
|
| 229 |
)
|
| 230 |
|
| 231 |
+
# ==================== Guide Tab ====================
|
| 232 |
+
with gr.TabItem("Guide"):
|
| 233 |
+
gr.HTML('<h1 style="text-align: center;">🪷 Takane - Mirei 「美嶺」 </h1>')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
with gr.Row():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
with gr.Column(scale=1):
|
| 236 |
+
gr.HTML("""
|
| 237 |
+
<div style="background-color: rgba(255, 255, 255, 0.525); padding: 30px; border-radius: 12px; backdrop-filter: blur(5px); max-width: 100%; box-shadow: 0 4px 6px rgba(0,0,0,0.5);">
|
| 238 |
+
<h2 style="color: #000000; margin-bottom: 20px; font-size: 28px;">日本語</h2>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
|
| 240 |
+
<p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; margin-bottom: 20px; font-size: 16px;">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
このモードでは、性別・口調・感情などをプロンプトで指示するだけで、好みの声を作成できます。
|
| 242 |
+
テキストを入力し、声の特徴を自然言語で記述すると、モデルがその記述に合った音声を合成します。
|
| 243 |
</p>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
|
| 245 |
+
<h3 style="color: #000000; margin-top: 30px; margin-bottom: 15px; font-size: 20px;">使い方:</h3>
|
| 246 |
+
<ul style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 15px;">
|
| 247 |
+
<li style="margin: 8px 0;">1. 読み上げたい日本語テキストを入力</li>
|
| 248 |
+
<li style="margin: 8px 0;">2. 声の特徴を記述(例:「かわいい元気な女の子の声」「落ち着いた低い男性ナレーター」)</li>
|
| 249 |
+
<li style="margin: 8px 0;">3. Generate をクリック!</li>
|
| 250 |
+
</ul>
|
| 251 |
+
|
| 252 |
+
<hr style="border: none; border-top: 1px solid rgba(0,0,0,0.15); margin: 25px 0;">
|
| 253 |
+
|
| 254 |
+
<h3 style="color: #000000; margin-top: 20px; margin-bottom: 15px; font-size: 20px;">注意事項:</h3>
|
| 255 |
+
<ul style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 14px;">
|
| 256 |
+
<li style="margin: 8px 0;"><strong>Style Tags</strong>: テキストの先頭に <code><asmr></code> のようなスタイルタグを付けることで出力スタイルを制御できます。</li>
|
| 257 |
+
<li style="margin: 8px 0;"><strong>DRY Multiplier</strong>: 繰り返し防止のペナルティです。0にすると出力が崩壊する可能性があります。</li>
|
| 258 |
+
<li style="margin: 8px 0;"><strong>長さ制限</strong>: 最大入力400トークン、最大出力は約29.9秒です。</li>
|
| 259 |
+
</ul>
|
| 260 |
+
</div>
|
| 261 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
|
| 263 |
+
with gr.Column(scale=1):
|
|
|
|
| 264 |
gr.HTML("""
|
| 265 |
+
<div style="background-color: rgba(255, 255, 255, 0.525); padding: 30px; border-radius: 12px; backdrop-filter: blur(5px); max-width: 100%; box-shadow: 0 4px 6px rgba(0,0,0,0.5);">
|
| 266 |
+
<h2 style="color: #000000; margin-bottom: 20px; font-size: 28px;">English</h2>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
|
| 268 |
+
<p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; margin-bottom: 20px; font-size: 16px;">
|
| 269 |
+
In this mode, you can create the voice you want by describing it with a prompt — gender, tone, emotion, characteristics, and more.
|
| 270 |
+
Enter the Japanese text you want spoken and describe the desired voice in natural language.
|
| 271 |
+
</p>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
|
| 273 |
+
<h3 style="color: #000000; margin-top: 30px; margin-bottom: 15px; font-size: 20px;">How to use:</h3>
|
| 274 |
+
<ul style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 15px;">
|
| 275 |
+
<li style="margin: 8px 0;">1. Enter the Japanese text you want spoken</li>
|
| 276 |
+
<li style="margin: 8px 0;">2. Describe the voice (e.g. "a cute, energetic young girl voice", "a calm deep male narrator")</li>
|
| 277 |
+
<li style="margin: 8px 0;">3. Click Generate!</li>
|
| 278 |
+
</ul>
|
| 279 |
+
|
| 280 |
+
<hr style="border: none; border-top: 1px solid rgba(0,0,0,0.15); margin: 25px 0;">
|
| 281 |
+
|
| 282 |
+
<h3 style="color: #000000; margin-top: 20px; margin-bottom: 15px; font-size: 20px;">Parameter Guide:</h3>
|
| 283 |
+
|
| 284 |
+
<h4 style="color: #333; margin-top: 20px;">Style / CFG Parameters:</h4>
|
| 285 |
+
<ul style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 14px;">
|
| 286 |
+
<li style="margin: 8px 0;"><strong>CFG Text:</strong> Classifier-free guidance strength for text conditioning</li>
|
| 287 |
+
<li style="margin: 8px 0;"><strong>CFG Style:</strong> Guidance strength for the voice description. Higher = more faithful but may sound unnatural.</li>
|
| 288 |
+
<li style="margin: 8px 0;"><strong>Description AdaLN Scale:</strong> Controls influence of the description embedding via adaptive layer norm</li>
|
| 289 |
+
</ul>
|
| 290 |
+
|
| 291 |
+
<h4 style="color: #333; margin-top: 20px;">Sampling Parameters:</h4>
|
| 292 |
+
<ul style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 14px;">
|
| 293 |
+
<li style="margin: 8px 0;"><strong>Temperature:</strong> Controls randomness (lower = more deterministic)</li>
|
| 294 |
+
<li style="margin: 8px 0;"><strong>Min/Max Temperature:</strong> Adaptive temperature range based on entropy</li>
|
| 295 |
+
<li style="margin: 8px 0;"><strong>Temperature Exponent:</strong> Adaptive temperature mapping curve (1.0 = linear)</li>
|
| 296 |
+
<li style="margin: 8px 0;"><strong>Top K:</strong> Keeps only K most likely tokens (0 = disabled)</li>
|
| 297 |
+
<li style="margin: 8px 0;"><strong>Min P:</strong> Minimum probability threshold (0 = disabled)</li>
|
| 298 |
+
<li style="margin: 8px 0;"><strong>DRY Multiplier:</strong> Repetition penalty (0 = off, higher = less repetition). Turning it off can cause collapse.</li>
|
| 299 |
+
</ul>
|
| 300 |
+
|
| 301 |
+
<h4 style="color: #333; margin-top: 20px;">Notes:</h4>
|
| 302 |
+
<ul style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 14px;">
|
| 303 |
+
<li style="margin: 8px 0;"><strong>Style tags</strong>: Start text with <code><asmr></code> etc. to control generation style.</li>
|
| 304 |
+
<li style="margin: 8px 0;">Maximum input: 400 tokens. Maximum output: ~29.9 seconds.</li>
|
| 305 |
+
</ul>
|
| 306 |
+
|
| 307 |
+
<div style="margin-top: 40px; padding-top: 20px; border-top: 1px solid rgba(0,0,0,0.1);">
|
| 308 |
+
<p style="color: #666; font-size: 14px; text-align: center;">
|
| 309 |
+
🌸 Takane Kiwami — Description-Guided Japanese Text-to-Speech
|
| 310 |
+
</p>
|
| 311 |
+
</div>
|
| 312 |
+
</div>
|
| 313 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
|
| 315 |
+
gr.HTML("""
|
| 316 |
+
<div style="text-align: center; margin-top: 30px; padding: 20px; background-color: rgba(255, 255, 255, 0.35); border-radius: 10px; backdrop-filter: blur(5px);">
|
| 317 |
+
<p style="color: #1a1a1a; font-size: 17px; font-weight: 500;">
|
| 318 |
+
If you need help or have questions, feel free to contact me on
|
| 319 |
+
<a href="https://x.com/MystiqCaleid" target="_blank" style="color: #b45309; text-decoration: none; font-weight: 600;">X / Twitter</a>
|
| 320 |
+
or
|
| 321 |
+
<a href="https://discord.com/users/349236707167698944" target="_blank" style="color: #5865F2; text-decoration: none; font-weight: 600;">Discord (@soshyant)</a>
|
| 322 |
+
</p>
|
| 323 |
+
</div>
|
| 324 |
+
""")
|
| 325 |
|
| 326 |
if __name__ == "__main__":
|
| 327 |
+
demo.queue(api_open=False, max_size=15).launch()
|
|
|
|
|
|