Respair commited on
Commit
16b7f44
·
verified ·
1 Parent(s): af3fef5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +140 -952
app.py CHANGED
@@ -1,16 +1,11 @@
1
  import gradio as gr
2
  from gradio_client import Client
3
  import os
4
- import csv
5
  import numpy as np
6
  import scipy.io.wavfile as wavfile
7
- import tempfile
8
 
9
-
10
- try:
11
- client = Client(os.environ['src'])
12
- except:
13
- client = Client("http://localhost:7860/")
14
 
15
  css = """
16
  .gradio-container input::placeholder,
@@ -22,13 +17,6 @@ code {
22
  padding: 2px 4px;
23
  border-radius: 3px;
24
  }
25
- #settings-accordion summary {
26
- justify-content: center;
27
- }
28
- .examples-holder > .label {
29
- color: #b45309 !important;
30
- font-weight: 600;
31
- }
32
 
33
  .gr-checkbox label span,
34
  .gr-check-radio label span,
@@ -36,11 +24,6 @@ code {
36
  .checkbox-container span {
37
  color: #ECF2F7 !important;
38
  }
39
- .gr-checkbox label span.selected,
40
- .gr-check-radio label span.selected,
41
- [data-testid="checkbox"].selected span {
42
- color: #FFD700 !important;
43
- }
44
 
45
  #advanced-accordion > button,
46
  #advanced-accordion > button span,
@@ -53,145 +36,6 @@ code {
53
  color: #FFD700 !important;
54
  }
55
 
56
- .examples-table {
57
- border-collapse: collapse !important;
58
- width: 100% !important;
59
- }
60
-
61
- .examples-table tbody tr {
62
- background-color: #d4c896 !important;
63
- border-bottom: 2px solid #c4b886 !important;
64
- }
65
-
66
- .examples-table tbody tr:hover {
67
- background-color: #c9bd8b !important;
68
- }
69
-
70
- .examples-table tbody td {
71
- background-color: #d4c896 !important;
72
- padding: 12px 16px !important;
73
- color: #1a1a1a !important;
74
- font-weight: 500 !important;
75
- border: none !important;
76
- }
77
-
78
- .examples-table thead th {
79
- background-color: #bfb07a !important;
80
- color: #1a1a1a !important;
81
- font-weight: 600 !important;
82
- padding: 10px 16px !important;
83
- }
84
-
85
- .gallery,
86
- .gr-examples,
87
- .examples {
88
- background: transparent !important;
89
- }
90
-
91
- .gallery > div,
92
- .gr-examples > div,
93
- .examples > div {
94
- background: transparent !important;
95
- }
96
-
97
- .gallery button,
98
- .gr-examples button,
99
- .examples button,
100
- .gr-sample-textbox,
101
- button.sample {
102
- background-color: #d4c896 !important;
103
- border: 1px solid #c4b886 !important;
104
- color: #1a1a1a !important;
105
- font-weight: 500 !important;
106
- margin: 4px !important;
107
- padding: 10px 14px !important;
108
- border-radius: 6px !important;
109
- transition: background-color 0.2s ease !important;
110
- }
111
-
112
- .gallery button:hover,
113
- .gr-examples button:hover,
114
- .examples button:hover,
115
- .gr-sample-textbox:hover,
116
- button.sample:hover {
117
- background-color: #c9bd8b !important;
118
- border-color: #b4a876 !important;
119
- }
120
-
121
- #mono-examples-container .gallery button,
122
- #mono-examples-container .gr-examples button,
123
- #mono-examples-container .examples button,
124
- #mono-examples-container button.sample {
125
- background-color: #d4c896 !important;
126
- border-color: #c4b886 !important;
127
- }
128
-
129
- #stereo-examples-container .gallery button,
130
- #stereo-examples-container .gr-examples button,
131
- #stereo-examples-container .examples button,
132
- #stereo-examples-container button.sample {
133
- background-color: #c8d4a6 !important;
134
- border-color: #b8c496 !important;
135
- }
136
-
137
- #stereo-examples-container .gallery button:hover,
138
- #stereo-examples-container .gr-examples button:hover,
139
- #stereo-examples-container .examples button:hover,
140
- #stereo-examples-container button.sample:hover {
141
- background-color: #bdc9a0 !important;
142
- border-color: #a8b486 !important;
143
- }
144
-
145
- .gr-examples table,
146
- .examples table,
147
- table.examples-table {
148
- width: 100% !important;
149
- border-collapse: collapse !important;
150
- }
151
-
152
- .gr-examples table tr,
153
- .examples table tr {
154
- background-color: #d4c896 !important;
155
- }
156
-
157
- .gr-examples table tr:hover,
158
- .examples table tr:hover {
159
- background-color: #c9bd8b !important;
160
- }
161
-
162
- .gr-examples table td,
163
- .examples table td {
164
- background-color: inherit !important;
165
- padding: 12px 16px !important;
166
- color: #1a1a1a !important;
167
- font-weight: 500 !important;
168
- cursor: pointer !important;
169
- }
170
-
171
- .gr-examples .gr-samples-table,
172
- .examples .gr-samples-table {
173
- background: transparent !important;
174
- }
175
-
176
- .gr-examples .gr-samples-table tr,
177
- .examples .gr-samples-table tr {
178
- background-color: #d4c896 !important;
179
- margin-bottom: 4px !important;
180
- }
181
-
182
- .gr-examples > div > div,
183
- .examples > div > div {
184
- background-color: #d4c896 !important;
185
- border-radius: 6px !important;
186
- margin: 4px !important;
187
- padding: 8px 12px !important;
188
- }
189
-
190
- .gr-examples > div > div:hover,
191
- .examples > div > div:hover {
192
- background-color: #c9bd8b !important;
193
- }
194
-
195
  body {
196
  background: none !important;
197
  }
@@ -207,199 +51,45 @@ body::before {
207
  pointer-events: none;
208
  background: url('https://i.postimg.cc/1smD6GPf/gradio-theme-rin2.png') center center / cover no-repeat;
209
  }
210
-
211
  """
212
 
213
 
214
- def save_audio_to_temp(audio_data):
215
- if audio_data is None:
216
- return None
217
- sample_rate, audio_array = audio_data
218
- if isinstance(audio_array, list):
219
- audio_array = np.array(audio_array)
220
- if audio_array.dtype == np.float32 or audio_array.dtype == np.float64:
221
- audio_array = (audio_array * 32767).astype(np.int16)
222
- tmp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
223
- wavfile.write(tmp.name, sample_rate, audio_array)
224
- tmp.close()
225
- return tmp.name
226
-
227
-
228
- def cleanup_temp(path):
229
- if path is not None:
230
- try:
231
- os.unlink(path)
232
- except:
233
- pass
234
-
235
-
236
- def load_examples(csv_path):
237
- examples = []
238
- if not os.path.exists(csv_path):
239
- return examples
240
- try:
241
- with open(csv_path, 'r', encoding='utf-8') as f:
242
- reader = csv.reader(f, delimiter=',', quotechar='"', doublequote=True)
243
- for row in reader:
244
- if len(row) >= 2:
245
- audio_path = row[0].strip()
246
- text = row[1].strip()
247
- if text.startswith('"') and text.endswith('"'):
248
- text = text[1:-1]
249
- elif text.startswith("'") and text.endswith("'"):
250
- text = text[1:-1]
251
- if text.startswith('\u201c') and text.endswith('\u201d'):
252
- text = text[1:-1]
253
- if text.startswith('\u300c') and text.endswith('\u300d'):
254
- text = text[1:-1]
255
-
256
- speaker_id = 1
257
- if len(row) >= 3 and row[2].strip():
258
- try:
259
- speaker_id = int(row[2].strip())
260
- except ValueError:
261
- speaker_id = 1
262
-
263
- pregenerated_audio = None
264
- if audio_path and audio_path.lower() != "none" and audio_path != "":
265
- if not os.path.isabs(audio_path):
266
- base_dir = os.path.dirname(csv_path)
267
- audio_path = os.path.join(base_dir, audio_path)
268
- if os.path.exists(audio_path):
269
- pregenerated_audio = audio_path
270
- examples.append([text, pregenerated_audio, speaker_id])
271
- except Exception as e:
272
- pass
273
- return examples
274
-
275
-
276
  def run_generation_pipeline_client(
277
  raw_text,
278
- audio_prompt,
279
- use_stereo,
280
- speaker_id,
281
- cfg_scale,
282
- temperature,
283
- min_temp,
284
- max_temp,
285
- top_k,
286
- top_p,
287
- min_p,
288
- dry_multiplier,
289
- max_tokens,
290
- pan_idx,
291
- width_idx,
292
- seed,
293
- ):
294
- try:
295
- audio_path = save_audio_to_temp(audio_prompt)
296
- audio_for_api = None
297
- if audio_path is not None:
298
- audio_for_api = {"path": audio_path, "meta": {"_type": "gradio.FileData"}}
299
-
300
- result = client.predict(
301
- raw_text,
302
- audio_for_api,
303
- use_stereo,
304
- speaker_id,
305
- cfg_scale,
306
- temperature,
307
- min_temp,
308
- max_temp,
309
- top_k,
310
- top_p,
311
- min_p,
312
- dry_multiplier,
313
- max_tokens,
314
- pan_idx,
315
- width_idx,
316
- seed,
317
- "",
318
- api_name="/run_generation_pipeline"
319
- )
320
-
321
- cleanup_temp(audio_path)
322
-
323
- if result is None:
324
- return None, "Status: No response from server"
325
-
326
- if isinstance(result, (list, tuple)) and len(result) == 2:
327
- audio_result, status_msg = result
328
- if audio_result is not None:
329
- if isinstance(audio_result, str) and os.path.exists(audio_result):
330
- sr, data = wavfile.read(audio_result)
331
- return (sr, data), status_msg
332
- elif isinstance(audio_result, (list, tuple)) and len(audio_result) >= 2:
333
- sr = audio_result[0]
334
- data = audio_result[1]
335
- if isinstance(data, list):
336
- data = np.array(data)
337
- return (sr, data), status_msg
338
- return None, status_msg
339
-
340
- return None, "Status: Unexpected response format from server"
341
-
342
- except Exception as e:
343
- cleanup_temp(audio_path if 'audio_path' in dir() else None)
344
- return None, f"Status: Connection error: {str(e)}"
345
-
346
-
347
- def run_alchemy_pipeline_client(
348
- raw_text,
349
- ref_audio_1,
350
- ref_audio_2,
351
- mix_ratio,
352
- cfg_scale,
353
- speaker_cfg_scale,
354
- speaker_adaln_scale,
355
  temperature,
356
  min_temp,
357
  max_temp,
358
  temp_exponent,
359
  top_k,
360
- top_p,
361
  min_p,
362
  dry_multiplier,
363
  max_tokens,
364
  seed,
365
  ):
366
  try:
367
- ref1_path = save_audio_to_temp(ref_audio_1)
368
- ref2_path = save_audio_to_temp(ref_audio_2)
369
-
370
- ref1_for_api = None
371
- if ref1_path is not None:
372
- ref1_for_api = {"path": ref1_path, "meta": {"_type": "gradio.FileData"}}
373
-
374
- ref2_for_api = None
375
- if ref2_path is not None:
376
- ref2_for_api = {"path": ref2_path, "meta": {"_type": "gradio.FileData"}}
377
-
378
  result = client.predict(
379
  raw_text,
380
- ref1_for_api,
381
- ref2_for_api,
382
- mix_ratio,
383
- cfg_scale,
384
- speaker_cfg_scale,
385
- speaker_adaln_scale,
386
  temperature,
387
  min_temp,
388
  max_temp,
389
  temp_exponent,
390
  top_k,
391
- top_p,
392
  min_p,
393
  dry_multiplier,
394
  max_tokens,
395
  seed,
396
- "",
397
- api_name="/run_alchemy_pipeline"
398
  )
399
 
400
- cleanup_temp(ref1_path)
401
- cleanup_temp(ref2_path)
402
-
403
  if result is None:
404
  return None, "Status: No response from server"
405
 
@@ -420,169 +110,90 @@ def run_alchemy_pipeline_client(
420
  return None, "Status: Unexpected response format from server"
421
 
422
  except Exception as e:
423
- cleanup_temp(ref1_path if 'ref1_path' in dir() else None)
424
- cleanup_temp(ref2_path if 'ref2_path' in dir() else None)
425
  return None, f"Status: Connection error: {str(e)}"
426
 
427
 
428
- examples_mono_csv_path = "samples/examples_mono.csv"
429
- examples_stereo_csv_path = "samples/examples_stereo.csv"
430
-
431
- example_list_mono = load_examples(examples_mono_csv_path)
432
- example_list_stereo = load_examples(examples_stereo_csv_path)
433
-
434
-
435
  with gr.Blocks(theme="Respair/Shiki@10.1.0", css=css) as demo:
436
 
437
- # gr.Markdown('<h1 style="text-align: center;">🪷 Takane - Mirei 「美嶺」 </h1>')
438
- # gr.HTML('<h1 style="text-align: center;">🪷 Takane - Mirei 「美嶺」 </h1>')
439
- # gr.HTML('<h1 style="text-align: center; font-size: 24px;">🪷 Takane - Mirei 「美嶺」 </h1>')
440
  gr.Markdown(
441
- """
442
- <div style="text-align: left;">
443
- モデルの性能を発揮し、ハルシネーションや不自然な生成を防ぐには、学習データに沿った入力スタイルを守る必要があります。
444
- 使用前に <code>Guide</code> と <code>Examples</code> タブをよくチェックしてください。
445
- I hope you enjoy! <br>
446
- 音声プロンプトを使たい場合は、Alchemy モドの使用強く推奨ます。<br>
447
- 実嶺はゼロショットモデルではなく、話者ID付きで学習しているため、音声プロンプトとの話者類似性はどのみち低めになります。<br>
448
-
449
- </div>
450
- """
451
  )
452
 
453
  with gr.Tabs():
454
 
 
455
  with gr.TabItem("Speech Generation"):
456
  with gr.Row():
457
  with gr.Column(scale=2):
458
  text_input = gr.Textbox(
459
- label="Text here",
460
  lines=5,
461
- max_length=125,
462
  placeholder="ここでテキストを入力してください...\n\n"
463
  )
464
 
465
- use_stereo_checkbox = gr.Checkbox(
466
- label="🎧 Use Stereo Mode",
467
- value=False,
468
- info="Using a headphone is recommended. This mode is 2-3x slower, please disable it for regular usage, mono is more suitable for most usages. <br> 処理が2〜3倍重くなるため、普段はオフにしてください。大抵の用途では Mono の方が適しています。"
469
  )
470
 
471
  with gr.Row(equal_height=False):
472
  with gr.Accordion("----------------------------------⭐ 🛠️ ⭐", open=False):
473
 
474
- audio_prompt_input = gr.Audio(
475
- label="Audio Prompt (Optional — Please set Speaker ID to 1)",
476
- sources=["upload", "microphone"],
477
- type="numpy"
478
- )
479
 
480
- speaker_id_slider = gr.Slider(
481
- label="Speaker ID",
482
- minimum=1,
483
- maximum=2000,
484
- value=1,
485
- step=1
486
  )
487
-
488
- gr.Markdown('<h3 style="color: #FFD700;">Common Parameters</h3>')
489
-
490
- cfg_scale_slider = gr.Slider(
491
- label="CFG Scale",
492
- minimum=1.0,
493
- maximum=3.0,
494
- value=1.15,
495
- step=0.05
496
  )
497
 
 
 
498
  temperature_slider = gr.Slider(
499
- label="Temperature (Min/Max Temp の両方が 0 に設定されている場合のみ有効です)",
500
- minimum=0.0,
501
- maximum=2.0,
502
- value=1.0,
503
- step=0.05
504
  )
505
-
506
  min_temp_slider = gr.Slider(
507
- label="Min Temperature (0 = off; ignored in stereo mode)",
508
- minimum=0.0,
509
- maximum=2.0,
510
- value=0.25,
511
- step=0.05
512
  )
513
-
514
  max_temp_slider = gr.Slider(
515
- label="Max Temperature (0 = off; ignored in stereo mode)",
516
- minimum=0.0,
517
- maximum=2.0,
518
- value=1.0,
519
- step=0.05
520
  )
521
-
522
- top_k_slider = gr.Slider(
523
- label="Top K (0 = off)",
524
- minimum=0,
525
- maximum=200,
526
- value=0,
527
- step=1
528
  )
529
-
530
- top_p_slider = gr.Slider(
531
- label="Top P",
532
- minimum=0.0,
533
- maximum=1.0,
534
- value=1.0,
535
- step=0.01
536
  )
537
-
538
  min_p_slider = gr.Slider(
539
- label="Min P (0 = off)",
540
- minimum=0.0,
541
- maximum=1.0,
542
- value=0.0,
543
- step=0.01
544
  )
545
 
546
- max_tokens_slider = gr.Slider(
547
- label="Max Tokens",
548
- minimum=100,
549
- maximum=1500,
550
- value=768,
551
- step=10
552
- )
553
 
554
  dry_multiplier_slider = gr.Slider(
555
- label="DRY Multiplier (0 = off)",
556
- minimum=0.0,
557
- maximum=2.0,
558
- value=0.8,
559
- step=0.1
560
  )
561
 
562
- seed_slider = gr.Slider(
563
- label="Seed (-1 for random)",
564
- minimum=-1,
565
- maximum=2700000000,
566
- value=-1,
567
- step=1
568
- )
569
 
570
- gr.Markdown('<h3 style="color: #FFD700;">Stereo-Only Parameters</h3>')
571
-
572
- pan_idx_slider = gr.Slider(
573
- label="Pan (0=Left, 5=Center, 10=Right)",
574
- minimum=0,
575
- maximum=10,
576
- value=2,
577
- step=1
578
  )
579
-
580
- width_idx_slider = gr.Slider(
581
- label="Stereo Width (0=Narrow, 10=Wide)",
582
- minimum=0,
583
- maximum=10,
584
- value=5,
585
- step=1
586
  )
587
 
588
  with gr.Column(scale=1):
@@ -595,545 +206,122 @@ with gr.Blocks(theme="Respair/Shiki@10.1.0", css=css) as demo:
595
  interactive=False
596
  )
597
 
598
- def on_stereo_toggle(use_stereo):
599
- if use_stereo:
600
- temp, min_t, max_t, min_p_val = 1.0, 0.0, 0.0, 0.05
601
- if len(example_list_stereo) >= 2:
602
- ex = example_list_stereo[1]
603
- text = ex[0]
604
- sid = ex[2]
605
- path = ex[1]
606
- if path and os.path.exists(path):
607
- sr, data = wavfile.read(path)
608
- return (gr.update(value=temp), gr.update(value=min_t), gr.update(value=max_t), gr.update(value=min_p_val),
609
- gr.update(value=text), gr.update(value=sid), (sr, data), "Status: Stereo example loaded / ステレオ例を読み込みました")
610
- return (gr.update(value=temp), gr.update(value=min_t), gr.update(value=max_t), gr.update(value=min_p_val),
611
- gr.update(), gr.update(), gr.update(), gr.update())
612
- else:
613
- temp, min_t, max_t, min_p_val = 1.0, 0.25, 1.0, 0.0
614
- if len(example_list_mono) >= 2:
615
- ex = example_list_mono[-1]
616
- text = ex[0]
617
- sid = ex[2]
618
- path = ex[1]
619
- if path and os.path.exists(path):
620
- sr, data = wavfile.read(path)
621
- return (gr.update(value=temp), gr.update(value=min_t), gr.update(value=max_t), gr.update(value=min_p_val),
622
- gr.update(value=text), gr.update(value=sid), (sr, data), "Status: Mono example loaded / モノラル例を読み込みました")
623
- return (gr.update(value=temp), gr.update(value=min_t), gr.update(value=max_t), gr.update(value=min_p_val),
624
- gr.update(), gr.update(), gr.update(), gr.update())
625
-
626
-
627
- use_stereo_checkbox.change(
628
- fn=on_stereo_toggle,
629
- inputs=[use_stereo_checkbox],
630
- outputs=[temperature_slider, min_temp_slider, max_temp_slider, min_p_slider,
631
- text_input, speaker_id_slider, audio_output, status_output]
632
- )
633
-
634
  generate_button.click(
635
  fn=run_generation_pipeline_client,
636
  inputs=[
637
  text_input,
638
- audio_prompt_input,
639
- use_stereo_checkbox,
640
- speaker_id_slider,
641
- cfg_scale_slider,
642
  temperature_slider,
643
  min_temp_slider,
644
  max_temp_slider,
 
645
  top_k_slider,
646
- top_p_slider,
647
  min_p_slider,
648
  dry_multiplier_slider,
649
  max_tokens_slider,
650
- pan_idx_slider,
651
- width_idx_slider,
652
  seed_slider,
653
  ],
654
  outputs=[audio_output, status_output],
655
- concurrency_limit=4
656
  )
657
 
658
- with gr.TabItem("⚗️ Alchemy ⚗️"):
659
-
660
- # gr.HTML("""
661
- # <div style="background-color: rgba(255, 255, 255, 0.025); padding: 20px; border-radius: 12px; backdrop-filter: blur(10px); box-shadow: 0 4px 6px rgba(0,0,0,0.5); margin-top: 8px;">
662
- # <p style="color: #1a1a1a; font-weight: 500; line-height: 1.6; font-size: 14px; text-align: center; margin: 0;">
663
- # Upload audio references and mix them to create new voices.
664
- # Upload two references and adjust the mix ratio, or use a single reference.
665
- # </p>
666
- # </div>
667
- # """)
668
-
669
  with gr.Row():
670
- with gr.Column(scale=2):
671
- alch_text_input = gr.Textbox(
672
- label="Synthesize",
673
- lines=5,
674
- placeholder="日本語のテキストを入力してください...\n\n",
675
- value="睡眠は、心身の健康を保つために欠かせない大切な時間です。良質な睡眠をとることで、一日の疲れを癒やし、脳内の情報を整理することができます。寝る前にスマートフォンを控えたり、温かい飲み物を飲んでリラックスすることで、より深く眠れるようになります。明日も元気に過ごすために、今夜はゆっくりと体を休めましょうね。"
676
- )
677
-
678
- alch_mix_ratio = gr.Slider(
679
- minimum=0.0,
680
- maximum=1.0,
681
- value=0.5,
682
- step=0.05,
683
- label="Mix Ratio",
684
- )
685
-
686
- with gr.Row():
687
- with gr.Column():
688
- gr.HTML("""
689
- <div style="background-color: rgba(255, 255, 255, 0.55); padding: 8px 12px; border-radius: 8px; backdrop-filter: blur(10px); box-shadow: 0 2px 4px rgba(0,0,0,0.08); text-align: center; max-width: 180px; margin: 0 auto;">
690
- <h3 style="color: #000000; margin: 0; font-size: 16px;">🔊 Reference 1</h3>
691
- </div>
692
- """)
693
- alch_ref_audio_1 = gr.Audio(
694
- label="Reference Audio 1",
695
- sources=["upload", "microphone"],
696
- type="numpy",
697
- value="samples/sample_01.mp3"
698
- )
699
- with gr.Column():
700
- gr.HTML("""
701
- <div style="background-color: rgba(255, 255, 255, 0.525); padding: 8px 12px; border-radius: 8px; backdrop-filter: blur(10px); box-shadow: 0 2px 4px rgba(0,0,0,0.08); text-align: center; max-width: 250px; margin: 0 auto;">
702
- <h3 style="color: #000000; margin: 0; font-size: 16px;">🔊 Reference 2 (Optional)</h3>
703
- </div>
704
- """)
705
- alch_ref_audio_2 = gr.Audio(
706
- label="Reference Audio 2 (Optional)",
707
- sources=["upload", "microphone"],
708
- type="numpy",
709
- value="samples/sample_02.mp3"
710
- )
711
-
712
- with gr.Row(equal_height=False):
713
- with gr.Accordion("----------------------------------⭐ 🛠️ ⭐", open=False):
714
-
715
- gr.Markdown('<h3 style="color: #FFD700;">Style Parameters</h3>')
716
-
717
- alch_cfg_scale = gr.Slider(
718
- label="CFG Scale",
719
- minimum=0.5,
720
- maximum=3.0,
721
- value=1.4,
722
- step=0.05
723
- )
724
-
725
- alch_speaker_cfg_scale = gr.Slider(
726
- label="Speaker CFG Scale (Legacy - just rely AdaLN scale)",
727
- minimum=0.5,
728
- maximum=3.0,
729
- value=1.,
730
- step=0.1
731
- )
732
-
733
- alch_speaker_adaln_scale = gr.Slider(
734
- label="Speaker AdaLN Scale (値を上げると参照音声に近づきますが、不自然になる可能性があります)",
735
- minimum=0.0,
736
- maximum=3.0,
737
- value=2.,
738
- step=0.1
739
- )
740
-
741
- gr.Markdown('<h3 style="color: #FFD700;">Sampling Parameters</h3>')
742
-
743
- alch_temperature = gr.Slider(
744
- label="Temperature (Min/Max Temp の両方が 0 に設定されている場合のみ有効です)",
745
- minimum=0.0,
746
- maximum=2.0,
747
- value=0.0,
748
- step=0.05
749
- )
750
-
751
- alch_min_temp = gr.Slider(
752
- label="Min Temperature (0 = off)",
753
- minimum=0.0,
754
- maximum=1.0,
755
- value=0.1,
756
- step=0.05
757
- )
758
-
759
- alch_max_temp = gr.Slider(
760
- label="Max Temperature (0 = off)",
761
- minimum=0.0,
762
- maximum=2.0,
763
- value=1.0,
764
- step=0.1
765
- )
766
-
767
- alch_temp_exponent = gr.Slider(
768
- label="Temperature Exponent",
769
- minimum=0.5,
770
- maximum=2.0,
771
- value=1.0,
772
- step=0.1
773
- )
774
-
775
- alch_top_k = gr.Slider(
776
- label="Top K (0 = off)",
777
- minimum=0,
778
- maximum=200,
779
- value=0,
780
- step=5
781
- )
782
-
783
- alch_top_p = gr.Slider(
784
- label="Top P",
785
- minimum=0.0,
786
- maximum=1.0,
787
- value=1.0,
788
- step=0.01
789
- )
790
-
791
- alch_min_p = gr.Slider(
792
- label="Min P (0 = off)",
793
- minimum=0.0,
794
- maximum=1.0,
795
- value=0.0,
796
- step=0.01
797
- )
798
-
799
- alch_max_tokens = gr.Slider(
800
- label="Max Tokens",
801
- minimum=100,
802
- maximum=1500,
803
- value=1024,
804
- step=10
805
- )
806
-
807
- alch_dry_multiplier = gr.Slider(
808
- label="DRY Multiplier (0 = off)",
809
- minimum=0.0,
810
- maximum=2.0,
811
- value=0.8,
812
- step=0.1
813
- )
814
-
815
- alch_seed = gr.Slider(
816
- label="Seed (-1 for random)",
817
- minimum=-1,
818
- maximum=2700000000,
819
- value=42,
820
- step=1
821
- )
822
-
823
- with gr.Column(scale=1):
824
- alch_generate_button = gr.Button("⚗️ Generate", variant="primary", size="lg")
825
-
826
  with gr.Column(scale=1):
827
- alch_status_output = gr.Textbox(label="Status", interactive=False)
828
- alch_audio_output = gr.Audio(
829
- label="Transmuted Speech",
830
- interactive=False,
831
- value="samples/audio_62.wav"
832
-
833
- )
834
-
835
- alch_generate_button.click(
836
- fn=run_alchemy_pipeline_client,
837
- inputs=[
838
- alch_text_input,
839
- alch_ref_audio_1,
840
- alch_ref_audio_2,
841
- alch_mix_ratio,
842
- alch_cfg_scale,
843
- alch_speaker_cfg_scale,
844
- alch_speaker_adaln_scale,
845
- alch_temperature,
846
- alch_min_temp,
847
- alch_max_temp,
848
- alch_temp_exponent,
849
- alch_top_k,
850
- alch_top_p,
851
- alch_min_p,
852
- alch_dry_multiplier,
853
- alch_max_tokens,
854
- alch_seed,
855
- ],
856
- outputs=[alch_audio_output, alch_status_output],
857
- concurrency_limit=4
858
- )
859
 
860
- with gr.TabItem("Description Guided"):
861
- gr.HTML("""
862
- <div style="background-color: rgba(255, 255, 255, 0.025); padding: 60px 30px; border-radius: 12px; backdrop-filter: blur(5px); max-width: 100%; box-shadow: 0 4px 6px rgba(0,0,0,0.1); text-align: center;">
863
- <p style="color: #1a1a1a; font-weight: 600; line-height: 1.8; margin-bottom: 8px; font-size: 24px;">
864
- Soon...
865
- </p>
866
- <p style="color: #555; font-weight: 400; line-height: 1.6; margin-bottom: 24px; font-size: 14px;">
867
- Here, you can create the voice you want by describing it with a prompt (gender, tone, characteristic, emotion etc.)
868
- </p>
869
- <p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; margin-bottom: 8px; font-size: 20px;">
870
- 近日公開...
871
- </p>
872
- <p style="color: #555; font-weight: 400; line-height: 1.6; font-size: 13px;">
873
  このモードでは、性別・口調・感情などをプロンプトで指示するだけで、好みの声を作成できます。
 
874
  </p>
875
- </div>
876
- """)
877
- with gr.TabItem("Examples"):
878
- gr.HTML("""
879
- <div style="background-color: rgba(255, 255, 255, 0.025); padding: 20px; border-radius: 12px; backdrop-filter: blur(10px); box-shadow: 0 4px 6px rgba(0,0,0,0.5); margin-top: 8px;">
880
- <p style="color: #1a1a1a; font-weight: 500; line-height: 1.6; font-size: 14px; text-align: center; margin: 0;">
881
- Click on any example below to load the text and hear the pre-generated audio. / 下の例をクリックすると、テキストが読み込まれ、生成済みの音声を聞くことができます。
882
- </p>
883
- </div>
884
- """)
885
-
886
-
887
- example_text_holder = gr.Textbox(visible=False)
888
-
889
- def load_mono_example_fn(text):
890
- for ex in example_list_mono:
891
- if ex[0] == text:
892
- pregenerated_path = ex[1]
893
- sid = ex[2]
894
- if pregenerated_path and os.path.exists(pregenerated_path):
895
- try:
896
- sample_rate, audio_data = wavfile.read(pregenerated_path)
897
- status = "Status: Mono example loaded / モノラル例を読み込みました"
898
- return text, False, sid, (sample_rate, audio_data), status
899
- except Exception as e:
900
- return text, False, sid, None, f"Status: Error loading audio: {str(e)}"
901
- else:
902
- return text, False, sid, None, "Status: Mono example loaded (no pre-generated audio)"
903
- return text, False, 1, None, "Status: Mono example loaded"
904
-
905
- def load_stereo_example_fn(text):
906
- for ex in example_list_stereo:
907
- if ex[0] == text:
908
- pregenerated_path = ex[1]
909
- sid = ex[2]
910
- if pregenerated_path and os.path.exists(pregenerated_path):
911
- try:
912
- sample_rate, audio_data = wavfile.read(pregenerated_path)
913
- status = "Status: Stereo example loaded / ステレオ例を読み込みました"
914
- return text, True, sid, (sample_rate, audio_data), status
915
- except Exception as e:
916
- return text, True, sid, None, f"Status: Error loading audio: {str(e)}"
917
- else:
918
- return text, True, sid, None, "Status: Stereo example loaded (no pre-generated audio)"
919
- return text, True, 1, None, "Status: Stereo example loaded"
920
 
921
- with gr.Row():
922
- with gr.Column(scale=1, elem_id="mono-examples-container"):
923
-
924
- gr.HTML("""
925
- <div style="background-color: rgba(255, 255, 255, 0.55); padding: 8px 12px; border-radius: 8px; backdrop-filter: blur(10px); box-shadow: 0 2px 4px rgba(0,0,0,0.08); text-align: center; max-width: 180px; margin: 0 auto;">
926
- <h3 style="color: #000000; margin: 0; font-size: 16px;">🔊 Mono </h3>
927
- </div>
 
 
 
 
 
 
 
 
 
928
  """)
929
-
930
- if example_list_mono:
931
- mono_example_texts = [[ex[0]] for ex in example_list_mono]
932
- gr.Examples(
933
- examples=mono_example_texts,
934
- inputs=[example_text_holder],
935
- outputs=[text_input, use_stereo_checkbox, speaker_id_slider, audio_output, status_output],
936
- fn=load_mono_example_fn,
937
- label="Click to load a mono example",
938
- cache_examples=False,
939
- run_on_click=True,
940
- examples_per_page=15
941
- )
942
- else:
943
- gr.Markdown("*No mono examples available*")
944
 
945
- with gr.Column(scale=1, elem_id="stereo-examples-container"):
946
-
947
  gr.HTML("""
948
- <div style="background-color: rgba(255, 255, 255, 0.55); padding: 8px 12px; border-radius: 8px; backdrop-filter: blur(10px); box-shadow: 0 2px 4px rgba(0,0,0,0.08); text-align: center; max-width: 180px; margin: 0 auto;">
949
- <h3 style="color: #000000; margin: 0; font-size: 16px;">🎧 Stereo </h3>
950
- </div>
951
- """)
952
-
953
-
954
- if example_list_stereo:
955
- stereo_example_texts = [[ex[0]] for ex in example_list_stereo]
956
- gr.Examples(
957
- examples=stereo_example_texts,
958
- inputs=[example_text_holder],
959
- outputs=[text_input, use_stereo_checkbox, speaker_id_slider, audio_output, status_output],
960
- fn=load_stereo_example_fn,
961
- label="Click to load a stereo example",
962
- cache_examples=False,
963
- run_on_click=True,
964
- examples_per_page=15
965
- )
966
- else:
967
- gr.Markdown("*No stereo examples available*")
968
-
969
- with gr.TabItem("Guide"):
970
- gr.HTML('<h1 style="text-align: center;">🪷 Takane - Mirei 「美嶺」 </h1>')
971
- with gr.Row():
972
- with gr.Column(scale=1):
973
-
974
- gr.HTML("""
975
- <div style="background-color: rgba(255, 255, 255, 0.525); padding: 30px; border-radius: 12px; backdrop-filter: blur(5px); max-width: 100%; box-shadow: 0 4px 6px rgba(0,0,0,0.5);">
976
- <h2 style="color: #000000; margin-bottom: 20px; font-size: 28px;">日本語</h2>
977
-
978
- <div style="background-color: rgba(255, 200, 200, 0.45); padding: 16px 20px; border-radius: 8px; border-left: 4px solid #cc4444; margin-bottom: 20px;">
979
- <p style="color: #1a1a1a; font-weight: 600; line-height: 1.8; font-size: 14px; margin: 0;">
980
- ⚠️ サーバー負荷を軽くするため、オリジナルのTakaneとは違い、このデモではハルシネーション対策などの安全策を省いています。
981
- そのため、モデルの出力が崩れたり、デモが止まるリスクは通常より高いです。
982
- たまにメンテしますが、あくまでWIPということでご了承ください。
983
- </p>
984
- </div>
985
-
986
- <p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; margin-bottom: 20px; font-size: 16px;">
987
- Mirei「美嶺」 は、Takane の系譜に連なる最新モデルであり、高品質なアニメっぽい日本語音声生成のためにゼロから再構築・再設計されたモデルです。より大規模で高速、かつクリーンな音質を実現し、機能も増え、学習コストも大幅に抑えられています。
988
- </p>
989
-
990
- <p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; margin-bottom: 20px; font-size: 16px;">
991
- これは、現代の最高峰の技術を駆使し、究極の日本語音声合成モデルを開発するという私の目標に向けた、新たな一歩です。
992
- </p>
993
-
994
- <h3 style="color: #000000; margin-top: 30px; margin-bottom: 15px; font-size: 20px;">初代 Takane との比較:</h3>
995
- <ul style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 15px;">
996
- <li style="margin: 8px 0;">大幅な容量拡大: 500M → 1.2B (Mono) / 3B (Stereo) パラメータ</li>
997
- <li style="margin: 8px 0;">より多くのデータでの学習</li>
998
- <li style="margin: 8px 0;">実験的な制御可能 End-to-end Stereo 生成</li>
999
- <li style="margin: 8px 0;">Promptable Description Guided (TBA)</li>
1000
- <li style="margin: 8px 0;">FiLM ベースの文字起こし不要なゼロショット・オーディオプロンプティング (Alchemy mode)</li>
1001
- <li style="margin: 8px 0;">この種のモデルとしては最も包括的なサンプリングツールキット</li>
1002
- <li style="margin: 8px 0;">アーティファクトを極限まで抑え、鮮明な音質を実現する新しい 44.1khz - 25hz コーデック</li>
1003
- </ul>
1004
-
1005
- <hr style="border: none; border-top: 1px solid rgba(0,0,0,0.15); margin: 25px 0;">
1006
-
1007
- <h3 style="color: #000000; margin-top: 20px; margin-bottom: 15px; font-size: 20px;">注意事項:</h3>
1008
- <ul style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 14px;">
1009
- <li style="margin: 8px 0;"><strong>Stereo</strong> は概念実証(PoC)段階です。Pan と Width で方向と深度を制御できますが、Width は現時点では近似的なものです。</li>
1010
- <li style="margin: 8px 0;"><strong>Speaker IDs</strong>: <code>speaker_id = 1</code> はランダムを意味します。多くの学習データ(特に Stereo)には話者タグがありません。未知の話者は分布外(OOD)になる可能性があるため品質にばらつきが出ますが、Stereo でも機能はします。</li>
1011
- <li style="margin: 8px 0;"><strong>データセットのバイアス / 制限</strong>: 左耳へのパンニング(<code>pan = 1–4</code>)や浅い深度(<code>&lt; 5</code>)のデータが過多です。Hover/rotation はここではサポートされていません。</li>
1012
- <li style="margin: 8px 0;"><strong>絵文字</strong>: サポートは最小限ですが、ASMR プロンプトの方向付けに役立ちます(例:😮‍💨 🌬️ 👂 🤭 🍭 💋)。NSFW の場合、テキスト(またはテキスト+絵文字)を主に使用してください。絵文字のみのプロンプトはあまりうまくいきません。</li>
1013
- <li style="margin: 8px 0;">絵文字は特定の音に必ずしも対応しているわけではないため、配置場所はそれほど重要ではありませんが、全体的な雰囲気(vibe)に影響を与えます。</li>
1014
- <li style="margin: 8px 0;"><strong>多数の設定項目</strong>: 基本的にはそのままで素晴らしい結果が得られますが、場合によっては調整が必要になることもあります。</li>
1015
- <li style="margin: 8px 0;"><strong><code>&lt;asmr&gt;</code> タグ</strong>: 任意ですが、出力が変化するため、有無の両方を試してみてください。</li>
1016
- <li style="margin: 8px 0;"><strong>NSFW</strong>: Mono の方が一般的に制御しやすいです(データ量が多いため)。入力テキストの先頭に <code>♡♡</code> を付けることを推奨します。</li>
1017
- <li style="margin: 8px 0;"><strong>Audio prompting</strong>: ベースモードでは、プロンプトは6秒以内かつ綺麗にトリミングされたものにしてください。Alchemy mode では、Prefill とは異なる手法を用いているため、この制約はそれほど重要ではありません。</li>
1018
- <li style="margin: 8px 0;"><strong>長さ制限</strong>: 最大出力は29.9秒です。このデモでは長時間の生成はサポートされていません。</li>
1019
- <li style="margin: 8px 0;"><strong>再生環境</strong>: 音場表現のしっかりしたヘッドホンの使用を推奨します。</li>
1020
- <li style="margin: 8px 0;"><strong>多様性の調整</strong>: バリエーションの激しい出力の場合、<code>Temperature</code> を <code>~0.8–1.0</code> に上げ、<code>DRY = 0</code> に設定してください。</li>
1021
- <li style="margin: 8px 0;"><code>DRY</code> をオフにすると、出力とテキストの相関が弱い場合にモデルが崩壊する可能性があります。</li>
1022
- <li style="margin: 8px 0;">Takane シリーズはアニメスタイルの出力に特化していますが、Mirei バリアントでは通常の日本語もよりサポートされています。</li>
1023
- </ul>
1024
-
1025
- <h3 style="color: #000000; margin-top: 30px; margin-bottom: 15px; font-size: 20px;">Prefilling と RyuseiNet についての注記</h3>
1026
- <p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 14px;">
1027
- コーデックベースのモデルにおける Audio prompting のデフォルトモードは Prefilling です。これは、ボイスクリップからオーディオトークンを抽出し、その文字起こしと共に入力に結合する方法です。
1028
- 問題点はコンテキストを消費してしまうことです。モデルは入力を、プロンプトの自然な続きとして扱うためです。
1029
- </p>
1030
- <p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 14px;">
1031
- 例えば、モデルが主に30秒のチャンクで学習されている場合、リファレンス+出力の合計長は30秒未満である必要があります。また、安全のためにバッファを持たせることも推奨されます。例えば、出力が約20秒になる場合、オーディオプロンプトは5〜6秒程度に収めるべきです。
1032
- </p>
1033
- <p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 14px;">
1034
- RyuseiNet (Alchemy mode) では、この制限はありません。それに話者の類似性がより高く、入力テキストに対してより頑健です。ただし、参考のために両方の手法を残しています。
1035
- </p>
1036
-
1037
- <div style="margin-top: 40px; padding-top: 20px; border-top: 1px solid rgba(0,0,0,0.1);">
1038
- <p style="color: #666; font-size: 14px; text-align: center;">
1039
-
1040
- </p>
1041
- </div>
1042
-
1043
- </div>
1044
- """)
1045
- with gr.Column(scale=1):
1046
- gr.HTML("""
1047
- <div style="background-color: rgba(255, 255, 255, 0.525); padding: 30px; border-radius: 12px; backdrop-filter: blur(5px); max-width: 100%; box-shadow: 0 4px 6px rgba(0,0,0,0.5);">
1048
-
1049
- <h2 style="color: #000000; margin-bottom: 20px; font-size: 28px;">English</h2>
1050
-
1051
- <p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; margin-bottom: 20px; font-size: 16px;">
1052
- Mirei (美嶺) is the continuation of my work from Takane, rebuilt and redesigned from the ground up for high-quality, anime-style Japanese voice generation. It's larger, faster, sounds cleaner, has more features, and is also much cheaper to train.
1053
- </p>
1054
-
1055
- <p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; margin-bottom: 20px; font-size: 16px;">
1056
- It is another step toward my goal of developing the ultimate Japanese speech and audio synthesis model, pushing as far as today's best technology can take.
1057
- </p>
1058
 
1059
- <h3 style="color: #000000; margin-top: 30px; margin-bottom: 15px; font-size: 20px;">Compared to the base Takane, Mirei adds:</h3>
1060
- <ul style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 15px;">
1061
- <li style="margin: 8px 0;">Significantly larger capacity: 500M 1.2B (mono) / 3B (stereo) parameters</li>
1062
- <li style="margin: 8px 0;">Trained on more data</li>
1063
- <li style="margin: 8px 0;">Experimental controllable end-to-end stereo generation</li>
1064
- <li style="margin: 8px 0;">Promptable Description Guided (TBA)</li>
1065
- <li style="margin: 8px 0;">FiLM-based, transcription-less zero-shot audio prompting (Alchemy mode)</li>
1066
- <li style="margin: 8px 0;">The most comprehensive sampling toolkit in any model of this kind</li>
1067
- <li style="margin: 8px 0;">A new 44.1khz - 25hz codec that sounds crisp with as few artifacts as possible</li>
1068
- </ul>
1069
-
1070
- <hr style="border: none; border-top: 1px solid rgba(0,0,0,0.15); margin: 25px 0;">
1071
-
1072
- <h3 style="color: #000000; margin-top: 20px; margin-bottom: 15px; font-size: 20px;">Notes:</h3>
1073
- <ul style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 14px;">
1074
- <li style="margin: 8px 0;"><strong>Stereo</strong> is a proof of concept. Use pan and width to control direction and depth, but width is approximate right now (until I come up with a better extraction algorithm).</li>
1075
- <li style="margin: 8px 0;"><strong>Speaker IDs</strong>: <code>speaker_id = 1</code> means random. Most training data (especially stereo) lacks speaker tags. Unseen speakers may be out-of-distribution, so quality can vary (stereo can still work).</li>
1076
- <li style="margin: 8px 0;"><strong>Dataset bias / limits</strong>: Left-ear panning (<code>pan = 1–4</code>) and shallower depth (<code>&lt; 5</code>) are overrepresented. Hover/rotation isn't supported here (but it is technically feasible).</li>
1077
- <li style="margin: 8px 0;"><strong>Emoji</strong> support is minimal but can help ground ASMR prompts (e.g., 😮‍💨 🌬️ 👂 🤭 🍭 💋). For NSFW, rely more on text (or text + emoji). Emoji-only prompts rarely work well.</li>
1078
- <li style="margin: 8px 0;">Emojis don't necessarily correlate with specific sounds, so their placement doesn't matter much; but they affect the overall vibe.</li>
1079
- <li style="margin: 8px 0;"><strong>Lots of knobs</strong>: You'll get great results most of the time, but the remaining cases may take some tuning.</li>
1080
- <li style="margin: 8px 0;"><strong><code>&lt;asmr&gt;</code> tag</strong>: Optional, but it does change the output — try with and without it.</li>
1081
- <li style="margin: 8px 0;"><strong>NSFW</strong>: Mono is generally easier to steer (more data). Prepending the input text with <code>♡♡</code> is recommended.</li>
1082
- <li style="margin: 8px 0;"><strong>Audio prompting</strong>: In base mode, keep prompts ≤ 6s, cleanly trimmed (no abrupt cuts). In Alchemy mode, this constraint doesn't matter as much (we use a different method than prefill).</li>
1083
- <li style="margin: 8px 0;"><strong>Length limit</strong>: Max output is 29.9s; long-form generation isn't supported in this demo.</li>
1084
- <li style="margin: 8px 0;"><strong>Playback</strong>: Headphones with a decent soundstage are recommended; otherwise spatial effects may feel weaker.</li>
1085
- <li style="margin: 8px 0;"><strong>Diversity tuning</strong>: For high-variance outputs (laughter, extreme emotions, aegi/chupa), raise <code>temperature</code> to <code>~0.8–1.0</code> and set <code>DRY = 0</code>.</li>
1086
- <li style="margin: 8px 0;">Turning off <code>DRY</code> can cause the model to collapse when the output is weakly correlated with the text.</li>
1087
- <li style="margin: 8px 0;">While the Takane model family is geared towards anime-style outputs, normal Japanese is also better supported with the Mirei variant (I have provided an example).</li>
1088
- </ul>
1089
-
1090
- <h3 style="color: #000000; margin-top: 30px; margin-bottom: 15px; font-size: 20px;">A Note about Prefilling vs RyuseiNet</h3>
1091
- <p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 14px;">
1092
- The default mode for audio prompting in any codec-based model is prefilling — where you extract audio tokens from your voice clip and glue them together with the transcription as your input.
1093
- The problem is it eats up your context, because the model treats your input as if it is a natural continuation of your prompt.
1094
- </p>
1095
- <p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 14px;">
1096
- For example, if your model was trained on mostly 30-second chunks, the length of your reference + the output should be less than 30 seconds. It's also recommended to keep a safety buffer. For example, if your output will be roughly 20 seconds, your audio prompt should be around 5–6 seconds to be in the safe zone.
1097
- </p>
1098
- <p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 14px;">
1099
- For RyuseiNet (Alchemy mode), we don't have this limitation, it has a better speaker similarity and more robust to your input text. But I am keeping both methods for reference.
1100
- </p>
1101
-
1102
- <div style="margin-top: 40px; padding-top: 20px; border-top: 1px solid rgba(0,0,0,0.1);">
1103
- <p style="color: #666; font-size: 14px; text-align: center;">
1104
-
1105
- </p>
1106
- </div>
1107
 
1108
- </div>
1109
- """)
1110
- gr.HTML("""
1111
- <div style="text-align: center; margin-top: 30px; padding: 20px; background-color: rgba(255, 255, 255, 0.35); border-radius: 10px; backdrop-filter: blur(5px);">
1112
- <p style="color: #1a1a1a; font-size: 17px; font-weight: 500;">
1113
- If you need help or have questions, feel free to contact me on
1114
- <a href="https://x.com/MystiqCaleid" target="_blank" style="color: #b45309; text-decoration: none; font-weight: 600;">X / Twitter</a>
1115
- or
1116
- <a href="https://discord.com/users/349236707167698944" target="_blank" style="color: #5865F2; text-decoration: none; font-weight: 600;">Discord (@soshyant)</a>
1117
- </p>
1118
- </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1119
  """)
1120
- # def load_default_example():
1121
- # if len(example_list_stereo) >= 2:
1122
- # return load_stereo_example_fn(example_list_stereo[1][0])
1123
- # return gr.update(), gr.update(), gr.update(), None, ""
1124
 
1125
- def load_default_example():
1126
- if len(example_list_mono) >= 2:
1127
- return load_mono_example_fn(example_list_mono[-1][0])
1128
- return gr.update(), gr.update(), gr.update(), None, ""
1129
-
1130
- demo.load(
1131
- fn=load_default_example,
1132
- inputs=None,
1133
- outputs=[text_input, use_stereo_checkbox, speaker_id_slider, audio_output, status_output]
1134
- )
1135
 
1136
  if __name__ == "__main__":
1137
- demo.queue(api_open=False, max_size=15).launch()
1138
-
1139
-
 
1
  import gradio as gr
2
  from gradio_client import Client
3
  import os
 
4
  import numpy as np
5
  import scipy.io.wavfile as wavfile
 
6
 
7
+ client = Client(os.environ['src'])
8
+ # client = Client("http://localhost:7861/")
 
 
 
9
 
10
  css = """
11
  .gradio-container input::placeholder,
 
17
  padding: 2px 4px;
18
  border-radius: 3px;
19
  }
 
 
 
 
 
 
 
20
 
21
  .gr-checkbox label span,
22
  .gr-check-radio label span,
 
24
  .checkbox-container span {
25
  color: #ECF2F7 !important;
26
  }
 
 
 
 
 
27
 
28
  #advanced-accordion > button,
29
  #advanced-accordion > button span,
 
36
  color: #FFD700 !important;
37
  }
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  body {
40
  background: none !important;
41
  }
 
51
  pointer-events: none;
52
  background: url('https://i.postimg.cc/1smD6GPf/gradio-theme-rin2.png') center center / cover no-repeat;
53
  }
 
54
  """
55
 
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  def run_generation_pipeline_client(
58
  raw_text,
59
+ voice_description,
60
+ cfg_text,
61
+ cfg_style,
62
+ description_adaln_scale,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  temperature,
64
  min_temp,
65
  max_temp,
66
  temp_exponent,
67
  top_k,
 
68
  min_p,
69
  dry_multiplier,
70
  max_tokens,
71
  seed,
72
  ):
73
  try:
 
 
 
 
 
 
 
 
 
 
 
74
  result = client.predict(
75
  raw_text,
76
+ voice_description,
77
+ cfg_text,
78
+ cfg_style,
79
+ description_adaln_scale,
 
 
80
  temperature,
81
  min_temp,
82
  max_temp,
83
  temp_exponent,
84
  top_k,
 
85
  min_p,
86
  dry_multiplier,
87
  max_tokens,
88
  seed,
89
+ "", # user_ip
90
+ api_name="/run_generation_pipeline"
91
  )
92
 
 
 
 
93
  if result is None:
94
  return None, "Status: No response from server"
95
 
 
110
  return None, "Status: Unexpected response format from server"
111
 
112
  except Exception as e:
 
 
113
  return None, f"Status: Connection error: {str(e)}"
114
 
115
 
 
 
 
 
 
 
 
116
  with gr.Blocks(theme="Respair/Shiki@10.1.0", css=css) as demo:
117
 
 
 
 
118
  gr.Markdown(
119
+ """
120
+ <div style="text-align: left;">
121
+ モデルの性能を発揮し、ハルシネーションや不自然な生成を防ぐには、学習データに沿った入力スタイルを守る必要があります。
122
+ 使用前に <code>Guide</code> タブをよくチェックしてください。
123
+ I hope you enjoy! <br>
124
+ もしSpaceが正しく読み込めな(Errorと表示される)場合は、何度かペリロードてみてください。<br>
125
+ </div>
126
+ """
 
 
127
  )
128
 
129
  with gr.Tabs():
130
 
131
+ # ==================== Speech Generation Tab ====================
132
  with gr.TabItem("Speech Generation"):
133
  with gr.Row():
134
  with gr.Column(scale=2):
135
  text_input = gr.Textbox(
136
+ label="Text",
137
  lines=5,
 
138
  placeholder="ここでテキストを入力してください...\n\n"
139
  )
140
 
141
+ voice_desc_input = gr.Textbox(
142
+ label="Voice Description",
143
+ lines=3,
144
+ placeholder="Describe the voice you want, e.g. 'a calm, warm female voice speaking softly'...",
145
  )
146
 
147
  with gr.Row(equal_height=False):
148
  with gr.Accordion("----------------------------------⭐ 🛠️ ⭐", open=False):
149
 
150
+ gr.Markdown('<h3 style="color: #FFD700;">Style / CFG Parameters</h3>')
 
 
 
 
151
 
152
+ cfg_text_slider = gr.Slider(
153
+ label="CFG Text", minimum=0.5, maximum=3.0, value=1.4, step=0.05,
 
 
 
 
154
  )
155
+ cfg_style_slider = gr.Slider(
156
+ label="CFG Style (値を上げると記述への忠実度が増しますが、不自然になる可能性があります)",
157
+ minimum=0.5, maximum=5.0, value=2.0, step=0.1,
158
+ )
159
+ desc_adaln_scale_slider = gr.Slider(
160
+ label="Description AdaLN Scale", minimum=0.0, maximum=3.0, value=1.0, step=0.1,
 
 
 
161
  )
162
 
163
+ gr.Markdown('<h3 style="color: #FFD700;">Sampling Parameters</h3>')
164
+
165
  temperature_slider = gr.Slider(
166
+ label="Temperature", minimum=0.0, maximum=2.0, value=0.4, step=0.05,
 
 
 
 
167
  )
 
168
  min_temp_slider = gr.Slider(
169
+ label="Min Temperature (adaptive)", minimum=0.0, maximum=2.0, value=0.8, step=0.05,
 
 
 
 
170
  )
 
171
  max_temp_slider = gr.Slider(
172
+ label="Max Temperature (adaptive)", minimum=0.0, maximum=2.0, value=1.0, step=0.05,
 
 
 
 
173
  )
174
+ temp_exponent_slider = gr.Slider(
175
+ label="Temperature Exponent", minimum=0.5, maximum=2.0, value=1.0, step=0.1,
 
 
 
 
 
176
  )
177
+ top_k_slider = gr.Slider(
178
+ label="Top K (0 = off)", minimum=0, maximum=200, value=0, step=5,
 
 
 
 
 
179
  )
 
180
  min_p_slider = gr.Slider(
181
+ label="Min P (0 = off)", minimum=0.0, maximum=1.0, value=0.0, step=0.01,
 
 
 
 
182
  )
183
 
184
+ gr.Markdown('<h3 style="color: #FFD700;">Repetition Control (DRY)</h3>')
 
 
 
 
 
 
185
 
186
  dry_multiplier_slider = gr.Slider(
187
+ label="DRY Multiplier (0 = off)", minimum=0.0, maximum=5.0, value=3.8, step=0.1,
 
 
 
 
188
  )
189
 
190
+ gr.Markdown('<h3 style="color: #FFD700;">Other</h3>')
 
 
 
 
 
 
191
 
192
+ max_tokens_slider = gr.Slider(
193
+ label="Max Tokens", minimum=100, maximum=1500, value=1024, step=10,
 
 
 
 
 
 
194
  )
195
+ seed_slider = gr.Slider(
196
+ label="Seed (-1 for random)", minimum=-1, maximum=2700000000, value=42, step=1,
 
 
 
 
 
197
  )
198
 
199
  with gr.Column(scale=1):
 
206
  interactive=False
207
  )
208
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  generate_button.click(
210
  fn=run_generation_pipeline_client,
211
  inputs=[
212
  text_input,
213
+ voice_desc_input,
214
+ cfg_text_slider,
215
+ cfg_style_slider,
216
+ desc_adaln_scale_slider,
217
  temperature_slider,
218
  min_temp_slider,
219
  max_temp_slider,
220
+ temp_exponent_slider,
221
  top_k_slider,
 
222
  min_p_slider,
223
  dry_multiplier_slider,
224
  max_tokens_slider,
 
 
225
  seed_slider,
226
  ],
227
  outputs=[audio_output, status_output],
228
+ concurrency_limit=4,
229
  )
230
 
231
+ # ==================== Guide Tab ====================
232
+ with gr.TabItem("Guide"):
233
+ gr.HTML('<h1 style="text-align: center;">🪷 Takane - Mirei 「美嶺」 </h1>')
 
 
 
 
 
 
 
 
234
  with gr.Row():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  with gr.Column(scale=1):
236
+ gr.HTML("""
237
+ <div style="background-color: rgba(255, 255, 255, 0.525); padding: 30px; border-radius: 12px; backdrop-filter: blur(5px); max-width: 100%; box-shadow: 0 4px 6px rgba(0,0,0,0.5);">
238
+ <h2 style="color: #000000; margin-bottom: 20px; font-size: 28px;">日本語</h2>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
+ <p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; margin-bottom: 20px; font-size: 16px;">
 
 
 
 
 
 
 
 
 
 
 
 
241
  このモードでは、性別・口調・感情などをプロンプトで指示するだけで、好みの声を作成できます。
242
+ テキストを入力し、声の特徴を自然言語で記述すると、モデルがその記述に合った音声を合成します。
243
  </p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
 
245
+ <h3 style="color: #000000; margin-top: 30px; margin-bottom: 15px; font-size: 20px;">使い方:</h3>
246
+ <ul style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 15px;">
247
+ <li style="margin: 8px 0;">1. 読み上げたい日本語テキストを入力</li>
248
+ <li style="margin: 8px 0;">2. 声の特徴を記述(例:「かわいい元気な女の子の声」「落ち着いた低い男性ナレーター」)</li>
249
+ <li style="margin: 8px 0;">3. Generate をクリック!</li>
250
+ </ul>
251
+
252
+ <hr style="border: none; border-top: 1px solid rgba(0,0,0,0.15); margin: 25px 0;">
253
+
254
+ <h3 style="color: #000000; margin-top: 20px; margin-bottom: 15px; font-size: 20px;">注意事項:</h3>
255
+ <ul style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 14px;">
256
+ <li style="margin: 8px 0;"><strong>Style Tags</strong>: テキストの先頭に <code>&lt;asmr&gt;</code> のようなスタイルタグを付けることで出力スタイルを制御できます。</li>
257
+ <li style="margin: 8px 0;"><strong>DRY Multiplier</strong>: 繰り返し防止のペナルティです。0にすると出力が崩壊する可能性があります。</li>
258
+ <li style="margin: 8px 0;"><strong>長さ制限</strong>: 最大入力400トークン、最大出力は約29.9秒です。</li>
259
+ </ul>
260
+ </div>
261
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
 
263
+ with gr.Column(scale=1):
 
264
  gr.HTML("""
265
+ <div style="background-color: rgba(255, 255, 255, 0.525); padding: 30px; border-radius: 12px; backdrop-filter: blur(5px); max-width: 100%; box-shadow: 0 4px 6px rgba(0,0,0,0.5);">
266
+ <h2 style="color: #000000; margin-bottom: 20px; font-size: 28px;">English</h2>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
 
268
+ <p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; margin-bottom: 20px; font-size: 16px;">
269
+ In this mode, you can create the voice you want by describing it with a prompt — gender, tone, emotion, characteristics, and more.
270
+ Enter the Japanese text you want spoken and describe the desired voice in natural language.
271
+ </p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
+ <h3 style="color: #000000; margin-top: 30px; margin-bottom: 15px; font-size: 20px;">How to use:</h3>
274
+ <ul style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 15px;">
275
+ <li style="margin: 8px 0;">1. Enter the Japanese text you want spoken</li>
276
+ <li style="margin: 8px 0;">2. Describe the voice (e.g. "a cute, energetic young girl voice", "a calm deep male narrator")</li>
277
+ <li style="margin: 8px 0;">3. Click Generate!</li>
278
+ </ul>
279
+
280
+ <hr style="border: none; border-top: 1px solid rgba(0,0,0,0.15); margin: 25px 0;">
281
+
282
+ <h3 style="color: #000000; margin-top: 20px; margin-bottom: 15px; font-size: 20px;">Parameter Guide:</h3>
283
+
284
+ <h4 style="color: #333; margin-top: 20px;">Style / CFG Parameters:</h4>
285
+ <ul style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 14px;">
286
+ <li style="margin: 8px 0;"><strong>CFG Text:</strong> Classifier-free guidance strength for text conditioning</li>
287
+ <li style="margin: 8px 0;"><strong>CFG Style:</strong> Guidance strength for the voice description. Higher = more faithful but may sound unnatural.</li>
288
+ <li style="margin: 8px 0;"><strong>Description AdaLN Scale:</strong> Controls influence of the description embedding via adaptive layer norm</li>
289
+ </ul>
290
+
291
+ <h4 style="color: #333; margin-top: 20px;">Sampling Parameters:</h4>
292
+ <ul style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 14px;">
293
+ <li style="margin: 8px 0;"><strong>Temperature:</strong> Controls randomness (lower = more deterministic)</li>
294
+ <li style="margin: 8px 0;"><strong>Min/Max Temperature:</strong> Adaptive temperature range based on entropy</li>
295
+ <li style="margin: 8px 0;"><strong>Temperature Exponent:</strong> Adaptive temperature mapping curve (1.0 = linear)</li>
296
+ <li style="margin: 8px 0;"><strong>Top K:</strong> Keeps only K most likely tokens (0 = disabled)</li>
297
+ <li style="margin: 8px 0;"><strong>Min P:</strong> Minimum probability threshold (0 = disabled)</li>
298
+ <li style="margin: 8px 0;"><strong>DRY Multiplier:</strong> Repetition penalty (0 = off, higher = less repetition). Turning it off can cause collapse.</li>
299
+ </ul>
300
+
301
+ <h4 style="color: #333; margin-top: 20px;">Notes:</h4>
302
+ <ul style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 14px;">
303
+ <li style="margin: 8px 0;"><strong>Style tags</strong>: Start text with <code>&lt;asmr&gt;</code> etc. to control generation style.</li>
304
+ <li style="margin: 8px 0;">Maximum input: 400 tokens. Maximum output: ~29.9 seconds.</li>
305
+ </ul>
306
+
307
+ <div style="margin-top: 40px; padding-top: 20px; border-top: 1px solid rgba(0,0,0,0.1);">
308
+ <p style="color: #666; font-size: 14px; text-align: center;">
309
+ 🌸 Takane Kiwami — Description-Guided Japanese Text-to-Speech
310
+ </p>
311
+ </div>
312
+ </div>
313
  """)
 
 
 
 
314
 
315
+ gr.HTML("""
316
+ <div style="text-align: center; margin-top: 30px; padding: 20px; background-color: rgba(255, 255, 255, 0.35); border-radius: 10px; backdrop-filter: blur(5px);">
317
+ <p style="color: #1a1a1a; font-size: 17px; font-weight: 500;">
318
+ If you need help or have questions, feel free to contact me on
319
+ <a href="https://x.com/MystiqCaleid" target="_blank" style="color: #b45309; text-decoration: none; font-weight: 600;">X / Twitter</a>
320
+ or
321
+ <a href="https://discord.com/users/349236707167698944" target="_blank" style="color: #5865F2; text-decoration: none; font-weight: 600;">Discord (@soshyant)</a>
322
+ </p>
323
+ </div>
324
+ """)
325
 
326
  if __name__ == "__main__":
327
+ demo.queue(api_open=False, max_size=15).launch()