File size: 16,794 Bytes
b2ed3d3
 
 
 
 
 
 
d99da81
7fbf919
 
6b05521
 
 
 
b2ed3d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b05521
 
 
 
 
4cb44e3
b2ed3d3
 
 
 
4cb44e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b05521
b2ed3d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b05521
 
 
1505e24
6b05521
 
 
b2ed3d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12369c4
 
 
 
b2ed3d3
 
930f74a
b2ed3d3
 
 
8116d64
b2ed3d3
 
 
ba697f2
b2ed3d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e7ed96
b2ed3d3
1e7ed96
b2ed3d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4cb44e3
b2ed3d3
 
 
4cb44e3
b2ed3d3
 
 
 
 
 
 
 
adce806
b2ed3d3
 
4cb44e3
 
b2ed3d3
4cb44e3
 
 
 
 
b2ed3d3
 
 
 
 
3593633
b2ed3d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2300c3
b2ed3d3
 
 
 
 
 
 
812b614
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2ed3d3
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
import gradio as gr
from gradio_client import Client
import os
import random
import numpy as np
import scipy.io.wavfile as wavfile




# try:
#     client = Client(os.environ['src'])
# except:
#     client = Client("http://localhost:7861/")

css = """
.gradio-container input::placeholder,
.gradio-container textarea::placeholder {
    color: #333333 !important;
}
code {
    background-color: #ffde9f;
    padding: 2px 4px;
    border-radius: 3px;
}

.gr-checkbox label span,
.gr-check-radio label span,
[data-testid="checkbox"] label span,
.checkbox-container span {
    color: #ECF2F7 !important;
}

#advanced-accordion > button,
#advanced-accordion > button span,
#advanced-accordion > div > button,
#advanced-accordion > div > button span,
#advanced-accordion .label-wrap,
#advanced-accordion .label-wrap span,
#advanced-accordion > .open,
#advanced-accordion > .open span {
    color: #FFD700 !important;
}

#voice-preset-container .gallery button,
#voice-preset-container .gr-examples button,
#voice-preset-container .examples button,
#voice-preset-container button.sample {
    background-color: #c8b8d4 !important;
    border: 1px solid #b8a8c4 !important;
    color: #1a1a1a !important;
    font-weight: 500 !important;
    margin: 4px !important;
    padding: 10px 14px !important;
    border-radius: 6px !important;
    transition: background-color 0.2s ease !important;
}

#voice-preset-container .gallery button:hover,
#voice-preset-container .gr-examples button:hover,
#voice-preset-container .examples button:hover,
#voice-preset-container button.sample:hover {
    background-color: #baadc9 !important;
    border-color: #a89ab8 !important;
}

body {
    background: none !important;
}

body::before {
    content: "";
    position: fixed;
    top: 0;
    left: 0;
    width: 100%;
    height: 100%;
    z-index: -1;
    pointer-events: none;
    background: url('https://i.postimg.cc/1smD6GPf/gradio-theme-rin2.png') center center / cover no-repeat;
}

"""

VOICE_EXAMPLES = {
    "甘えた女の子 / ゆっくり": "かわいくて高い声の女の子が、甘えながらゆっくりのんびりしゃべってる感じの音声がほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/onnanoko_amai.wav
    "激怒する女性 / 感情爆発": "低くて激しい声の女性が、感情を抑えきれずに怒りを爆発させながら、早口でまくしたてるような声で読んでほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/angry.wav
    "落ち着いた男性 / 呆れ気味": "落ち着いた低めの声の男性が、相手の言動に少し呆れつつも感情を表に出さず、静かで平坦なトーンで淡々と話してるような声で読んでほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/guy_cool.wav
    "Calm man / mildly exasperated (EN)": "Read this in the voice of a calm, low-pitched man who sounds mildly exasperated but keeps his emotions in check, speaking in a flat, even tone without much expression.", # Nothing
    "冷たい女性 / 憎しみ (1)": "低くて冷たい声の女性が、怒りを内に秘めながら憎しみのこもった口調で、淡々と早めに話してるような声で読んでほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/woman_cold_frustrated_2.wav
    "冷たい女性 / 憎しみ (2)": "低くて冷たい声の女性が、怒りを内に秘めながら憎しみのこもった口調で、淡々と早めに話してるような声で読んでほしい。", # same text different result --> https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/woman_cold_frustrated.wav
}

VOICE_PRESET_LIST = list(VOICE_EXAMPLES.items())

# label -> local file path (ship these in your Space repo under samples/)
PREGENERATED_AUDIO = {
    "甘えた女の子 / ゆっくり": "samples/onnanoko_amai.wav",
    "激怒する女性 / 感情爆発": "samples/angry.wav",
    "落ち着いた男性 / 呆れ気味": "samples/guy_cool.wav",
    "冷たい女性 / 憎しみ (1)": "samples/woman_cold_frustrated_2.wav",
    "冷たい女性 / 憎しみ (2)": "samples/woman_cold_frustrated.wav",
}

def load_pregenerated_to_main(label):
    """
    Click handler from Examples tab:
    loads instruction text into the Instruction box (optional)
    and loads the pre-generated WAV into the MAIN tab audio_output.
    """
    desc = VOICE_EXAMPLES.get(label, "")
    path = PREGENERATED_AUDIO.get(label)

    if path and os.path.exists(path):
        sr, data = wavfile.read(path)

        if isinstance(data, np.ndarray) and data.ndim == 2 and data.shape[0] in (1, 2) and data.shape[0] < data.shape[1]:
            data = data.T

        return (
            gr.update(value=desc),          # voice_desc_input
            (sr, data),                     # audio_output (MAIN TAB)
            f"Status: Loaded pre-generated sample: {label}"
        )

    return (
        gr.update(value=desc),
        None,
        f"Status: No pre-generated audio found for: {label}"
    )


def run_generation_pipeline_client(
    raw_text,
    voice_description,
    cfg_text,
    cfg_style,
    min_temp,
    max_temp,
    top_k,
    min_p,
    dry_multiplier,
    seed,
):
    try:
        result = client.predict(
            raw_text,
            voice_description,
            cfg_text,
            cfg_style,
            min_temp,
            max_temp,
            top_k,
            min_p,
            dry_multiplier,
            seed,
            "",
            api_name="/run_generation_pipeline"
        )

        if result is None:
            return None, "Status: No response from server"

        if isinstance(result, (list, tuple)) and len(result) == 2:
            audio_result, status_msg = result
            if audio_result is not None:
                if isinstance(audio_result, str) and os.path.exists(audio_result):
                    sr, data = wavfile.read(audio_result)
                elif isinstance(audio_result, (list, tuple)) and len(audio_result) >= 2:
                    sr = audio_result[0]
                    data = np.array(audio_result[1]) if isinstance(audio_result[1], list) else audio_result[1]
                else:
                    return None, status_msg

                if isinstance(data, np.ndarray) and data.ndim == 2 and data.shape[0] in (1, 2) and data.shape[0] < data.shape[1]:
                    data = data.T

                return (sr, data), status_msg
            return None, status_msg

        return None, "Status: Unexpected response format from server"

    except Exception as e:
        return None, f"Status: Connection error: {str(e)}"


with gr.Blocks(theme="Respair/Shiki@10.1.0", css=css) as demo:
    gr.Markdown(
    """
    <div style="text-align: left;">
    Demo is closed until further notice; thank you for using it. Feel free to check the pre-generated samples at the <code>Examples</code> tab. <br>
    </div>
    """
    )
    with gr.Tabs():

        with gr.TabItem("Speech Generation"):
            with gr.Row():
                with gr.Column(scale=2):
                    text_input = gr.Textbox(
                        label="Text",
                        lines=5,
                        max_length=125,
                        value="準備もできましたけど、いきなり本題に入ると分かりにくいかもしれないので、まずは今日やることを短く整理して、手順を一つずつ確認しながら進めていきますね。途中で気になるところがあったら、その都度止めて大丈夫です。",
                    )

                    with gr.Column(elem_id="voice-desc-wrap"):
                        voice_desc_input = gr.Textbox(
                        label="Instruction",
                        value="低くて激しい声の女性が、感情を抑えきれずに怒りを爆発させながら、早口でまくしたてるような声で読んでほしい。",
                        lines=2,
                    )
                    with gr.Row(equal_height=False):
                        with gr.Accordion("----------------------------------⭐ 🛠️ ⭐", open=False):
                    
                            seed_slider = gr.Slider( 
                                label="Seed (-1 for random)", minimum=-1, maximum=2700000000, value=2700000000, step=1
                            )
                            gr.Markdown('<h3 style="color: #FFD700;">Style / CFG Parameters</h3>')
                            cfg_text_slider = gr.Slider(
                                label="CFG Text", minimum=0.5, maximum=3.0, value=1.15, step=0.05,
                            )
                            cfg_style_slider = gr.Slider(
                                label="CFG Style",
                                minimum=0.5, maximum=3.0, value=1.2, step=0.1,
                            )
                            gr.Markdown('<h3 style="color: #FFD700;">Sampling Parameters</h3>')
                            min_temp_slider = gr.Slider(
                                label="Min Temperature (adaptive)", minimum=0.0, maximum=2.0, value=0.25, step=0.05,
                            )
                            max_temp_slider = gr.Slider(
                                label="Max Temperature (adaptive)", minimum=0.0, maximum=2.0, value=1.0, step=0.05,
                            )
                            top_k_slider = gr.Slider(
                                label="Top K (0 = off)", minimum=0, maximum=200, value=0, step=5,
                            )
                            min_p_slider = gr.Slider(
                                label="Min P (0 = off)", minimum=0.0, maximum=1.0, value=0.0, step=0.01,
                            )

                            gr.Markdown('<h3 style="color: #FFD700;">Repetition Control</h3>')

                            dry_multiplier_slider = gr.Slider(
                                label="DRY Multiplier (0 = off)", minimum=0.0, maximum=5.0, value=0.8, step=0.1,
                            )

                            # gr.Markdown('<h3 style="color: #FFD700;">Other</h3>')

     
                        with gr.Column(scale=1):
                            generate_button = gr.Button("🎤 Generate", variant="primary", size="lg")

                with gr.Column(scale=1):
                    status_output = gr.Textbox(label="Status", interactive=False)
                    audio_output = gr.Audio(
                        label="Generated Speech",
                        interactive=False
                    )

            # random_desc_button.click(
            #     fn=lambda: random.choice(VOICE_PRESET_LIST)[1],
            #     inputs=[],
            #     outputs=[voice_desc_input],
            # )

            generate_button.click(
                fn=run_generation_pipeline_client,
                inputs=[
                    text_input,
                    voice_desc_input,
                    cfg_text_slider,
                    cfg_style_slider,
                    min_temp_slider,
                    max_temp_slider,
                    top_k_slider,
                    min_p_slider,
                    dry_multiplier_slider,
                    seed_slider,
                ],
                outputs=[audio_output, status_output],
                concurrency_limit=4,
            )

        with gr.TabItem("Examples"):
            gr.HTML("""
            <div style="background-color: rgba(255, 255, 255, 0.025); padding: 20px; border-radius: 12px; backdrop-filter: blur(10px); box-shadow: 0 4px 6px rgba(0,0,0,0.5); margin-top: 8px;">
                <p style="color: #1a1a1a; font-weight: 500; line-height: 1.6; font-size: 14px; text-align: center; margin: 0;">
                クリックするとメインタブの音声プレイヤーにプリジェネ音声がロードされます。 / Click a preset to load the pre-generated audio into the main tab player.
                </p>
            </div>
            """)

            with gr.Row():
                with gr.Column(scale=1, elem_id="voice-preset-container"):
                    gr.HTML("""
                    <div style="background-color: rgba(255, 255, 255, 0.55); padding: 8px 12px; border-radius: 8px; backdrop-filter: blur(10px); box-shadow: 0 2px 4px rgba(0,0,0,0.08); text-align: center; max-width: 220px; margin: 0 auto 12px auto;">
                        <h3 style="color: #000000; margin: 0; font-size: 16px;">Examples</h3>
                    </div>
                    """)
                    example_label_holder = gr.Textbox(visible=False)

                    gr.Examples(
                        examples=[[label] for label in PREGENERATED_AUDIO.keys()],
                        inputs=[example_label_holder],
                        outputs=[voice_desc_input, audio_output, status_output],  # <-- MAIN TAB outputs
                        fn=load_pregenerated_to_main,
                        label="Click to load a pre-generated sample",
                        cache_examples=False,
                        run_on_click=True,
                        examples_per_page=10,
                    )

        with gr.TabItem("Info"):
            gr.HTML('<h1 style="text-align: center;">🌸 Takane - Voice Design 🎨 </h1>')
            
            gr.HTML("""
            <div style="background-color: rgba(255, 255, 255, 0.525); padding: 30px; border-radius: 12px; backdrop-filter: blur(5px); max-width: 100%; box-shadow: 0 4px 6px rgba(0,0,0,0.5);">
            <div style="display: flex; gap: 24px; flex-wrap: wrap; justify-content: center;">
                
                <div style="flex: 1; min-width: 280px;">
                <h3 style="color: #000000; margin: 0 0 12px 0; font-size: 20px; text-align: center;">日本語</h3>
                <p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 16px; margin: 0; text-align: center;">
                    本モデルのバックボーンは
                    <a href="https://huggingface.co/spaces/Respair/Takane" target="_blank" rel="noopener noreferrer"
                    style="color: #b45309; text-decoration: none; font-weight: 600;">
                    Takane
                    </a>
                    を改良したもので、ネイティブ 44.1kHz コーデックを備えた完全自回帰のエンコーダ・デコーダ型 Transformer です。<br><br>
                    <strong>CFG Style</strong> を上げると指示への追従が強くなりますが、上げすぎると過剰な条件付け(over-conditioning)が起きて音質が劣化する場合があります。
                </p>
                </div>

                <div style="flex: 1; min-width: 280px;">
                <h3 style="color: #000000; margin: 0 0 12px 0; font-size: 20px; text-align: center;">English</h3>
                <p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 16px; margin: 0; text-align: center;">
                    The backbone is a modified version of
                    <a href="https://huggingface.co/spaces/Respair/Takane" target="_blank" rel="noopener noreferrer"
                    style="color: #b45309; text-decoration: none; font-weight: 600;">
                    Takane
                    </a>,
                    a fully autoregressive encoder-decoder transformer with a native 44.1khz codec.<br><br>
                    Raise <strong>CFG Style</strong> if you want stronger adherence; pushing it too high can cause over-conditioning and degrade quality. <br><br>
                    <code>This model is only in Japanese</code>, if you enjoy anime, this is yours to play with.
                </p>
                </div>

            </div>
            </div>
            """)

    def load_default():
        label = "激怒する女性 / 感情爆発"
        desc = VOICE_EXAMPLES.get(label, "")
        path = PREGENERATED_AUDIO.get(label)
    
        if path and os.path.exists(path):
            sr, data = wavfile.read(path)
            if isinstance(data, np.ndarray) and data.ndim == 2 and data.shape[0] in (1, 2) and data.shape[0] < data.shape[1]:
                data = data.T
            return gr.update(value=desc), (sr, data), gr.update(value=f"Status: Loaded default sample: {label}")
    
        return gr.update(value=desc), None, gr.update(value=f"Status: Default sample missing: {label}")
    
    demo.load(
        fn=load_default,
        inputs=None,
        outputs=[voice_desc_input, audio_output, status_output],
    )

if __name__ == "__main__":
    demo.queue(api_open=False, max_size=15).launch()