File size: 8,582 Bytes
553380f
 
 
 
 
 
 
 
 
 
 
 
 
68baf64
553380f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import gradio as gr
import torch
import torchaudio
import os
import tempfile
import spaces
from datetime import datetime
from omnivoice import OmniVoice

# ─── Model ───
print("モデルを読み込み中...")
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32
model = OmniVoice.from_pretrained("drbaph/OmniVoice-bf16", device_map=device, dtype=dtype)
print(f"モデル読み込み完了({device})")


def _build_instruct(gender, age, pitch, style):
    parts = []
    if gender and gender != "Auto":
        parts.append(gender.lower())
    if age and age != "Auto":
        parts.append(age.lower())
    if pitch and pitch != "Auto":
        parts.append(f"{pitch.lower()} pitch")
    if style and style != "Auto":
        parts.append(style.lower())
    return ", ".join(parts) if parts else None


# ─── Voice Design / Auto ───
@spaces.GPU
def generate_design(text, mode, language, gender, age, pitch, style,
                    speed, duration, num_step, guidance_scale, denoise, postprocess):
    if not text or not text.strip():
        return None, "テキストを入力してください。"

    kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise)

    if language and language != "Auto":
        kwargs["language"] = language

    if mode == "Voice Design":
        instruct = _build_instruct(gender, age, pitch, style)
        if instruct:
            kwargs["instruct"] = instruct

    if duration and duration > 0:
        kwargs["duration"] = duration
    else:
        kwargs["speed"] = speed

    if postprocess:
        kwargs["postprocess_output"] = True

    try:
        audio = model.generate(text=text, **kwargs)
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
            torchaudio.save(f.name, audio[0], 24000)
            return f.name, f"生成完了({audio[0].shape[1]/24000:.1f}秒)"
    except Exception as e:
        return None, f"エラー: {e}"


# ─── Voice Clone ───
@spaces.GPU
def generate_clone(text, ref_audio, ref_text, language, speed, duration,
                   num_step, guidance_scale, denoise, postprocess):
    if not text or not text.strip():
        return None, "テキストを入力してください。"
    if ref_audio is None:
        return None, "リファレンス音声をアップロードしてください。"

    kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise)

    if language and language != "Auto":
        kwargs["language"] = language

    if duration and duration > 0:
        kwargs["duration"] = duration
    else:
        kwargs["speed"] = speed

    if postprocess:
        kwargs["postprocess_output"] = True

    try:
        audio = model.generate(
            text=text,
            ref_audio=ref_audio,
            ref_text=ref_text if ref_text and ref_text.strip() else None,
            **kwargs,
        )
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
            torchaudio.save(f.name, audio[0], 24000)
            return f.name, f"生成完了({audio[0].shape[1]/24000:.1f}秒)"
    except Exception as e:
        return None, f"エラー: {e}"


# ─── UI ───
CSS = """
.main-title { text-align: center; font-size: 1.8em; font-weight: 800; margin-bottom: 0; }
.subtitle { text-align: center; color: #888; font-size: 0.9em; margin-bottom: 1em; }
footer { display: none !important; }
"""

with gr.Blocks(title="OmniVoice") as app:
    gr.HTML("<h1 class='main-title'>OmniVoice</h1>")
    gr.HTML("<p class='subtitle'>AI Voice Generator — Personal</p>")

    with gr.Tabs():
        # ── Voice Design / Auto ──
        with gr.Tab("Voice Design"):
            with gr.Row():
                with gr.Column(scale=1):
                    d_text = gr.Textbox(label="読み上げテキスト", lines=4,
                                        placeholder="テキストを入力...")
                    d_mode = gr.Radio(["Auto", "Voice Design"], value="Auto", label="モード")
                    d_lang = gr.Dropdown(["Auto", "Japanese", "English", "Korean"],
                                         value="Auto", label="言語")

                    with gr.Group(visible=False) as d_voice_opts:
                        with gr.Row():
                            d_gender = gr.Dropdown(["Auto", "Female", "Male"],
                                                    value="Auto", label="性別")
                            d_age = gr.Dropdown(["Auto", "Child", "Young", "Middle-aged", "Elderly"],
                                                 value="Auto", label="年齢")
                        with gr.Row():
                            d_pitch = gr.Dropdown(
                                ["Auto", "Very low", "Low", "Moderate", "High", "Very high"],
                                value="Auto", label="ピッチ")
                            d_style = gr.Dropdown(["Auto", "Whisper"],
                                                   value="Auto", label="スタイル")

                    d_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="速度")

                    with gr.Accordion("詳細設定", open=False):
                        d_duration = gr.Number(value=0, label="Duration(秒)",
                                               info="0で自動。設定するとSpeedは無視")
                        d_steps = gr.Slider(4, 64, value=32, step=1, label="Inference Steps")
                        d_cfg = gr.Slider(0.5, 5.0, value=2.0, step=0.1, label="Guidance Scale")
                        d_denoise = gr.Checkbox(value=True, label="Denoise")
                        d_postprocess = gr.Checkbox(value=True, label="Postprocess(無音除去)")

                    d_btn = gr.Button("音声を生成", variant="primary", size="lg")

                with gr.Column(scale=1):
                    d_audio = gr.Audio(label="生成結果", type="filepath")
                    d_status = gr.Textbox(label="ステータス", interactive=False)

            d_mode.change(
                fn=lambda m: gr.update(visible=m == "Voice Design"),
                inputs=d_mode, outputs=d_voice_opts,
            )
            d_btn.click(
                fn=generate_design,
                inputs=[d_text, d_mode, d_lang, d_gender, d_age, d_pitch, d_style,
                        d_speed, d_duration, d_steps, d_cfg, d_denoise, d_postprocess],
                outputs=[d_audio, d_status],
            )

        # ── Voice Clone ──
        with gr.Tab("Voice Clone"):
            with gr.Row():
                with gr.Column(scale=1):
                    c_text = gr.Textbox(label="読み上げテキスト", lines=4,
                                        placeholder="この声で読み上げたいテキスト...")
                    c_ref = gr.Audio(label="リファレンス音声(3〜15秒)", type="filepath")
                    c_ref_text = gr.Textbox(label="書き起こし(任意)", lines=2,
                                             placeholder="省略すると自動書き起こし")
                    c_lang = gr.Dropdown(["Auto", "Japanese", "English", "Korean"],
                                          value="Auto", label="言語")
                    c_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="速度")

                    with gr.Accordion("詳細設定", open=False):
                        c_duration = gr.Number(value=0, label="Duration(秒)")
                        c_steps = gr.Slider(4, 64, value=32, step=1, label="Inference Steps")
                        c_cfg = gr.Slider(0.5, 5.0, value=2.0, step=0.1, label="Guidance Scale")
                        c_denoise = gr.Checkbox(value=True, label="Denoise")
                        c_postprocess = gr.Checkbox(value=True, label="Postprocess(無音除去)")

                    c_btn = gr.Button("音声を生成", variant="primary", size="lg")

                with gr.Column(scale=1):
                    c_audio = gr.Audio(label="生成結果", type="filepath")
                    c_status = gr.Textbox(label="ステータス", interactive=False)

            c_btn.click(
                fn=generate_clone,
                inputs=[c_text, c_ref, c_ref_text, c_lang, c_speed,
                        c_duration, c_steps, c_cfg, c_denoise, c_postprocess],
                outputs=[c_audio, c_status],
            )

if __name__ == "__main__":
    app.launch()