omnivoice-personal

Running

File size: 8,582 Bytes

import gradio as gr
import torch
import torchaudio
import os
import tempfile
import spaces
from datetime import datetime
from omnivoice import OmniVoice

# ─── Model ───
print("モデルを読み込み中...")
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32
model = OmniVoice.from_pretrained("drbaph/OmniVoice-bf16", device_map=device, dtype=dtype)
print(f"モデル読み込み完了（{device}）")


def _build_instruct(gender, age, pitch, style):
    parts = []
    if gender and gender != "Auto":
        parts.append(gender.lower())
    if age and age != "Auto":
        parts.append(age.lower())
    if pitch and pitch != "Auto":
        parts.append(f"{pitch.lower()} pitch")
    if style and style != "Auto":
        parts.append(style.lower())
    return ", ".join(parts) if parts else None


# ─── Voice Design / Auto ───
@spaces.GPU
def generate_design(text, mode, language, gender, age, pitch, style,
                    speed, duration, num_step, guidance_scale, denoise, postprocess):
    if not text or not text.strip():
        return None, "テキストを入力してください。"

    kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise)

    if language and language != "Auto":
        kwargs["language"] = language

    if mode == "Voice Design":
        instruct = _build_instruct(gender, age, pitch, style)
        if instruct:
            kwargs["instruct"] = instruct

    if duration and duration > 0:
        kwargs["duration"] = duration
    else:
        kwargs["speed"] = speed

    if postprocess:
        kwargs["postprocess_output"] = True

    try:
        audio = model.generate(text=text, **kwargs)
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
            torchaudio.save(f.name, audio[0], 24000)
            return f.name, f"生成完了（{audio[0].shape[1]/24000:.1f}秒）"
    except Exception as e:
        return None, f"エラー: {e}"


# ─── Voice Clone ───
@spaces.GPU
def generate_clone(text, ref_audio, ref_text, language, speed, duration,
                   num_step, guidance_scale, denoise, postprocess):
    if not text or not text.strip():
        return None, "テキストを入力してください。"
    if ref_audio is None:
        return None, "リファレンス音声をアップロードしてください。"

    kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise)

    if language and language != "Auto":
        kwargs["language"] = language

    if duration and duration > 0:
        kwargs["duration"] = duration
    else:
        kwargs["speed"] = speed

    if postprocess:
        kwargs["postprocess_output"] = True

    try:
        audio = model.generate(
            text=text,
            ref_audio=ref_audio,
            ref_text=ref_text if ref_text and ref_text.strip() else None,
            **kwargs,
        )
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
            torchaudio.save(f.name, audio[0], 24000)
            return f.name, f"生成完了（{audio[0].shape[1]/24000:.1f}秒）"
    except Exception as e:
        return None, f"エラー: {e}"


# ─── UI ───
CSS = """
.main-title { text-align: center; font-size: 1.8em; font-weight: 800; margin-bottom: 0; }
.subtitle { text-align: center; color: #888; font-size: 0.9em; margin-bottom: 1em; }
footer { display: none !important; }
"""

with gr.Blocks(title="OmniVoice") as app:
    gr.HTML("<h1 class='main-title'>OmniVoice</h1>")
    gr.HTML("<p class='subtitle'>AI Voice Generator — Personal</p>")

    with gr.Tabs():
        # ── Voice Design / Auto ──
        with gr.Tab("Voice Design"):
            with gr.Row():
                with gr.Column(scale=1):
                    d_text = gr.Textbox(label="読み上げテキスト", lines=4,
                                        placeholder="テキストを入力...")
                    d_mode = gr.Radio(["Auto", "Voice Design"], value="Auto", label="モード")
                    d_lang = gr.Dropdown(["Auto", "Japanese", "English", "Korean"],
                                         value="Auto", label="言語")

                    with gr.Group(visible=False) as d_voice_opts:
                        with gr.Row():
                            d_gender = gr.Dropdown(["Auto", "Female", "Male"],
                                                    value="Auto", label="性別")
                            d_age = gr.Dropdown(["Auto", "Child", "Young", "Middle-aged", "Elderly"],
                                                 value="Auto", label="年齢")
                        with gr.Row():
                            d_pitch = gr.Dropdown(
                                ["Auto", "Very low", "Low", "Moderate", "High", "Very high"],
                                value="Auto", label="ピッチ")
                            d_style = gr.Dropdown(["Auto", "Whisper"],
                                                   value="Auto", label="スタイル")

                    d_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="速度")

                    with gr.Accordion("詳細設定", open=False):
                        d_duration = gr.Number(value=0, label="Duration（秒）",
                                               info="0で自動。設定するとSpeedは無視")
                        d_steps = gr.Slider(4, 64, value=32, step=1, label="Inference Steps")
                        d_cfg = gr.Slider(0.5, 5.0, value=2.0, step=0.1, label="Guidance Scale")
                        d_denoise = gr.Checkbox(value=True, label="Denoise")
                        d_postprocess = gr.Checkbox(value=True, label="Postprocess（無音除去）")

                    d_btn = gr.Button("音声を生成", variant="primary", size="lg")

                with gr.Column(scale=1):
                    d_audio = gr.Audio(label="生成結果", type="filepath")
                    d_status = gr.Textbox(label="ステータス", interactive=False)

            d_mode.change(
                fn=lambda m: gr.update(visible=m == "Voice Design"),
                inputs=d_mode, outputs=d_voice_opts,
            )
            d_btn.click(
                fn=generate_design,
                inputs=[d_text, d_mode, d_lang, d_gender, d_age, d_pitch, d_style,
                        d_speed, d_duration, d_steps, d_cfg, d_denoise, d_postprocess],
                outputs=[d_audio, d_status],
            )

        # ── Voice Clone ──
        with gr.Tab("Voice Clone"):
            with gr.Row():
                with gr.Column(scale=1):
                    c_text = gr.Textbox(label="読み上げテキスト", lines=4,
                                        placeholder="この声で読み上げたいテキスト...")
                    c_ref = gr.Audio(label="リファレンス音声（3〜15秒）", type="filepath")
                    c_ref_text = gr.Textbox(label="書き起こし（任意）", lines=2,
                                             placeholder="省略すると自動書き起こし")
                    c_lang = gr.Dropdown(["Auto", "Japanese", "English", "Korean"],
                                          value="Auto", label="言語")
                    c_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="速度")

                    with gr.Accordion("詳細設定", open=False):
                        c_duration = gr.Number(value=0, label="Duration（秒）")
                        c_steps = gr.Slider(4, 64, value=32, step=1, label="Inference Steps")
                        c_cfg = gr.Slider(0.5, 5.0, value=2.0, step=0.1, label="Guidance Scale")
                        c_denoise = gr.Checkbox(value=True, label="Denoise")
                        c_postprocess = gr.Checkbox(value=True, label="Postprocess（無音除去）")

                    c_btn = gr.Button("音声を生成", variant="primary", size="lg")

                with gr.Column(scale=1):
                    c_audio = gr.Audio(label="生成結果", type="filepath")
                    c_status = gr.Textbox(label="ステータス", interactive=False)

            c_btn.click(
                fn=generate_clone,
                inputs=[c_text, c_ref, c_ref_text, c_lang, c_speed,
                        c_duration, c_steps, c_cfg, c_denoise, c_postprocess],
                outputs=[c_audio, c_status],
            )

if __name__ == "__main__":
    app.launch()