Spaces:
Running
Running
File size: 8,582 Bytes
553380f 68baf64 553380f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 | import gradio as gr
import torch
import torchaudio
import os
import tempfile
import spaces
from datetime import datetime
from omnivoice import OmniVoice
# ─── Model ───
print("モデルを読み込み中...")
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32
model = OmniVoice.from_pretrained("drbaph/OmniVoice-bf16", device_map=device, dtype=dtype)
print(f"モデル読み込み完了({device})")
def _build_instruct(gender, age, pitch, style):
parts = []
if gender and gender != "Auto":
parts.append(gender.lower())
if age and age != "Auto":
parts.append(age.lower())
if pitch and pitch != "Auto":
parts.append(f"{pitch.lower()} pitch")
if style and style != "Auto":
parts.append(style.lower())
return ", ".join(parts) if parts else None
# ─── Voice Design / Auto ───
@spaces.GPU
def generate_design(text, mode, language, gender, age, pitch, style,
speed, duration, num_step, guidance_scale, denoise, postprocess):
if not text or not text.strip():
return None, "テキストを入力してください。"
kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise)
if language and language != "Auto":
kwargs["language"] = language
if mode == "Voice Design":
instruct = _build_instruct(gender, age, pitch, style)
if instruct:
kwargs["instruct"] = instruct
if duration and duration > 0:
kwargs["duration"] = duration
else:
kwargs["speed"] = speed
if postprocess:
kwargs["postprocess_output"] = True
try:
audio = model.generate(text=text, **kwargs)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
torchaudio.save(f.name, audio[0], 24000)
return f.name, f"生成完了({audio[0].shape[1]/24000:.1f}秒)"
except Exception as e:
return None, f"エラー: {e}"
# ─── Voice Clone ───
@spaces.GPU
def generate_clone(text, ref_audio, ref_text, language, speed, duration,
num_step, guidance_scale, denoise, postprocess):
if not text or not text.strip():
return None, "テキストを入力してください。"
if ref_audio is None:
return None, "リファレンス音声をアップロードしてください。"
kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise)
if language and language != "Auto":
kwargs["language"] = language
if duration and duration > 0:
kwargs["duration"] = duration
else:
kwargs["speed"] = speed
if postprocess:
kwargs["postprocess_output"] = True
try:
audio = model.generate(
text=text,
ref_audio=ref_audio,
ref_text=ref_text if ref_text and ref_text.strip() else None,
**kwargs,
)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
torchaudio.save(f.name, audio[0], 24000)
return f.name, f"生成完了({audio[0].shape[1]/24000:.1f}秒)"
except Exception as e:
return None, f"エラー: {e}"
# ─── UI ───
CSS = """
.main-title { text-align: center; font-size: 1.8em; font-weight: 800; margin-bottom: 0; }
.subtitle { text-align: center; color: #888; font-size: 0.9em; margin-bottom: 1em; }
footer { display: none !important; }
"""
with gr.Blocks(title="OmniVoice") as app:
gr.HTML("<h1 class='main-title'>OmniVoice</h1>")
gr.HTML("<p class='subtitle'>AI Voice Generator — Personal</p>")
with gr.Tabs():
# ── Voice Design / Auto ──
with gr.Tab("Voice Design"):
with gr.Row():
with gr.Column(scale=1):
d_text = gr.Textbox(label="読み上げテキスト", lines=4,
placeholder="テキストを入力...")
d_mode = gr.Radio(["Auto", "Voice Design"], value="Auto", label="モード")
d_lang = gr.Dropdown(["Auto", "Japanese", "English", "Korean"],
value="Auto", label="言語")
with gr.Group(visible=False) as d_voice_opts:
with gr.Row():
d_gender = gr.Dropdown(["Auto", "Female", "Male"],
value="Auto", label="性別")
d_age = gr.Dropdown(["Auto", "Child", "Young", "Middle-aged", "Elderly"],
value="Auto", label="年齢")
with gr.Row():
d_pitch = gr.Dropdown(
["Auto", "Very low", "Low", "Moderate", "High", "Very high"],
value="Auto", label="ピッチ")
d_style = gr.Dropdown(["Auto", "Whisper"],
value="Auto", label="スタイル")
d_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="速度")
with gr.Accordion("詳細設定", open=False):
d_duration = gr.Number(value=0, label="Duration(秒)",
info="0で自動。設定するとSpeedは無視")
d_steps = gr.Slider(4, 64, value=32, step=1, label="Inference Steps")
d_cfg = gr.Slider(0.5, 5.0, value=2.0, step=0.1, label="Guidance Scale")
d_denoise = gr.Checkbox(value=True, label="Denoise")
d_postprocess = gr.Checkbox(value=True, label="Postprocess(無音除去)")
d_btn = gr.Button("音声を生成", variant="primary", size="lg")
with gr.Column(scale=1):
d_audio = gr.Audio(label="生成結果", type="filepath")
d_status = gr.Textbox(label="ステータス", interactive=False)
d_mode.change(
fn=lambda m: gr.update(visible=m == "Voice Design"),
inputs=d_mode, outputs=d_voice_opts,
)
d_btn.click(
fn=generate_design,
inputs=[d_text, d_mode, d_lang, d_gender, d_age, d_pitch, d_style,
d_speed, d_duration, d_steps, d_cfg, d_denoise, d_postprocess],
outputs=[d_audio, d_status],
)
# ── Voice Clone ──
with gr.Tab("Voice Clone"):
with gr.Row():
with gr.Column(scale=1):
c_text = gr.Textbox(label="読み上げテキスト", lines=4,
placeholder="この声で読み上げたいテキスト...")
c_ref = gr.Audio(label="リファレンス音声(3〜15秒)", type="filepath")
c_ref_text = gr.Textbox(label="書き起こし(任意)", lines=2,
placeholder="省略すると自動書き起こし")
c_lang = gr.Dropdown(["Auto", "Japanese", "English", "Korean"],
value="Auto", label="言語")
c_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="速度")
with gr.Accordion("詳細設定", open=False):
c_duration = gr.Number(value=0, label="Duration(秒)")
c_steps = gr.Slider(4, 64, value=32, step=1, label="Inference Steps")
c_cfg = gr.Slider(0.5, 5.0, value=2.0, step=0.1, label="Guidance Scale")
c_denoise = gr.Checkbox(value=True, label="Denoise")
c_postprocess = gr.Checkbox(value=True, label="Postprocess(無音除去)")
c_btn = gr.Button("音声を生成", variant="primary", size="lg")
with gr.Column(scale=1):
c_audio = gr.Audio(label="生成結果", type="filepath")
c_status = gr.Textbox(label="ステータス", interactive=False)
c_btn.click(
fn=generate_clone,
inputs=[c_text, c_ref, c_ref_text, c_lang, c_speed,
c_duration, c_steps, c_cfg, c_denoise, c_postprocess],
outputs=[c_audio, c_status],
)
if __name__ == "__main__":
app.launch()
|