omnivoice-personal

Running

App Files Files Community

yoshinishii commited on Apr 3

Commit

553380f

verified ·

1 Parent(s): 59a4dfe

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

README.md +7 -7
app.py +197 -0
requirements.txt +4 -0

README.md CHANGED Viewed

@@ -1,12 +1,12 @@
 ---
-title: Omnivoice Personal
-emoji: 🏆
-colorFrom: yellow
-colorTo: pink
 sdk: gradio
-sdk_version: 6.11.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: OmniVoice Personal
+emoji: 🎙️
+colorFrom: red
+colorTo: red
 sdk: gradio
+sdk_version: "6.10.0"
 app_file: app.py
 pinned: false
+license: apache-2.0
+suggested_hardware: zero-a10g
 ---

app.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import gradio as gr
+import torch
+import torchaudio
+import os
+import tempfile
+import spaces
+from datetime import datetime
+from omnivoice import OmniVoice
+# ─── Model ───
+print("モデルを読み込み中...")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+dtype = torch.float16 if device == "cuda" else torch.float32
+model = OmniVoice.from_pretrained("k2-fsa/OmniVoice", device_map=device, dtype=dtype)
+print(f"モデル読み込み完了（{device}）")
+def _build_instruct(gender, age, pitch, style):
+    parts = []
+    if gender and gender != "Auto":
+        parts.append(gender.lower())
+    if age and age != "Auto":
+        parts.append(age.lower())
+    if pitch and pitch != "Auto":
+        parts.append(f"{pitch.lower()} pitch")
+    if style and style != "Auto":
+        parts.append(style.lower())
+    return ", ".join(parts) if parts else None
+# ─── Voice Design / Auto ───
+@spaces.GPU
+def generate_design(text, mode, language, gender, age, pitch, style,
+                    speed, duration, num_step, guidance_scale, denoise, postprocess):
+    if not text or not text.strip():
+        return None, "テキストを入力してください。"
+    kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise)
+    if language and language != "Auto":
+        kwargs["language"] = language
+    if mode == "Voice Design":
+        instruct = _build_instruct(gender, age, pitch, style)
+        if instruct:
+            kwargs["instruct"] = instruct
+    if duration and duration > 0:
+        kwargs["duration"] = duration
+    else:
+        kwargs["speed"] = speed
+    if postprocess:
+        kwargs["postprocess_output"] = True
+    try:
+        audio = model.generate(text=text, **kwargs)
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+            torchaudio.save(f.name, audio[0], 24000)
+            return f.name, f"生成完了（{audio[0].shape[1]/24000:.1f}秒）"
+    except Exception as e:
+        return None, f"エラー: {e}"
+# ─── Voice Clone ───
+@spaces.GPU
+def generate_clone(text, ref_audio, ref_text, language, speed, duration,
+                   num_step, guidance_scale, denoise, postprocess):
+    if not text or not text.strip():
+        return None, "テキストを入力してください。"
+    if ref_audio is None:
+        return None, "リファレンス音声をアップロードしてください。"
+    kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise)
+    if language and language != "Auto":
+        kwargs["language"] = language
+    if duration and duration > 0:
+        kwargs["duration"] = duration
+    else:
+        kwargs["speed"] = speed
+    if postprocess:
+        kwargs["postprocess_output"] = True
+    try:
+        audio = model.generate(
+            text=text,
+            ref_audio=ref_audio,
+            ref_text=ref_text if ref_text and ref_text.strip() else None,
+            **kwargs,
+        )
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+            torchaudio.save(f.name, audio[0], 24000)
+            return f.name, f"生成完了（{audio[0].shape[1]/24000:.1f}秒）"
+    except Exception as e:
+        return None, f"エラー: {e}"
+# ─── UI ───
+CSS = """
+.main-title { text-align: center; font-size: 1.8em; font-weight: 800; margin-bottom: 0; }
+.subtitle { text-align: center; color: #888; font-size: 0.9em; margin-bottom: 1em; }
+footer { display: none !important; }
+"""
+with gr.Blocks(title="OmniVoice") as app:
+    gr.HTML("<h1 class='main-title'>OmniVoice</h1>")
+    gr.HTML("<p class='subtitle'>AI Voice Generator — Personal</p>")
+    with gr.Tabs():
+        # ── Voice Design / Auto ──
+        with gr.Tab("Voice Design"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    d_text = gr.Textbox(label="読み上げテキスト", lines=4,
+                                        placeholder="テキストを入力...")
+                    d_mode = gr.Radio(["Auto", "Voice Design"], value="Auto", label="モード")
+                    d_lang = gr.Dropdown(["Auto", "Japanese", "English", "Korean"],
+                                         value="Auto", label="言語")
+                    with gr.Group(visible=False) as d_voice_opts:
+                        with gr.Row():
+                            d_gender = gr.Dropdown(["Auto", "Female", "Male"],
+                                                    value="Auto", label="性別")
+                            d_age = gr.Dropdown(["Auto", "Child", "Young", "Middle-aged", "Elderly"],
+                                                 value="Auto", label="年齢")
+                        with gr.Row():
+                            d_pitch = gr.Dropdown(
+                                ["Auto", "Very low", "Low", "Moderate", "High", "Very high"],
+                                value="Auto", label="ピッチ")
+                            d_style = gr.Dropdown(["Auto", "Whisper"],
+                                                   value="Auto", label="スタイル")
+                    d_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="速度")
+                    with gr.Accordion("詳細設定", open=False):
+                        d_duration = gr.Number(value=0, label="Duration（秒）",
+                                               info="0で自動。設定するとSpeedは無視")
+                        d_steps = gr.Slider(4, 64, value=32, step=1, label="Inference Steps")
+                        d_cfg = gr.Slider(0.5, 5.0, value=2.0, step=0.1, label="Guidance Scale")
+                        d_denoise = gr.Checkbox(value=True, label="Denoise")
+                        d_postprocess = gr.Checkbox(value=True, label="Postprocess（無音除去）")
+                    d_btn = gr.Button("音声を生成", variant="primary", size="lg")
+                with gr.Column(scale=1):
+                    d_audio = gr.Audio(label="生成結果", type="filepath")
+                    d_status = gr.Textbox(label="ステータス", interactive=False)
+            d_mode.change(
+                fn=lambda m: gr.update(visible=m == "Voice Design"),
+                inputs=d_mode, outputs=d_voice_opts,
+            )
+            d_btn.click(
+                fn=generate_design,
+                inputs=[d_text, d_mode, d_lang, d_gender, d_age, d_pitch, d_style,
+                        d_speed, d_duration, d_steps, d_cfg, d_denoise, d_postprocess],
+                outputs=[d_audio, d_status],
+            )
+        # ── Voice Clone ──
+        with gr.Tab("Voice Clone"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    c_text = gr.Textbox(label="読み上げテキスト", lines=4,
+                                        placeholder="この声で読み上げたいテキスト...")
+                    c_ref = gr.Audio(label="リファレンス音声（3〜15秒）", type="filepath")
+                    c_ref_text = gr.Textbox(label="書き起こし（任意）", lines=2,
+                                             placeholder="省略すると自動書き起こし")
+                    c_lang = gr.Dropdown(["Auto", "Japanese", "English", "Korean"],
+                                          value="Auto", label="言語")
+                    c_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="速度")
+                    with gr.Accordion("詳細設定", open=False):
+                        c_duration = gr.Number(value=0, label="Duration（秒）")
+                        c_steps = gr.Slider(4, 64, value=32, step=1, label="Inference Steps")
+                        c_cfg = gr.Slider(0.5, 5.0, value=2.0, step=0.1, label="Guidance Scale")
+                        c_denoise = gr.Checkbox(value=True, label="Denoise")
+                        c_postprocess = gr.Checkbox(value=True, label="Postprocess（無音除去）")
+                    c_btn = gr.Button("音声を生成", variant="primary", size="lg")
+                with gr.Column(scale=1):
+                    c_audio = gr.Audio(label="生成結果", type="filepath")
+                    c_status = gr.Textbox(label="ステータス", interactive=False)
+            c_btn.click(
+                fn=generate_clone,
+                inputs=[c_text, c_ref, c_ref_text, c_lang, c_speed,
+                        c_duration, c_steps, c_cfg, c_denoise, c_postprocess],
+                outputs=[c_audio, c_status],
+            )
+if __name__ == "__main__":
+    app.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+omnivoice
+torch
+torchaudio
+gradio