yoshinishii commited on
Commit
553380f
·
verified ·
1 Parent(s): 59a4dfe

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +7 -7
  2. app.py +197 -0
  3. requirements.txt +4 -0
README.md CHANGED
@@ -1,12 +1,12 @@
1
  ---
2
- title: Omnivoice Personal
3
- emoji: 🏆
4
- colorFrom: yellow
5
- colorTo: pink
6
  sdk: gradio
7
- sdk_version: 6.11.0
8
  app_file: app.py
9
  pinned: false
 
 
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: OmniVoice Personal
3
+ emoji: 🎙️
4
+ colorFrom: red
5
+ colorTo: red
6
  sdk: gradio
7
+ sdk_version: "6.10.0"
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
+ suggested_hardware: zero-a10g
12
  ---
 
 
app.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import torchaudio
4
+ import os
5
+ import tempfile
6
+ import spaces
7
+ from datetime import datetime
8
+ from omnivoice import OmniVoice
9
+
10
+ # ─── Model ───
11
+ print("モデルを読み込み中...")
12
+ device = "cuda" if torch.cuda.is_available() else "cpu"
13
+ dtype = torch.float16 if device == "cuda" else torch.float32
14
+ model = OmniVoice.from_pretrained("k2-fsa/OmniVoice", device_map=device, dtype=dtype)
15
+ print(f"モデル読み込み完了({device})")
16
+
17
+
18
+ def _build_instruct(gender, age, pitch, style):
19
+ parts = []
20
+ if gender and gender != "Auto":
21
+ parts.append(gender.lower())
22
+ if age and age != "Auto":
23
+ parts.append(age.lower())
24
+ if pitch and pitch != "Auto":
25
+ parts.append(f"{pitch.lower()} pitch")
26
+ if style and style != "Auto":
27
+ parts.append(style.lower())
28
+ return ", ".join(parts) if parts else None
29
+
30
+
31
+ # ─── Voice Design / Auto ───
32
+ @spaces.GPU
33
+ def generate_design(text, mode, language, gender, age, pitch, style,
34
+ speed, duration, num_step, guidance_scale, denoise, postprocess):
35
+ if not text or not text.strip():
36
+ return None, "テキストを入力してください。"
37
+
38
+ kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise)
39
+
40
+ if language and language != "Auto":
41
+ kwargs["language"] = language
42
+
43
+ if mode == "Voice Design":
44
+ instruct = _build_instruct(gender, age, pitch, style)
45
+ if instruct:
46
+ kwargs["instruct"] = instruct
47
+
48
+ if duration and duration > 0:
49
+ kwargs["duration"] = duration
50
+ else:
51
+ kwargs["speed"] = speed
52
+
53
+ if postprocess:
54
+ kwargs["postprocess_output"] = True
55
+
56
+ try:
57
+ audio = model.generate(text=text, **kwargs)
58
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
59
+ torchaudio.save(f.name, audio[0], 24000)
60
+ return f.name, f"生成完了({audio[0].shape[1]/24000:.1f}秒)"
61
+ except Exception as e:
62
+ return None, f"エラー: {e}"
63
+
64
+
65
+ # ─── Voice Clone ───
66
+ @spaces.GPU
67
+ def generate_clone(text, ref_audio, ref_text, language, speed, duration,
68
+ num_step, guidance_scale, denoise, postprocess):
69
+ if not text or not text.strip():
70
+ return None, "テキストを入力してください。"
71
+ if ref_audio is None:
72
+ return None, "リファレンス音声をアップロードしてください。"
73
+
74
+ kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise)
75
+
76
+ if language and language != "Auto":
77
+ kwargs["language"] = language
78
+
79
+ if duration and duration > 0:
80
+ kwargs["duration"] = duration
81
+ else:
82
+ kwargs["speed"] = speed
83
+
84
+ if postprocess:
85
+ kwargs["postprocess_output"] = True
86
+
87
+ try:
88
+ audio = model.generate(
89
+ text=text,
90
+ ref_audio=ref_audio,
91
+ ref_text=ref_text if ref_text and ref_text.strip() else None,
92
+ **kwargs,
93
+ )
94
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
95
+ torchaudio.save(f.name, audio[0], 24000)
96
+ return f.name, f"生成完了({audio[0].shape[1]/24000:.1f}秒)"
97
+ except Exception as e:
98
+ return None, f"エラー: {e}"
99
+
100
+
101
+ # ─── UI ───
102
+ CSS = """
103
+ .main-title { text-align: center; font-size: 1.8em; font-weight: 800; margin-bottom: 0; }
104
+ .subtitle { text-align: center; color: #888; font-size: 0.9em; margin-bottom: 1em; }
105
+ footer { display: none !important; }
106
+ """
107
+
108
+ with gr.Blocks(title="OmniVoice") as app:
109
+ gr.HTML("<h1 class='main-title'>OmniVoice</h1>")
110
+ gr.HTML("<p class='subtitle'>AI Voice Generator — Personal</p>")
111
+
112
+ with gr.Tabs():
113
+ # ── Voice Design / Auto ──
114
+ with gr.Tab("Voice Design"):
115
+ with gr.Row():
116
+ with gr.Column(scale=1):
117
+ d_text = gr.Textbox(label="読み上げテキスト", lines=4,
118
+ placeholder="テキストを入力...")
119
+ d_mode = gr.Radio(["Auto", "Voice Design"], value="Auto", label="モード")
120
+ d_lang = gr.Dropdown(["Auto", "Japanese", "English", "Korean"],
121
+ value="Auto", label="言語")
122
+
123
+ with gr.Group(visible=False) as d_voice_opts:
124
+ with gr.Row():
125
+ d_gender = gr.Dropdown(["Auto", "Female", "Male"],
126
+ value="Auto", label="性別")
127
+ d_age = gr.Dropdown(["Auto", "Child", "Young", "Middle-aged", "Elderly"],
128
+ value="Auto", label="年齢")
129
+ with gr.Row():
130
+ d_pitch = gr.Dropdown(
131
+ ["Auto", "Very low", "Low", "Moderate", "High", "Very high"],
132
+ value="Auto", label="ピッチ")
133
+ d_style = gr.Dropdown(["Auto", "Whisper"],
134
+ value="Auto", label="スタイル")
135
+
136
+ d_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="速度")
137
+
138
+ with gr.Accordion("詳細設定", open=False):
139
+ d_duration = gr.Number(value=0, label="Duration(秒)",
140
+ info="0で自動。設定するとSpeedは無視")
141
+ d_steps = gr.Slider(4, 64, value=32, step=1, label="Inference Steps")
142
+ d_cfg = gr.Slider(0.5, 5.0, value=2.0, step=0.1, label="Guidance Scale")
143
+ d_denoise = gr.Checkbox(value=True, label="Denoise")
144
+ d_postprocess = gr.Checkbox(value=True, label="Postprocess(無音除去)")
145
+
146
+ d_btn = gr.Button("音声を生成", variant="primary", size="lg")
147
+
148
+ with gr.Column(scale=1):
149
+ d_audio = gr.Audio(label="生成結果", type="filepath")
150
+ d_status = gr.Textbox(label="ステータス", interactive=False)
151
+
152
+ d_mode.change(
153
+ fn=lambda m: gr.update(visible=m == "Voice Design"),
154
+ inputs=d_mode, outputs=d_voice_opts,
155
+ )
156
+ d_btn.click(
157
+ fn=generate_design,
158
+ inputs=[d_text, d_mode, d_lang, d_gender, d_age, d_pitch, d_style,
159
+ d_speed, d_duration, d_steps, d_cfg, d_denoise, d_postprocess],
160
+ outputs=[d_audio, d_status],
161
+ )
162
+
163
+ # ── Voice Clone ──
164
+ with gr.Tab("Voice Clone"):
165
+ with gr.Row():
166
+ with gr.Column(scale=1):
167
+ c_text = gr.Textbox(label="読み上げテキスト", lines=4,
168
+ placeholder="この声で読み上げたいテキスト...")
169
+ c_ref = gr.Audio(label="リファレンス音声(3〜15秒)", type="filepath")
170
+ c_ref_text = gr.Textbox(label="書き起こし(任意)", lines=2,
171
+ placeholder="省略すると自動書き起こし")
172
+ c_lang = gr.Dropdown(["Auto", "Japanese", "English", "Korean"],
173
+ value="Auto", label="言語")
174
+ c_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="速度")
175
+
176
+ with gr.Accordion("詳細設定", open=False):
177
+ c_duration = gr.Number(value=0, label="Duration(秒)")
178
+ c_steps = gr.Slider(4, 64, value=32, step=1, label="Inference Steps")
179
+ c_cfg = gr.Slider(0.5, 5.0, value=2.0, step=0.1, label="Guidance Scale")
180
+ c_denoise = gr.Checkbox(value=True, label="Denoise")
181
+ c_postprocess = gr.Checkbox(value=True, label="Postprocess(無音除去)")
182
+
183
+ c_btn = gr.Button("音声を生成", variant="primary", size="lg")
184
+
185
+ with gr.Column(scale=1):
186
+ c_audio = gr.Audio(label="生成結果", type="filepath")
187
+ c_status = gr.Textbox(label="ステータス", interactive=False)
188
+
189
+ c_btn.click(
190
+ fn=generate_clone,
191
+ inputs=[c_text, c_ref, c_ref_text, c_lang, c_speed,
192
+ c_duration, c_steps, c_cfg, c_denoise, c_postprocess],
193
+ outputs=[c_audio, c_status],
194
+ )
195
+
196
+ if __name__ == "__main__":
197
+ app.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ omnivoice
2
+ torch
3
+ torchaudio
4
+ gradio