File size: 19,702 Bytes
6ad2a4c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 | """
本地配音软件 - 基于Edge TTS的文本转语音应用
"""
import gradio as gr
import asyncio
import os
from pydub import AudioSegment
from pydub.playback import play
import tempfile
from api import tts_api
class TTSApp:
def __init__(self):
self.app = self.create_interface()
def create_interface(self):
"""创建Gradio界面"""
with gr.Blocks(title="本地配音软件") as app: # 移除了theme参数
gr.Markdown("# <center> 🎙️ 本地配音软件 </center>")
gr.Markdown("基于Edge TTS和Hugging Face Spaces的文本转语音工具,支持多语言和多种语音")
with gr.Tab("文本配音"):
with gr.Row():
with gr.Column(scale=2):
text_input = gr.TextArea(
label="📝 输入文本",
placeholder="在此输入您要转换为语音的文本...",
lines=12
)
with gr.Row():
voice_selection = gr.Dropdown(
choices=tts_api.get_available_voices(),
value="zh-CN-XiaoxiaoNeural",
label="🗣️ 选择语音",
multiselect=False
)
language_filter = gr.Dropdown(
choices=["全部", "中文", "英文", "日文", "韩文", "其他"],
value="全部",
label="🌐 语言筛选"
)
with gr.Row():
rate_slider = gr.Slider(
minimum=-50,
maximum=50,
value=0,
step=1,
label="⏩ 语速调整 (%)"
)
pitch_slider = gr.Slider(
minimum=-50,
maximum=50,
value=0,
step=1,
label="🎵 音调调整 (Hz)"
)
with gr.Row():
api_selection = gr.Radio(
choices=["Edge TTS (本地)", "Hugging Face API"],
value="Edge TTS (本地)",
label="🌐 API选择"
)
with gr.Row():
generate_btn = gr.Button("🔊 生成语音", variant="primary", scale=1)
batch_generate_btn = gr.Button("📦 批量生成", variant="secondary", scale=1)
with gr.Column(scale=1):
audio_output = gr.Audio(label="🎧 生成的语音", type="filepath")
status_output = gr.Textbox(label="📊 状态信息", interactive=False)
with gr.Group():
gr.Markdown("### 📁 输出选项")
output_format = gr.Radio(
choices=["MP3", "WAV"],
value="MP3",
label="输出格式"
)
with gr.Group():
gr.Markdown("### 📚 语音预览")
voice_info_btn = gr.Button("🔍 查看语音信息")
voice_info_output = gr.JSON(label="语音详情")
with gr.Tab("批量处理"):
with gr.Row():
batch_text_input = gr.TextArea(
label="📝 批量文本输入(每行一段)",
placeholder="每行输入一段文本,将为每段文本生成对应的语音",
lines=10
)
with gr.Row():
batch_voice_selection = gr.Dropdown(
choices=tts_api.get_available_voices(),
value="zh-CN-XiaoxiaoNeural",
label="🗣️ 选择语音"
)
batch_rate_slider = gr.Slider(
minimum=-50,
maximum=50,
value=0,
step=1,
label="⏩ 语速调整 (%)"
)
batch_pitch_slider = gr.Slider(
minimum=-50,
maximum=50,
value=0,
step=1,
label="🎵 音调调整 (Hz)"
)
with gr.Row():
batch_api_selection = gr.Radio(
choices=["Edge TTS (本地)", "Hugging Face API"],
value="Edge TTS (本地)",
label="🌐 API选择"
)
batch_generate_btn2 = gr.Button("📦 生成批量语音", variant="primary")
batch_output = gr.File(label="📥 下载批量生成的音频", interactive=False)
with gr.Tab("音频项目"):
with gr.Row():
with gr.Column():
project_name = gr.Textbox(
label="📋 项目名称",
placeholder="输入项目名称",
value="my_audio_project"
)
segments_input = gr.JSON(
label="📝 音频片段",
value=[{"text": "第一段文本", "delay": 0}, {"text": "第二段文本", "delay": 1000}]
)
with gr.Row():
add_segment_btn = gr.Button("➕ 添加片段")
remove_segment_btn = gr.Button("➖ 删除片段")
project_voice_selection = gr.Dropdown(
choices=tts_api.get_available_voices(),
value="zh-CN-XiaoxiaoNeural",
label="🗣️ 选择语音"
)
with gr.Row():
project_rate_slider = gr.Slider(
minimum=-50,
maximum=50,
value=0,
step=1,
label="⏩ 语速调整 (%)"
)
project_pitch_slider = gr.Slider(
minimum=-50,
maximum=50,
value=0,
step=1,
label="🎵 音调调整 (Hz)"
)
with gr.Row():
project_api_selection = gr.Radio(
choices=["Edge TTS (本地)", "Hugging Face API"],
value="Edge TTS (本地)",
label="🌐 API选择"
)
create_project_btn = gr.Button("🎬 创建音频项目", variant="primary")
project_output = gr.Audio(label="🎧 项目音频输出", type="filepath")
with gr.Tab("语音库"):
with gr.Row():
voice_table = gr.Dataframe(
headers=["语音名称", "语言", "性别"],
datatype=["str", "str", "str"],
value=[[v, v.split('-')[0]+'-'+v.split('-')[1], "女声" if any(x in v.lower() for x in ['xiaoxiao', 'xiaoyi', 'nanami', 'sarah', 'jenny', 'aria']) else "男声"] for v in tts_api.get_available_voices()],
label="可用语音列表",
interactive=False
)
# 绑定事件
def update_voice_list(language):
if language == "全部":
voices = tts_api.get_available_voices()
elif language == "中文":
voices = tts_api.get_available_voices('zh')
elif language == "英文":
voices = tts_api.get_available_voices('en')
elif language == "日文":
voices = tts_api.get_available_voices('ja')
elif language == "韩文":
voices = tts_api.get_available_voices('ko')
else:
voices = tts_api.get_available_voices()
return gr.Dropdown(choices=voices, value=voices[0] if voices else "zh-CN-XiaoxiaoNeural")
language_filter.change(
fn=update_voice_list,
inputs=language_filter,
outputs=voice_selection
)
async def generate_speech_async(text, voice, rate, pitch, format_type, api_type):
if not text.strip():
return None, "请输入要转换的文本"
# 根据选择的格式确定文件扩展名
ext = ".mp3" if format_type == "MP3" else ".wav"
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
output_path = temp_file.name
try:
if api_type == "Hugging Face API":
# 使用Hugging Face API
result = await tts_api.text_to_speech_hf(text, voice, rate, pitch, output_path, format_type.lower())
else:
# 使用本地Edge TTS
result = await tts_api.text_to_speech(text, voice, rate, pitch, output_path, format_type.lower())
if result:
return result, "语音生成成功"
else:
return None, "语音生成失败"
except Exception as e:
return None, f"生成语音时出错: {str(e)}"
generate_btn.click(
fn=lambda text, voice, rate, pitch, fmt, api: asyncio.run(
generate_speech_async(text, voice, rate, pitch, fmt, api)
),
inputs=[text_input, voice_selection, rate_slider, pitch_slider, output_format, api_selection],
outputs=[audio_output, status_output]
)
def play_audio(audio_path):
if audio_path and os.path.exists(audio_path):
try:
audio = AudioSegment.from_file(audio_path)
play(audio)
return "音频播放成功"
except Exception as e:
return f"播放失败: {str(e)}"
return "没有可播放的音频文件"
def get_voice_info(voice):
import asyncio
try:
info = asyncio.run(tts_api.get_voice_info(voice))
return info or {"错误": "未找到语音信息"}
except Exception as e:
return {"错误": str(e)}
voice_info_btn.click(
fn=get_voice_info,
inputs=voice_info_btn, # 实际上我们需要传递voice_selection的值,这里先简化
outputs=voice_info_output
)
# 为voice_selection添加change事件来更新语音信息
voice_selection.change(
fn=get_voice_info,
inputs=voice_selection,
outputs=voice_info_output
)
# 批量处理功能
async def batch_generate(texts, voice, rate, pitch, api_type):
if not texts.strip():
return None, "请输入要转换的文本"
# 按行分割文本
text_list = [t.strip() for t in texts.split('\n') if t.strip()]
if not text_list:
return None, "没有有效的文本段落"
try:
# 根据API类型选择处理方式
if api_type == "Hugging Face API":
audio_files = []
for text in text_list:
if text.strip():
audio_file = await tts_api.text_to_speech_hf(text, voice, rate, pitch, output_format="mp3")
audio_files.append(audio_file)
else:
audio_files.append(None)
else:
audio_files = await tts_api.batch_text_to_speech(text_list, voice, rate, pitch)
# 将音频文件打包成zip
import zipfile
with tempfile.NamedTemporaryFile(delete=False, suffix='.zip') as zip_file:
with zipfile.ZipFile(zip_file.name, 'w') as zf:
for i, audio_file in enumerate(audio_files):
if audio_file:
zf.write(audio_file, f"audio_{i+1}.mp3")
return zip_file.name, f"成功生成 {len([f for f in audio_files if f])} 个音频文件"
except Exception as e:
return None, f"批量生成失败: {str(e)}"
batch_generate_btn2.click(
fn=lambda texts, voice, rate, pitch, api: asyncio.run(
batch_generate(texts, voice, rate, pitch, api)
),
inputs=[batch_text_input, batch_voice_selection, batch_rate_slider, batch_pitch_slider, batch_api_selection],
outputs=[batch_output, status_output]
)
# 音频项目功能
async def create_audio_project(name, segments, voice, rate, pitch, api_type):
if not name.strip():
return None, "请输入项目名称"
try:
# 根据API类型选择处理方式
if api_type == "Hugging Face API":
# 对于项目,我们逐个生成片段然后合并
temp_dir = tempfile.mkdtemp()
segment_files = []
for i, segment in enumerate(segments):
text = segment.get("text", "")
if not text.strip():
continue
delay = segment.get("delay", 0) # 延迟时间(毫秒)
# 使用Hugging Face API生成音频片段
segment_file = os.path.join(temp_dir, f"segment_{i}.mp3")
result = await tts_api.text_to_speech_hf(text, voice, rate, pitch, segment_file, "mp3")
if result:
segment_files.append((result, delay))
else:
# 使用本地API创建项目
project_file = await tts_api.create_audio_project(
name, segments, voice, rate, pitch
)
if project_file:
return project_file, f"项目 '{name}' 创建成功"
else:
return None, "项目创建失败"
return None, "项目创建失败"
# 合并音频片段(如果使用Hugging Face API)
if api_type == "Hugging Face API" and segment_files:
from pydub import AudioSegment
combined_audio = AudioSegment.empty()
for audio_file, delay in segment_files:
if delay > 0:
# 添加静音间隔
silence = AudioSegment.silent(duration=delay)
combined_audio += silence
# 添加音频片段
segment_audio = AudioSegment.from_file(audio_file, format="mp3")
combined_audio += segment_audio
# 生成最终输出文件
output_path = os.path.join(temp_dir, f"{name}.mp3")
combined_audio.export(output_path, format="mp3")
# 清理临时片段文件
for audio_file, _ in segment_files:
try:
os.remove(audio_file)
except:
pass
return output_path, f"项目 '{name}' 创建成功"
else:
return None, "项目创建失败"
except Exception as e:
return None, f"创建项目时出错: {str(e)}"
create_project_btn.click(
fn=lambda name, segments, voice, rate, pitch, api: asyncio.run(
create_audio_project(name, segments, voice, rate, pitch, api)
),
inputs=[project_name, segments_input, project_voice_selection, project_rate_slider, project_pitch_slider, project_api_selection],
outputs=[project_output, status_output]
)
return app
def run(self, share=False):
"""启动应用"""
self.app.launch(server_name="127.0.0.1", server_port=7860, share=share)
if __name__ == "__main__":
app = TTSApp()
app.run() |