""" 本地配音软件 - 基于Edge TTS的文本转语音应用 """ import gradio as gr import asyncio import os from pydub import AudioSegment from pydub.playback import play import tempfile from api import tts_api class TTSApp: def __init__(self): self.app = self.create_interface() def create_interface(self): """创建Gradio界面""" with gr.Blocks(title="本地配音软件") as app: # 移除了theme参数 gr.Markdown("#
🎙️ 本地配音软件
") gr.Markdown("基于Edge TTS和Hugging Face Spaces的文本转语音工具,支持多语言和多种语音") with gr.Tab("文本配音"): with gr.Row(): with gr.Column(scale=2): text_input = gr.TextArea( label="📝 输入文本", placeholder="在此输入您要转换为语音的文本...", lines=12 ) with gr.Row(): voice_selection = gr.Dropdown( choices=tts_api.get_available_voices(), value="zh-CN-XiaoxiaoNeural", label="🗣️ 选择语音", multiselect=False ) language_filter = gr.Dropdown( choices=["全部", "中文", "英文", "日文", "韩文", "其他"], value="全部", label="🌐 语言筛选" ) with gr.Row(): rate_slider = gr.Slider( minimum=-50, maximum=50, value=0, step=1, label="⏩ 语速调整 (%)" ) pitch_slider = gr.Slider( minimum=-50, maximum=50, value=0, step=1, label="🎵 音调调整 (Hz)" ) with gr.Row(): api_selection = gr.Radio( choices=["Edge TTS (本地)", "Hugging Face API"], value="Edge TTS (本地)", label="🌐 API选择" ) with gr.Row(): generate_btn = gr.Button("🔊 生成语音", variant="primary", scale=1) batch_generate_btn = gr.Button("📦 批量生成", variant="secondary", scale=1) with gr.Column(scale=1): audio_output = gr.Audio(label="🎧 生成的语音", type="filepath") status_output = gr.Textbox(label="📊 状态信息", interactive=False) with gr.Group(): gr.Markdown("### 📁 输出选项") output_format = gr.Radio( choices=["MP3", "WAV"], value="MP3", label="输出格式" ) with gr.Group(): gr.Markdown("### 📚 语音预览") voice_info_btn = gr.Button("🔍 查看语音信息") voice_info_output = gr.JSON(label="语音详情") with gr.Tab("批量处理"): with gr.Row(): batch_text_input = gr.TextArea( label="📝 批量文本输入(每行一段)", placeholder="每行输入一段文本,将为每段文本生成对应的语音", lines=10 ) with gr.Row(): batch_voice_selection = gr.Dropdown( choices=tts_api.get_available_voices(), value="zh-CN-XiaoxiaoNeural", label="🗣️ 选择语音" ) batch_rate_slider = gr.Slider( minimum=-50, maximum=50, value=0, step=1, label="⏩ 语速调整 (%)" ) batch_pitch_slider = gr.Slider( minimum=-50, maximum=50, value=0, step=1, label="🎵 音调调整 (Hz)" ) with gr.Row(): batch_api_selection = gr.Radio( choices=["Edge TTS (本地)", "Hugging Face API"], value="Edge TTS (本地)", label="🌐 API选择" ) batch_generate_btn2 = gr.Button("📦 生成批量语音", variant="primary") batch_output = gr.File(label="📥 下载批量生成的音频", interactive=False) with gr.Tab("音频项目"): with gr.Row(): with gr.Column(): project_name = gr.Textbox( label="📋 项目名称", placeholder="输入项目名称", value="my_audio_project" ) segments_input = gr.JSON( label="📝 音频片段", value=[{"text": "第一段文本", "delay": 0}, {"text": "第二段文本", "delay": 1000}] ) with gr.Row(): add_segment_btn = gr.Button("➕ 添加片段") remove_segment_btn = gr.Button("➖ 删除片段") project_voice_selection = gr.Dropdown( choices=tts_api.get_available_voices(), value="zh-CN-XiaoxiaoNeural", label="🗣️ 选择语音" ) with gr.Row(): project_rate_slider = gr.Slider( minimum=-50, maximum=50, value=0, step=1, label="⏩ 语速调整 (%)" ) project_pitch_slider = gr.Slider( minimum=-50, maximum=50, value=0, step=1, label="🎵 音调调整 (Hz)" ) with gr.Row(): project_api_selection = gr.Radio( choices=["Edge TTS (本地)", "Hugging Face API"], value="Edge TTS (本地)", label="🌐 API选择" ) create_project_btn = gr.Button("🎬 创建音频项目", variant="primary") project_output = gr.Audio(label="🎧 项目音频输出", type="filepath") with gr.Tab("语音库"): with gr.Row(): voice_table = gr.Dataframe( headers=["语音名称", "语言", "性别"], datatype=["str", "str", "str"], value=[[v, v.split('-')[0]+'-'+v.split('-')[1], "女声" if any(x in v.lower() for x in ['xiaoxiao', 'xiaoyi', 'nanami', 'sarah', 'jenny', 'aria']) else "男声"] for v in tts_api.get_available_voices()], label="可用语音列表", interactive=False ) # 绑定事件 def update_voice_list(language): if language == "全部": voices = tts_api.get_available_voices() elif language == "中文": voices = tts_api.get_available_voices('zh') elif language == "英文": voices = tts_api.get_available_voices('en') elif language == "日文": voices = tts_api.get_available_voices('ja') elif language == "韩文": voices = tts_api.get_available_voices('ko') else: voices = tts_api.get_available_voices() return gr.Dropdown(choices=voices, value=voices[0] if voices else "zh-CN-XiaoxiaoNeural") language_filter.change( fn=update_voice_list, inputs=language_filter, outputs=voice_selection ) async def generate_speech_async(text, voice, rate, pitch, format_type, api_type): if not text.strip(): return None, "请输入要转换的文本" # 根据选择的格式确定文件扩展名 ext = ".mp3" if format_type == "MP3" else ".wav" with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file: output_path = temp_file.name try: if api_type == "Hugging Face API": # 使用Hugging Face API result = await tts_api.text_to_speech_hf(text, voice, rate, pitch, output_path, format_type.lower()) else: # 使用本地Edge TTS result = await tts_api.text_to_speech(text, voice, rate, pitch, output_path, format_type.lower()) if result: return result, "语音生成成功" else: return None, "语音生成失败" except Exception as e: return None, f"生成语音时出错: {str(e)}" generate_btn.click( fn=lambda text, voice, rate, pitch, fmt, api: asyncio.run( generate_speech_async(text, voice, rate, pitch, fmt, api) ), inputs=[text_input, voice_selection, rate_slider, pitch_slider, output_format, api_selection], outputs=[audio_output, status_output] ) def play_audio(audio_path): if audio_path and os.path.exists(audio_path): try: audio = AudioSegment.from_file(audio_path) play(audio) return "音频播放成功" except Exception as e: return f"播放失败: {str(e)}" return "没有可播放的音频文件" def get_voice_info(voice): import asyncio try: info = asyncio.run(tts_api.get_voice_info(voice)) return info or {"错误": "未找到语音信息"} except Exception as e: return {"错误": str(e)} voice_info_btn.click( fn=get_voice_info, inputs=voice_info_btn, # 实际上我们需要传递voice_selection的值,这里先简化 outputs=voice_info_output ) # 为voice_selection添加change事件来更新语音信息 voice_selection.change( fn=get_voice_info, inputs=voice_selection, outputs=voice_info_output ) # 批量处理功能 async def batch_generate(texts, voice, rate, pitch, api_type): if not texts.strip(): return None, "请输入要转换的文本" # 按行分割文本 text_list = [t.strip() for t in texts.split('\n') if t.strip()] if not text_list: return None, "没有有效的文本段落" try: # 根据API类型选择处理方式 if api_type == "Hugging Face API": audio_files = [] for text in text_list: if text.strip(): audio_file = await tts_api.text_to_speech_hf(text, voice, rate, pitch, output_format="mp3") audio_files.append(audio_file) else: audio_files.append(None) else: audio_files = await tts_api.batch_text_to_speech(text_list, voice, rate, pitch) # 将音频文件打包成zip import zipfile with tempfile.NamedTemporaryFile(delete=False, suffix='.zip') as zip_file: with zipfile.ZipFile(zip_file.name, 'w') as zf: for i, audio_file in enumerate(audio_files): if audio_file: zf.write(audio_file, f"audio_{i+1}.mp3") return zip_file.name, f"成功生成 {len([f for f in audio_files if f])} 个音频文件" except Exception as e: return None, f"批量生成失败: {str(e)}" batch_generate_btn2.click( fn=lambda texts, voice, rate, pitch, api: asyncio.run( batch_generate(texts, voice, rate, pitch, api) ), inputs=[batch_text_input, batch_voice_selection, batch_rate_slider, batch_pitch_slider, batch_api_selection], outputs=[batch_output, status_output] ) # 音频项目功能 async def create_audio_project(name, segments, voice, rate, pitch, api_type): if not name.strip(): return None, "请输入项目名称" try: # 根据API类型选择处理方式 if api_type == "Hugging Face API": # 对于项目,我们逐个生成片段然后合并 temp_dir = tempfile.mkdtemp() segment_files = [] for i, segment in enumerate(segments): text = segment.get("text", "") if not text.strip(): continue delay = segment.get("delay", 0) # 延迟时间(毫秒) # 使用Hugging Face API生成音频片段 segment_file = os.path.join(temp_dir, f"segment_{i}.mp3") result = await tts_api.text_to_speech_hf(text, voice, rate, pitch, segment_file, "mp3") if result: segment_files.append((result, delay)) else: # 使用本地API创建项目 project_file = await tts_api.create_audio_project( name, segments, voice, rate, pitch ) if project_file: return project_file, f"项目 '{name}' 创建成功" else: return None, "项目创建失败" return None, "项目创建失败" # 合并音频片段(如果使用Hugging Face API) if api_type == "Hugging Face API" and segment_files: from pydub import AudioSegment combined_audio = AudioSegment.empty() for audio_file, delay in segment_files: if delay > 0: # 添加静音间隔 silence = AudioSegment.silent(duration=delay) combined_audio += silence # 添加音频片段 segment_audio = AudioSegment.from_file(audio_file, format="mp3") combined_audio += segment_audio # 生成最终输出文件 output_path = os.path.join(temp_dir, f"{name}.mp3") combined_audio.export(output_path, format="mp3") # 清理临时片段文件 for audio_file, _ in segment_files: try: os.remove(audio_file) except: pass return output_path, f"项目 '{name}' 创建成功" else: return None, "项目创建失败" except Exception as e: return None, f"创建项目时出错: {str(e)}" create_project_btn.click( fn=lambda name, segments, voice, rate, pitch, api: asyncio.run( create_audio_project(name, segments, voice, rate, pitch, api) ), inputs=[project_name, segments_input, project_voice_selection, project_rate_slider, project_pitch_slider, project_api_selection], outputs=[project_output, status_output] ) return app def run(self, share=False): """启动应用""" self.app.launch(server_name="127.0.0.1", server_port=7860, share=share) if __name__ == "__main__": app = TTSApp() app.run()