Spaces:

chaore
/

egtts

Sleeping

File size: 19,702 Bytes

6ad2a4c

"""

本地配音软件 - 基于Edge TTS的文本转语音应用

"""
import gradio as gr
import asyncio
import os
from pydub import AudioSegment
from pydub.playback import play
import tempfile
from api import tts_api

class TTSApp:
    def __init__(self):
        self.app = self.create_interface()
    
    def create_interface(self):
        """创建Gradio界面"""
        with gr.Blocks(title="本地配音软件") as app:  # 移除了theme参数
            gr.Markdown("# <center> 🎙️ 本地配音软件 </center>")
            gr.Markdown("基于Edge TTS和Hugging Face Spaces的文本转语音工具，支持多语言和多种语音")
            
            with gr.Tab("文本配音"):
                with gr.Row():
                    with gr.Column(scale=2):
                        text_input = gr.TextArea(
                            label="📝 输入文本", 
                            placeholder="在此输入您要转换为语音的文本...",
                            lines=12
                        )
                        
                        with gr.Row():
                            voice_selection = gr.Dropdown(
                                choices=tts_api.get_available_voices(),
                                value="zh-CN-XiaoxiaoNeural",
                                label="🗣️ 选择语音",
                                multiselect=False
                            )
                            
                            language_filter = gr.Dropdown(
                                choices=["全部", "中文", "英文", "日文", "韩文", "其他"],
                                value="全部",
                                label="🌐 语言筛选"
                            )
                        
                        with gr.Row():
                            rate_slider = gr.Slider(
                                minimum=-50, 
                                maximum=50, 
                                value=0, 
                                step=1, 
                                label="⏩ 语速调整 (%)"
                            )
                            
                            pitch_slider = gr.Slider(
                                minimum=-50, 
                                maximum=50, 
                                value=0, 
                                step=1, 
                                label="🎵 音调调整 (Hz)"
                            )
                        
                        with gr.Row():
                            api_selection = gr.Radio(
                                choices=["Edge TTS (本地)", "Hugging Face API"],
                                value="Edge TTS (本地)",
                                label="🌐 API选择"
                            )
                        
                        with gr.Row():
                            generate_btn = gr.Button("🔊 生成语音", variant="primary", scale=1)
                            batch_generate_btn = gr.Button("📦 批量生成", variant="secondary", scale=1)
                    
                    with gr.Column(scale=1):
                        audio_output = gr.Audio(label="🎧 生成的语音", type="filepath")
                        status_output = gr.Textbox(label="📊 状态信息", interactive=False)
                        
                        with gr.Group():
                            gr.Markdown("### 📁 输出选项")
                            output_format = gr.Radio(
                                choices=["MP3", "WAV"],
                                value="MP3",
                                label="输出格式"
                            )
                        
                        with gr.Group():
                            gr.Markdown("### 📚 语音预览")
                            voice_info_btn = gr.Button("🔍 查看语音信息")
                            voice_info_output = gr.JSON(label="语音详情")
            
            with gr.Tab("批量处理"):
                with gr.Row():
                    batch_text_input = gr.TextArea(
                        label="📝 批量文本输入（每行一段）",
                        placeholder="每行输入一段文本，将为每段文本生成对应的语音",
                        lines=10
                    )
                
                with gr.Row():
                    batch_voice_selection = gr.Dropdown(
                        choices=tts_api.get_available_voices(),
                        value="zh-CN-XiaoxiaoNeural",
                        label="🗣️ 选择语音"
                    )
                    
                    batch_rate_slider = gr.Slider(
                        minimum=-50, 
                        maximum=50, 
                        value=0, 
                        step=1, 
                        label="⏩ 语速调整 (%)"
                    )
                    
                    batch_pitch_slider = gr.Slider(
                        minimum=-50, 
                        maximum=50, 
                        value=0, 
                        step=1, 
                        label="🎵 音调调整 (Hz)"
                    )
                
                with gr.Row():
                    batch_api_selection = gr.Radio(
                        choices=["Edge TTS (本地)", "Hugging Face API"],
                        value="Edge TTS (本地)",
                        label="🌐 API选择"
                    )
                
                batch_generate_btn2 = gr.Button("📦 生成批量语音", variant="primary")
                batch_output = gr.File(label="📥 下载批量生成的音频", interactive=False)
            
            with gr.Tab("音频项目"):
                with gr.Row():
                    with gr.Column():
                        project_name = gr.Textbox(
                            label="📋 项目名称",
                            placeholder="输入项目名称",
                            value="my_audio_project"
                        )
                        
                        segments_input = gr.JSON(
                            label="📝 音频片段",
                            value=[{"text": "第一段文本", "delay": 0}, {"text": "第二段文本", "delay": 1000}]
                        )
                        
                        with gr.Row():
                            add_segment_btn = gr.Button("➕ 添加片段")
                            remove_segment_btn = gr.Button("➖ 删除片段")
                        
                        project_voice_selection = gr.Dropdown(
                            choices=tts_api.get_available_voices(),
                            value="zh-CN-XiaoxiaoNeural",
                            label="🗣️ 选择语音"
                        )
                        
                        with gr.Row():
                            project_rate_slider = gr.Slider(
                                minimum=-50, 
                                maximum=50, 
                                value=0, 
                                step=1, 
                                label="⏩ 语速调整 (%)"
                            )
                            
                            project_pitch_slider = gr.Slider(
                                minimum=-50, 
                                maximum=50, 
                                value=0, 
                                step=1, 
                                label="🎵 音调调整 (Hz)"
                            )
                        
                        with gr.Row():
                            project_api_selection = gr.Radio(
                                choices=["Edge TTS (本地)", "Hugging Face API"],
                                value="Edge TTS (本地)",
                                label="🌐 API选择"
                            )
                        
                        create_project_btn = gr.Button("🎬 创建音频项目", variant="primary")
                        project_output = gr.Audio(label="🎧 项目音频输出", type="filepath")
            
            with gr.Tab("语音库"):
                with gr.Row():
                    voice_table = gr.Dataframe(
                        headers=["语音名称", "语言", "性别"],
                        datatype=["str", "str", "str"],
                        value=[[v, v.split('-')[0]+'-'+v.split('-')[1], "女声" if any(x in v.lower() for x in ['xiaoxiao', 'xiaoyi', 'nanami', 'sarah', 'jenny', 'aria']) else "男声"] for v in tts_api.get_available_voices()],
                        label="可用语音列表",
                        interactive=False
                    )
            
            # 绑定事件
            def update_voice_list(language):
                if language == "全部":
                    voices = tts_api.get_available_voices()
                elif language == "中文":
                    voices = tts_api.get_available_voices('zh')
                elif language == "英文":
                    voices = tts_api.get_available_voices('en')
                elif language == "日文":
                    voices = tts_api.get_available_voices('ja')
                elif language == "韩文":
                    voices = tts_api.get_available_voices('ko')
                else:
                    voices = tts_api.get_available_voices()
                
                return gr.Dropdown(choices=voices, value=voices[0] if voices else "zh-CN-XiaoxiaoNeural")
            
            language_filter.change(
                fn=update_voice_list,
                inputs=language_filter,
                outputs=voice_selection
            )
            
            async def generate_speech_async(text, voice, rate, pitch, format_type, api_type):
                if not text.strip():
                    return None, "请输入要转换的文本"
                
                # 根据选择的格式确定文件扩展名
                ext = ".mp3" if format_type == "MP3" else ".wav"
                
                with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
                    output_path = temp_file.name
                
                try:
                    if api_type == "Hugging Face API":
                        # 使用Hugging Face API
                        result = await tts_api.text_to_speech_hf(text, voice, rate, pitch, output_path, format_type.lower())
                    else:
                        # 使用本地Edge TTS
                        result = await tts_api.text_to_speech(text, voice, rate, pitch, output_path, format_type.lower())
                    
                    if result:
                        return result, "语音生成成功"
                    else:
                        return None, "语音生成失败"
                except Exception as e:
                    return None, f"生成语音时出错: {str(e)}"
            
            generate_btn.click(
                fn=lambda text, voice, rate, pitch, fmt, api: asyncio.run(
                    generate_speech_async(text, voice, rate, pitch, fmt, api)
                ),
                inputs=[text_input, voice_selection, rate_slider, pitch_slider, output_format, api_selection],
                outputs=[audio_output, status_output]
            )
            
            def play_audio(audio_path):
                if audio_path and os.path.exists(audio_path):
                    try:
                        audio = AudioSegment.from_file(audio_path)
                        play(audio)
                        return "音频播放成功"
                    except Exception as e:
                        return f"播放失败: {str(e)}"
                return "没有可播放的音频文件"
            
            def get_voice_info(voice):
                import asyncio
                try:
                    info = asyncio.run(tts_api.get_voice_info(voice))
                    return info or {"错误": "未找到语音信息"}
                except Exception as e:
                    return {"错误": str(e)}
            
            voice_info_btn.click(
                fn=get_voice_info,
                inputs=voice_info_btn,  # 实际上我们需要传递voice_selection的值，这里先简化
                outputs=voice_info_output
            )
            
            # 为voice_selection添加change事件来更新语音信息
            voice_selection.change(
                fn=get_voice_info,
                inputs=voice_selection,
                outputs=voice_info_output
            )
            
            # 批量处理功能
            async def batch_generate(texts, voice, rate, pitch, api_type):
                if not texts.strip():
                    return None, "请输入要转换的文本"
                
                # 按行分割文本
                text_list = [t.strip() for t in texts.split('\n') if t.strip()]
                if not text_list:
                    return None, "没有有效的文本段落"
                
                try:
                    # 根据API类型选择处理方式
                    if api_type == "Hugging Face API":
                        audio_files = []
                        for text in text_list:
                            if text.strip():
                                audio_file = await tts_api.text_to_speech_hf(text, voice, rate, pitch, output_format="mp3")
                                audio_files.append(audio_file)
                            else:
                                audio_files.append(None)
                    else:
                        audio_files = await tts_api.batch_text_to_speech(text_list, voice, rate, pitch)
                    
                    # 将音频文件打包成zip
                    import zipfile
                    with tempfile.NamedTemporaryFile(delete=False, suffix='.zip') as zip_file:
                        with zipfile.ZipFile(zip_file.name, 'w') as zf:
                            for i, audio_file in enumerate(audio_files):
                                if audio_file:
                                    zf.write(audio_file, f"audio_{i+1}.mp3")
                    
                    return zip_file.name, f"成功生成 {len([f for f in audio_files if f])} 个音频文件"
                except Exception as e:
                    return None, f"批量生成失败: {str(e)}"
            
            batch_generate_btn2.click(
                fn=lambda texts, voice, rate, pitch, api: asyncio.run(
                    batch_generate(texts, voice, rate, pitch, api)
                ),
                inputs=[batch_text_input, batch_voice_selection, batch_rate_slider, batch_pitch_slider, batch_api_selection],
                outputs=[batch_output, status_output]
            )
            
            # 音频项目功能
            async def create_audio_project(name, segments, voice, rate, pitch, api_type):
                if not name.strip():
                    return None, "请输入项目名称"
                
                try:
                    # 根据API类型选择处理方式
                    if api_type == "Hugging Face API":
                        # 对于项目，我们逐个生成片段然后合并
                        temp_dir = tempfile.mkdtemp()
                        segment_files = []
                        
                        for i, segment in enumerate(segments):
                            text = segment.get("text", "")
                            if not text.strip():
                                continue
                            
                            delay = segment.get("delay", 0)  # 延迟时间（毫秒）
                            
                            # 使用Hugging Face API生成音频片段
                            segment_file = os.path.join(temp_dir, f"segment_{i}.mp3")
                            result = await tts_api.text_to_speech_hf(text, voice, rate, pitch, segment_file, "mp3")
                            
                            if result:
                                segment_files.append((result, delay))
                    else:
                        # 使用本地API创建项目
                        project_file = await tts_api.create_audio_project(
                            name, segments, voice, rate, pitch
                        )
                        if project_file:
                            return project_file, f"项目 '{name}' 创建成功"
                        else:
                            return None, "项目创建失败"
                        return None, "项目创建失败"
                    
                    # 合并音频片段（如果使用Hugging Face API）
                    if api_type == "Hugging Face API" and segment_files:
                        from pydub import AudioSegment
                        combined_audio = AudioSegment.empty()
                        
                        for audio_file, delay in segment_files:
                            if delay > 0:
                                # 添加静音间隔
                                silence = AudioSegment.silent(duration=delay)
                                combined_audio += silence
                            
                            # 添加音频片段
                            segment_audio = AudioSegment.from_file(audio_file, format="mp3")
                            combined_audio += segment_audio
                        
                        # 生成最终输出文件
                        output_path = os.path.join(temp_dir, f"{name}.mp3")
                        combined_audio.export(output_path, format="mp3")
                        
                        # 清理临时片段文件
                        for audio_file, _ in segment_files:
                            try:
                                os.remove(audio_file)
                            except:
                                pass
                        
                        return output_path, f"项目 '{name}' 创建成功"
                    else:
                        return None, "项目创建失败"
                        
                except Exception as e:
                    return None, f"创建项目时出错: {str(e)}"
            
            create_project_btn.click(
                fn=lambda name, segments, voice, rate, pitch, api: asyncio.run(
                    create_audio_project(name, segments, voice, rate, pitch, api)
                ),
                inputs=[project_name, segments_input, project_voice_selection, project_rate_slider, project_pitch_slider, project_api_selection],
                outputs=[project_output, status_output]
            )
        
        return app
    
    def run(self, share=False):
        """启动应用"""
        self.app.launch(server_name="127.0.0.1", server_port=7860, share=share)

if __name__ == "__main__":
    app = TTSApp()
    app.run()