File size: 19,702 Bytes
6ad2a4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
"""

本地配音软件 - 基于Edge TTS的文本转语音应用

"""
import gradio as gr
import asyncio
import os
from pydub import AudioSegment
from pydub.playback import play
import tempfile
from api import tts_api

class TTSApp:
    def __init__(self):
        self.app = self.create_interface()
    
    def create_interface(self):
        """创建Gradio界面"""
        with gr.Blocks(title="本地配音软件") as app:  # 移除了theme参数
            gr.Markdown("# <center> 🎙️ 本地配音软件 </center>")
            gr.Markdown("基于Edge TTS和Hugging Face Spaces的文本转语音工具,支持多语言和多种语音")
            
            with gr.Tab("文本配音"):
                with gr.Row():
                    with gr.Column(scale=2):
                        text_input = gr.TextArea(
                            label="📝 输入文本", 
                            placeholder="在此输入您要转换为语音的文本...",
                            lines=12
                        )
                        
                        with gr.Row():
                            voice_selection = gr.Dropdown(
                                choices=tts_api.get_available_voices(),
                                value="zh-CN-XiaoxiaoNeural",
                                label="🗣️ 选择语音",
                                multiselect=False
                            )
                            
                            language_filter = gr.Dropdown(
                                choices=["全部", "中文", "英文", "日文", "韩文", "其他"],
                                value="全部",
                                label="🌐 语言筛选"
                            )
                        
                        with gr.Row():
                            rate_slider = gr.Slider(
                                minimum=-50, 
                                maximum=50, 
                                value=0, 
                                step=1, 
                                label="⏩ 语速调整 (%)"
                            )
                            
                            pitch_slider = gr.Slider(
                                minimum=-50, 
                                maximum=50, 
                                value=0, 
                                step=1, 
                                label="🎵 音调调整 (Hz)"
                            )
                        
                        with gr.Row():
                            api_selection = gr.Radio(
                                choices=["Edge TTS (本地)", "Hugging Face API"],
                                value="Edge TTS (本地)",
                                label="🌐 API选择"
                            )
                        
                        with gr.Row():
                            generate_btn = gr.Button("🔊 生成语音", variant="primary", scale=1)
                            batch_generate_btn = gr.Button("📦 批量生成", variant="secondary", scale=1)
                    
                    with gr.Column(scale=1):
                        audio_output = gr.Audio(label="🎧 生成的语音", type="filepath")
                        status_output = gr.Textbox(label="📊 状态信息", interactive=False)
                        
                        with gr.Group():
                            gr.Markdown("### 📁 输出选项")
                            output_format = gr.Radio(
                                choices=["MP3", "WAV"],
                                value="MP3",
                                label="输出格式"
                            )
                        
                        with gr.Group():
                            gr.Markdown("### 📚 语音预览")
                            voice_info_btn = gr.Button("🔍 查看语音信息")
                            voice_info_output = gr.JSON(label="语音详情")
            
            with gr.Tab("批量处理"):
                with gr.Row():
                    batch_text_input = gr.TextArea(
                        label="📝 批量文本输入(每行一段)",
                        placeholder="每行输入一段文本,将为每段文本生成对应的语音",
                        lines=10
                    )
                
                with gr.Row():
                    batch_voice_selection = gr.Dropdown(
                        choices=tts_api.get_available_voices(),
                        value="zh-CN-XiaoxiaoNeural",
                        label="🗣️ 选择语音"
                    )
                    
                    batch_rate_slider = gr.Slider(
                        minimum=-50, 
                        maximum=50, 
                        value=0, 
                        step=1, 
                        label="⏩ 语速调整 (%)"
                    )
                    
                    batch_pitch_slider = gr.Slider(
                        minimum=-50, 
                        maximum=50, 
                        value=0, 
                        step=1, 
                        label="🎵 音调调整 (Hz)"
                    )
                
                with gr.Row():
                    batch_api_selection = gr.Radio(
                        choices=["Edge TTS (本地)", "Hugging Face API"],
                        value="Edge TTS (本地)",
                        label="🌐 API选择"
                    )
                
                batch_generate_btn2 = gr.Button("📦 生成批量语音", variant="primary")
                batch_output = gr.File(label="📥 下载批量生成的音频", interactive=False)
            
            with gr.Tab("音频项目"):
                with gr.Row():
                    with gr.Column():
                        project_name = gr.Textbox(
                            label="📋 项目名称",
                            placeholder="输入项目名称",
                            value="my_audio_project"
                        )
                        
                        segments_input = gr.JSON(
                            label="📝 音频片段",
                            value=[{"text": "第一段文本", "delay": 0}, {"text": "第二段文本", "delay": 1000}]
                        )
                        
                        with gr.Row():
                            add_segment_btn = gr.Button("➕ 添加片段")
                            remove_segment_btn = gr.Button("➖ 删除片段")
                        
                        project_voice_selection = gr.Dropdown(
                            choices=tts_api.get_available_voices(),
                            value="zh-CN-XiaoxiaoNeural",
                            label="🗣️ 选择语音"
                        )
                        
                        with gr.Row():
                            project_rate_slider = gr.Slider(
                                minimum=-50, 
                                maximum=50, 
                                value=0, 
                                step=1, 
                                label="⏩ 语速调整 (%)"
                            )
                            
                            project_pitch_slider = gr.Slider(
                                minimum=-50, 
                                maximum=50, 
                                value=0, 
                                step=1, 
                                label="🎵 音调调整 (Hz)"
                            )
                        
                        with gr.Row():
                            project_api_selection = gr.Radio(
                                choices=["Edge TTS (本地)", "Hugging Face API"],
                                value="Edge TTS (本地)",
                                label="🌐 API选择"
                            )
                        
                        create_project_btn = gr.Button("🎬 创建音频项目", variant="primary")
                        project_output = gr.Audio(label="🎧 项目音频输出", type="filepath")
            
            with gr.Tab("语音库"):
                with gr.Row():
                    voice_table = gr.Dataframe(
                        headers=["语音名称", "语言", "性别"],
                        datatype=["str", "str", "str"],
                        value=[[v, v.split('-')[0]+'-'+v.split('-')[1], "女声" if any(x in v.lower() for x in ['xiaoxiao', 'xiaoyi', 'nanami', 'sarah', 'jenny', 'aria']) else "男声"] for v in tts_api.get_available_voices()],
                        label="可用语音列表",
                        interactive=False
                    )
            
            # 绑定事件
            def update_voice_list(language):
                if language == "全部":
                    voices = tts_api.get_available_voices()
                elif language == "中文":
                    voices = tts_api.get_available_voices('zh')
                elif language == "英文":
                    voices = tts_api.get_available_voices('en')
                elif language == "日文":
                    voices = tts_api.get_available_voices('ja')
                elif language == "韩文":
                    voices = tts_api.get_available_voices('ko')
                else:
                    voices = tts_api.get_available_voices()
                
                return gr.Dropdown(choices=voices, value=voices[0] if voices else "zh-CN-XiaoxiaoNeural")
            
            language_filter.change(
                fn=update_voice_list,
                inputs=language_filter,
                outputs=voice_selection
            )
            
            async def generate_speech_async(text, voice, rate, pitch, format_type, api_type):
                if not text.strip():
                    return None, "请输入要转换的文本"
                
                # 根据选择的格式确定文件扩展名
                ext = ".mp3" if format_type == "MP3" else ".wav"
                
                with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
                    output_path = temp_file.name
                
                try:
                    if api_type == "Hugging Face API":
                        # 使用Hugging Face API
                        result = await tts_api.text_to_speech_hf(text, voice, rate, pitch, output_path, format_type.lower())
                    else:
                        # 使用本地Edge TTS
                        result = await tts_api.text_to_speech(text, voice, rate, pitch, output_path, format_type.lower())
                    
                    if result:
                        return result, "语音生成成功"
                    else:
                        return None, "语音生成失败"
                except Exception as e:
                    return None, f"生成语音时出错: {str(e)}"
            
            generate_btn.click(
                fn=lambda text, voice, rate, pitch, fmt, api: asyncio.run(
                    generate_speech_async(text, voice, rate, pitch, fmt, api)
                ),
                inputs=[text_input, voice_selection, rate_slider, pitch_slider, output_format, api_selection],
                outputs=[audio_output, status_output]
            )
            
            def play_audio(audio_path):
                if audio_path and os.path.exists(audio_path):
                    try:
                        audio = AudioSegment.from_file(audio_path)
                        play(audio)
                        return "音频播放成功"
                    except Exception as e:
                        return f"播放失败: {str(e)}"
                return "没有可播放的音频文件"
            
            def get_voice_info(voice):
                import asyncio
                try:
                    info = asyncio.run(tts_api.get_voice_info(voice))
                    return info or {"错误": "未找到语音信息"}
                except Exception as e:
                    return {"错误": str(e)}
            
            voice_info_btn.click(
                fn=get_voice_info,
                inputs=voice_info_btn,  # 实际上我们需要传递voice_selection的值,这里先简化
                outputs=voice_info_output
            )
            
            # 为voice_selection添加change事件来更新语音信息
            voice_selection.change(
                fn=get_voice_info,
                inputs=voice_selection,
                outputs=voice_info_output
            )
            
            # 批量处理功能
            async def batch_generate(texts, voice, rate, pitch, api_type):
                if not texts.strip():
                    return None, "请输入要转换的文本"
                
                # 按行分割文本
                text_list = [t.strip() for t in texts.split('\n') if t.strip()]
                if not text_list:
                    return None, "没有有效的文本段落"
                
                try:
                    # 根据API类型选择处理方式
                    if api_type == "Hugging Face API":
                        audio_files = []
                        for text in text_list:
                            if text.strip():
                                audio_file = await tts_api.text_to_speech_hf(text, voice, rate, pitch, output_format="mp3")
                                audio_files.append(audio_file)
                            else:
                                audio_files.append(None)
                    else:
                        audio_files = await tts_api.batch_text_to_speech(text_list, voice, rate, pitch)
                    
                    # 将音频文件打包成zip
                    import zipfile
                    with tempfile.NamedTemporaryFile(delete=False, suffix='.zip') as zip_file:
                        with zipfile.ZipFile(zip_file.name, 'w') as zf:
                            for i, audio_file in enumerate(audio_files):
                                if audio_file:
                                    zf.write(audio_file, f"audio_{i+1}.mp3")
                    
                    return zip_file.name, f"成功生成 {len([f for f in audio_files if f])} 个音频文件"
                except Exception as e:
                    return None, f"批量生成失败: {str(e)}"
            
            batch_generate_btn2.click(
                fn=lambda texts, voice, rate, pitch, api: asyncio.run(
                    batch_generate(texts, voice, rate, pitch, api)
                ),
                inputs=[batch_text_input, batch_voice_selection, batch_rate_slider, batch_pitch_slider, batch_api_selection],
                outputs=[batch_output, status_output]
            )
            
            # 音频项目功能
            async def create_audio_project(name, segments, voice, rate, pitch, api_type):
                if not name.strip():
                    return None, "请输入项目名称"
                
                try:
                    # 根据API类型选择处理方式
                    if api_type == "Hugging Face API":
                        # 对于项目,我们逐个生成片段然后合并
                        temp_dir = tempfile.mkdtemp()
                        segment_files = []
                        
                        for i, segment in enumerate(segments):
                            text = segment.get("text", "")
                            if not text.strip():
                                continue
                            
                            delay = segment.get("delay", 0)  # 延迟时间(毫秒)
                            
                            # 使用Hugging Face API生成音频片段
                            segment_file = os.path.join(temp_dir, f"segment_{i}.mp3")
                            result = await tts_api.text_to_speech_hf(text, voice, rate, pitch, segment_file, "mp3")
                            
                            if result:
                                segment_files.append((result, delay))
                    else:
                        # 使用本地API创建项目
                        project_file = await tts_api.create_audio_project(
                            name, segments, voice, rate, pitch
                        )
                        if project_file:
                            return project_file, f"项目 '{name}' 创建成功"
                        else:
                            return None, "项目创建失败"
                        return None, "项目创建失败"
                    
                    # 合并音频片段(如果使用Hugging Face API)
                    if api_type == "Hugging Face API" and segment_files:
                        from pydub import AudioSegment
                        combined_audio = AudioSegment.empty()
                        
                        for audio_file, delay in segment_files:
                            if delay > 0:
                                # 添加静音间隔
                                silence = AudioSegment.silent(duration=delay)
                                combined_audio += silence
                            
                            # 添加音频片段
                            segment_audio = AudioSegment.from_file(audio_file, format="mp3")
                            combined_audio += segment_audio
                        
                        # 生成最终输出文件
                        output_path = os.path.join(temp_dir, f"{name}.mp3")
                        combined_audio.export(output_path, format="mp3")
                        
                        # 清理临时片段文件
                        for audio_file, _ in segment_files:
                            try:
                                os.remove(audio_file)
                            except:
                                pass
                        
                        return output_path, f"项目 '{name}' 创建成功"
                    else:
                        return None, "项目创建失败"
                        
                except Exception as e:
                    return None, f"创建项目时出错: {str(e)}"
            
            create_project_btn.click(
                fn=lambda name, segments, voice, rate, pitch, api: asyncio.run(
                    create_audio_project(name, segments, voice, rate, pitch, api)
                ),
                inputs=[project_name, segments_input, project_voice_selection, project_rate_slider, project_pitch_slider, project_api_selection],
                outputs=[project_output, status_output]
            )
        
        return app
    
    def run(self, share=False):
        """启动应用"""
        self.app.launch(server_name="127.0.0.1", server_port=7860, share=share)

if __name__ == "__main__":
    app = TTSApp()
    app.run()