"""
本地配音软件 - 基于Edge TTS的文本转语音应用
"""
import gradio as gr
import asyncio
import os
from pydub import AudioSegment
from pydub.playback import play
import tempfile
from api import tts_api
class TTSApp:
def __init__(self):
self.app = self.create_interface()
def create_interface(self):
"""创建Gradio界面"""
with gr.Blocks(title="本地配音软件") as app: # 移除了theme参数
gr.Markdown("#
🎙️ 本地配音软件 ")
gr.Markdown("基于Edge TTS和Hugging Face Spaces的文本转语音工具,支持多语言和多种语音")
with gr.Tab("文本配音"):
with gr.Row():
with gr.Column(scale=2):
text_input = gr.TextArea(
label="📝 输入文本",
placeholder="在此输入您要转换为语音的文本...",
lines=12
)
with gr.Row():
voice_selection = gr.Dropdown(
choices=tts_api.get_available_voices(),
value="zh-CN-XiaoxiaoNeural",
label="🗣️ 选择语音",
multiselect=False
)
language_filter = gr.Dropdown(
choices=["全部", "中文", "英文", "日文", "韩文", "其他"],
value="全部",
label="🌐 语言筛选"
)
with gr.Row():
rate_slider = gr.Slider(
minimum=-50,
maximum=50,
value=0,
step=1,
label="⏩ 语速调整 (%)"
)
pitch_slider = gr.Slider(
minimum=-50,
maximum=50,
value=0,
step=1,
label="🎵 音调调整 (Hz)"
)
with gr.Row():
api_selection = gr.Radio(
choices=["Edge TTS (本地)", "Hugging Face API"],
value="Edge TTS (本地)",
label="🌐 API选择"
)
with gr.Row():
generate_btn = gr.Button("🔊 生成语音", variant="primary", scale=1)
batch_generate_btn = gr.Button("📦 批量生成", variant="secondary", scale=1)
with gr.Column(scale=1):
audio_output = gr.Audio(label="🎧 生成的语音", type="filepath")
status_output = gr.Textbox(label="📊 状态信息", interactive=False)
with gr.Group():
gr.Markdown("### 📁 输出选项")
output_format = gr.Radio(
choices=["MP3", "WAV"],
value="MP3",
label="输出格式"
)
with gr.Group():
gr.Markdown("### 📚 语音预览")
voice_info_btn = gr.Button("🔍 查看语音信息")
voice_info_output = gr.JSON(label="语音详情")
with gr.Tab("批量处理"):
with gr.Row():
batch_text_input = gr.TextArea(
label="📝 批量文本输入(每行一段)",
placeholder="每行输入一段文本,将为每段文本生成对应的语音",
lines=10
)
with gr.Row():
batch_voice_selection = gr.Dropdown(
choices=tts_api.get_available_voices(),
value="zh-CN-XiaoxiaoNeural",
label="🗣️ 选择语音"
)
batch_rate_slider = gr.Slider(
minimum=-50,
maximum=50,
value=0,
step=1,
label="⏩ 语速调整 (%)"
)
batch_pitch_slider = gr.Slider(
minimum=-50,
maximum=50,
value=0,
step=1,
label="🎵 音调调整 (Hz)"
)
with gr.Row():
batch_api_selection = gr.Radio(
choices=["Edge TTS (本地)", "Hugging Face API"],
value="Edge TTS (本地)",
label="🌐 API选择"
)
batch_generate_btn2 = gr.Button("📦 生成批量语音", variant="primary")
batch_output = gr.File(label="📥 下载批量生成的音频", interactive=False)
with gr.Tab("音频项目"):
with gr.Row():
with gr.Column():
project_name = gr.Textbox(
label="📋 项目名称",
placeholder="输入项目名称",
value="my_audio_project"
)
segments_input = gr.JSON(
label="📝 音频片段",
value=[{"text": "第一段文本", "delay": 0}, {"text": "第二段文本", "delay": 1000}]
)
with gr.Row():
add_segment_btn = gr.Button("➕ 添加片段")
remove_segment_btn = gr.Button("➖ 删除片段")
project_voice_selection = gr.Dropdown(
choices=tts_api.get_available_voices(),
value="zh-CN-XiaoxiaoNeural",
label="🗣️ 选择语音"
)
with gr.Row():
project_rate_slider = gr.Slider(
minimum=-50,
maximum=50,
value=0,
step=1,
label="⏩ 语速调整 (%)"
)
project_pitch_slider = gr.Slider(
minimum=-50,
maximum=50,
value=0,
step=1,
label="🎵 音调调整 (Hz)"
)
with gr.Row():
project_api_selection = gr.Radio(
choices=["Edge TTS (本地)", "Hugging Face API"],
value="Edge TTS (本地)",
label="🌐 API选择"
)
create_project_btn = gr.Button("🎬 创建音频项目", variant="primary")
project_output = gr.Audio(label="🎧 项目音频输出", type="filepath")
with gr.Tab("语音库"):
with gr.Row():
voice_table = gr.Dataframe(
headers=["语音名称", "语言", "性别"],
datatype=["str", "str", "str"],
value=[[v, v.split('-')[0]+'-'+v.split('-')[1], "女声" if any(x in v.lower() for x in ['xiaoxiao', 'xiaoyi', 'nanami', 'sarah', 'jenny', 'aria']) else "男声"] for v in tts_api.get_available_voices()],
label="可用语音列表",
interactive=False
)
# 绑定事件
def update_voice_list(language):
if language == "全部":
voices = tts_api.get_available_voices()
elif language == "中文":
voices = tts_api.get_available_voices('zh')
elif language == "英文":
voices = tts_api.get_available_voices('en')
elif language == "日文":
voices = tts_api.get_available_voices('ja')
elif language == "韩文":
voices = tts_api.get_available_voices('ko')
else:
voices = tts_api.get_available_voices()
return gr.Dropdown(choices=voices, value=voices[0] if voices else "zh-CN-XiaoxiaoNeural")
language_filter.change(
fn=update_voice_list,
inputs=language_filter,
outputs=voice_selection
)
async def generate_speech_async(text, voice, rate, pitch, format_type, api_type):
if not text.strip():
return None, "请输入要转换的文本"
# 根据选择的格式确定文件扩展名
ext = ".mp3" if format_type == "MP3" else ".wav"
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
output_path = temp_file.name
try:
if api_type == "Hugging Face API":
# 使用Hugging Face API
result = await tts_api.text_to_speech_hf(text, voice, rate, pitch, output_path, format_type.lower())
else:
# 使用本地Edge TTS
result = await tts_api.text_to_speech(text, voice, rate, pitch, output_path, format_type.lower())
if result:
return result, "语音生成成功"
else:
return None, "语音生成失败"
except Exception as e:
return None, f"生成语音时出错: {str(e)}"
generate_btn.click(
fn=lambda text, voice, rate, pitch, fmt, api: asyncio.run(
generate_speech_async(text, voice, rate, pitch, fmt, api)
),
inputs=[text_input, voice_selection, rate_slider, pitch_slider, output_format, api_selection],
outputs=[audio_output, status_output]
)
def play_audio(audio_path):
if audio_path and os.path.exists(audio_path):
try:
audio = AudioSegment.from_file(audio_path)
play(audio)
return "音频播放成功"
except Exception as e:
return f"播放失败: {str(e)}"
return "没有可播放的音频文件"
def get_voice_info(voice):
import asyncio
try:
info = asyncio.run(tts_api.get_voice_info(voice))
return info or {"错误": "未找到语音信息"}
except Exception as e:
return {"错误": str(e)}
voice_info_btn.click(
fn=get_voice_info,
inputs=voice_info_btn, # 实际上我们需要传递voice_selection的值,这里先简化
outputs=voice_info_output
)
# 为voice_selection添加change事件来更新语音信息
voice_selection.change(
fn=get_voice_info,
inputs=voice_selection,
outputs=voice_info_output
)
# 批量处理功能
async def batch_generate(texts, voice, rate, pitch, api_type):
if not texts.strip():
return None, "请输入要转换的文本"
# 按行分割文本
text_list = [t.strip() for t in texts.split('\n') if t.strip()]
if not text_list:
return None, "没有有效的文本段落"
try:
# 根据API类型选择处理方式
if api_type == "Hugging Face API":
audio_files = []
for text in text_list:
if text.strip():
audio_file = await tts_api.text_to_speech_hf(text, voice, rate, pitch, output_format="mp3")
audio_files.append(audio_file)
else:
audio_files.append(None)
else:
audio_files = await tts_api.batch_text_to_speech(text_list, voice, rate, pitch)
# 将音频文件打包成zip
import zipfile
with tempfile.NamedTemporaryFile(delete=False, suffix='.zip') as zip_file:
with zipfile.ZipFile(zip_file.name, 'w') as zf:
for i, audio_file in enumerate(audio_files):
if audio_file:
zf.write(audio_file, f"audio_{i+1}.mp3")
return zip_file.name, f"成功生成 {len([f for f in audio_files if f])} 个音频文件"
except Exception as e:
return None, f"批量生成失败: {str(e)}"
batch_generate_btn2.click(
fn=lambda texts, voice, rate, pitch, api: asyncio.run(
batch_generate(texts, voice, rate, pitch, api)
),
inputs=[batch_text_input, batch_voice_selection, batch_rate_slider, batch_pitch_slider, batch_api_selection],
outputs=[batch_output, status_output]
)
# 音频项目功能
async def create_audio_project(name, segments, voice, rate, pitch, api_type):
if not name.strip():
return None, "请输入项目名称"
try:
# 根据API类型选择处理方式
if api_type == "Hugging Face API":
# 对于项目,我们逐个生成片段然后合并
temp_dir = tempfile.mkdtemp()
segment_files = []
for i, segment in enumerate(segments):
text = segment.get("text", "")
if not text.strip():
continue
delay = segment.get("delay", 0) # 延迟时间(毫秒)
# 使用Hugging Face API生成音频片段
segment_file = os.path.join(temp_dir, f"segment_{i}.mp3")
result = await tts_api.text_to_speech_hf(text, voice, rate, pitch, segment_file, "mp3")
if result:
segment_files.append((result, delay))
else:
# 使用本地API创建项目
project_file = await tts_api.create_audio_project(
name, segments, voice, rate, pitch
)
if project_file:
return project_file, f"项目 '{name}' 创建成功"
else:
return None, "项目创建失败"
return None, "项目创建失败"
# 合并音频片段(如果使用Hugging Face API)
if api_type == "Hugging Face API" and segment_files:
from pydub import AudioSegment
combined_audio = AudioSegment.empty()
for audio_file, delay in segment_files:
if delay > 0:
# 添加静音间隔
silence = AudioSegment.silent(duration=delay)
combined_audio += silence
# 添加音频片段
segment_audio = AudioSegment.from_file(audio_file, format="mp3")
combined_audio += segment_audio
# 生成最终输出文件
output_path = os.path.join(temp_dir, f"{name}.mp3")
combined_audio.export(output_path, format="mp3")
# 清理临时片段文件
for audio_file, _ in segment_files:
try:
os.remove(audio_file)
except:
pass
return output_path, f"项目 '{name}' 创建成功"
else:
return None, "项目创建失败"
except Exception as e:
return None, f"创建项目时出错: {str(e)}"
create_project_btn.click(
fn=lambda name, segments, voice, rate, pitch, api: asyncio.run(
create_audio_project(name, segments, voice, rate, pitch, api)
),
inputs=[project_name, segments_input, project_voice_selection, project_rate_slider, project_pitch_slider, project_api_selection],
outputs=[project_output, status_output]
)
return app
def run(self, share=False):
"""启动应用"""
self.app.launch(server_name="127.0.0.1", server_port=7860, share=share)
if __name__ == "__main__":
app = TTSApp()
app.run()