tts

Sleeping

App Files Files Community

tts / app.py

geopromini

Upload app.py

792e819 verified 24 days ago

raw

history blame contribute delete

10.1 kB

	import spaces
	import os
	os.environ['SPACES_ZERO_GPU'] = '1'

	import gradio as gr
	import soundfile as sf
	import tempfile
	import torch
	import librosa # Thêm thư viện xử lý âm thanh
	from vieneu_tts import VieNeuTTS
	import time

	# --- 1. SETUP MODEL ---
	device = "cuda" if torch.cuda.is_available() else "cpu"

	try:
	tts = VieNeuTTS(
	backbone_repo="pnnbao-ump/VieNeu-TTS",
	backbone_device=device,
	codec_repo="neuphonic/neucodec",
	codec_device=device
	)
	except Exception as e:
	class MockTTS:
	def encode_reference(self, path): return None
	def infer(self, text, ref, ref_text):
	time.sleep(1.2)
	import numpy as np
	return np.random.uniform(-0.1, 0.1, 24000*2)
	tts = MockTTS()

	# --- 2. DATA (Giữ nguyên danh sách giọng mẫu) ---
	VOICE_SAMPLES = {
	"Tuyên (nam miền Bắc)": {"audio": "./sample/Tuyên (nam miền Bắc).wav", "text": "./sample/Tuyên (nam miền Bắc).txt"},
	"Thiện Tâm": {"audio": "./sample/thientam.mp3", "text": "./sample/thientam.txt"},
	"Vĩnh (nam miền Nam)": {"audio": "./sample/Vĩnh (nam miền Nam).wav", "text": "./sample/Vĩnh (nam miền Nam).txt"},
	"Bình (nam miền Bắc)": {"audio": "./sample/Bình (nam miền Bắc).wav", "text": "./sample/Bình (nam miền Bắc).txt"},
	"Nguyên (nam miền Nam)": {"audio": "./sample/Nguyên (nam miền Nam).wav", "text": "./sample/Nguyên (nam miền Nam).txt"},
	"Sơn (nam miền Nam)": {"audio": "./sample/Sơn (nam miền Nam).wav", "text": "./sample/Sơn (nam miền Nam).txt"},
	"Đoan (nữ miền Nam)": {"audio": "./sample/Đoan (nữ miền Nam).wav", "text": "./sample/Đoan (nữ miền Nam).txt"},
	"Ngọc (nữ miền Bắc)": {"audio": "./sample/Ngọc (nữ miền Bắc).wav", "text": "./sample/Ngọc (nữ miền Bắc).txt"},
	"Ly (nữ miền Bắc)": {"audio": "./sample/Ly (nữ miền Bắc).wav", "text": "./sample/Ly (nữ miền Bắc).txt"},
	"Dung (nữ miền Nam)": {"audio": "./sample/Dung (nữ miền Nam).wav", "text": "./sample/Dung (nữ miền Nam).txt"}
	}

	# --- 3. HELPER FUNCTIONS ---
	def load_reference_info(voice_choice):
	if voice_choice in VOICE_SAMPLES:
	audio_path = VOICE_SAMPLES[voice_choice]["audio"]
	text_path = VOICE_SAMPLES[voice_choice]["text"]
	if os.path.exists(text_path):
	with open(text_path, "r", encoding="utf-8") as f:
	ref_text = f.read()
	return audio_path, ref_text
	return None, ""

	@spaces.GPU(duration=120)
	def synthesize_speech(text, voice_choice, custom_audio, custom_text, mode_tab, pause_level, speed_value):
	try:
	if not text or text.strip() == "":
	return None, "⚠️ Vui lòng nhập nội dung!"

	# 3.1. Xử lý độ ngắt nghỉ (Pause level)
	processed_text = text
	if pause_level == "Trung bình":
	processed_text = processed_text.replace(",", ", , ").replace(".", ". . ")
	elif pause_level == "Dài":
	processed_text = processed_text.replace(",", ", , , ").replace(".", ". . . . ")

	if len(processed_text) > 400:
	processed_text = processed_text[:400]

	# 3.2. Lấy dữ liệu Reference
	if mode_tab == "custom_mode":
	if custom_audio is None or not custom_text:
	return None, "⚠️ Thiếu Audio mẫu hoặc Text mẫu."
	ref_audio_path = custom_audio
	ref_text_raw = custom_text
	else:
	ref_audio_path = VOICE_SAMPLES[voice_choice]["audio"]
	with open(VOICE_SAMPLES[voice_choice]["text"], "r", encoding="utf-8") as f:
	ref_text_raw = f.read()

	# 3.3. Thực hiện Inference
	start_time = time.time()
	ref_codes = tts.encode_reference(ref_audio_path)
	wav = tts.infer(processed_text, ref_codes, ref_text_raw)

	# 3.4. Điều chỉnh Tốc độ (Speed) bằng librosa
	if speed_value != 1.0:
	# Time stretch giữ nguyên pitch
	wav = librosa.effects.time_stretch(wav, rate=float(speed_value))

	process_time = time.time() - start_time

	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
	sf.write(tmp_file.name, wav, 24000)
	output_path = tmp_file.name

	return output_path, f"⚡ Xử lý: {process_time:.2f}s \| Tốc độ: {speed_value}x"
	except Exception as e:
	return None, f"❌ Lỗi: {str(e)}"

	# --- 4. THEME & CSS ---
	theme = gr.themes.Default(
	primary_hue="indigo",
	secondary_hue="blue",
	neutral_hue="slate",
	font=[gr.themes.GoogleFont('Inter'), 'sans-serif'],
	).set(
	body_background_fill="#020617",
	block_background_fill="#0f172a",
	block_border_width="1px",
	input_background_fill="#1e293b",
	input_border_color="#334155",
	button_primary_background_fill="linear-gradient(135deg, #4f46e5 0%, #7c3aed 100%)",
	)

	css = """
	.main-wrap { max-width: 1200px !important; margin: auto !important; padding: 20px !important; }
	.st-card {
	border-radius: 16px !important;
	border: 1px solid rgba(255,255,255,0.1) !important;
	box-shadow: 0 4px 20px rgba(0,0,0,0.5) !important;
	padding: 15px;
	}
	.result-card {
	background: linear-gradient(180deg, rgba(15, 23, 42, 0.8) 0%, rgba(30, 41, 59, 0.8) 100%) !important;
	border: 1px solid rgba(99, 102, 241, 0.2) !important;
	margin-top: 15px;
	}
	audio { filter: invert(90%) hue-rotate(180deg) brightness(1.5); width: 100%; border-radius: 8px; }
	.footer { text-align: center; margin-top: 40px; color: #475569; font-size: 0.8rem; font-weight: 500; }
	"""

	# --- 5. UI CONSTRUCTION ---
	with gr.Blocks(title="AI Voice Studio") as demo:

	with gr.Column(elem_classes="main-wrap"):
	with gr.Row(equal_height=True):
	# CỘT TRÁI
	with gr.Column(scale=1):
	with gr.Group(elem_classes="st-card"):
	text_input = gr.Textbox(
	label="VĂN BẢN CẦN CHUYỂN ĐỔI",
	placeholder="Nhập nội dung vào đây...",
	lines=20, # Tăng thêm để cân bằng với các nút mới
	show_label=True,
	)
	char_count = gr.HTML("<div style='text-align: right; color: #6366f1; font-size: 0.85rem; font-weight: bold; padding: 5px;'>0 / 250</div>")

	# CỘT PHẢI
	with gr.Column(scale=1):
	with gr.Tabs() as tabs:
	with gr.TabItem("👤 Nghệ sĩ đọc", id="preset_mode"):
	voice_select = gr.Dropdown(
	choices=list(VOICE_SAMPLES.keys()),
	value="Tuyên (nam miền Bắc)",
	label="Lựa chọn giọng đọc mẫu",
	)
	with gr.Accordion("Nghe thử giọng mẫu", open=False):
	ref_audio_preview = gr.Audio(interactive=False, show_label=False)
	ref_text_preview = gr.Markdown("...")

	with gr.TabItem("🎙️ Nhân bản (Clone)", id="custom_mode"):
	custom_audio = gr.Audio(label="Audio gốc", type="filepath")
	custom_text = gr.Textbox(
	label="NỘI DUNG AUDIO MẪU",
	placeholder="Nhập lời thoại của audio mẫu...",
	lines=4,
	show_label=True
	)

	# --- KHU VỰC ĐIỀU CHỈNH ÂM THANH ---
	with gr.Row():
	pause_level = gr.Radio(
	choices=["Mặc định", "Trung bình", "Dài"],
	value="Mặc định",
	label="Độ ngắt nghỉ",
	scale=1
	)
	speed_select = gr.Dropdown(
	choices=[0.8, 0.9, 1.0, 1.1, 1.2, 1.5],
	value=1.0,
	label="Tốc độ đọc",
	scale=1
	)

	current_mode = gr.State(value="preset_mode")

	gr.Markdown("<br>")
	btn_generate = gr.Button("BẮT ĐẦU TỔNG HỢP", variant="primary", size="lg")

	with gr.Group(elem_classes="st-card result-card"):
	audio_output = gr.Audio(label="AUDIO KẾT QUẢ", interactive=False, autoplay=True)
	status_output = gr.Markdown("<p style='text-align: center; color: #818cf8; font-weight: 500;'>✨ Sẵn sàng thực hiện</p>")

	gr.HTML("<div class='footer'>ENGINE BY VIENEU-TTS • PROFESSIONAL AI SOLUTIONS 2025</div>")

	# --- LOGIC ---
	def update_count(text):
	l = len(text)
	color = "#6366f1" if l <= 250 else "#f43f5e"
	return f"<div style='text-align: right; color: {color}; font-size: 0.85rem; font-weight: bold; padding: 5px;'>{l} / 250</div>"

	text_input.change(update_count, text_input, char_count)

	def update_ref_preview(voice):
	audio, text = load_reference_info(voice)
	return audio, f"Nội dung mẫu: \"{text}\""

	voice_select.change(update_ref_preview, voice_select, [ref_audio_preview, ref_text_preview])
	demo.load(update_ref_preview, voice_select, [ref_audio_preview, ref_text_preview])

	tabs.children[0].select(fn=lambda: "preset_mode", outputs=current_mode)
	tabs.children[1].select(fn=lambda: "custom_mode", outputs=current_mode)

	btn_generate.click(
	fn=synthesize_speech,
	inputs=[text_input, voice_select, custom_audio, custom_text, current_mode, pause_level, speed_select],
	outputs=[audio_output, status_output]
	)

	if __name__ == "__main__":
	demo.queue().launch(theme=theme, css=css, server_name="0.0.0.0", server_port=7860)