Spaces:
Sleeping
Sleeping
File size: 10,120 Bytes
97bd888 222cbc4 97bd888 222cbc4 792e819 222cbc4 97bd888 222cbc4 97bd888 222cbc4 792e819 222cbc4 fbf325c 222cbc4 97bd888 222cbc4 792e819 222cbc4 97bd888 792e819 222cbc4 97bd888 222cbc4 97bd888 222cbc4 97bd888 222cbc4 792e819 222cbc4 792e819 97bd888 222cbc4 792e819 222cbc4 97bd888 222cbc4 792e819 97bd888 222cbc4 792e819 222cbc4 97bd888 222cbc4 792e819 1fd11d5 97bd888 792e819 222cbc4 792e819 1fd11d5 792e819 1fd11d5 792e819 222cbc4 792e819 222cbc4 1fd11d5 792e819 222cbc4 97bd888 792e819 97bd888 792e819 97bd888 222cbc4 792e819 97bd888 792e819 97bd888 1fd11d5 792e819 97bd888 792e819 97bd888 1fd11d5 97bd888 792e819 97bd888 792e819 97bd888 1fd11d5 792e819 97bd888 792e819 97bd888 1fd11d5 222cbc4 792e819 222cbc4 792e819 222cbc4 97bd888 222cbc4 792e819 222cbc4 792e819 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 | import spaces
import os
os.environ['SPACES_ZERO_GPU'] = '1'
import gradio as gr
import soundfile as sf
import tempfile
import torch
import librosa # Thêm thư viện xử lý âm thanh
from vieneu_tts import VieNeuTTS
import time
# --- 1. SETUP MODEL ---
device = "cuda" if torch.cuda.is_available() else "cpu"
try:
tts = VieNeuTTS(
backbone_repo="pnnbao-ump/VieNeu-TTS",
backbone_device=device,
codec_repo="neuphonic/neucodec",
codec_device=device
)
except Exception as e:
class MockTTS:
def encode_reference(self, path): return None
def infer(self, text, ref, ref_text):
time.sleep(1.2)
import numpy as np
return np.random.uniform(-0.1, 0.1, 24000*2)
tts = MockTTS()
# --- 2. DATA (Giữ nguyên danh sách giọng mẫu) ---
VOICE_SAMPLES = {
"Tuyên (nam miền Bắc)": {"audio": "./sample/Tuyên (nam miền Bắc).wav", "text": "./sample/Tuyên (nam miền Bắc).txt"},
"Thiện Tâm": {"audio": "./sample/thientam.mp3", "text": "./sample/thientam.txt"},
"Vĩnh (nam miền Nam)": {"audio": "./sample/Vĩnh (nam miền Nam).wav", "text": "./sample/Vĩnh (nam miền Nam).txt"},
"Bình (nam miền Bắc)": {"audio": "./sample/Bình (nam miền Bắc).wav", "text": "./sample/Bình (nam miền Bắc).txt"},
"Nguyên (nam miền Nam)": {"audio": "./sample/Nguyên (nam miền Nam).wav", "text": "./sample/Nguyên (nam miền Nam).txt"},
"Sơn (nam miền Nam)": {"audio": "./sample/Sơn (nam miền Nam).wav", "text": "./sample/Sơn (nam miền Nam).txt"},
"Đoan (nữ miền Nam)": {"audio": "./sample/Đoan (nữ miền Nam).wav", "text": "./sample/Đoan (nữ miền Nam).txt"},
"Ngọc (nữ miền Bắc)": {"audio": "./sample/Ngọc (nữ miền Bắc).wav", "text": "./sample/Ngọc (nữ miền Bắc).txt"},
"Ly (nữ miền Bắc)": {"audio": "./sample/Ly (nữ miền Bắc).wav", "text": "./sample/Ly (nữ miền Bắc).txt"},
"Dung (nữ miền Nam)": {"audio": "./sample/Dung (nữ miền Nam).wav", "text": "./sample/Dung (nữ miền Nam).txt"}
}
# --- 3. HELPER FUNCTIONS ---
def load_reference_info(voice_choice):
if voice_choice in VOICE_SAMPLES:
audio_path = VOICE_SAMPLES[voice_choice]["audio"]
text_path = VOICE_SAMPLES[voice_choice]["text"]
if os.path.exists(text_path):
with open(text_path, "r", encoding="utf-8") as f:
ref_text = f.read()
return audio_path, ref_text
return None, ""
@spaces.GPU(duration=120)
def synthesize_speech(text, voice_choice, custom_audio, custom_text, mode_tab, pause_level, speed_value):
try:
if not text or text.strip() == "":
return None, "⚠️ Vui lòng nhập nội dung!"
# 3.1. Xử lý độ ngắt nghỉ (Pause level)
processed_text = text
if pause_level == "Trung bình":
processed_text = processed_text.replace(",", ", , ").replace(".", ". . ")
elif pause_level == "Dài":
processed_text = processed_text.replace(",", ", , , ").replace(".", ". . . . ")
if len(processed_text) > 400:
processed_text = processed_text[:400]
# 3.2. Lấy dữ liệu Reference
if mode_tab == "custom_mode":
if custom_audio is None or not custom_text:
return None, "⚠️ Thiếu Audio mẫu hoặc Text mẫu."
ref_audio_path = custom_audio
ref_text_raw = custom_text
else:
ref_audio_path = VOICE_SAMPLES[voice_choice]["audio"]
with open(VOICE_SAMPLES[voice_choice]["text"], "r", encoding="utf-8") as f:
ref_text_raw = f.read()
# 3.3. Thực hiện Inference
start_time = time.time()
ref_codes = tts.encode_reference(ref_audio_path)
wav = tts.infer(processed_text, ref_codes, ref_text_raw)
# 3.4. Điều chỉnh Tốc độ (Speed) bằng librosa
if speed_value != 1.0:
# Time stretch giữ nguyên pitch
wav = librosa.effects.time_stretch(wav, rate=float(speed_value))
process_time = time.time() - start_time
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
sf.write(tmp_file.name, wav, 24000)
output_path = tmp_file.name
return output_path, f"⚡ Xử lý: {process_time:.2f}s | Tốc độ: {speed_value}x"
except Exception as e:
return None, f"❌ Lỗi: {str(e)}"
# --- 4. THEME & CSS ---
theme = gr.themes.Default(
primary_hue="indigo",
secondary_hue="blue",
neutral_hue="slate",
font=[gr.themes.GoogleFont('Inter'), 'sans-serif'],
).set(
body_background_fill="#020617",
block_background_fill="#0f172a",
block_border_width="1px",
input_background_fill="#1e293b",
input_border_color="#334155",
button_primary_background_fill="linear-gradient(135deg, #4f46e5 0%, #7c3aed 100%)",
)
css = """
.main-wrap { max-width: 1200px !important; margin: auto !important; padding: 20px !important; }
.st-card {
border-radius: 16px !important;
border: 1px solid rgba(255,255,255,0.1) !important;
box-shadow: 0 4px 20px rgba(0,0,0,0.5) !important;
padding: 15px;
}
.result-card {
background: linear-gradient(180deg, rgba(15, 23, 42, 0.8) 0%, rgba(30, 41, 59, 0.8) 100%) !important;
border: 1px solid rgba(99, 102, 241, 0.2) !important;
margin-top: 15px;
}
audio { filter: invert(90%) hue-rotate(180deg) brightness(1.5); width: 100%; border-radius: 8px; }
.footer { text-align: center; margin-top: 40px; color: #475569; font-size: 0.8rem; font-weight: 500; }
"""
# --- 5. UI CONSTRUCTION ---
with gr.Blocks(title="AI Voice Studio") as demo:
with gr.Column(elem_classes="main-wrap"):
with gr.Row(equal_height=True):
# CỘT TRÁI
with gr.Column(scale=1):
with gr.Group(elem_classes="st-card"):
text_input = gr.Textbox(
label="VĂN BẢN CẦN CHUYỂN ĐỔI",
placeholder="Nhập nội dung vào đây...",
lines=20, # Tăng thêm để cân bằng với các nút mới
show_label=True,
)
char_count = gr.HTML("<div style='text-align: right; color: #6366f1; font-size: 0.85rem; font-weight: bold; padding: 5px;'>0 / 250</div>")
# CỘT PHẢI
with gr.Column(scale=1):
with gr.Tabs() as tabs:
with gr.TabItem("👤 Nghệ sĩ đọc", id="preset_mode"):
voice_select = gr.Dropdown(
choices=list(VOICE_SAMPLES.keys()),
value="Tuyên (nam miền Bắc)",
label="Lựa chọn giọng đọc mẫu",
)
with gr.Accordion("Nghe thử giọng mẫu", open=False):
ref_audio_preview = gr.Audio(interactive=False, show_label=False)
ref_text_preview = gr.Markdown("...")
with gr.TabItem("🎙️ Nhân bản (Clone)", id="custom_mode"):
custom_audio = gr.Audio(label="Audio gốc", type="filepath")
custom_text = gr.Textbox(
label="NỘI DUNG AUDIO MẪU",
placeholder="Nhập lời thoại của audio mẫu...",
lines=4,
show_label=True
)
# --- KHU VỰC ĐIỀU CHỈNH ÂM THANH ---
with gr.Row():
pause_level = gr.Radio(
choices=["Mặc định", "Trung bình", "Dài"],
value="Mặc định",
label="Độ ngắt nghỉ",
scale=1
)
speed_select = gr.Dropdown(
choices=[0.8, 0.9, 1.0, 1.1, 1.2, 1.5],
value=1.0,
label="Tốc độ đọc",
scale=1
)
current_mode = gr.State(value="preset_mode")
gr.Markdown("<br>")
btn_generate = gr.Button("BẮT ĐẦU TỔNG HỢP", variant="primary", size="lg")
with gr.Group(elem_classes="st-card result-card"):
audio_output = gr.Audio(label="AUDIO KẾT QUẢ", interactive=False, autoplay=True)
status_output = gr.Markdown("<p style='text-align: center; color: #818cf8; font-weight: 500;'>✨ Sẵn sàng thực hiện</p>")
gr.HTML("<div class='footer'>ENGINE BY VIENEU-TTS • PROFESSIONAL AI SOLUTIONS 2025</div>")
# --- LOGIC ---
def update_count(text):
l = len(text)
color = "#6366f1" if l <= 250 else "#f43f5e"
return f"<div style='text-align: right; color: {color}; font-size: 0.85rem; font-weight: bold; padding: 5px;'>{l} / 250</div>"
text_input.change(update_count, text_input, char_count)
def update_ref_preview(voice):
audio, text = load_reference_info(voice)
return audio, f"**Nội dung mẫu:** *\"{text}\"*"
voice_select.change(update_ref_preview, voice_select, [ref_audio_preview, ref_text_preview])
demo.load(update_ref_preview, voice_select, [ref_audio_preview, ref_text_preview])
tabs.children[0].select(fn=lambda: "preset_mode", outputs=current_mode)
tabs.children[1].select(fn=lambda: "custom_mode", outputs=current_mode)
btn_generate.click(
fn=synthesize_speech,
inputs=[text_input, voice_select, custom_audio, custom_text, current_mode, pause_level, speed_select],
outputs=[audio_output, status_output]
)
if __name__ == "__main__":
demo.queue().launch(theme=theme, css=css, server_name="0.0.0.0", server_port=7860) |