Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import subprocess | |
| import os | |
| import sys | |
| import soundfile as sf | |
| import numpy as np | |
| import torch | |
| import traceback | |
| import spaces | |
| repo_url = "https://huggingface.co/dangtr0408/StyleTTS2-lite-vi" | |
| repo_dir = "StyleTTS2-lite-vi" | |
| if not os.path.exists(repo_dir): | |
| subprocess.run(["git", "clone", repo_url, repo_dir]) | |
| sys.path.append(os.path.abspath(repo_dir)) | |
| from inference import StyleTTS2 | |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| config_path = os.path.join(repo_dir, "Models", "config.yaml") | |
| models_path = os.path.join(repo_dir, "Models", "model.pth") | |
| model = StyleTTS2(config_path, models_path).eval().to(device) | |
| voice_path = os.path.join(repo_dir, "reference_audio") | |
| eg_voices = [os.path.join(voice_path,"vn_1.wav"), os.path.join(voice_path,"vn_2.wav")] | |
| eg_texts = [ | |
| "Chỉ với khoảng 90 triệu tham số, [en-us]{StyleTTS2-lite} có thể dễ dàng tạo giọng nói với tốc độ cao.", | |
| "[id_1] Với [en-us]{StyleTTS2-lite} bạn có thể sử dụng [en-us]{language tag} để mô hình chắc chắn đọc bằng tiếng Anh, [id_2]cũng như sử dụng [en-us]{speaker tag} để chuyển đổi nhanh giữa các giọng đọc.", | |
| ] | |
| # Core inference function | |
| def main(reference_paths, text_prompt, denoise, avg_style, stabilize): | |
| try: | |
| speakers = {} | |
| for i, path in enumerate(reference_paths, 1): | |
| speaker_id = f"id_{i}" | |
| speakers[speaker_id] = { | |
| "path": path, | |
| "lang": "vi", | |
| "speed": 1.0 | |
| } | |
| with torch.no_grad(): | |
| styles = model.get_styles(speakers, denoise, avg_style) | |
| r = model.generate(text_prompt, styles, stabilize, 18, "[id_1]") | |
| r = r / np.abs(r).max() | |
| sf.write("output.wav", r, samplerate=24000) | |
| return "output.wav", "Audio generated successfully!" | |
| except Exception as e: | |
| error_message = traceback.format_exc() | |
| return None, error_message | |
| def on_file_upload(file_list): | |
| if not file_list: | |
| return None, "No file uploaded yet." | |
| unique_files = {} | |
| for file_path in file_list: | |
| file_name = os.path.basename(file_path) | |
| unique_files[file_name] = file_path #update and remove duplicate | |
| uploaded_infos = [] | |
| uploaded_file_names = list(unique_files.keys()) | |
| for i in range(len(uploaded_file_names)): | |
| uploaded_infos.append(f"[id_{i+1}]: {uploaded_file_names[i]}") | |
| summary = "\n".join(uploaded_infos) | |
| return list(unique_files.values()), f"Current reference audios:\n{summary}" | |
| def gen_example(reference_paths, text_prompt): | |
| output, status = main(reference_paths, text_prompt, 0.6, True, True) | |
| return output, reference_paths, status | |
| # Gradio UI | |
| with gr.Blocks() as demo: | |
| gr.HTML("<h1 style='text-align: center;'>StyleTTS2‑Lite Demo</h1>") | |
| gr.Markdown( | |
| "Download the local inference package from Hugging Face: " | |
| "[StyleTTS2‑Lite (Vietnamese)]" | |
| "(https://huggingface.co/dangtr0408/StyleTTS2-lite-vi/)." | |
| ) | |
| gr.Markdown( | |
| "Annotate any non‑Vietnamese words with the appropriate language tag, e.g., [en-us]{ } for English. For more information, see " | |
| "[eSpeakNG docs]" | |
| "(https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md)" | |
| ) | |
| with gr.Row(equal_height=True): | |
| with gr.Column(scale=1): | |
| text_prompt = gr.Textbox(label="Text Prompt", placeholder="Enter your text here...", lines=4) | |
| with gr.Column(scale=1): | |
| avg_style = gr.Checkbox(label="Use Average Styles", value=True) | |
| stabilize = gr.Checkbox(label="Stabilize Speaking Speed", value=True) | |
| denoise = gr.Slider(0.0, 1.0, step=0.1, value=0.6, label="Denoise Strength") | |
| with gr.Row(equal_height=True): | |
| with gr.Column(scale=1): | |
| reference_audios = gr.File(label="Reference Audios", file_types=[".wav", ".mp3"], file_count="multiple", height=150) | |
| gen_button = gr.Button("Generate") | |
| with gr.Column(scale=1): | |
| synthesized_audio = gr.Audio(label="Generate Audio", type="filepath") | |
| status = gr.Textbox(label="Status", interactive=False, lines=3) | |
| reference_audios.change( | |
| on_file_upload, | |
| inputs=[reference_audios], | |
| outputs=[reference_audios, status] | |
| ) | |
| gen_button.click( | |
| fn=main, | |
| inputs=[ | |
| reference_audios, | |
| text_prompt, | |
| denoise, | |
| avg_style, | |
| stabilize | |
| ], | |
| outputs=[synthesized_audio, status] | |
| ) | |
| gr.Examples( | |
| examples=[[[eg_voices[0]], eg_texts[0]], [eg_voices, eg_texts[1]]], | |
| inputs=[reference_audios, text_prompt], | |
| outputs=[synthesized_audio, reference_audios, status], | |
| fn=gen_example, | |
| cache_examples=False, | |
| label="Examples", | |
| run_on_click=True | |
| ) | |
| demo.launch() |