| | """MaskGCT - HuggingFace Space Demo |
| | |
| | A Gradio interface for MaskGCT text-to-speech synthesis. |
| | """ |
| |
|
| | import os |
| | from pathlib import Path |
| |
|
| | import gradio as gr |
| | import numpy as np |
| |
|
| | from ttsdb_maskgct import MaskGCT |
| |
|
| |
|
| | |
| | MODEL_ID = os.environ.get("MODEL_ID", "ttsds/maskgct") |
| | model = model = MaskGCT(model_id=MODEL_ID) |
| |
|
| | def synthesize( |
| | text: str, |
| | reference_audio: str, |
| | reference_text: str, |
| | language: str, |
| | ) -> tuple[int, np.ndarray]: |
| | """Synthesize speech from text. |
| | |
| | Expects `reference_audio` to be a filepath (Gradio `type="filepath"`). |
| | Returns (sample_rate, audio_array) as expected by Gradio. |
| | """ |
| | if not text or not text.strip(): |
| | raise gr.Error("Please enter some text to synthesize.") |
| |
|
| | if not reference_audio or not os.path.exists(reference_audio): |
| | raise gr.Error("Please upload a reference audio file.") |
| |
|
| | if not reference_text or not reference_text.strip(): |
| | raise gr.Error("Please enter the transcript of the reference audio.") |
| |
|
| | audio, sr = model.synthesize( |
| | text=text, |
| | reference_audio=reference_audio, |
| | text_reference=reference_text, |
| | language=language, |
| | ) |
| | return (sr, audio) |
| |
|
| | gr.set_static_paths(paths=["examples"]) |
| |
|
| | |
| | with gr.Blocks(title="MaskGCT TTS") as demo: |
| | |
| | gr.Markdown( |
| | """ |
| | # MaskGCT Text-to-Speech |
| | |
| | |
| | |
| | MaskGCT is a zero-shot text-to-speech model using a masked generative codec transformer by [Amphion](https://github.com/open-mmlab/Amphion). |
| | |
| | |
| | |
| | |
| | > **Note:** This demo is not affiliated with or endorsed by the original authors. |
| | > It is provided for research and educational purposes only. |
| | |
| | **Links:** [Code](https://github.com/open-mmlab/Amphion) | [Paper](https://arxiv.org/abs/2409.00750) | [Paper](https://ieeexplore.ieee.org/abstract/document/10832255) | [Weights](https://huggingface.co/amphion/MaskGCT) |
| | """ |
| | ) |
| | |
| | with gr.Row(): |
| | with gr.Column(): |
| | reference_audio = gr.Audio( |
| | label="Reference Audio", |
| | type="filepath", |
| | ) |
| | reference_text = gr.Textbox( |
| | label="Reference Transcript", |
| | placeholder="Enter what is being said in the reference audio...", |
| | lines=2, |
| | ) |
| | text_input = gr.Textbox( |
| | label="Text to Synthesize", |
| | placeholder="Enter the text you want to convert to speech...", |
| | lines=3, |
| | ) |
| | language = gr.Dropdown( |
| | label="Language", |
| | choices=[ |
| |
|
| | ("English", "eng"), |
| |
|
| | ("Chinese (中文)", "zho"), |
| |
|
| | ("Korean (한국어)", "kor"), |
| |
|
| | ("Japanese (日本語)", "jpn"), |
| |
|
| | ("French (Français)", "fra"), |
| |
|
| | ("German (Deutsch)", "deu"), |
| |
|
| | ], |
| | value="eng", |
| | ) |
| | submit_btn = gr.Button("Synthesize", variant="primary") |
| | |
| | with gr.Column(): |
| | output_audio = gr.Audio( |
| | label="Synthesized Audio", |
| | type="numpy", |
| | ) |
| | |
| | |
| | |
| | _runtime_examples = [] |
| |
|
| |
|
| | _rel = Path("examples/ref_eng.mp3") |
| | _src = Path(__file__).parent / _rel |
| | if _src.exists(): |
| | _runtime_examples.append([str(_src.absolute()), "Were the leaders in this luckless change, though our own Baskerville, who was at work some years before them, went much on the same lines.", "With tenure, Suzie'd have all the more leisure for yachting, but her publications are no good.", "eng"]) |
| |
|
| |
|
| |
|
| | _rel = Path("examples/ref_zho.wav") |
| | _src = Path(__file__).parent / _rel |
| | if _src.exists(): |
| | _runtime_examples.append([str(_src.absolute()), "對,這就是我,萬人敬仰的太乙真人。雖然有點嬰兒肥,但也掩不住我,逼人的帥氣。", "视野无限广,窗外有蓝天。", "zho"]) |
| |
|
| |
|
| |
|
| | gr.Examples( |
| | examples=_runtime_examples, |
| | inputs=[reference_audio, reference_text, text_input, language], |
| | ) |
| | |
| | submit_btn.click( |
| | fn=synthesize, |
| | inputs=[text_input, reference_audio, reference_text, language], |
| | outputs=[output_audio], |
| | ) |
| | |
| | |
| | gr.Markdown( |
| | """ |
| | ## Model Information |
| | |
| | | Property | Value | |
| | |----------|-------| |
| | | **Architecture** | Non-Autoregressive Masked Transformer | |
| | | **Sample Rate** | 24000 Hz | |
| | | **Parameters** | 1010M | |
| | | **Languages** | English, Chinese, Korean, Japanese, French, German | |
| | | **Release Date** | 2024-10-17 | |
| | |
| | |
| | |
| | ## Citations |
| | |
| | If you use this model, please cite the original work: |
| | |
| | |
| | ```bibtex |
| | |
| | @article{wang2024maskgct, |
| | title={MaskGCT: Zero-Shot Text-to-Speech with Masked Generative Codec Transformer}, |
| | author={Wang, Yuancheng and Zhan, Haoyue and Liu, Liwei and Zeng, Ruihong and Guo, Haotian and Zheng, Jiachen and Zhang, Qiang and Zhang, Xueyao and Zhang, Shunsi and Wu, Zhizheng}, |
| | journal={arXiv preprint arXiv:2409.00750}, |
| | year={2024} |
| | } |
| | |
| | |
| | ``` |
| | |
| | ```bibtex |
| | |
| | @inproceedings{amphion, |
| | author={Zhang, Xueyao and Xue, Liumeng and Gu, Yicheng and Wang, Yuancheng and Li, Jiaqi and He, Haorui and Wang, Chaoren and Song, Ting and Chen, Xi and Fang, Zihao and Chen, Haopeng and Zhang, Junan and Tang, Tze Ying and Zou, Lexiao and Wang, Mingxuan and Han, Jun and Chen, Kai and Li, Haizhou and Wu, Zhizheng}, |
| | title={Amphion: An Open-Source Audio, Music and Speech Generation Toolkit}, |
| | booktitle={{IEEE} Spoken Language Technology Workshop, {SLT} 2024}, |
| | year={2024} |
| | } |
| | |
| | |
| | ``` |
| | |
| | |
| | """ |
| | ) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | |
| | port = int(os.environ.get("PORT", "7860")) |
| | demo.launch(server_name="0.0.0.0", server_port=port, share=False, allowed_paths=["examples"]) |
| |
|