| | """F5-TTS - HuggingFace Space Demo |
| | |
| | Generated at 2026-02-02T08:59:59Z from templates/space/app.py.j2. |
| | |
| | A Gradio interface for F5-TTS text-to-speech synthesis. |
| | """ |
| |
|
| | import os |
| | from pathlib import Path |
| |
|
| | import gradio as gr |
| | import numpy as np |
| |
|
| | from ttsdb_f5_tts import F5TTS |
| |
|
| | from ttsdb_core.config import ModelConfig |
| |
|
| |
|
| |
|
| | |
| | MODEL_ID = os.environ.get("MODEL_ID", "ttsds/f5-tts") |
| | DEFAULT_VARIANT = os.environ.get("MODEL_VARIANT", "base") |
| |
|
| |
|
| | |
| | AVAILABLE_VARIANTS = ["base", "v1"] |
| |
|
| |
|
| | |
| | model = None |
| |
|
| |
|
| | def get_model(variant: str = DEFAULT_VARIANT): |
| | """Get or reload the model with the specified variant.""" |
| | global model |
| |
|
| | current_variant = getattr(model, "variant", None) if model else None |
| | if model is None or current_variant != variant: |
| | print(f"Loading model variant: {variant}") |
| | model = F5TTS(model_id=MODEL_ID, variant=variant or None) |
| |
|
| | return model |
| |
|
| |
|
| | |
| | get_model() |
| |
|
| | def synthesize( |
| | text: str, |
| | reference_audio: str, |
| | reference_text: str, |
| | language: str, |
| | |
| | variant: str = DEFAULT_VARIANT, |
| | |
| | ) -> tuple[int, np.ndarray]: |
| | """Synthesize speech from text. |
| | |
| | Expects `reference_audio` to be a filepath (Gradio `type="filepath"`). |
| | Returns (sample_rate, audio_array) as expected by Gradio. |
| | """ |
| | if not text or not text.strip(): |
| | raise gr.Error("Please enter some text to synthesize.") |
| |
|
| | if not reference_audio or not os.path.exists(reference_audio): |
| | raise gr.Error("Please upload a reference audio file.") |
| |
|
| | if not reference_text or not reference_text.strip(): |
| | raise gr.Error("Please enter the transcript of the reference audio.") |
| |
|
| |
|
| | |
| | model = get_model(variant) |
| |
|
| |
|
| | audio, sr = model.synthesize( |
| | text=text, |
| | reference_audio=reference_audio, |
| | text_reference=reference_text, |
| | language=language, |
| | ) |
| | return (sr, audio) |
| |
|
| | gr.set_static_paths(paths=["examples"]) |
| |
|
| | |
| | with gr.Blocks(title="F5-TTS TTS") as demo: |
| | |
| | gr.Markdown( |
| | """ |
| | # F5-TTS Text-to-Speech |
| | |
| | |
| | |
| | Non-Autoregressive Flow Matching (DiT) text-to-speech model by [Yushen Chen](https://github.com/SWivid). |
| | |
| | |
| | |
| | |
| | > **Note:** This demo is not affiliated with or endorsed by the original authors. |
| | > It is provided for research and educational purposes only. |
| | |
| | **Links:** [Code](https://github.com/SWivid/F5-TTS) | [Paper](https://aclanthology.org/2025.acl-long.313/) | [Weights](https://huggingface.co/SWivid/F5-TTS) |
| | """ |
| | ) |
| |
|
| | with gr.Row(): |
| | with gr.Column(): |
| | reference_audio = gr.Audio( |
| | label="Reference Audio", |
| | type="filepath", |
| | ) |
| | reference_text = gr.Textbox( |
| | label="Reference Transcript", |
| | placeholder="Enter what is being said in the reference audio...", |
| | lines=2, |
| | ) |
| | text_input = gr.Textbox( |
| | label="Text to Synthesize", |
| | placeholder="Enter the text you want to convert to speech...", |
| | lines=3, |
| | ) |
| | language = gr.Dropdown( |
| | label="Language", |
| | choices=[ |
| |
|
| | ("English", "eng"), |
| |
|
| | ("Chinese (中文)", "zho"), |
| |
|
| | ], |
| | value="eng", |
| | ) |
| |
|
| | variant = gr.Dropdown( |
| | label="Model Variant", |
| | choices=AVAILABLE_VARIANTS, |
| | value=DEFAULT_VARIANT or AVAILABLE_VARIANTS[0], |
| | info="Select model variant/version", |
| | ) |
| |
|
| | submit_btn = gr.Button("Synthesize", variant="primary") |
| |
|
| | with gr.Column(): |
| | output_audio = gr.Audio( |
| | label="Synthesized Audio", |
| | type="numpy", |
| | ) |
| |
|
| | |
| | |
| | _runtime_examples = [] |
| |
|
| |
|
| | _rel = Path("examples/ref_eng.wav") |
| | _src = Path(__file__).parent / _rel |
| | if _src.exists(): |
| |
|
| | _runtime_examples.append([_src, "This is Learning English from the News, our podcast about the news headlines.", "With tenure, Suzie'd have all the more leisure for yachting, but her publications are no good.", "eng", DEFAULT_VARIANT or AVAILABLE_VARIANTS[0]]) |
| |
|
| |
|
| |
|
| |
|
| | _rel = Path("examples/ref_zho.wav") |
| | _src = Path(__file__).parent / _rel |
| | if _src.exists(): |
| |
|
| | _runtime_examples.append([_src, "中国企业的第一个大语言模型是文心一言,百度推出了ERNIE 5.0,千帆的4.0,所以一个由中国公司构建的人工智能的基础设施系统。", "在大學實驗室裡,教授寫了一篇滑稽論文,卻把咖啡灑在筆記上。", "zho", DEFAULT_VARIANT or AVAILABLE_VARIANTS[0]]) |
| |
|
| |
|
| |
|
| |
|
| |
|
| | gr.Examples( |
| | examples=_runtime_examples, |
| | inputs=[reference_audio, reference_text, text_input, language, variant], |
| | ) |
| |
|
| | submit_btn.click( |
| | fn=synthesize, |
| | inputs=[text_input, reference_audio, reference_text, language, variant], |
| | outputs=[output_audio], |
| | ) |
| |
|
| |
|
| | |
| | gr.Markdown( |
| | """ |
| | ## Model Information |
| | |
| | | Property | Value | |
| | |----------|-------| |
| | | **Architecture** | Non-Autoregressive, Flow Matching, Diffusion Transformer | |
| | | **Sample Rate** | 24000 Hz | |
| | | **Parameters** | 335M | |
| | | **Languages** | English, Chinese | |
| | | **Release Date** | 2024-10-30 | |
| | |
| | |
| | |
| | ## Citations |
| | |
| | If you use this model, please cite the original work: |
| | |
| | |
| | ```bibtex |
| | |
| | @inproceedings{f5-tts, |
| | title = "F5-{TTS}: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching", |
| | author = "Chen, Yushen and |
| | Niu, Zhikang and |
| | Ma, Ziyang and |
| | Deng, Keqi and |
| | Wang, Chunhui and |
| | JianZhao, JianZhao and |
| | Yu, Kai and |
| | Chen, Xie", |
| | editor = "Che, Wanxiang and |
| | Nabende, Joyce and |
| | Shutova, Ekaterina and |
| | Pilehvar, Mohammad Taher", |
| | booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", |
| | month = jul, |
| | year = "2025", |
| | address = "Vienna, Austria", |
| | publisher = "Association for Computational Linguistics", |
| | url = "https://aclanthology.org/2025.acl-long.313/", |
| | doi = "10.18653/v1/2025.acl-long.313", |
| | pages = "6255--6271", |
| | ISBN = "979-8-89176-251-0", |
| | abstract = "This paper introduces F5-TTS, a fully non-autoregressive text-to-speech system based on flow matching with Diffusion Transformer (DiT). Without requiring complex designs such as duration model, text encoder, and phoneme alignment, the text input is simply padded with filler tokens to the same length as input speech, and then the denoising is performed for speech generation, which was originally proved feasible by E2 TTS. However, the original design of E2 TTS makes it hard to follow due to its slow convergence and low robustness. To address these issues, we first model the input with ConvNeXt to refine the text representation, making it easy to align with the speech. We further propose an inference-time Sway Sampling strategy, which significantly improves our model{'}s performance and efficiency. This sampling strategy for flow step can be easily applied to existing flow matching based models without retraining. Our design allows faster training and achieves an inference RTF of 0.15, which is greatly improved compared to state-of-the-art diffusion-based TTS models. Trained on a public 100K hours multilingual dataset, our F5-TTS exhibits highly natural and expressive zero-shot ability, seamless code-switching capability, and speed control efficiency. We have released all codes and checkpoints to promote community development, at https://SWivid.github.io/F5-TTS/." |
| | } |
| | |
| | |
| | ``` |
| | |
| | |
| | """ |
| | ) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | |
| | port = int(os.environ.get("PORT", "7860")) |
| | demo.launch(server_name="0.0.0.0", server_port=port, share=False, allowed_paths=["examples"]) |
| |
|