Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| import spaces | |
| import os | |
| import tempfile | |
| import subprocess | |
| import shlex | |
| from pathlib import Path | |
| from huggingface_hub import hf_hub_download | |
| # Global cache for model files | |
| model_files_cache = {} | |
| def download_model_files(): | |
| """Download model files once and cache paths.""" | |
| if not model_files_cache: | |
| print("Downloading model files...") | |
| model_files_cache["vocab_file"] = hf_hub_download( | |
| repo_id="IbrahimSalah/Arabic-F5-TTS-v2", | |
| filename="vocab.txt" | |
| ) | |
| model_files_cache["ckpt_file"] = hf_hub_download( | |
| repo_id="IbrahimSalah/Arabic-F5-TTS-v2", | |
| filename="model_547500_8_18.pt" | |
| ) | |
| model_files_cache["config_file"] = hf_hub_download( | |
| repo_id="IbrahimSalah/Arabic-F5-TTS-v2", | |
| filename="F5TTS_Base_8_18.yaml" | |
| ) | |
| print("Model files downloaded!") | |
| return model_files_cache | |
| def generate_speech( | |
| text: str, | |
| reference_audio, | |
| reference_transcript: str, | |
| nfe_step: int = 32, | |
| cfg_strength: float = 1.8, | |
| speed: float = 1.0, | |
| progress=gr.Progress() | |
| ): | |
| """Generate speech using F5-TTS CLI - exactly like working Colab.""" | |
| try: | |
| # Validate inputs | |
| if not text.strip(): | |
| return None, "โ Please enter text to synthesize." | |
| if reference_audio is None: | |
| return None, "โ Please upload a reference audio file." | |
| if not reference_transcript.strip(): | |
| return None, "โ Please enter the reference transcript." | |
| # Download model files | |
| progress(0.1, desc="Loading model files...") | |
| files = download_model_files() | |
| # Create temporary output file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav", mode='w') as tmp_file: | |
| output_path = tmp_file.name | |
| # Build CLI command - EXACTLY like working Colab | |
| progress(0.3, desc="Generating audio...") | |
| cmd = [ | |
| "python", "-m", "f5_tts.infer.infer_cli", | |
| "--model_cfg", files["config_file"], | |
| "--output_file", output_path, | |
| "--model", "F5TTS_Base", | |
| "--ckpt_file", files["ckpt_file"], | |
| "--vocab_file", files["vocab_file"], | |
| "--ref_audio", reference_audio, | |
| "--nfe_step", str(nfe_step), | |
| "--cfg_strength", str(cfg_strength), | |
| "--speed", str(speed), | |
| "--ref_text", reference_transcript, | |
| "--gen_text", text | |
| ] | |
| print(f"Running command: {' '.join(cmd)}") | |
| # Run the CLI command | |
| result = subprocess.run( | |
| cmd, | |
| capture_output=True, | |
| text=True, | |
| timeout=300 # 5 minute timeout | |
| ) | |
| # Print outputs for debugging | |
| if result.stdout: | |
| print("STDOUT:", result.stdout) | |
| if result.stderr: | |
| print("STDERR:", result.stderr) | |
| # Check for errors | |
| if result.returncode != 0: | |
| error_msg = f"โ CLI failed with return code {result.returncode}\n" | |
| error_msg += f"STDERR: {result.stderr}\n" | |
| error_msg += f"STDOUT: {result.stdout}" | |
| return None, error_msg | |
| # Check if output file was created | |
| if not os.path.exists(output_path): | |
| return None, f"โ Output file not created. Check logs above." | |
| if os.path.getsize(output_path) == 0: | |
| return None, "โ Output file is empty." | |
| # Get audio duration | |
| try: | |
| audio, sample_rate = torchaudio.load(output_path) | |
| duration = audio.shape[-1] / sample_rate | |
| status = f"โ Generated {duration:.2f}s audio" | |
| except Exception as e: | |
| status = f"โ Audio generated (duration unknown: {str(e)})" | |
| progress(1.0, desc="Complete!") | |
| return output_path, status | |
| except subprocess.TimeoutExpired: | |
| return None, "โ Generation timed out (>5 minutes)" | |
| except Exception as e: | |
| import traceback | |
| error_msg = f"โ Error: {str(e)}\n{traceback.format_exc()}" | |
| print(error_msg) | |
| return None, error_msg | |
| # Default examples | |
| DEFAULT_REFERENCE_TEXT = "ููุง ููู ูุฑูู ููููู ู ุฅููููุง ููุฃูุณูุชูููุจููู ุนูุฏููุฉู ุฑูุณูุงุฆูููุ ุชูุชูุถูู ูููู ุฃูุณูุฆูููุฉู ู ูููุญููุฉู." | |
| DEFAULT_TEXT = "ุชูุณูุงููู ู ุงูุชูููููููููุงุชู ุงููุญูุฏููุซูุฉู ููู ุชูุณูููููู ุญูููุงุฉู ุงููุฅูููุณูุงููุ ููุฐููููู ู ููู ุฎูููุงูู ุชูุทููููุฑู ุฃูููุธูู ูุฉู ุฐููููููุฉู ุชูุนูุชูู ูุฏู ุนูููู ุงูุฐููููุงุกู ุงููุงุตูุทูููุงุนูููู." | |
| DEFAULT_REFERENCE_AUDIO = "reference.wav" | |
| # Create Gradio interface | |
| with gr.Blocks(title="Arabic F5-TTS", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # ๐๏ธ Arabic Text-to-Speech | F5-TTS Model | |
| High-quality Arabic TTS with voice cloning. **Diacritized text (ุชุดููู) required.** | |
| **Model:** [IbrahimSalah/Arabic-F5-TTS-v2](https://huggingface.co/IbrahimSalah/Arabic-F5-TTS-v2) | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| text_input = gr.Textbox( | |
| label="๐ Text to Synthesize (Arabic with Tashkeel)", | |
| placeholder="ุฃูุฏูุฎููู ููุตููุง ุนูุฑูุจููููุง ู ูุดููููููุง ููููุง...", | |
| lines=6, | |
| value=DEFAULT_TEXT | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("**๐ต Reference Audio**") | |
| reference_audio = gr.Audio( | |
| label="", | |
| type="filepath", | |
| value=DEFAULT_REFERENCE_AUDIO | |
| ) | |
| with gr.Column(): | |
| reference_transcript = gr.Textbox( | |
| label="๐ Reference Transcript (with Tashkeel)", | |
| placeholder="ุงููุต ุงูู ูุงุจู ููุตูุช ุงูู ุฑุฌุนู...", | |
| lines=4, | |
| value=DEFAULT_REFERENCE_TEXT | |
| ) | |
| with gr.Accordion("โ๏ธ Advanced Settings", open=False): | |
| with gr.Row(): | |
| nfe_step = gr.Slider(16, 64, value=32, step=1, label="NFE Steps") | |
| cfg_strength = gr.Slider(0.0, 3.0, value=1.8, step=0.1, label="CFG Strength") | |
| with gr.Row(): | |
| speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speed") | |
| generate_btn = gr.Button("๐ค Generate Speech", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| output_audio = gr.Audio(label="๐ Generated Speech", type="filepath") | |
| status_text = gr.Textbox(label="Status", interactive=False, lines=2) | |
| gr.Markdown(""" | |
| ### โน๏ธ Requirements | |
| - **Diacritized text is required** (ุชุดููู/ุชุดููู) | |
| - Reference audio: 5-30 seconds, clear speech | |
| - Use AI (ChatGPT/Claude) or [online tools](https://tahadz.com/mishkal) to add diacritics | |
| ### ๐ Resources | |
| - [Model Card](https://huggingface.co/IbrahimSalah/Arabic-F5-TTS-v2) | |
| - [Spark TTS](https://huggingface.co/IbrahimSalah/Arabic-TTS-Spark) | |
| - [Report Issues](https://huggingface.co/IbrahimSalah/Arabic-F5-TTS-v2/discussions) | |
| """) | |
| # Examples | |
| with gr.Accordion("๐ Examples", open=False): | |
| gr.Examples( | |
| examples=[ | |
| [DEFAULT_TEXT, DEFAULT_REFERENCE_AUDIO, DEFAULT_REFERENCE_TEXT, 32, 1.8, 1.0], | |
| ["ุงูุณููููุงู ู ุนูููููููู ู ููุฑูุญูู ูุฉู ุงูููููู ููุจูุฑูููุงุชูููุ ูููููู ุญูุงูููู ุงููููููู ูุ", DEFAULT_REFERENCE_AUDIO, DEFAULT_REFERENCE_TEXT, 32, 1.8, 1.0], | |
| ["ุงูุฐููููุงุกู ุงููุงุตูุทูููุงุนูููู ููุบููููุฑู ุงููุนูุงููู ู ุจูุณูุฑูุนูุฉู ููุจููุฑูุฉู.", DEFAULT_REFERENCE_AUDIO, DEFAULT_REFERENCE_TEXT, 32, 1.8, 1.0] | |
| ], | |
| inputs=[text_input, reference_audio, reference_transcript, nfe_step, cfg_strength, speed] | |
| ) | |
| generate_btn.click( | |
| fn=generate_speech, | |
| inputs=[text_input, reference_audio, reference_transcript, nfe_step, cfg_strength, speed], | |
| outputs=[output_audio, status_text] | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue(max_size=20) | |
| demo.launch() | |