Arabic-F5-T / app.py
ibrahimabdelaal
Use subprocess with better error handling and timeout
b19aabf
import gradio as gr
import torch
import torchaudio
import spaces
import os
import tempfile
import subprocess
import shlex
from pathlib import Path
from huggingface_hub import hf_hub_download
# Global cache for model files
model_files_cache = {}
def download_model_files():
"""Download model files once and cache paths."""
if not model_files_cache:
print("Downloading model files...")
model_files_cache["vocab_file"] = hf_hub_download(
repo_id="IbrahimSalah/Arabic-F5-TTS-v2",
filename="vocab.txt"
)
model_files_cache["ckpt_file"] = hf_hub_download(
repo_id="IbrahimSalah/Arabic-F5-TTS-v2",
filename="model_547500_8_18.pt"
)
model_files_cache["config_file"] = hf_hub_download(
repo_id="IbrahimSalah/Arabic-F5-TTS-v2",
filename="F5TTS_Base_8_18.yaml"
)
print("Model files downloaded!")
return model_files_cache
@spaces.GPU(duration=120)
def generate_speech(
text: str,
reference_audio,
reference_transcript: str,
nfe_step: int = 32,
cfg_strength: float = 1.8,
speed: float = 1.0,
progress=gr.Progress()
):
"""Generate speech using F5-TTS CLI - exactly like working Colab."""
try:
# Validate inputs
if not text.strip():
return None, "โŒ Please enter text to synthesize."
if reference_audio is None:
return None, "โŒ Please upload a reference audio file."
if not reference_transcript.strip():
return None, "โŒ Please enter the reference transcript."
# Download model files
progress(0.1, desc="Loading model files...")
files = download_model_files()
# Create temporary output file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav", mode='w') as tmp_file:
output_path = tmp_file.name
# Build CLI command - EXACTLY like working Colab
progress(0.3, desc="Generating audio...")
cmd = [
"python", "-m", "f5_tts.infer.infer_cli",
"--model_cfg", files["config_file"],
"--output_file", output_path,
"--model", "F5TTS_Base",
"--ckpt_file", files["ckpt_file"],
"--vocab_file", files["vocab_file"],
"--ref_audio", reference_audio,
"--nfe_step", str(nfe_step),
"--cfg_strength", str(cfg_strength),
"--speed", str(speed),
"--ref_text", reference_transcript,
"--gen_text", text
]
print(f"Running command: {' '.join(cmd)}")
# Run the CLI command
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=300 # 5 minute timeout
)
# Print outputs for debugging
if result.stdout:
print("STDOUT:", result.stdout)
if result.stderr:
print("STDERR:", result.stderr)
# Check for errors
if result.returncode != 0:
error_msg = f"โŒ CLI failed with return code {result.returncode}\n"
error_msg += f"STDERR: {result.stderr}\n"
error_msg += f"STDOUT: {result.stdout}"
return None, error_msg
# Check if output file was created
if not os.path.exists(output_path):
return None, f"โŒ Output file not created. Check logs above."
if os.path.getsize(output_path) == 0:
return None, "โŒ Output file is empty."
# Get audio duration
try:
audio, sample_rate = torchaudio.load(output_path)
duration = audio.shape[-1] / sample_rate
status = f"โœ… Generated {duration:.2f}s audio"
except Exception as e:
status = f"โœ… Audio generated (duration unknown: {str(e)})"
progress(1.0, desc="Complete!")
return output_path, status
except subprocess.TimeoutExpired:
return None, "โŒ Generation timed out (>5 minutes)"
except Exception as e:
import traceback
error_msg = f"โŒ Error: {str(e)}\n{traceback.format_exc()}"
print(error_msg)
return None, error_msg
# Default examples
DEFAULT_REFERENCE_TEXT = "ู„ูŽุง ูŠูŽู…ูุฑูู‘ ูŠูŽูˆู’ู…ูŒ ุฅูู„ูŽู‘ุง ูˆูŽุฃูŽุณู’ุชูŽู‚ู’ุจูู„ู ุนูุฏูŽู‘ุฉูŽ ุฑูŽุณูŽุงุฆูู„ูŽุŒ ุชูŽุชูŽุถูŽู…ูŽู‘ู†ู ุฃูŽุณู’ุฆูู„ูŽุฉู‹ ู…ูู„ูุญูŽู‘ุฉู’."
DEFAULT_TEXT = "ุชูุณูŽุงู‡ูู…ู ุงู„ุชูู‘ู‚ู’ู†ููŠูŽู‘ุงุชู ุงู„ู’ุญูŽุฏููŠุซูŽุฉู ูููŠ ุชูŽุณู’ู‡ููŠู„ู ุญูŽูŠูŽุงุฉู ุงู„ู’ุฅูู†ู’ุณูŽุงู†ูุŒ ูˆูŽุฐูŽู„ููƒูŽ ู…ูู†ู’ ุฎูู„ูŽุงู„ู ุชูŽุทู’ูˆููŠุฑู ุฃูŽู†ู’ุธูู…ูŽุฉู ุฐูŽูƒููŠูŽู‘ุฉู ุชูŽุนู’ุชูŽู…ูุฏู ุนูŽู„ูŽู‰ ุงู„ุฐูŽู‘ูƒูŽุงุกู ุงู„ูุงุตู’ุทูู†ูŽุงุนููŠูู‘."
DEFAULT_REFERENCE_AUDIO = "reference.wav"
# Create Gradio interface
with gr.Blocks(title="Arabic F5-TTS", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# ๐ŸŽ™๏ธ Arabic Text-to-Speech | F5-TTS Model
High-quality Arabic TTS with voice cloning. **Diacritized text (ุชุดูƒูŠู„) required.**
**Model:** [IbrahimSalah/Arabic-F5-TTS-v2](https://huggingface.co/IbrahimSalah/Arabic-F5-TTS-v2)
""")
with gr.Row():
with gr.Column(scale=1):
text_input = gr.Textbox(
label="๐Ÿ“ Text to Synthesize (Arabic with Tashkeel)",
placeholder="ุฃูŽุฏู’ุฎูู„ู’ ู†ูŽุตู‹ู‘ุง ุนูŽุฑูŽุจููŠู‹ู‘ุง ู…ูุดูŽูƒูŽู‘ู„ู‹ุง ู‡ูู†ูŽุง...",
lines=6,
value=DEFAULT_TEXT
)
with gr.Row():
with gr.Column():
gr.Markdown("**๐ŸŽต Reference Audio**")
reference_audio = gr.Audio(
label="",
type="filepath",
value=DEFAULT_REFERENCE_AUDIO
)
with gr.Column():
reference_transcript = gr.Textbox(
label="๐Ÿ“„ Reference Transcript (with Tashkeel)",
placeholder="ุงู„ู†ุต ุงู„ู…ู‚ุงุจู„ ู„ู„ุตูˆุช ุงู„ู…ุฑุฌุนูŠ...",
lines=4,
value=DEFAULT_REFERENCE_TEXT
)
with gr.Accordion("โš™๏ธ Advanced Settings", open=False):
with gr.Row():
nfe_step = gr.Slider(16, 64, value=32, step=1, label="NFE Steps")
cfg_strength = gr.Slider(0.0, 3.0, value=1.8, step=0.1, label="CFG Strength")
with gr.Row():
speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speed")
generate_btn = gr.Button("๐ŸŽค Generate Speech", variant="primary", size="lg")
with gr.Column(scale=1):
output_audio = gr.Audio(label="๐Ÿ”Š Generated Speech", type="filepath")
status_text = gr.Textbox(label="Status", interactive=False, lines=2)
gr.Markdown("""
### โ„น๏ธ Requirements
- **Diacritized text is required** (ุชุดูƒูŠู„/ุชุดูƒูŠู„)
- Reference audio: 5-30 seconds, clear speech
- Use AI (ChatGPT/Claude) or [online tools](https://tahadz.com/mishkal) to add diacritics
### ๐Ÿ”— Resources
- [Model Card](https://huggingface.co/IbrahimSalah/Arabic-F5-TTS-v2)
- [Spark TTS](https://huggingface.co/IbrahimSalah/Arabic-TTS-Spark)
- [Report Issues](https://huggingface.co/IbrahimSalah/Arabic-F5-TTS-v2/discussions)
""")
# Examples
with gr.Accordion("๐Ÿ“š Examples", open=False):
gr.Examples(
examples=[
[DEFAULT_TEXT, DEFAULT_REFERENCE_AUDIO, DEFAULT_REFERENCE_TEXT, 32, 1.8, 1.0],
["ุงู„ุณูŽู‘ู„ูŽุงู…ู ุนูŽู„ูŽูŠู’ูƒูู…ู’ ูˆูŽุฑูŽุญู’ู…ูŽุฉู ุงู„ู„ูŽู‘ู‡ู ูˆูŽุจูŽุฑูŽูƒูŽุงุชูู‡ูุŒ ูƒูŽูŠู’ููŽ ุญูŽุงู„ููƒูŽ ุงู„ู’ูŠูŽูˆู’ู…ูŽุŸ", DEFAULT_REFERENCE_AUDIO, DEFAULT_REFERENCE_TEXT, 32, 1.8, 1.0],
["ุงู„ุฐูŽู‘ูƒูŽุงุกู ุงู„ูุงุตู’ุทูู†ูŽุงุนููŠูู‘ ูŠูุบูŽูŠูู‘ุฑู ุงู„ู’ุนูŽุงู„ูŽู…ูŽ ุจูุณูุฑู’ุนูŽุฉู ูƒูŽุจููŠุฑูŽุฉู.", DEFAULT_REFERENCE_AUDIO, DEFAULT_REFERENCE_TEXT, 32, 1.8, 1.0]
],
inputs=[text_input, reference_audio, reference_transcript, nfe_step, cfg_strength, speed]
)
generate_btn.click(
fn=generate_speech,
inputs=[text_input, reference_audio, reference_transcript, nfe_step, cfg_strength, speed],
outputs=[output_audio, status_text]
)
if __name__ == "__main__":
demo.queue(max_size=20)
demo.launch()