Arabic-F5-T

Sleeping

App Files Files Community

ibrahimabdelaal commited on Nov 14, 2025

Commit

b19aabf

1 Parent(s): f66f843

Use subprocess with better error handling and timeout

Browse files

Files changed (1) hide show

app.py +80 -65

app.py CHANGED Viewed

@@ -4,44 +4,32 @@ import torchaudio
 import spaces
 import os
 import tempfile
 from pathlib import Path
 from huggingface_hub import hf_hub_download
-# Import F5-TTS - use the same approach as working Colab
-from f5_tts.infer.utils_infer import infer_process, load_model
-from f5_tts.model import DiT
-# Global cache for models
-model_cache = {}
-def load_f5_model():
-    """Load F5-TTS model (cached) - exactly like Colab."""
-    if "model" not in model_cache:
-        print("Loading F5-TTS model...")
-        # Download model files
-        vocab_file = hf_hub_download(repo_id="IbrahimSalah/Arabic-F5-TTS-v2", filename="vocab.txt")
-        ckpt_file = hf_hub_download(repo_id="IbrahimSalah/Arabic-F5-TTS-v2", filename="model_547500_8_18.pt")
-        config_file = hf_hub_download(repo_id="IbrahimSalah/Arabic-F5-TTS-v2", filename="F5TTS_Base_8_18.yaml")
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        # Load model - pass config_file as string path (the function will handle it)
-        model, vocab_char_map, vocab_size = load_model(
-            model_cls=DiT,
-            model_cfg=config_file,  # Pass path, load_model will load it internally
-            ckpt_path=ckpt_file,
-            vocab_file=vocab_file,
-            device=device
         )
-        model_cache["model"] = model
-        model_cache["vocab_char_map"] = vocab_char_map
-        model_cache["vocab_size"] = vocab_size
-        model_cache["device"] = device
-        print("Model loaded successfully!")
-    return model_cache["model"], model_cache["vocab_char_map"], model_cache["vocab_size"], model_cache["device"]
 @spaces.GPU(duration=120)
@@ -54,12 +42,8 @@ def generate_speech(
     speed: float = 1.0,
     progress=gr.Progress()
 ):
-    """Generate speech using F5-TTS - same as Colab."""
     try:
-        # Load model
-        progress(0.1, desc="Loading model...")
-        model, vocab_char_map, vocab_size, device = load_f5_model()
         # Validate inputs
         if not text.strip():
             return None, "❌ Please enter text to synthesize."
@@ -70,44 +54,75 @@ def generate_speech(
         if not reference_transcript.strip():
             return None, "❌ Please enter the reference transcript."
-        # Generate audio
-        progress(0.3, desc="Generating audio...")
         # Create temporary output file
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
             output_path = tmp_file.name
-        # Run inference - exactly like Colab
-        audio, sample_rate, _ = infer_process(
-            ref_audio=reference_audio,
-            ref_text=reference_transcript,
-            gen_text=text,
-            model_obj=model,
-            vocoder=None,
-            mel_spec_type="vocos",
-            show_info=print,
-            progress=gr.Progress(),
-            target_rms=0.1,
-            cross_fade_duration=0.15,
-            nfe_step=nfe_step,
-            cfg_strength=cfg_strength,
-            sway_sampling_coef=-1.0,
-            speed=speed,
-            fix_duration=None,
-            device=device,
-            vocab_char_map=vocab_char_map,
         )
-        # Save audio
-        progress(0.9, desc="Saving audio...")
-        torchaudio.save(output_path, audio, sample_rate)
-        duration = audio.shape[-1] / sample_rate
-        status = f"✅ Generated {duration:.2f}s audio"
         progress(1.0, desc="Complete!")
         return output_path, status
     except Exception as e:
         import traceback
         error_msg = f"❌ Error: {str(e)}\n{traceback.format_exc()}"

 import spaces
 import os
 import tempfile
+import subprocess
+import shlex
 from pathlib import Path
 from huggingface_hub import hf_hub_download
+# Global cache for model files
+model_files_cache = {}
+def download_model_files():
+    """Download model files once and cache paths."""
+    if not model_files_cache:
+        print("Downloading model files...")
+        model_files_cache["vocab_file"] = hf_hub_download(
+            repo_id="IbrahimSalah/Arabic-F5-TTS-v2",
+            filename="vocab.txt"
         )
+        model_files_cache["ckpt_file"] = hf_hub_download(
+            repo_id="IbrahimSalah/Arabic-F5-TTS-v2",
+            filename="model_547500_8_18.pt"
+        )
+        model_files_cache["config_file"] = hf_hub_download(
+            repo_id="IbrahimSalah/Arabic-F5-TTS-v2",
+            filename="F5TTS_Base_8_18.yaml"
+        )
+        print("Model files downloaded!")
+    return model_files_cache
 @spaces.GPU(duration=120)
     speed: float = 1.0,
     progress=gr.Progress()
 ):
+    """Generate speech using F5-TTS CLI - exactly like working Colab."""
     try:
         # Validate inputs
         if not text.strip():
             return None, "❌ Please enter text to synthesize."
         if not reference_transcript.strip():
             return None, "❌ Please enter the reference transcript."
+        # Download model files
+        progress(0.1, desc="Loading model files...")
+        files = download_model_files()
         # Create temporary output file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav", mode='w') as tmp_file:
             output_path = tmp_file.name
+        # Build CLI command - EXACTLY like working Colab
+        progress(0.3, desc="Generating audio...")
+        cmd = [
+            "python", "-m", "f5_tts.infer.infer_cli",
+            "--model_cfg", files["config_file"],
+            "--output_file", output_path,
+            "--model", "F5TTS_Base",
+            "--ckpt_file", files["ckpt_file"],
+            "--vocab_file", files["vocab_file"],
+            "--ref_audio", reference_audio,
+            "--nfe_step", str(nfe_step),
+            "--cfg_strength", str(cfg_strength),
+            "--speed", str(speed),
+            "--ref_text", reference_transcript,
+            "--gen_text", text
+        ]
+        print(f"Running command: {' '.join(cmd)}")
+        # Run the CLI command
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=300  # 5 minute timeout
         )
+        # Print outputs for debugging
+        if result.stdout:
+            print("STDOUT:", result.stdout)
+        if result.stderr:
+            print("STDERR:", result.stderr)
+        # Check for errors
+        if result.returncode != 0:
+            error_msg = f"❌ CLI failed with return code {result.returncode}\n"
+            error_msg += f"STDERR: {result.stderr}\n"
+            error_msg += f"STDOUT: {result.stdout}"
+            return None, error_msg
+        # Check if output file was created
+        if not os.path.exists(output_path):
+            return None, f"❌ Output file not created. Check logs above."
+        if os.path.getsize(output_path) == 0:
+            return None, "❌ Output file is empty."
+        # Get audio duration
+        try:
+            audio, sample_rate = torchaudio.load(output_path)
+            duration = audio.shape[-1] / sample_rate
+            status = f"✅ Generated {duration:.2f}s audio"
+        except Exception as e:
+            status = f"✅ Audio generated (duration unknown: {str(e)})"
         progress(1.0, desc="Complete!")
         return output_path, status
+    except subprocess.TimeoutExpired:
+        return None, "❌ Generation timed out (>5 minutes)"
     except Exception as e:
         import traceback
         error_msg = f"❌ Error: {str(e)}\n{traceback.format_exc()}"