Arabic-F5-T

Sleeping

ibrahimabdelaal

Use subprocess with better error handling and timeout

b19aabf 5 months ago

8.83 kB

	import gradio as gr
	import torch
	import torchaudio
	import spaces
	import os
	import tempfile
	import subprocess
	import shlex
	from pathlib import Path
	from huggingface_hub import hf_hub_download

	# Global cache for model files
	model_files_cache = {}

	def download_model_files():
	"""Download model files once and cache paths."""
	if not model_files_cache:
	print("Downloading model files...")
	model_files_cache["vocab_file"] = hf_hub_download(
	repo_id="IbrahimSalah/Arabic-F5-TTS-v2",
	filename="vocab.txt"
	)
	model_files_cache["ckpt_file"] = hf_hub_download(
	repo_id="IbrahimSalah/Arabic-F5-TTS-v2",
	filename="model_547500_8_18.pt"
	)
	model_files_cache["config_file"] = hf_hub_download(
	repo_id="IbrahimSalah/Arabic-F5-TTS-v2",
	filename="F5TTS_Base_8_18.yaml"
	)
	print("Model files downloaded!")
	return model_files_cache


	@spaces.GPU(duration=120)
	def generate_speech(
	text: str,
	reference_audio,
	reference_transcript: str,
	nfe_step: int = 32,
	cfg_strength: float = 1.8,
	speed: float = 1.0,
	progress=gr.Progress()
	):
	"""Generate speech using F5-TTS CLI - exactly like working Colab."""
	try:
	# Validate inputs
	if not text.strip():
	return None, "❌ Please enter text to synthesize."

	if reference_audio is None:
	return None, "❌ Please upload a reference audio file."

	if not reference_transcript.strip():
	return None, "❌ Please enter the reference transcript."

	# Download model files
	progress(0.1, desc="Loading model files...")
	files = download_model_files()

	# Create temporary output file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav", mode='w') as tmp_file:
	output_path = tmp_file.name

	# Build CLI command - EXACTLY like working Colab
	progress(0.3, desc="Generating audio...")

	cmd = [
	"python", "-m", "f5_tts.infer.infer_cli",
	"--model_cfg", files["config_file"],
	"--output_file", output_path,
	"--model", "F5TTS_Base",
	"--ckpt_file", files["ckpt_file"],
	"--vocab_file", files["vocab_file"],
	"--ref_audio", reference_audio,
	"--nfe_step", str(nfe_step),
	"--cfg_strength", str(cfg_strength),
	"--speed", str(speed),
	"--ref_text", reference_transcript,
	"--gen_text", text
	]

	print(f"Running command: {' '.join(cmd)}")

	# Run the CLI command
	result = subprocess.run(
	cmd,
	capture_output=True,
	text=True,
	timeout=300 # 5 minute timeout
	)

	# Print outputs for debugging
	if result.stdout:
	print("STDOUT:", result.stdout)
	if result.stderr:
	print("STDERR:", result.stderr)

	# Check for errors
	if result.returncode != 0:
	error_msg = f"❌ CLI failed with return code {result.returncode}\n"
	error_msg += f"STDERR: {result.stderr}\n"
	error_msg += f"STDOUT: {result.stdout}"
	return None, error_msg

	# Check if output file was created
	if not os.path.exists(output_path):
	return None, f"❌ Output file not created. Check logs above."

	if os.path.getsize(output_path) == 0:
	return None, "❌ Output file is empty."

	# Get audio duration
	try:
	audio, sample_rate = torchaudio.load(output_path)
	duration = audio.shape[-1] / sample_rate
	status = f"✅ Generated {duration:.2f}s audio"
	except Exception as e:
	status = f"✅ Audio generated (duration unknown: {str(e)})"

	progress(1.0, desc="Complete!")
	return output_path, status

	except subprocess.TimeoutExpired:
	return None, "❌ Generation timed out (>5 minutes)"
	except Exception as e:
	import traceback
	error_msg = f"❌ Error: {str(e)}\n{traceback.format_exc()}"
	print(error_msg)
	return None, error_msg


	# Default examples
	DEFAULT_REFERENCE_TEXT = "لَا يَمُرُّ يَوْمٌ إِلَّا وَأَسْتَقْبِلُ عِدَّةَ رَسَائِلَ، تَتَضَمَّنُ أَسْئِلَةً مُلِحَّةْ."
	DEFAULT_TEXT = "تُسَاهِمُ التِّقْنِيَّاتُ الْحَدِيثَةُ فِي تَسْهِيلِ حَيَاةِ الْإِنْسَانِ، وَذَلِكَ مِنْ خِلَالِ تَطْوِيرِ أَنْظِمَةٍ ذَكِيَّةٍ تَعْتَمِدُ عَلَى الذَّكَاءِ الِاصْطِنَاعِيِّ."
	DEFAULT_REFERENCE_AUDIO = "reference.wav"

	# Create Gradio interface
	with gr.Blocks(title="Arabic F5-TTS", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🎙️ Arabic Text-to-Speech \| F5-TTS Model

	High-quality Arabic TTS with voice cloning. Diacritized text (تشكيل) required.

	Model: [IbrahimSalah/Arabic-F5-TTS-v2](https://huggingface.co/IbrahimSalah/Arabic-F5-TTS-v2)
	""")

	with gr.Row():
	with gr.Column(scale=1):
	text_input = gr.Textbox(
	label="📝 Text to Synthesize (Arabic with Tashkeel)",
	placeholder="أَدْخِلْ نَصًّا عَرَبِيًّا مُشَكَّلًا هُنَا...",
	lines=6,
	value=DEFAULT_TEXT
	)

	with gr.Row():
	with gr.Column():
	gr.Markdown("🎵 Reference Audio")
	reference_audio = gr.Audio(
	label="",
	type="filepath",
	value=DEFAULT_REFERENCE_AUDIO
	)

	with gr.Column():
	reference_transcript = gr.Textbox(
	label="📄 Reference Transcript (with Tashkeel)",
	placeholder="النص المقابل للصوت المرجعي...",
	lines=4,
	value=DEFAULT_REFERENCE_TEXT
	)

	with gr.Accordion("⚙️ Advanced Settings", open=False):
	with gr.Row():
	nfe_step = gr.Slider(16, 64, value=32, step=1, label="NFE Steps")
	cfg_strength = gr.Slider(0.0, 3.0, value=1.8, step=0.1, label="CFG Strength")
	with gr.Row():
	speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speed")

	generate_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg")

	with gr.Column(scale=1):
	output_audio = gr.Audio(label="🔊 Generated Speech", type="filepath")
	status_text = gr.Textbox(label="Status", interactive=False, lines=2)

	gr.Markdown("""
	### ℹ️ Requirements
	- Diacritized text is required (تشكيل/تشكيل)
	- Reference audio: 5-30 seconds, clear speech
	- Use AI (ChatGPT/Claude) or [online tools](https://tahadz.com/mishkal) to add diacritics

	### 🔗 Resources
	- [Model Card](https://huggingface.co/IbrahimSalah/Arabic-F5-TTS-v2)
	- [Spark TTS](https://huggingface.co/IbrahimSalah/Arabic-TTS-Spark)
	- [Report Issues](https://huggingface.co/IbrahimSalah/Arabic-F5-TTS-v2/discussions)
	""")

	# Examples
	with gr.Accordion("📚 Examples", open=False):
	gr.Examples(
	examples=[
	[DEFAULT_TEXT, DEFAULT_REFERENCE_AUDIO, DEFAULT_REFERENCE_TEXT, 32, 1.8, 1.0],
	["السَّلَامُ عَلَيْكُمْ وَرَحْمَةُ اللَّهِ وَبَرَكَاتُهُ، كَيْفَ حَالُكَ الْيَوْمَ؟", DEFAULT_REFERENCE_AUDIO, DEFAULT_REFERENCE_TEXT, 32, 1.8, 1.0],
	["الذَّكَاءُ الِاصْطِنَاعِيُّ يُغَيِّرُ الْعَالَمَ بِسُرْعَةٍ كَبِيرَةٍ.", DEFAULT_REFERENCE_AUDIO, DEFAULT_REFERENCE_TEXT, 32, 1.8, 1.0]
	],
	inputs=[text_input, reference_audio, reference_transcript, nfe_step, cfg_strength, speed]
	)

	generate_btn.click(
	fn=generate_speech,
	inputs=[text_input, reference_audio, reference_transcript, nfe_step, cfg_strength, speed],
	outputs=[output_audio, status_text]
	)

	if __name__ == "__main__":
	demo.queue(max_size=20)
	demo.launch()