Spaces:

Arabi32
/

image

Runtime error

App Files Files Community

image / app.py

Arabi32

Update app.py

4e82faa verified 26 days ago

raw

history blame contribute delete

5.64 kB

	import os
	import torch
	import torchaudio
	import gradio as gr
	from TTS.api import TTS

	# 1. إعداد البيئة والموافقة على شروط الاستخدام (مهم جداً لـ Hugging Face)
	os.environ["COQUI_TOS_AGREED"] = "1"
	os.environ["CUDA_VISIBLE_DEVICES"] = "" # إجبار النظام على تجاهل أي GPU إن وُجد خطأً

	# 2. تحميل النموذج
	# نستخدم كلاس TTS لأنه يتولى تحميل النموذج بأمان في مجلد الـ Cache الخاص بـ HF
	print("جاري تحميل نموذج XTTS-v2 (قد يستغرق بضع دقائق عند أول تشغيل للسبيس)...")
	device = "cpu"
	tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

	# استخراج النموذج الأساسي للتحكم العميق (الاستنساخ والدمج)
	model = tts.synthesizer.tts_model

	# ==========================================
	# 3. وظيفة استنساخ الصوت (Voice Cloning)
	# ==========================================
	def clone_voice(text, language, ref_audio):
	try:
	# استخراج البصمة الصوتية
	gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[ref_audio])

	# توليد الصوت (تم تقليل length_penalty لتسريع المعالجة على الـ CPU)
	out = model.inference(
	text,
	language,
	gpt_cond_latent,
	speaker_embedding,
	temperature=0.7,
	length_penalty=1.0,
	repetition_penalty=2.0
	)

	output_path = "output_cloned.wav"
	torchaudio.save(output_path, torch.tensor(out["wav"]).unsqueeze(0), 24000)
	return output_path
	except Exception as e:
	return f"حدث خطأ أثناء المعالجة: {str(e)}"

	# ==========================================
	# 4. وظيفة دمج البصمات (Voice Blending)
	# ==========================================
	def blend_voices(text, language, ref_audio_1, ref_audio_2, blend_ratio=0.5):
	try:
	# استخراج الخصائص للصوتين
	gpt_cond_1, speaker_emb_1 = model.get_conditioning_latents(audio_path=[ref_audio_1])
	gpt_cond_2, speaker_emb_2 = model.get_conditioning_latents(audio_path=[ref_audio_2])

	# دمج البصمات رياضياً
	blended_gpt_cond = (gpt_cond_1 * blend_ratio) + (gpt_cond_2 * (1.0 - blend_ratio))
	blended_speaker_emb = (speaker_emb_1 * blend_ratio) + (speaker_emb_2 * (1.0 - blend_ratio))

	# توليد الصوت المدمج
	out = model.inference(
	text,
	language,
	blended_gpt_cond,
	blended_speaker_emb,
	temperature=0.75,
	length_penalty=1.0,
	repetition_penalty=2.0
	)

	output_path = "output_blended.wav"
	torchaudio.save(output_path, torch.tensor(out["wav"]).unsqueeze(0), 24000)
	return output_path
	except Exception as e:
	return f"حدث خطأ أثناء المعالجة: {str(e)}"

	# ==========================================
	# 5. واجهة المستخدم (Gradio)
	# ==========================================
	with gr.Blocks(title="AI Voice Studio (CPU Edition)") as demo:
	gr.Markdown("# 🎙️ نظام الذكاء الاصطناعي الصوتي (Hugging Face CPU)")
	gr.Markdown("⚠️ ملاحظة هامة: بما أن النظام يعمل على معالج عادي (CPU)، يُرجى كتابة نصوص قصيرة (جملة أو جملتين) لتجنب انقطاع الاتصال (Timeout) أثناء التوليد.")

	with gr.Tab("استنساخ الصوت"):
	with gr.Row():
	with gr.Column():
	text_input_1 = gr.Textbox(label="النص", lines=3, placeholder="اكتب جملة قصيرة هنا...")
	lang_1 = gr.Dropdown(choices=["en", "ar", "fr", "es", "de"], value="ar", label="اللغة")
	ref_audio_1 = gr.Audio(type="filepath", label="عينة الصوت المرجعية (3 إلى 5 ثوانٍ)")
	clone_btn = gr.Button("🎙️ توليد الصوت المستنسخ", variant="primary")
	with gr.Column():
	audio_output_1 = gr.Audio(label="النتيجة")

	clone_btn.click(clone_voice, inputs=[text_input_1, lang_1, ref_audio_1], outputs=audio_output_1)

	with gr.Tab("دمج البصمات الصوتية"):
	with gr.Row():
	with gr.Column():
	text_input_2 = gr.Textbox(label="النص", lines=3, placeholder="اكتب جملة قصيرة هنا...")
	lang_2 = gr.Dropdown(choices=["en", "ar", "fr", "es", "de"], value="ar", label="اللغة")
	ref_1 = gr.Audio(type="filepath", label="الصوت الأول")
	ref_2 = gr.Audio(type="filepath", label="الصوت الثاني")
	ratio = gr.Slider(minimum=0.0, maximum=1.0, value=0.5, step=0.1, label="نسبة الدمج (0 = الثاني، 1 = الأول)")
	blend_btn = gr.Button("🧬 دمج وتوليد صوت جديد", variant="primary")
	with gr.Column():
	audio_output_2 = gr.Audio(label="النتيجة المدمجة")

	blend_btn.click(blend_voices, inputs=[text_input_2, lang_2, ref_1, ref_2, ratio], outputs=audio_output_2)

	# تشغيل التطبيق (مع تفعيل طابور الانتظار لتجنب ضغط السيرفر)
	if __name__ == "__main__":
	demo.queue().launch()