| import os |
| import torch |
| import torchaudio |
| import gradio as gr |
| from TTS.api import TTS |
|
|
| |
| os.environ["COQUI_TOS_AGREED"] = "1" |
| os.environ["CUDA_VISIBLE_DEVICES"] = "" |
|
|
| |
| |
| print("جاري تحميل نموذج XTTS-v2 (قد يستغرق بضع دقائق عند أول تشغيل للسبيس)...") |
| device = "cpu" |
| tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) |
|
|
| |
| model = tts.synthesizer.tts_model |
|
|
| |
| |
| |
| def clone_voice(text, language, ref_audio): |
| try: |
| |
| gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[ref_audio]) |
| |
| |
| out = model.inference( |
| text, |
| language, |
| gpt_cond_latent, |
| speaker_embedding, |
| temperature=0.7, |
| length_penalty=1.0, |
| repetition_penalty=2.0 |
| ) |
| |
| output_path = "output_cloned.wav" |
| torchaudio.save(output_path, torch.tensor(out["wav"]).unsqueeze(0), 24000) |
| return output_path |
| except Exception as e: |
| return f"حدث خطأ أثناء المعالجة: {str(e)}" |
|
|
| |
| |
| |
| def blend_voices(text, language, ref_audio_1, ref_audio_2, blend_ratio=0.5): |
| try: |
| |
| gpt_cond_1, speaker_emb_1 = model.get_conditioning_latents(audio_path=[ref_audio_1]) |
| gpt_cond_2, speaker_emb_2 = model.get_conditioning_latents(audio_path=[ref_audio_2]) |
| |
| |
| blended_gpt_cond = (gpt_cond_1 * blend_ratio) + (gpt_cond_2 * (1.0 - blend_ratio)) |
| blended_speaker_emb = (speaker_emb_1 * blend_ratio) + (speaker_emb_2 * (1.0 - blend_ratio)) |
| |
| |
| out = model.inference( |
| text, |
| language, |
| blended_gpt_cond, |
| blended_speaker_emb, |
| temperature=0.75, |
| length_penalty=1.0, |
| repetition_penalty=2.0 |
| ) |
| |
| output_path = "output_blended.wav" |
| torchaudio.save(output_path, torch.tensor(out["wav"]).unsqueeze(0), 24000) |
| return output_path |
| except Exception as e: |
| return f"حدث خطأ أثناء المعالجة: {str(e)}" |
|
|
| |
| |
| |
| with gr.Blocks(title="AI Voice Studio (CPU Edition)") as demo: |
| gr.Markdown("# 🎙️ نظام الذكاء الاصطناعي الصوتي (Hugging Face CPU)") |
| gr.Markdown("⚠️ **ملاحظة هامة:** بما أن النظام يعمل على معالج عادي (CPU)، يُرجى كتابة **نصوص قصيرة** (جملة أو جملتين) لتجنب انقطاع الاتصال (Timeout) أثناء التوليد.") |
| |
| with gr.Tab("استنساخ الصوت"): |
| with gr.Row(): |
| with gr.Column(): |
| text_input_1 = gr.Textbox(label="النص", lines=3, placeholder="اكتب جملة قصيرة هنا...") |
| lang_1 = gr.Dropdown(choices=["en", "ar", "fr", "es", "de"], value="ar", label="اللغة") |
| ref_audio_1 = gr.Audio(type="filepath", label="عينة الصوت المرجعية (3 إلى 5 ثوانٍ)") |
| clone_btn = gr.Button("🎙️ توليد الصوت المستنسخ", variant="primary") |
| with gr.Column(): |
| audio_output_1 = gr.Audio(label="النتيجة") |
| |
| clone_btn.click(clone_voice, inputs=[text_input_1, lang_1, ref_audio_1], outputs=audio_output_1) |
|
|
| with gr.Tab("دمج البصمات الصوتية"): |
| with gr.Row(): |
| with gr.Column(): |
| text_input_2 = gr.Textbox(label="النص", lines=3, placeholder="اكتب جملة قصيرة هنا...") |
| lang_2 = gr.Dropdown(choices=["en", "ar", "fr", "es", "de"], value="ar", label="اللغة") |
| ref_1 = gr.Audio(type="filepath", label="الصوت الأول") |
| ref_2 = gr.Audio(type="filepath", label="الصوت الثاني") |
| ratio = gr.Slider(minimum=0.0, maximum=1.0, value=0.5, step=0.1, label="نسبة الدمج (0 = الثاني، 1 = الأول)") |
| blend_btn = gr.Button("🧬 دمج وتوليد صوت جديد", variant="primary") |
| with gr.Column(): |
| audio_output_2 = gr.Audio(label="النتيجة المدمجة") |
| |
| blend_btn.click(blend_voices, inputs=[text_input_2, lang_2, ref_1, ref_2, ratio], outputs=audio_output_2) |
|
|
| |
| if __name__ == "__main__": |
| demo.queue().launch() |
|
|