Mustafa-albakkar commited on
Commit
95f614f
·
verified ·
1 Parent(s): dfc939f

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +577 -0
  2. kokoro_engine.py +207 -0
  3. packages.txt +1 -0
  4. requirements.txt +9 -0
app.py ADDED
@@ -0,0 +1,577 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # === FILE: app.py (Video Agent - Kokoro TTS with Voice Rotation) ===
2
+ # Updated version - Fixed for Hugging Face Spaces
3
+
4
+ import os
5
+ import io
6
+ import json
7
+ import base64
8
+ import logging
9
+ from typing import Optional, Dict, Any, Tuple, List
10
+ from datetime import datetime
11
+ import tempfile
12
+
13
+ import gradio as gr
14
+ import numpy as np
15
+ from PIL import Image
16
+
17
+ # استيراد مكتبات معالجة الصوت والفيديو
18
+ from kokoro_engine import KokoroEngine
19
+
20
+ KOKORO_AVAILABLE = True
21
+
22
+ try:
23
+ from moviepy.editor import ImageClip, AudioFileClip
24
+ MOVIEPY_AVAILABLE = True
25
+ except ImportError:
26
+ MOVIEPY_AVAILABLE = False
27
+ logging.warning("⚠️ MoviePy not available. Install with: pip install moviepy")
28
+
29
+ # ---------------- Logging Setup ----------------
30
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
31
+ log = logging.getLogger("video_agent")
32
+
33
+ # ---------------- Environment Variables ----------------
34
+ VIDEO_HISTORY_DIR = os.getenv("VIDEO_HISTORY_DIR", "video_history")
35
+ MAX_HISTORY_COUNT = int(os.getenv("MAX_HISTORY_COUNT", "10"))
36
+ VOICE_STATE_FILE = os.getenv("VOICE_STATE_FILE", "voice_rotation_state.json")
37
+
38
+ # ---------------- Kokoro Voices List ----------------
39
+ KOKORO_VOICES = [
40
+ # British Female
41
+ "bf_alice",
42
+ "bf_emma",
43
+ "bf_isabella",
44
+ "bf_lily",
45
+ # American Female
46
+ "af_alloy",
47
+ "af_aoede",
48
+ "af_bella",
49
+ "af_heart",
50
+ "af_jessica",
51
+ "af_kore",
52
+ "af_nicole",
53
+ "af_nova",
54
+ "af_river",
55
+ "af_sarah",
56
+ "af_sky",
57
+ # British Male
58
+ "bm_daniel",
59
+ "bm_fable",
60
+ "bm_george",
61
+ "bm_lewis",
62
+ # American Male
63
+ "am_adam",
64
+ "am_echo",
65
+ "am_eric",
66
+ "am_fenrir",
67
+ "am_liam",
68
+ "am_michael",
69
+ "am_onyx",
70
+ "am_puck"
71
+ ]
72
+
73
+ # ---------------- Initialization Check ----------------
74
+ IS_SERVICE_READY = True
75
+
76
+ # ---------------- Voice Rotation Manager ----------------
77
+ class VoiceRotationManager:
78
+ """إدارة تدوير الأصوات بشكل دوري"""
79
+
80
+ def __init__(self, state_file: str, voices: List[str]):
81
+ self.state_file = state_file
82
+ self.voices = voices
83
+ self.current_index = 0
84
+ self.load_state()
85
+
86
+ def load_state(self):
87
+ """تحميل حالة التدوير من الملف"""
88
+ if os.path.exists(self.state_file):
89
+ try:
90
+ with open(self.state_file, "r") as f:
91
+ data = json.load(f)
92
+ self.current_index = data.get("current_voice_index", 0)
93
+ # التأكد من أن المؤشر في النطاق الصحيح
94
+ if self.current_index >= len(self.voices):
95
+ self.current_index = 0
96
+ log.info(f"Voice rotation state loaded: index={self.current_index}")
97
+ except Exception as e:
98
+ log.warning(f"Could not load voice rotation state: {e}")
99
+ self.current_index = 0
100
+ else:
101
+ log.info("No voice rotation state file found, starting from index 0")
102
+
103
+ def save_state(self):
104
+ """حفظ حالة التدوير في الملف"""
105
+ try:
106
+ with open(self.state_file, "w") as f:
107
+ json.dump({"current_voice_index": self.current_index}, f)
108
+ log.info(f"Voice rotation state saved: index={self.current_index}")
109
+ except Exception as e:
110
+ log.error(f"Failed to save voice rotation state: {e}")
111
+
112
+ def get_next_voice(self) -> str:
113
+ """الحصول على الصوت التالي والانتقال للصوت الذي يليه"""
114
+ voice = self.voices[self.current_index]
115
+ log.info(f"Selected voice: {voice} (index: {self.current_index}/{len(self.voices)-1})")
116
+
117
+ # الانتقال للصوت التالي
118
+ self.current_index = (self.current_index + 1) % len(self.voices)
119
+ self.save_state()
120
+
121
+ return voice
122
+
123
+ def get_current_voice(self) -> str:
124
+ """الحصول على الصوت الحالي بدون تغيير المؤشر"""
125
+ return self.voices[self.current_index]
126
+
127
+ def reset(self):
128
+ """إعادة تعيين التدوير إلى البداية"""
129
+ self.current_index = 0
130
+ self.save_state()
131
+ log.info("Voice rotation reset to index 0")
132
+
133
+ # ---------------- Video Agent Class ----------------
134
+ class VideoAgent:
135
+ def __init__(self):
136
+ self.log = logging.getLogger("video_agent")
137
+ self.history = []
138
+ self._setup_history_dir()
139
+
140
+ # تهيئة مدير تدوير الأصوات
141
+ self.voice_manager = VoiceRotationManager(VOICE_STATE_FILE, KOKORO_VOICES)
142
+
143
+ # تهيئة نموذج Kokoro TTS
144
+ self.tts = None
145
+ self.kokoro_available = False
146
+
147
+ self.log.info("Initializing Kokoro TTS (ONNX - NeuML model)...")
148
+
149
+ try:
150
+ # استخدام المحرك المحدث
151
+ self.tts = KokoroEngine()
152
+ self.kokoro_available = True
153
+ self.log.info("✅ Kokoro-ONNX initialized successfully (NeuML model)")
154
+ except Exception as e:
155
+ self.tts = None
156
+ self.kokoro_available = False
157
+ self.log.error("❌ Kokoro TTS initialization failed")
158
+ self.log.error(str(e))
159
+ import traceback
160
+ traceback.print_exc()
161
+
162
+ def _setup_history_dir(self):
163
+ """إنشاء مجلد السجل إذا لم يكن موجوداً"""
164
+ if not os.path.exists(VIDEO_HISTORY_DIR):
165
+ os.makedirs(VIDEO_HISTORY_DIR)
166
+ self.log.info(f"Created video history directory: {VIDEO_HISTORY_DIR}")
167
+
168
+ def build_speech_text(self, quote: str, author: str, culture: str) -> str:
169
+ """بناء النص المنطوق بناءً على المعلومات المتوفرة."""
170
+ quote_clean = quote.strip().strip('"\'')
171
+
172
+ if author and author.lower() not in ["unknown", "unknown speaker", "manual post", "custom", ""]:
173
+ return f"{author} once said: {quote_clean}"
174
+ elif culture and culture.lower() not in ["diverse", "diverse culture", "custom", "unknown", ""]:
175
+ return f"In {culture} culture, they say: {quote_clean}"
176
+ else:
177
+ return f"There is an old saying: {quote_clean}"
178
+
179
+ def generate_audio(self, text: str, output_path: str, voice: Optional[str] = None, speed: float = 1.0) -> Tuple[bool, str]:
180
+ """توليد ملف صوتي من النص باستخدام Kokoro TTS."""
181
+ if not self.kokoro_available or self.tts is None:
182
+ self.log.error("Kokoro TTS not available.")
183
+ return False, ""
184
+
185
+ try:
186
+ if voice is None:
187
+ voice = self.voice_manager.get_next_voice()
188
+ else:
189
+ if voice not in KOKORO_VOICES:
190
+ self.log.warning(f"Voice '{voice}' not found, using next voice from rotation")
191
+ voice = self.voice_manager.get_next_voice()
192
+
193
+ self.log.info(f"Generating audio with voice: {voice}")
194
+ self.log.info(f"Text: {text[:50]}...")
195
+
196
+ # تعيين الصوت
197
+ self.tts.set_voice(voice)
198
+
199
+ # توليد الصوت باستخدام المحرك المحدث
200
+ audio_data = self.tts.synthesize(text, speed=speed)
201
+
202
+ # حفظ الملف
203
+ import scipy.io.wavfile as wavfile
204
+ sample_rate = 24000
205
+
206
+ # التأكد من أن البيانات في النطاق الصحيح لـ int16
207
+ audio_int16 = np.clip(audio_data * 32767, -32768, 32767).astype(np.int16)
208
+ wavfile.write(output_path, sample_rate, audio_int16)
209
+
210
+ self.log.info(f"✅ Audio generated successfully: {output_path}")
211
+ return True, voice
212
+
213
+ except Exception as e:
214
+ self.log.error(f"Audio generation failed: {e}")
215
+ import traceback
216
+ traceback.print_exc()
217
+ return False, ""
218
+
219
+ def create_video(self, image_bytes: bytes, audio_path: str, output_path: str) -> bool:
220
+ """إنشاء فيديو من صورة وملف صوتي."""
221
+ if not MOVIEPY_AVAILABLE:
222
+ self.log.error("MoviePy not available.")
223
+ return False
224
+
225
+ try:
226
+ image = Image.open(io.BytesIO(image_bytes))
227
+ img_array = np.array(image)
228
+
229
+ if not os.path.exists(audio_path):
230
+ raise FileNotFoundError(f"Audio file not found: {audio_path}")
231
+
232
+ audio = AudioFileClip(audio_path)
233
+ audio_duration = audio.duration
234
+
235
+ self.log.info(f"Creating video with duration: {audio_duration:.2f}s")
236
+
237
+ image_clip = ImageClip(img_array, duration=audio_duration)
238
+ video = image_clip.set_audio(audio)
239
+
240
+ video.write_videofile(
241
+ output_path,
242
+ fps=24,
243
+ codec='libx264',
244
+ audio_codec='aac',
245
+ temp_audiofile='temp-audio.m4a',
246
+ remove_temp=True,
247
+ verbose=False,
248
+ logger=None
249
+ )
250
+
251
+ audio.close()
252
+ video.close()
253
+
254
+ self.log.info(f"✅ Video created successfully: {output_path}")
255
+ return True
256
+
257
+ except Exception as e:
258
+ self.log.error(f"Video creation failed: {e}")
259
+ import traceback
260
+ traceback.print_exc()
261
+ return False
262
+
263
+ def process_request(
264
+ self,
265
+ image_base64: str,
266
+ quote: str,
267
+ author: str,
268
+ culture: str,
269
+ voice: Optional[str] = None,
270
+ speed: float = 1.0
271
+ ) -> Dict[str, Any]:
272
+ """معالجة طلب إنشاء فيديو كامل."""
273
+ if not self.kokoro_available or self.tts is None:
274
+ raise RuntimeError("Kokoro TTS not available")
275
+
276
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
277
+ safe_quote = quote.replace(" ", "_").replace("/", "_")[:30].strip()
278
+
279
+ temp_dir = tempfile.gettempdir()
280
+ audio_path = os.path.join(temp_dir, f"audio_{timestamp}.wav")
281
+ video_filename = f"{timestamp}_{safe_quote}.mp4"
282
+ video_path = os.path.join(VIDEO_HISTORY_DIR, video_filename)
283
+
284
+ try:
285
+ image_bytes = base64.b64decode(image_base64)
286
+ speech_text = self.build_speech_text(quote, author, culture)
287
+
288
+ success, voice_used = self.generate_audio(speech_text, audio_path, voice, speed)
289
+ if not success:
290
+ raise RuntimeError("Failed to generate audio.")
291
+
292
+ if not self.create_video(image_bytes, audio_path, video_path):
293
+ raise RuntimeError("Failed to create video.")
294
+
295
+ with open(video_path, "rb") as f:
296
+ video_bytes = f.read()
297
+ video_base64 = base64.b64encode(video_bytes).decode('utf-8')
298
+
299
+ entry = {
300
+ "timestamp": timestamp,
301
+ "quote": quote,
302
+ "author": author,
303
+ "culture": culture,
304
+ "voice": voice_used,
305
+ "video_path": video_path,
306
+ "speech_text": speech_text,
307
+ "duration": self._get_video_duration(video_path)
308
+ }
309
+
310
+ self.history.insert(0, entry)
311
+ self.history = self.history[:MAX_HISTORY_COUNT]
312
+
313
+ if os.path.exists(audio_path):
314
+ os.remove(audio_path)
315
+
316
+ self.log.info(f"✅ Video processing completed: {video_filename}")
317
+
318
+ return {
319
+ "video_base64": video_base64,
320
+ "video_path": video_path,
321
+ "speech_text": speech_text,
322
+ "voice_used": voice_used,
323
+ "status": "success",
324
+ "message": f"Video generated successfully with voice: {voice_used}"
325
+ }
326
+
327
+ except Exception as e:
328
+ self.log.error(f"Video processing failed: {e}")
329
+ if os.path.exists(audio_path):
330
+ os.remove(audio_path)
331
+ raise RuntimeError(f"Video generation failed: {str(e)}")
332
+
333
+ def _get_video_duration(self, video_path: str) -> float:
334
+ """الحصول على مدة الفيديو بالثواني"""
335
+ try:
336
+ if MOVIEPY_AVAILABLE:
337
+ from moviepy.editor import VideoFileClip
338
+ clip = VideoFileClip(video_path)
339
+ duration = clip.duration
340
+ clip.close()
341
+ return duration
342
+ except Exception as e:
343
+ self.log.warning(f"Could not get video duration: {e}")
344
+ return 0.0
345
+
346
+ def get_history(self) -> List[Dict[str, Any]]:
347
+ """الحصول على سجل الفيديوهات"""
348
+ return self.history
349
+
350
+ def get_voice_rotation_info(self) -> Dict[str, Any]:
351
+ """الحصول على معلومات حالة تدوير الأصوات"""
352
+ return {
353
+ "current_voice": self.voice_manager.get_current_voice(),
354
+ "current_index": self.voice_manager.current_index,
355
+ "total_voices": len(KOKORO_VOICES),
356
+ "all_voices": KOKORO_VOICES
357
+ }
358
+
359
+ # ---------------- Global Agent Instance ----------------
360
+ agent = VideoAgent()
361
+
362
+ if IS_SERVICE_READY:
363
+ if agent.tts is not None:
364
+ current_voice = agent.voice_manager.get_current_voice()
365
+ STATUS_MESSAGE = f"✅ Video Agent ready with Kokoro-ONNX. Next voice: {current_voice}"
366
+ else:
367
+ STATUS_MESSAGE = "⚠️ Video Agent ready but Kokoro-ONNX failed to initialize."
368
+ else:
369
+ STATUS_MESSAGE = "❌ Service not ready"
370
+
371
+ # ---------------- Gradio Functions ----------------
372
+
373
+ def gradio_generate_video(
374
+ image_input,
375
+ quote: str,
376
+ author: str,
377
+ culture: str,
378
+ voice_override: str,
379
+ speed: float
380
+ ) -> Tuple[Optional[str], str, str]:
381
+ """دالة Gradio للواجهة التفاعلية."""
382
+ if not IS_SERVICE_READY:
383
+ return None, f"❌ Service not ready: {STATUS_MESSAGE}", ""
384
+
385
+ if not image_input:
386
+ return None, "❌ Please provide an image.", ""
387
+
388
+ if not quote:
389
+ return None, "❌ Please provide a quote.", ""
390
+
391
+ try:
392
+ if isinstance(image_input, str):
393
+ with open(image_input, "rb") as f:
394
+ image_bytes = f.read()
395
+ else:
396
+ buffered = io.BytesIO()
397
+ image_input.save(buffered, format="JPEG")
398
+ image_bytes = buffered.getvalue()
399
+
400
+ image_base64 = base64.b64encode(image_bytes).decode('utf-8')
401
+ voice = None if voice_override == "Auto" or not voice_override else voice_override
402
+
403
+ result = agent.process_request(
404
+ image_base64=image_base64,
405
+ quote=quote,
406
+ author=author or "Unknown",
407
+ culture=culture or "Diverse",
408
+ voice=voice,
409
+ speed=speed
410
+ )
411
+
412
+ video_path = result["video_path"]
413
+ voice_used = result["voice_used"]
414
+
415
+ voice_info = agent.get_voice_rotation_info()
416
+ next_voice = voice_info["current_voice"]
417
+ current_index = voice_info["current_index"]
418
+ total_voices = voice_info["total_voices"]
419
+
420
+ status_msg = (
421
+ f"✅ {result['message']}\n\n"
422
+ f"**Speech:** {result['speech_text']}\n"
423
+ f"**Duration:** {result.get('duration', 0):.2f}s"
424
+ )
425
+
426
+ voice_rotation_msg = (
427
+ f"**Voice Used:** {voice_used}\n"
428
+ f"**Next Voice:** {next_voice}\n"
429
+ f"**Progress:** {current_index}/{total_voices}"
430
+ )
431
+
432
+ return video_path, status_msg, voice_rotation_msg
433
+
434
+ except Exception as e:
435
+ log.error(f"Video generation failed: {e}")
436
+ import traceback
437
+ traceback.print_exc()
438
+ return None, f"❌ Error: {str(e)}", ""
439
+
440
+ def gradio_api_endpoint(
441
+ image_base64: str,
442
+ quote: str,
443
+ author: str,
444
+ culture: str,
445
+ voice: Optional[str] = None,
446
+ speed: float = 1.0
447
+ ) -> Dict[str, Any]:
448
+ """نقطة النهاية للـ API"""
449
+ if not IS_SERVICE_READY:
450
+ raise RuntimeError(f"Service not ready: {STATUS_MESSAGE}")
451
+
452
+ log.info(f"API request received for quote: {quote[:50]}...")
453
+
454
+ return agent.process_request(
455
+ image_base64=image_base64,
456
+ quote=quote,
457
+ author=author,
458
+ culture=culture,
459
+ voice=voice,
460
+ speed=speed
461
+ )
462
+
463
+ def format_history_for_gallery() -> List[Tuple[str, str]]:
464
+ """تنسيق السجل لعرضه في Gallery"""
465
+ formatted = []
466
+ for entry in agent.get_history():
467
+ if entry.get("video_path") and os.path.exists(entry["video_path"]):
468
+ caption = (
469
+ f'"{entry["quote"][:50]}..." - {entry["author"]}\n'
470
+ f'Voice: {entry.get("voice", "N/A")} | Duration: {entry.get("duration", 0):.1f}s'
471
+ )
472
+ formatted.append((entry["video_path"], caption))
473
+ return formatted
474
+
475
+ def gradio_refresh_history():
476
+ return format_history_for_gallery()
477
+
478
+ def gradio_reset_voice_rotation():
479
+ agent.voice_manager.reset()
480
+ voice_info = agent.get_voice_rotation_info()
481
+ return f"✅ Voice rotation reset! Next voice: {voice_info['current_voice']}"
482
+
483
+ def gradio_get_voice_info():
484
+ voice_info = agent.get_voice_rotation_info()
485
+ return (
486
+ f"**Current Voice:** {voice_info['current_voice']}\n"
487
+ f"**Index:** {voice_info['current_index']}/{voice_info['total_voices']}\n"
488
+ f"**Total Voices:** {len(voice_info['all_voices'])}"
489
+ )
490
+
491
+ # ---------------- Gradio Interface ----------------
492
+
493
+ with gr.Blocks(title="Video Agent - Kokoro TTS") as demo:
494
+ gr.Markdown("# 🎬 Video Agent - Kokoro-ONNX with Voice Rotation")
495
+ gr.Markdown(f"**Status:** {STATUS_MESSAGE}")
496
+
497
+ if IS_SERVICE_READY:
498
+ gr.Markdown(f"**Voices:** {len(KOKORO_VOICES)} available | **Rotation:** Automatic | **Engine:** Kokoro-ONNX (stable)")
499
+
500
+ gr.Markdown("---")
501
+
502
+ with gr.Tab("Generate Video"):
503
+ with gr.Row():
504
+ with gr.Column(scale=1):
505
+ image_input = gr.Image(label="Upload Image", type="pil")
506
+ quote_input = gr.Textbox(label="Quote", lines=3)
507
+ author_input = gr.Textbox(label="Author")
508
+ culture_input = gr.Textbox(label="Culture")
509
+
510
+ voice_dropdown = gr.Dropdown(
511
+ choices=["Auto"] + KOKORO_VOICES,
512
+ value="Auto",
513
+ label="Voice (Auto = rotation)"
514
+ )
515
+
516
+ speed_slider = gr.Slider(
517
+ minimum=0.5,
518
+ maximum=2.0,
519
+ value=1.0,
520
+ step=0.1,
521
+ label="Speech Speed"
522
+ )
523
+
524
+ generate_btn = gr.Button("🎬 Generate", variant="primary")
525
+ status_output = gr.Textbox(label="Status", lines=4)
526
+ voice_rotation_output = gr.Textbox(label="Voice Info", lines=3)
527
+
528
+ with gr.Column(scale=1):
529
+ video_output = gr.Video(label="Generated Video")
530
+
531
+ generate_btn.click(
532
+ fn=gradio_generate_video,
533
+ inputs=[image_input, quote_input, author_input, culture_input, voice_dropdown, speed_slider],
534
+ outputs=[video_output, status_output, voice_rotation_output]
535
+ )
536
+
537
+ with gr.Tab("History"):
538
+ refresh_btn = gr.Button("🔄 Refresh")
539
+ history_gallery = gr.Gallery(label="Recent Videos", columns=2)
540
+ refresh_btn.click(fn=gradio_refresh_history, outputs=[history_gallery])
541
+
542
+ with gr.Tab("Voice Management"):
543
+ with gr.Row():
544
+ voice_info_btn = gr.Button("📊 Get Info")
545
+ reset_btn = gr.Button("🔄 Reset Rotation")
546
+
547
+ voice_mgmt_output = gr.Textbox(label="Voice Info", lines=5)
548
+
549
+ voice_info_btn.click(fn=gradio_get_voice_info, outputs=[voice_mgmt_output])
550
+ reset_btn.click(fn=gradio_reset_voice_rotation, outputs=[voice_mgmt_output])
551
+
552
+ gr.Markdown(f"### {len(KOKORO_VOICES)} Voices Available")
553
+ gr.Markdown("**British Female (4):** " + ", ".join([v for v in KOKORO_VOICES if v.startswith("bf_")]))
554
+ gr.Markdown("**American Female (11):** " + ", ".join([v for v in KOKORO_VOICES if v.startswith("af_")]))
555
+ gr.Markdown("**British Male (4):** " + ", ".join([v for v in KOKORO_VOICES if v.startswith("bm_")]))
556
+ gr.Markdown("**American Male (8):** " + ", ".join([v for v in KOKORO_VOICES if v.startswith("am_")]))
557
+
558
+ with gr.Tab("API"):
559
+ api_image = gr.Textbox(label="Image (Base64)", lines=2)
560
+ api_quote = gr.Textbox(label="Quote")
561
+ api_author = gr.Textbox(label="Author")
562
+ api_culture = gr.Textbox(label="Culture")
563
+ api_voice = gr.Dropdown(choices=["None"] + KOKORO_VOICES, value="None")
564
+ api_speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed")
565
+ api_output = gr.JSON(label="Response")
566
+
567
+ gr.Button("Test API").click(
568
+ fn=lambda img, q, a, c, v, s: gradio_api_endpoint(img, q, a, c, None if v == "None" else v, s),
569
+ inputs=[api_image, api_quote, api_author, api_culture, api_voice, api_speed],
570
+ outputs=[api_output],
571
+ api_name="generate_video"
572
+ )
573
+
574
+ if __name__ == "__main__":
575
+ PORT = int(os.getenv("PORT", "7860"))
576
+ log.info(f"Starting Video Agent with Kokoro-ONNX ({len(KOKORO_VOICES)} voices)...")
577
+ demo.launch(server_name="0.0.0.0", server_port=PORT)
kokoro_engine.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Kokoro TTS Engine using ONNX
3
+ Simple, stable, works on Python 3.9+
4
+ Uses NeuML/kokoro-base-onnx model
5
+ """
6
+ import os
7
+ import logging
8
+ import json
9
+ import numpy as np
10
+ from huggingface_hub import hf_hub_download
11
+
12
+ log = logging.getLogger("kokoro_engine")
13
+
14
+
15
+ class KokoroEngine:
16
+ """
17
+ محرك Kokoro TTS باستخدام ONNX
18
+ يعمل على Python 3.9+ بدون phonemizer أو spacy
19
+ """
20
+
21
+ def __init__(self, voice: str = "af_alloy", sample_rate: int = 24000):
22
+ """
23
+ تهيئة محرك Kokoro
24
+
25
+ Args:
26
+ voice: اسم الصوت الافتراضي
27
+ sample_rate: معدل العينة (دائماً 24000 لـ Kokoro)
28
+ """
29
+ self.sample_rate = sample_rate
30
+ self.voice = voice
31
+ self.session = None
32
+ self.voices_data = None
33
+ self.tokenizer = None
34
+
35
+ try:
36
+ # تحميل ملفات النموذج من Hugging Face
37
+ self._download_and_setup()
38
+ log.info(f"✅ Kokoro-ONNX initialized successfully")
39
+
40
+ except Exception as e:
41
+ log.error(f"❌ Failed to initialize Kokoro-ONNX: {e}")
42
+ raise
43
+
44
+ def _download_and_setup(self):
45
+ """تحميل وتهيئة النموذج"""
46
+ try:
47
+ # تحميل ملف ONNX من NeuML (أكثر استقراراً)
48
+ model_path = hf_hub_download(
49
+ repo_id="NeuML/kokoro-base-onnx",
50
+ filename="model.onnx",
51
+ cache_dir="./models"
52
+ )
53
+ log.info(f"Model downloaded: {model_path}")
54
+
55
+ # تحميل ملف voices.json
56
+ voices_json_path = hf_hub_download(
57
+ repo_id="NeuML/kokoro-base-onnx",
58
+ filename="voices.json",
59
+ cache_dir="./models"
60
+ )
61
+ log.info(f"Voices file downloaded: {voices_json_path}")
62
+
63
+ # تحميل بيانات الأصوات
64
+ with open(voices_json_path, 'r', encoding='utf-8') as f:
65
+ self.voices_data = json.load(f)
66
+
67
+ # تهيئة ONNX Runtime
68
+ import onnxruntime
69
+ self.session = onnxruntime.InferenceSession(
70
+ model_path,
71
+ providers=['CPUExecutionProvider']
72
+ )
73
+
74
+ # تهيئة Tokenizer
75
+ self._setup_tokenizer()
76
+
77
+ except Exception as e:
78
+ log.error(f"Failed to download/setup model: {e}")
79
+ raise
80
+
81
+ def _setup_tokenizer(self):
82
+ """تهيئة tokenizer باستخدام ttstokenizer (لا يحتاج espeak)"""
83
+ try:
84
+ # استخدام ttstokenizer إذا كان متاحاً (أفضل)
85
+ from ttstokenizer import IPATokenizer
86
+ self.tokenizer = IPATokenizer()
87
+ log.info("Using ttstokenizer for tokenization")
88
+ except ImportError:
89
+ # Fallback: استخدام misaki (يحتاج تثبيت)
90
+ try:
91
+ from misaki.g2p import grapheme_to_phoneme
92
+ self.tokenizer = grapheme_to_phoneme
93
+ log.info("Using misaki for tokenization")
94
+ except ImportError:
95
+ log.error("No tokenizer available. Install ttstokenizer or misaki.")
96
+ raise RuntimeError("Tokenizer not available")
97
+
98
+ def _text_to_tokens(self, text: str) -> list:
99
+ """تحويل النص إلى tokens"""
100
+ try:
101
+ if hasattr(self.tokenizer, '__call__'):
102
+ # إذا كان tokenizer دالة (ttstokenizer)
103
+ tokens = self.tokenizer(text)
104
+ else:
105
+ # fallback
106
+ tokens = list(text.encode('utf-8'))
107
+
108
+ return tokens
109
+ except Exception as e:
110
+ log.error(f"Tokenization failed: {e}")
111
+ raise
112
+
113
+ def set_voice(self, voice: str):
114
+ """
115
+ تغيير الصوت
116
+
117
+ Args:
118
+ voice: اسم الصوت الجديد
119
+ """
120
+ # تحويل اسم الصوت من صيغة bf_alice إلى bf
121
+ voice_prefix = voice.split('_')[0] if '_' in voice else voice
122
+
123
+ if voice_prefix in self.voices_data:
124
+ self.voice = voice
125
+ self.voice_prefix = voice_prefix
126
+ log.debug(f"Voice changed to: {voice} (prefix: {voice_prefix})")
127
+ else:
128
+ log.warning(f"Voice {voice} not found, keeping current voice")
129
+
130
+ def synthesize(self, text: str, speed: float = 1.0) -> np.ndarray:
131
+ """
132
+ تحويل النص إلى صوت
133
+
134
+ Args:
135
+ text: النص المراد تحويله
136
+ speed: سرعة التحدث (1.0 = عادي)
137
+
138
+ Returns:
139
+ np.ndarray: البيانات الصوتية كـ numpy array (float32)
140
+ """
141
+ if self.session is None:
142
+ raise RuntimeError("Kokoro engine not initialized")
143
+
144
+ try:
145
+ # Tokenize النص
146
+ tokens = self._text_to_tokens(text)
147
+
148
+ # التأكد من أن الطول مناسب
149
+ if len(tokens) > 510:
150
+ log.warning(f"Text too long ({len(tokens)} tokens), truncating to 510")
151
+ tokens = tokens[:510]
152
+
153
+ # إضافة padding tokens
154
+ tokens_input = np.array([[0] + tokens + [0]], dtype=np.int64)
155
+
156
+ # الحصول على voice style
157
+ voice_prefix = self.voice.split('_')[0] if '_' in self.voice else self.voice
158
+ if voice_prefix not in self.voices_data:
159
+ voice_prefix = 'af' # fallback
160
+
161
+ voice_array = np.array(self.voices_data[voice_prefix], dtype=np.float32)
162
+ style = voice_array[len(tokens)].reshape(1, -1)
163
+
164
+ # إعداد speed
165
+ speed_array = np.array([speed], dtype=np.float32)
166
+
167
+ # تشغيل النموذج
168
+ outputs = self.session.run(
169
+ None,
170
+ {
171
+ "tokens": tokens_input,
172
+ "style": style,
173
+ "speed": speed_array
174
+ }
175
+ )
176
+
177
+ # استخراج الصوت
178
+ audio = outputs[0].flatten()
179
+
180
+ log.info(f"✅ Audio generated: {len(audio)} samples, duration: {len(audio)/self.sample_rate:.2f}s")
181
+
182
+ # التأكد من أن النوع float32
183
+ return audio.astype(np.float32)
184
+
185
+ except Exception as e:
186
+ log.error(f"❌ Synthesis failed: {e}")
187
+ import traceback
188
+ traceback.print_exc()
189
+ raise
190
+
191
+ def get_available_voices(self):
192
+ """
193
+ الحصول على قائمة الأصوات المتاحة
194
+ """
195
+ return [
196
+ # British Female
197
+ "bf_alice", "bf_emma", "bf_isabella", "bf_lily",
198
+ # American Female
199
+ "af_alloy", "af_aoede", "af_bella", "af_heart",
200
+ "af_jessica", "af_kore", "af_nicole", "af_nova",
201
+ "af_river", "af_sarah", "af_sky",
202
+ # British Male
203
+ "bm_daniel", "bm_fable", "bm_george", "bm_lewis",
204
+ # American Male
205
+ "am_adam", "am_echo", "am_eric", "am_fenrir",
206
+ "am_liam", "am_michael", "am_onyx", "am_puck"
207
+ ]
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ numpy>=1.24.0
3
+ scipy>=1.11.0
4
+ onnxruntime>=1.16.0
5
+ soundfile>=0.12.1
6
+ moviepy==1.0.3
7
+ Pillow>=10.0.0
8
+ huggingface_hub
9
+ ttstokenizer>=1.0.0