Spaces:

Opera8
/

Geminidub1

Sleeping

App Files Files Community

Opera8 commited on 23 days ago

Commit

f302f9a

verified ·

1 Parent(s): d3df3a6

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -86

app.py CHANGED Viewed

@@ -5,7 +5,9 @@ import json
 import tempfile
 import re
 import shutil
-import base64
 from pydub import AudioSegment
 import yt_dlp
 from google import genai
@@ -16,6 +18,7 @@ from google.genai import types
 # ==========================================
 def download_youtube_video(url, output_path):
     ydl_opts = {
         'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
         'outtmpl': output_path,
@@ -27,6 +30,7 @@ def download_youtube_video(url, output_path):
     return output_path
 def extract_audio_from_video(video_path, audio_path):
     command = [
         'ffmpeg', '-i', video_path,
         '-vn', '-acodec', 'mp3', '-ar', '16000', '-ac', '1',
@@ -36,6 +40,7 @@ def extract_audio_from_video(video_path, audio_path):
     return audio_path
 def get_video_duration(video_path):
     result = subprocess.run([
         'ffprobe', '-v', 'error', '-show_entries', 'format=duration',
         '-of', 'default=noprint_wrappers=1:nokey=1', video_path
@@ -43,6 +48,7 @@ def get_video_duration(video_path):
     return float(result.stdout.strip())
 def adjust_audio_speed_ffmpeg(input_wav, output_wav, target_duration):
     try:
         audio = AudioSegment.from_file(input_wav)
         original_duration = len(audio) / 1000.0
@@ -83,7 +89,50 @@ def adjust_audio_speed_ffmpeg(input_wav, output_wav, target_duration):
         shutil.copy(input_wav, output_wav)
 # ==========================================
-# هسته اصلی پردازش جیمینای
 # ==========================================
 def process_dubbing(api_key, video_file, youtube_url, target_lang, progress=gr.Progress()):
@@ -96,11 +145,7 @@ def process_dubbing(api_key, video_file, youtube_url, target_lang, progress=gr.P
     temp_dir = tempfile.mkdtemp()
     try:
-        # 🔴 رفع خطای 404: استفاده اجباری از نسخه v1alpha برای پشتیبانی از مدل‌های Preview
-        client = genai.Client(
-            api_key=api_key,
-            http_options={'api_version': 'v1alpha'}
-        )
         video_path = os.path.join(temp_dir, "input_video.mp4")
         audio_path = os.path.join(temp_dir, "extracted_audio.mp3")
@@ -120,7 +165,7 @@ def process_dubbing(api_key, video_file, youtube_url, target_lang, progress=gr.P
         progress(0.1, desc="در حال استخراج صدای ویدیو...")
         extract_audio_from_video(video_path, audio_path)
-        # 3. ارسال به Gemini برای استخراج و ترجمه
         progress(0.2, desc=f"در حال پردازش متن و ترجمه به {target_lang}...")
         gemini_audio_file = client.files.upload(file=audio_path)
@@ -151,16 +196,16 @@ def process_dubbing(api_key, video_file, youtube_url, target_lang, progress=gr.P
             if not subtitles:
                 raise ValueError("لیست زیرنویس خالی است.")
         except Exception as e:
-            raise gr.Error(f"خطا در تجزیه خروجی مدل (JSON نامعتبر). \nجزئیات: {str(e)}\nخروجی: {response_text[:100]}")
-        # 4. تولید صدا با Gemini Native Audio برای هر سگمنت
         final_audio_track = AudioSegment.silent(duration=int(video_duration * 1000))
         total_subs = len(subtitles)
         successful_segments = 0
         errors_log = []
         for i, sub in enumerate(subtitles):
-            progress(0.3 + (0.5 * (i / total_subs)), desc=f"تولید صدا و سینک بخش {i+1} از {total_subs}...")
             text = sub.get('text', '')
             start_time = float(sub.get('start', 0))
@@ -170,86 +215,30 @@ def process_dubbing(api_key, video_file, youtube_url, target_lang, progress=gr.P
             if not text or target_duration <= 0:
                 continue
-            tts_prompt = f"Speak the following text naturally, fluently, and with human-like emotion in {target_lang}. Text: {text}"
-            audio_response = None
             try:
-                # 🔴 ابتدا تلاش برای استفاده از مدل ویژه مدنظر شما
-                audio_response = client.models.generate_content(
-                    model='gemini-2.5-flash-native-audio-preview-12-2025',
-                    contents=tts_prompt,
-                    config=types.GenerateContentConfig(
-                        response_modalities=["AUDIO"]
-                    )
-                )
-            except Exception as model_error:
-                # 🔴 سیستم پشتیبان (Fallback): اگر مدل ویژه در دسترس نبود از مدل استاندارد فلش استفاده کن
-                if "404" in str(model_error) or "NOT_FOUND" in str(model_error):
-                    try:
-                        audio_response = client.models.generate_content(
-                            model='gemini-2.5-flash',
-                            contents=tts_prompt,
-                            config=types.GenerateContentConfig(
-                                response_modalities=["AUDIO"]
-                            )
-                        )
-                    except Exception as fallback_error:
-                        errors_log.append(f"Segment {i+1} Fallback Exception: {str(fallback_error)}")
-                        continue
-                else:
-                    errors_log.append(f"Segment {i+1} Exception: {str(model_error)}")
-                    continue
-            if not audio_response:
-                continue
-            try:
-                audio_bytes = None
-                mime_type = None
-                # خواندن بایت‌های خام صدا از پاسخ جیمینای
-                if hasattr(audio_response, 'candidates') and audio_response.candidates:
-                    for part in audio_response.candidates[0].content.parts:
-                        if hasattr(part, 'inline_data') and part.inline_data:
-                            audio_bytes = part.inline_data.data
-                            mime_type = part.inline_data.mime_type
-                            break
-                if audio_bytes:
-                    raw_audio_path = os.path.join(temp_dir, f"raw_tts_{i}.bin")
-                    wav_audio_path = os.path.join(temp_dir, f"raw_tts_{i}.wav")
-                    adjusted_audio_path = os.path.join(temp_dir, f"adj_tts_{i}.wav")
-                    if isinstance(audio_bytes, str):
-                        audio_bytes = base64.b64decode(audio_bytes)
-                    with open(raw_audio_path, "wb") as f:
-                        f.write(audio_bytes)
-                    # تبدیل دیتای خام به فایل WAV استاندارد
-                    if mime_type and ("pcm" in mime_type.lower() or "raw" in mime_type.lower()):
-                        subprocess.run(['ffmpeg', '-y', '-f', 's16le', '-ar', '24000', '-ac', '1', '-i', raw_audio_path, wav_audio_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-                    else:
-                        subprocess.run(['ffmpeg', '-y', '-i', raw_audio_path, wav_audio_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-                    # سینک کردن صدا (Lip-Sync)
-                    if os.path.exists(wav_audio_path):
-                        adjust_audio_speed_ffmpeg(wav_audio_path, adjusted_audio_path, target_duration)
-                        segment_audio = AudioSegment.from_file(adjusted_audio_path)
-                        position_ms = int(start_time * 1000)
-                        final_audio_track = final_audio_track.overlay(segment_audio, position=position_ms)
-                        successful_segments += 1
-                    else:
-                        errors_log.append(f"Segment {i+1}: Failed to create WAV file.")
                 else:
-                    errors_log.append(f"Segment {i+1}: No audio data in response.")
             except Exception as e:
-                errors_log.append(f"Segment {i+1} Processing Error: {str(e)}")
                 continue
         if successful_segments == 0:
-            raise gr.Error(f"شکست کامل در تولید صدا!\nلاگ خطاها:\n" + "\n".join(errors_log[:5]))
         elif errors_log:
             gr.Warning(f"فقط {successful_segments} بخش از {total_subs} بخش با موفقیت صداگذاری شد.")
@@ -279,16 +268,16 @@ def process_dubbing(api_key, video_file, youtube_url, target_lang, progress=gr.P
 # رابط کاربری (Gradio UI)
 # ==========================================
-with gr.Blocks(title="AI Native Dubbing Studio (Gemini 2.5)") as app:
     gr.Markdown("""
-    # 🎙️ استودیو دوبله خودکار با موتور Gemini 2.5 Native
-    این برنامه از مدل **Gemini 2.5 Flash** در محیط `v1alpha` گوگل برای تولید صدای طبیعی و همگام‌سازی استفاده می‌کند.
     """)
     with gr.Row():
         with gr.Column(scale=1):
             api_key_input = gr.Textbox(
-                label="🔑 کلید API هوش مصنوعی (Google AI Studio Key)",
                 placeholder="AIzaSy...",
                 type="password"
             )

 import tempfile
 import re
 import shutil
+import asyncio
+import websockets
+import requests
 from pydub import AudioSegment
 import yt_dlp
 from google import genai
 # ==========================================
 def download_youtube_video(url, output_path):
+    """دانلود ویدیو از یوتیوب"""
     ydl_opts = {
         'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
         'outtmpl': output_path,
     return output_path
 def extract_audio_from_video(video_path, audio_path):
+    """استخراج صدای اصلی از ویدیو"""
     command = [
         'ffmpeg', '-i', video_path,
         '-vn', '-acodec', 'mp3', '-ar', '16000', '-ac', '1',
     return audio_path
 def get_video_duration(video_path):
+    """دریافت زمان کل ویدیو"""
     result = subprocess.run([
         'ffprobe', '-v', 'error', '-show_entries', 'format=duration',
         '-of', 'default=noprint_wrappers=1:nokey=1', video_path
     return float(result.stdout.strip())
 def adjust_audio_speed_ffmpeg(input_wav, output_wav, target_duration):
+    """تنظیم دقیق سرعت فایل صوتی (Lip-sync)"""
     try:
         audio = AudioSegment.from_file(input_wav)
         original_duration = len(audio) / 1000.0
         shutil.copy(input_wav, output_wav)
 # ==========================================
+# ارتباط با اسپیس خارجی TTS (opera8-ttslive)
+# ==========================================
+def get_tts_from_external_space(text, output_path):
+    """
+    این تابع به وب‌سوکت اسپیس دوم شما متصل می‌شود،
+    متن را می‌فرستد و پس از آماده شدن، فایل wav را دانلود می‌کند.
+    """
+    async def _fetch():
+        uri = "wss://opera8-ttslive.hf.space/ws"
+        async with websockets.connect(uri, ping_interval=None) as websocket:
+            # ارسال متن به اسپیس TTS
+            await websocket.send(text)
+            while True:
+                response = await websocket.recv()
+                # ما فقط منتظر پیام متنی (JSON) هستیم. پیام‌های باینری (صدا) را نادیده می‌گیریم.
+                if isinstance(response, str):
+                    try:
+                        data = json.loads(response)
+                        if data.get("event") == "STREAM_ENDED":
+                            # فایل کامل شده است، حالا آن را با HTTP GET دانلود می‌کنیم
+                            audio_url = "https://opera8-ttslive.hf.space" + data.get("url")
+                            download_response = requests.get(audio_url)
+                            download_response.raise_for_status()
+                            with open(output_path, "wb") as f:
+                                f.write(download_response.content)
+                            return True
+                        elif data.get("event") == "ERROR":
+                            raise Exception(data.get("message"))
+                    except json.JSONDecodeError:
+                        pass # اگر JSON نبود رد شو
+    # اجرای تابع async در محیط sync برنامه Gradio
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    try:
+        return loop.run_until_complete(_fetch())
+    finally:
+        loop.close()
+# ==========================================
+# هسته اصلی پردازش دوبله
 # ==========================================
 def process_dubbing(api_key, video_file, youtube_url, target_lang, progress=gr.Progress()):
     temp_dir = tempfile.mkdtemp()
     try:
+        client = genai.Client(api_key=api_key)
         video_path = os.path.join(temp_dir, "input_video.mp4")
         audio_path = os.path.join(temp_dir, "extracted_audio.mp3")
         progress(0.1, desc="در حال استخراج صدای ویدیو...")
         extract_audio_from_video(video_path, audio_path)
+        # 3. استخراج متن و ترجمه با جمینای
         progress(0.2, desc=f"در حال پردازش متن و ترجمه به {target_lang}...")
         gemini_audio_file = client.files.upload(file=audio_path)
             if not subtitles:
                 raise ValueError("لیست زیرنویس خالی است.")
         except Exception as e:
+            raise gr.Error(f"خطا در تجزیه خروجی مدل. \nجزئیات: {str(e)}\nخروجی خام:\n{response_text[:200]}")
+        # 4. تولید صدا با اتصال به اسپیس opera8-ttslive
         final_audio_track = AudioSegment.silent(duration=int(video_duration * 1000))
         total_subs = len(subtitles)
         successful_segments = 0
         errors_log = []
         for i, sub in enumerate(subtitles):
+            progress(0.3 + (0.5 * (i / total_subs)), desc=f"صداگذاری بخش {i+1}/{total_subs} (ارتباط با سرور TTS)...")
             text = sub.get('text', '')
             start_time = float(sub.get('start', 0))
             if not text or target_duration <= 0:
                 continue
             try:
+                raw_audio_path = os.path.join(temp_dir, f"raw_tts_{i}.wav")
+                adjusted_audio_path = os.path.join(temp_dir, f"adj_tts_{i}.wav")
+                # درخواست ساخت فایل صوتی از اسپیس دوم
+                success = get_tts_from_external_space(text, raw_audio_path)
+                if success and os.path.exists(raw_audio_path):
+                    # فایل دانلود شده استاندارد است، مستقیما تنظیم سرعت می‌کنیم
+                    adjust_audio_speed_ffmpeg(raw_audio_path, adjusted_audio_path, target_duration)
+                    segment_audio = AudioSegment.from_file(adjusted_audio_path)
+                    position_ms = int(start_time * 1000)
+                    final_audio_track = final_audio_track.overlay(segment_audio, position=position_ms)
+                    successful_segments += 1
                 else:
+                    errors_log.append(f"Segment {i+1}: Failed to download from TTS Space.")
             except Exception as e:
+                errors_log.append(f"Segment {i+1} Exception: {str(e)}")
                 continue
         if successful_segments == 0:
+            raise gr.Error(f"شکست در تولید صدا! ارتباط با سرور TTS ناموفق بود.\nلاگ خطاها:\n" + "\n".join(errors_log[:5]))
         elif errors_log:
             gr.Warning(f"فقط {successful_segments} بخش از {total_subs} بخش با موفقیت صداگذاری شد.")
 # رابط کاربری (Gradio UI)
 # ==========================================
+with gr.Blocks(title="AI Native Dubbing Studio") as app:
     gr.Markdown("""
+    # 🎙️ استودیو دوبله خودکار با موتور Gemini 2.5
+    این برنامه ترجمه و زمان‌بندی را انجام داده و برای تولید صدای طبیعی، به صورت خودکار به سرور اختصاصی TTS شما متصل می‌شود.
     """)
     with gr.Row():
         with gr.Column(scale=1):
             api_key_input = gr.Textbox(
+                label="🔑 کلید API جمینای (فقط برای مرحله ترجمه و تشخیص متن)",
                 placeholder="AIzaSy...",
                 type="password"
             )