Spaces:

Opera8
/

Geminidub1

Sleeping

App Files Files Community

Opera8 commited on Feb 23

Commit

74f3ef8

verified ·

1 Parent(s): b46aadf

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -70

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import tempfile
 import shutil
 import asyncio
 import requests
 from pydub import AudioSegment
 import yt_dlp
 from google import genai
@@ -61,47 +62,53 @@ def download_youtube_video(url, output_path):
     return output_path
 def extract_audio_from_video(video_path, audio_path):
-    subprocess.run(['ffmpeg', '-i', video_path, '-vn', '-acodec', 'mp3', '-ar', '24000', '-ac', '1', '-b:a', '128k', '-y', audio_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
     return audio_path
 def get_video_duration(video_path):
     result = subprocess.run(['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', video_path], capture_output=True, text=True, check=True)
     return float(result.stdout.strip())
-def remove_silence_from_edges(audio_segment, silence_thresh=-45.0, chunk_size=10):
-    if len(audio_segment) < 100: return audio_segment
-    trim_ms = 0
-    while trim_ms < len(audio_segment) and audio_segment[trim_ms:trim_ms+chunk_size].dBFS < silence_thresh: trim_ms += chunk_size
-    start_trim = trim_ms
-    trim_ms = 0
-    while trim_ms < len(audio_segment) and audio_segment[len(audio_segment)-trim_ms-chunk_size:len(audio_segment)-trim_ms].dBFS < silence_thresh: trim_ms += chunk_size
-    end_trim = len(audio_segment) - trim_ms
-    return audio_segment[start_trim:end_trim]
 def adjust_audio_speed_ffmpeg(input_wav, output_wav, target_duration):
     try:
         audio = AudioSegment.from_file(input_wav)
-        trimmed_audio = remove_silence_from_edges(audio)
-        if len(trimmed_audio) < 50: trimmed_audio = audio
-        temp_trimmed = input_wav.replace(".wav", "_trimmed.wav")
-        trimmed_audio.export(temp_trimmed, format="wav")
-        orig_dur = len(trimmed_audio) / 1000.0
         if orig_dur <= 0.1 or target_duration <= 0.1:
             shutil.copy(temp_trimmed, output_wav); return
         speed_factor = orig_dur / target_duration
         if speed_factor > 2.0: speed_factor = 2.0
         if speed_factor < 0.6: speed_factor = 0.6
         atempo_filters = []
         current = speed_factor
         while current > 2.0: atempo_filters.append("atempo=2.0"); current /= 2.0
         while current < 0.5: atempo_filters.append("atempo=0.5"); current /= 0.5
         if current != 1.0: atempo_filters.append(f"atempo={current}")
         if not atempo_filters: shutil.copy(temp_trimmed, output_wav); return
         subprocess.run(['ffmpeg', '-y', '-i', temp_trimmed, '-filter:a', ",".join(atempo_filters), output_wav], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
     except: shutil.copy(input_wav, output_wav)
@@ -132,7 +139,7 @@ async def generate_audio_via_podcast_api(text, speaker_name, output_path):
     return await asyncio.to_thread(_sync_request)
-# پردازش تک‌تک قطعات زیرنویس جهت اجرای همزمان (Concurrency)
 async def process_single_subtitle(i, sub, temp_dir, sem):
     async with sem:
         assigned_speaker = sub.get('speaker_id', 'Charon')
@@ -164,7 +171,7 @@ async def process_single_subtitle(i, sub, temp_dir, sem):
 # ==========================================
 # هسته اصلی: کارگردان هوشمند
 # ==========================================
-async def process_dubbing(api_key, video_file, youtube_url, target_lang, keep_original_audio, original_audio_volume, progress=gr.Progress()):
     if not api_key: raise gr.Error("کلید API جمینای الزامی است.")
     if not video_file and not youtube_url: raise gr.Error("ویدیو الزامی است.")
@@ -172,90 +179,122 @@ async def process_dubbing(api_key, video_file, youtube_url, target_lang, keep_or
     try:
         client = genai.Client(http_options={"api_version": "v1beta"}, api_key=api_key)
         video_path = os.path.join(temp_dir, "input.mp4")
         audio_path = os.path.join(temp_dir, "source.mp3")
         final_path = os.path.join(temp_dir, "output.mp4")
         progress(0.05, desc="دریافت ویدیو...")
-        if youtube_url:
-            download_youtube_video(youtube_url, video_path)
         else:
             s_path = video_file.name if hasattr(video_file, 'name') else str(video_file)
             shutil.copy(s_path, video_path)
         duration = get_video_duration(video_path)
         progress(0.1, desc="استخراج صدا...")
         extract_audio_from_video(video_path, audio_path)
-        progress(0.2, desc="تحلیل ویدیو، تشخیص گویندگان و انتخاب دوبلور مناسب...")
         gemini_file = client.files.upload(file=audio_path)
         prompt = f"""
         ROLE: You are an expert Dubbing Director using AI Voice Actors.
         {CAST_PROMPT}
         TASK:
-        1. Listen to the audio and identify different speakers (Male/Female, Tone, Emotion).
-        2. For EACH sentence, select the BEST MATCH from the 'AVAILABLE VOICE ACTORS' list above.
         3. Translate the text EXACTLY to {target_lang}.
         4. Output JSON Array.
         JSON FORMAT:
         [
-          {{"start": 0.0, "end": 3.5, "speaker_id": "Charon", "original_speaker_desc": "Male, Deep voice", "text": "ترجمه دقیق متن به زبان مقصد"}},
-          {{"start": 3.6, "end": 6.0, "speaker_id": "Zephyr", "original_speaker_desc": "Female, Soft voice", "text": "پاسخ زن در ویدیو"}}
         ]
         """
         transcription = client.models.generate_content(
             model='gemini-2.5-flash',
             contents=[gemini_file, prompt],
             config=types.GenerateContentConfig(response_mime_type="application/json")
         )
         try: client.files.delete(name=gemini_file.name)
         except: pass
         json_clean = transcription.text.strip().replace("```json", "").replace("```", "")
         subtitles = json.loads(json_clean)
         if not subtitles: raise ValueError("زیرنویس خالی است")
-        # --- بخش جدید: ایجاد صدای پس‌زمینه بر اساس انتخاب کاربر ---
-        if keep_original_audio:
-            progress(0.25, desc="آماده‌سازی صدای پس‌زمینه...")
-            background_audio = AudioSegment.from_file(audio_path)
-            # تبدیل مقیاس 0-1 به دسی‌بل (dB) برای کاهش صدا
-            # کاهش 50 دسی‌بلی تقریباً معادل سکوت است.
-            volume_reduction_db = (1.0 - original_audio_volume) * 50
-            final_track = background_audio - volume_reduction_db
-        else:
-            # ایجاد یک فایل صوتی کاملاً ساکت با طول ویدیو
-            final_track = AudioSegment.silent(duration=int(duration * 1000))
         total = len(subtitles)
         ok_cnt = 0
         sem = asyncio.Semaphore(20)
         tasks = [process_single_subtitle(i, sub, temp_dir, sem) for i, sub in enumerate(subtitles)]
         completed = 0
         for coro in asyncio.as_completed(tasks):
             res = await coro
             completed += 1
-            progress(0.3 + (0.6 * (completed / total)), desc=f"تولید صداها ({completed} از {total})...")
             if res is not None:
                 seg = AudioSegment.from_file(res["adj_p"])
-                final_track = final_track.overlay(seg, position=int(res["start"] * 1000))
                 ok_cnt += 1
         if ok_cnt == 0: raise gr.Error("خطا: صدایی تولید نشد.")
-        progress(0.95, desc="میکس نهایی...")
-        final_audio_p = os.path.join(temp_dir, "final_mix.wav")
-        final_track.export(final_audio_p, format="wav")
         cmd = ['ffmpeg', '-y', '-i', video_path, '-i', final_audio_p, '-c:v', 'copy', '-c:a', 'aac', '-map', '0:v:0', '-map', '1:a:0', final_path]
         subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
         return final_path, json.dumps(subtitles, ensure_ascii=False, indent=2)
     except Exception as e:
@@ -267,31 +306,28 @@ async def process_dubbing(api_key, video_file, youtube_url, target_lang, keep_or
 with gr.Blocks(title="AI Smart Director Dubbing", theme=gr.themes.Soft()) as app:
     gr.Markdown("""
     # 🎬 استودیو دوبله هوشمند (AI Director)
-    **قابلیت ویژه:** تشخیص خودکار گوینده‌های ویدیو (زن/مرد) و انتخاب بهترین صدا از بین ۳۰ گوینده حرفه‌ای توسط هوش مصنوعی.
     """)
     with gr.Row():
         with gr.Column():
-            api_key = gr.Textbox(label="کلید API جمینای", type="password")
             vid = gr.Video(label="فایل ویدیو")
             url = gr.Textbox(label="لینک یوتیوب")
             lang = gr.Dropdown(["Persian (فارسی)", "English", "Arabic"], value="Persian (فارسی)", label="زبان مقصد")
-            with gr.Accordion("تنظیمات پیشرفته صدا", open=False):
-                keep_audio_checkbox = gr.Checkbox(label="حفظ صدای پس‌زمینه (موسیقی و افکت‌ها)", value=True)
-                audio_volume_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.05, value=0.1, label="میزان بلندی صدای پس‌زمینه", info="اگر گزینه بالا غیرفعال باشد، این اسلایدر تاثیری ندارد.")
             btn = gr.Button("🚀 شروع دوبله هوشمند", variant="primary")
         with gr.Column():
             out_vid = gr.Video(label="ویدیو خروجی")
-            out_log = gr.Code(label="گزارش کستینگ (چه کسی چه گفت؟)", language="json")
-    btn.click(
-        process_dubbing,
-        [api_key, vid, url, lang, keep_audio_checkbox, audio_volume_slider],
-        [out_vid, out_log]
-    )
 if __name__ == "__main__":
     app.launch(ssr_mode=False)

 import shutil
 import asyncio
 import requests
+import math
 from pydub import AudioSegment
 import yt_dlp
 from google import genai
     return output_path
 def extract_audio_from_video(video_path, audio_path):
+    # استخراج صدا با فرمت mp3 برای پردازش
+    subprocess.run(['ffmpeg', '-i', video_path, '-vn', '-acodec', 'mp3', '-ar', '44100', '-ac', '2', '-b:a', '192k', '-y', audio_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
     return audio_path
 def get_video_duration(video_path):
     result = subprocess.run(['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', video_path], capture_output=True, text=True, check=True)
     return float(result.stdout.strip())
 def adjust_audio_speed_ffmpeg(input_wav, output_wav, target_duration):
     try:
         audio = AudioSegment.from_file(input_wav)
+        # حذف سکوت اضافه
+        if len(audio) > 100:
+            def trim(snd):
+                start_trim = 0
+                end_trim = 0
+                silence_thresh = -45.0
+                chunk = 10
+                while start_trim < len(snd) and snd[start_trim:start_trim+chunk].dBFS < silence_thresh: start_trim += chunk
+                while end_trim < len(snd) and snd[len(snd)-end_trim-chunk:len(snd)-end_trim].dBFS < silence_thresh: end_trim += chunk
+                return snd[start_trim:len(snd)-end_trim]
+            audio = trim(audio)
+        if len(audio) < 50:
+            shutil.copy(input_wav, output_wav)
+            return
+        temp_trimmed = input_wav.replace(".wav", "_trimmed.wav")
+        audio.export(temp_trimmed, format="wav")
+        orig_dur = len(audio) / 1000.0
         if orig_dur <= 0.1 or target_duration <= 0.1:
             shutil.copy(temp_trimmed, output_wav); return
         speed_factor = orig_dur / target_duration
+        # محدودیت سرعت برای جلوگیری از رباتی شدن بیش از حد
         if speed_factor > 2.0: speed_factor = 2.0
         if speed_factor < 0.6: speed_factor = 0.6
         atempo_filters = []
         current = speed_factor
         while current > 2.0: atempo_filters.append("atempo=2.0"); current /= 2.0
         while current < 0.5: atempo_filters.append("atempo=0.5"); current /= 0.5
         if current != 1.0: atempo_filters.append(f"atempo={current}")
         if not atempo_filters: shutil.copy(temp_trimmed, output_wav); return
         subprocess.run(['ffmpeg', '-y', '-i', temp_trimmed, '-filter:a', ",".join(atempo_filters), output_wav], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
     except: shutil.copy(input_wav, output_wav)
     return await asyncio.to_thread(_sync_request)
+# پردازش تک‌تک قطعات زیرنویس
 async def process_single_subtitle(i, sub, temp_dir, sem):
     async with sem:
         assigned_speaker = sub.get('speaker_id', 'Charon')
 # ==========================================
 # هسته اصلی: کارگردان هوشمند
 # ==========================================
+async def process_dubbing(api_key, video_file, youtube_url, target_lang, keep_bg_audio, bg_audio_volume, progress=gr.Progress()):
     if not api_key: raise gr.Error("کلید API جمینای الزامی است.")
     if not video_file and not youtube_url: raise gr.Error("ویدیو الزامی است.")
     try:
         client = genai.Client(http_options={"api_version": "v1beta"}, api_key=api_key)
         video_path = os.path.join(temp_dir, "input.mp4")
         audio_path = os.path.join(temp_dir, "source.mp3")
         final_path = os.path.join(temp_dir, "output.mp4")
         progress(0.05, desc="دریافت ویدیو...")
+        if youtube_url: download_youtube_video(youtube_url, video_path)
         else:
             s_path = video_file.name if hasattr(video_file, 'name') else str(video_file)
             shutil.copy(s_path, video_path)
         duration = get_video_duration(video_path)
         progress(0.1, desc="استخراج صدا...")
         extract_audio_from_video(video_path, audio_path)
+        # --- تحلیل هوش مصنوعی ---
+        progress(0.2, desc="تحلیل ویدیو و کستینگ...")
         gemini_file = client.files.upload(file=audio_path)
         prompt = f"""
         ROLE: You are an expert Dubbing Director using AI Voice Actors.
         {CAST_PROMPT}
         TASK:
+        1. Listen to the audio and identify different speakers.
+        2. Assign the BEST MATCH from 'AVAILABLE VOICE ACTORS'.
         3. Translate the text EXACTLY to {target_lang}.
         4. Output JSON Array.
         JSON FORMAT:
         [
+          {{
+            "start": 0.0,
+            "end": 3.5,
+            "speaker_id": "Charon",
+            "original_speaker_desc": "Male, Deep voice",
+            "text": "ترجمه متن"
+          }}
         ]
         """
         transcription = client.models.generate_content(
             model='gemini-2.5-flash',
             contents=[gemini_file, prompt],
             config=types.GenerateContentConfig(response_mime_type="application/json")
         )
         try: client.files.delete(name=gemini_file.name)
         except: pass
         json_clean = transcription.text.strip().replace("```json", "").replace("```", "")
         subtitles = json.loads(json_clean)
         if not subtitles: raise ValueError("زیرنویس خالی است")
+        # --- تولید صدا ---
+        # ایجاد ترک خالی برای صداهای دوبله (Voices Only)
+        voice_track = AudioSegment.silent(duration=int(duration * 1000))
         total = len(subtitles)
         ok_cnt = 0
         sem = asyncio.Semaphore(20)
         tasks = [process_single_subtitle(i, sub, temp_dir, sem) for i, sub in enumerate(subtitles)]
         completed = 0
         for coro in asyncio.as_completed(tasks):
             res = await coro
             completed += 1
+            progress(0.3 + (0.5 * (completed / total)), desc=f"تولید صداها ({completed} از {total})...")
             if res is not None:
                 seg = AudioSegment.from_file(res["adj_p"])
+                # قرار دادن صدای دوبله روی ترک خالی
+                voice_track = voice_track.overlay(seg, position=int(res["start"] * 1000))
                 ok_cnt += 1
         if ok_cnt == 0: raise gr.Error("خطا: صدایی تولید نشد.")
+        # --- میکس صدای پس‌زمینه (Duck / Mix) ---
+        progress(0.90, desc="میکس صدای پس‌زمینه...")
+        original_audio = AudioSegment.from_file(audio_path)
+        # اطمینان از هم‌اندازه بودن ترک‌ها (گاهی ffmpeg دقیق نیست)
+        if len(voice_track) > len(original_audio):
+            original_audio = original_audio + AudioSegment.silent(duration=len(voice_track)-len(original_audio))
+        else:
+            original_audio = original_audio[:len(voice_track)]
+        if keep_bg_audio:
+            if bg_audio_volume <= 0.001:
+                # اگر ولوم صفر بود، سکوت مطلق
+                bg_music = AudioSegment.silent(duration=len(original_audio))
+            else:
+                # تبدیل درصد ولوم (0 تا 1) به دسی‌بل
+                # فرمول: 20 * log10(ratio)
+                # مثال: 0.1 (10%) حدودا -20dB می‌شود
+                gain_db = 20 * math.log10(max(bg_audio_volume, 0.0001))
+                bg_music = original_audio + gain_db
+            # ترکیب صدای دوبله روی صدای پس‌زمینه
+            final_mix = bg_music.overlay(voice_track)
+        else:
+            # فقط صدای دوبله
+            final_mix = voice_track
+        final_audio_p = os.path.join(temp_dir, "final_mix.wav")
+        final_mix.export(final_audio_p, format="wav")
+        # --- چسباندن صدا به تصویر ---
+        progress(0.95, desc="رندر نهایی ویدیو...")
         cmd = ['ffmpeg', '-y', '-i', video_path, '-i', final_audio_p, '-c:v', 'copy', '-c:a', 'aac', '-map', '0:v:0', '-map', '1:a:0', final_path]
         subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
         return final_path, json.dumps(subtitles, ensure_ascii=False, indent=2)
     except Exception as e:
 with gr.Blocks(title="AI Smart Director Dubbing", theme=gr.themes.Soft()) as app:
     gr.Markdown("""
     # 🎬 استودیو دوبله هوشمند (AI Director)
+    **قابلیت‌ها:** کستینگ خودکار گوینده‌ها، تنظیم صدای پس‌زمینه و سرعت بالا با پردازش موازی.
     """)
     with gr.Row():
         with gr.Column():
+            api_key = gr.Textbox(label="کلید API جمینای (Gemini API Key)", type="password")
             vid = gr.Video(label="فایل ویدیو")
             url = gr.Textbox(label="لینک یوتیوب")
             lang = gr.Dropdown(["Persian (فارسی)", "English", "Arabic"], value="Persian (فارسی)", label="زبان مقصد")
+            # --- تنظیمات جدید صدا ---
+            gr.Markdown("### 🎚️ تنظیمات صدا")
+            keep_bg = gr.Checkbox(label="حفظ صدای پس‌زمینه (Original Audio)", value=True)
+            bg_vol = gr.Slider(minimum=0.0, maximum=1.0, value=0.15, step=0.01, label="میزان صدای پس‌زمینه (0 = سکوت، 1 = صدای اصلی)")
             btn = gr.Button("🚀 شروع دوبله هوشمند", variant="primary")
         with gr.Column():
             out_vid = gr.Video(label="ویدیو خروجی")
+            out_log = gr.Code(label="گزارش کستینگ (JSON)", language="json")
+    btn.click(process_dubbing, [api_key, vid, url, lang, keep_bg, bg_vol], [out_vid, out_log])
 if __name__ == "__main__":
     app.launch(ssr_mode=False)