Spaces:

Opera8
/

Doble

Paused

App Files Files Community

Opera8 commited on Nov 21, 2025

Commit

715b219

verified ·

1 Parent(s): 48db11a

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -262

app.py CHANGED Viewed

@@ -1,262 +1,35 @@
-import os
-import sys
-import subprocess
-# --- FFmpeg Setup (Replaces packages.txt) ---
-try:
-    import imageio_ffmpeg
-    ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
-    ffmpeg_dir = os.path.dirname(ffmpeg_path)
-    # Add ffmpeg binary directory to system PATH so os.system("ffmpeg") works
-    os.environ["PATH"] += os.pathsep + ffmpeg_dir
-    # Ensure it's executable
-    subprocess.run(["chmod", "+x", ffmpeg_path])
-    print(f"✅ FFmpeg configured at: {ffmpeg_path}")
-except ImportError:
-    print("⚠️ imageio-ffmpeg not found. Please add it to requirements.txt")
-# --- Main Imports ---
-import gradio as gr
-import torch
-import spaces  # Required for ZeroGPU
-from soni_translate.logging_setup import logger, set_logging_level, configure_logging_libs
-configure_logging_libs()
-import whisperx
-from soni_translate.preprocessor import audio_video_preprocessor, audio_preprocessor
-from soni_translate.postprocessor import media_out, get_no_ext_filename, sound_separate, get_subtitle_speaker
-from soni_translate.speech_segmentation import transcribe_speech, align_speech, diarize_speech, ASR_MODEL_OPTIONS, find_whisper_models, diarization_models, COMPUTE_TYPE_CPU, COMPUTE_TYPE_GPU
-from soni_translate.translate_segments import translate_text, TRANSLATION_PROCESS_OPTIONS
-from soni_translate.text_to_speech import audio_segmentation_to_voice, edge_tts_voices_list, coqui_xtts_voices_list, piper_tts_voices_list
-from soni_translate.audio_segments import create_translated_audio, accelerate_segments
-from soni_translate.language_configuration import LANGUAGES, LANGUAGES_LIST
-from soni_translate.utils import remove_files, get_link_list, get_valid_files, is_audio_file, is_subtitle_file
-from soni_translate.text_multiformat_processor import process_subtitles, srt_file_to_segments, break_aling_segments
-from soni_translate.languages_gui import language_data
-import hashlib
-import json
-import copy
-from pydub import AudioSegment
-# Check for API key from Hugging Face Secrets
-if "GOOGLE_API_KEY" in os.environ:
-    print("✅ Google API Key found in secrets.")
-else:
-    print("⚠️ Google API Key not found. Please set it in the Space secrets.")
-if "OPENAI_API_KEY" in os.environ:
-    print("✅ OpenAI API Key found in secrets.")
-else:
-    print("⚠️ OpenAI API Key not found. Please set it in the Space secrets if you use OpenAI models.")
-# Create necessary directories
-directories = ["downloads", "logs", "weights", "clean_song_output", "_XTTS_", "audio", "outputs"]
-for directory in directories:
-    if not os.path.exists(directory):
-        os.makedirs(directory)
-class SoniTranslate:
-    def __init__(self):
-        # Device detection moved inside the function for ZeroGPU compatibility
-        self.result_diarize = None
-        self.align_language = None
-        self.result_source_lang = None
-        self.tts_info = self._get_tts_info()
-    def _get_tts_info(self):
-        # Simplified for this example
-        class TTS_Info:
-            def tts_list(self):
-                try:
-                    return edge_tts_voices_list()
-                except Exception as e:
-                    logger.warning(f"Could not get Edge-TTS voices: {e}")
-                    return ["en-US-JennyNeural-Female"] # fallback
-        return TTS_Info()
-    # --- ZeroGPU Decorator ---
-    # duration=300 means 5 minutes max per request. Adjust if needed.
-    @spaces.GPU(duration=300)
-    def multilingual_media_conversion(
-        self,
-        media_file,
-        link_media,
-        directory_input,
-        origin_language,
-        target_language,
-        tts_voice,
-        transcriber_model,
-        max_speakers,
-        is_gui=True,
-        progress=gr.Progress(),
-    ):
-        # Check device inside the GPU decorated function
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        logger.info(f"Working on device: {self.device}")
-        try:
-            progress(0.05, desc="Starting process...")
-            # 1. Handle Input
-            input_media = None
-            if media_file is not None:
-                input_media = media_file.name
-            elif link_media:
-                input_media = link_media
-            elif directory_input and os.path.exists(directory_input):
-                input_media = directory_input
-            if not input_media:
-                raise ValueError("No input media specified. Please upload a file or provide a URL.")
-            base_audio_wav = "audio.wav"
-            base_video_file = "video.mp4"
-            remove_files(base_audio_wav, base_video_file)
-            progress(0.1, desc="Processing input media...")
-            if is_audio_file(input_media):
-                audio_preprocessor(False, input_media, base_audio_wav)
-            else:
-                audio_video_preprocessor(False, input_media, base_video_file, base_audio_wav)
-            # 2. Transcription
-            progress(0.25, desc="Transcribing audio with WhisperX...")
-            source_lang_code = LANGUAGES[origin_language] if origin_language != "Automatic detection" else None
-            # Force float16 if cuda is available (ZeroGPU)
-            compute_type = "float16" if self.device == "cuda" else "int8"
-            audio, result = transcribe_speech(
-                base_audio_wav,
-                transcriber_model,
-                compute_type,
-                16,
-                source_lang_code
-            )
-            progress(0.4, desc="Aligning transcription...")
-            self.align_language = result["language"]
-            result = align_speech(audio, result)
-            # 3. Diarization
-            progress(0.5, desc="Separating speakers...")
-            hf_token = os.environ.get("HF_TOKEN")
-            if not hf_token:
-                logger.warning("Hugging Face token not found. Diarization might fail.")
-            self.result_diarize = diarize_speech(
-                base_audio_wav,
-                result,
-                1,
-                max_speakers,
-                hf_token,
-                diarization_models["pyannote_3.1"]
-            )
-            self.result_source_lang = copy.deepcopy(self.result_diarize)
-            # 4. Translation
-            progress(0.6, desc="Translating text...")
-            translate_to_code = LANGUAGES[target_language]
-            self.result_diarize["segments"] = translate_text(
-                self.result_diarize["segments"],
-                translate_to_code,
-                "google_translator_batch",
-                chunk_size=1800,
-                source=self.align_language,
-            )
-            # 5. Text-to-Speech
-            progress(0.75, desc="Generating dubbed audio...")
-            valid_speakers = audio_segmentation_to_voice(
-                self.result_diarize,
-                translate_to_code,
-                is_gui,
-                tts_voice
-            )
-            # 6. Audio Processing & Merging
-            progress(0.85, desc="Synchronizing and mixing audio...")
-            dub_audio_file = "audio_dub_solo.ogg"
-            remove_files(dub_audio_file)
-            audio_files, _ = accelerate_segments(self.result_diarize, 1.8, valid_speakers)
-            create_translated_audio(self.result_diarize, audio_files, dub_audio_file, False, False)
-            mix_audio_file = "audio_mix.mp3"
-            remove_files(mix_audio_file)
-            # Using os.system which relies on the PATH set at the top
-            command_volume_mix = f'ffmpeg -y -i {base_audio_wav} -i {dub_audio_file} -filter_complex "[0:0]volume=0.1[a];[1:0]volume=1.5[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio_file}'
-            os.system(command_volume_mix)
-            # 7. Final Video Creation
-            progress(0.95, desc="Creating final video...")
-            output_filename = "video_dub.mp4"
-            remove_files(output_filename)
-            if os.path.exists(base_video_file):
-                os.system(f"ffmpeg -i {base_video_file} -i {mix_audio_file} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {output_filename}")
-                final_output = media_out(input_media, translate_to_code, "", "mp4", file_obj=output_filename)
-            else:
-                final_output = media_out(input_media, translate_to_code, "", "mp3", file_obj=mix_audio_file)
-            progress(1.0, desc="Done!")
-            return final_output
-        except Exception as e:
-            logger.error(f"An error occurred: {e}")
-            gr.Error(f"An error occurred: {e}")
-            return None
-# Instantiate the class
-SoniTr = SoniTranslate()
-# Create Gradio Interface
-with gr.Blocks(theme="Taithrah/Minimal") as app:
-    gr.Markdown("<center><h1>📽️ ابزار دوبله ویدیو با هوش مصنوعی 🈷️</h1></center>")
-    gr.Markdown("ساخته شده توسط [aigolden](https://youtube.com/@aigolden) - بر پایه [SoniTranslate](https://github.com/r3gm/SoniTranslate)")
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown("### ۱. ورودی ویدیو")
-            video_file_input = gr.File(label="آپلود ویدیو")
-            link_media_input = gr.Textbox(label="یا لینک یوتیوب", placeholder="https://www.youtube.com/watch?v=...")
-            gr.Markdown("### ۲. تنظیمات دوبله")
-            origin_language_input = gr.Dropdown(LANGUAGES_LIST, value="Automatic detection", label="زبان اصلی ویدیو")
-            target_language_input = gr.Dropdown(LANGUAGES_LIST[1:], value="Persian (fa)", label="زبان مقصد دوبله")
-            tts_voice_input = gr.Dropdown(SoniTr.tts_info.tts_list(), value="fa-IR-FaridNeural", label="صدای گوینده")
-            with gr.Accordion("تنظیمات پیشرفته", open=False):
-                transcriber_model_input = gr.Dropdown(
-                    ASR_MODEL_OPTIONS + find_whisper_models(),
-                    value="large-v3",
-                    label="مدل استخراج متن (Whisper)",
-                    info="مدل‌های بزرگتر دقیق‌تر اما کندتر هستند."
-                )
-                max_speakers_input = gr.Slider(1, 10, value=2, step=1, label="حداکثر تعداد گوینده")
-            process_button = gr.Button("شروع دوبله", variant="primary")
-        with gr.Column():
-            gr.Markdown("### ۳. خروجی")
-            output_video = gr.Video(label="ویدیوی دوبله شده")
-            output_file = gr.File(label="دانلود فایل")
-    process_button.click(
-        SoniTr.multilingual_media_conversion,
-        inputs=[
-            video_file_input,
-            link_media_input,
-            gr.Textbox(visible=False),
-            origin_language_input,
-            target_language_input,
-            tts_voice_input,
-            transcriber_model_input,
-            max_speakers_input,
-        ],
-        outputs=[output_file]
-    )
-if __name__ == "__main__":
-    app.launch(server_name="0.0.0.0", server_port=7860)

+gradio
+torch
+torchvision
+torchaudio
+spaces
+imageio-ffmpeg
+# SoniTranslate Core Dependencies
+git+https://github.com/m-bain/whisperX.git
+pyannote.audio>=3.3.2
+fairseq
+yt-dlp
+pysrt
+pydub
+faster-whisper
+audiostretchy
+# Translation and TTS
+google-generativeai
+openai
+edge-tts
+piper-tts==1.2.0
+TTS==0.21.1
+# Other utilities
+# Important: numpy must be <2 for audio libraries to work
+numpy<2
+soundfile
+librosa
+onnxruntime-gpu
+tqdm
+demucs
+python-multipart
+tenacity
+youtube-transcript-api
+ffmpeg-python