Spaces:

Peeble
/

video-dubbing

Runtime error

App Files Files Community

Peeble commited on Jan 6

Commit

57524e6

verified ·

1 Parent(s): 9f756c6

Update app.py

Browse files

Files changed (1) hide show

app.py +138 -49

app.py CHANGED Viewed

@@ -8,14 +8,20 @@ from pydub import AudioSegment
 from moviepy.editor import VideoFileClip, AudioFileClip
 from google.cloud import texttospeech
 from google.cloud import translate_v2 as translate
-import whisper
 import spacy
 from spacy_syllables import SpacySyllables
 from tqdm import tqdm
-# If FFmpeg is not in PATH, set this to your ffmpeg binary
-# Example for Replit: "/home/runner/<your-repl-name>/ffmpeg"
-# AudioSegment.converter = "/path/to/ffmpeg"
 spacy_models = {
     "english": "en_core_web_sm",
@@ -29,7 +35,7 @@ spacy_models = {
     "dutch": "nl_core_news_sm",
     "finnish": "fi_core_news_sm",
     "greek": "el_core_news_sm",
-    "japanese": "ja_core_news_sm",
     "korean": "ko_core_news_sm",
     "lithuanian": "lt_core_news_sm",
     "macedonian": "mk_core_news_sm",
@@ -63,10 +69,14 @@ ABBREVIATIONS = {
     "Corp.": "corporation"
 }
-ISWORD = re.compile(r'.*\w.*')
-def extract_audio_from_video(video_file: str) -> str:
     try:
         print("Extracting audio track")
         video = VideoFileClip(video_file)
@@ -79,33 +89,99 @@ def extract_audio_from_video(video_file: str) -> str:
         return None
-def transcribe_audio(audio_file: str, source_language: str):
     try:
-        print("Transcribing audio track")
-        model = whisper.load_model("large")
-        trans = model.transcribe(
             audio_file,
-            language=source_language,
-            verbose=False,
-            word_timestamps=True
         )
-        return trans
     except Exception as e:
-        print(f"Error transcribing audio: {e}")
         return None
-def translate_text(texts, target_language: str):
     try:
         translate_client = translate.Client()
         results = translate_client.translate(texts, target_language=target_language)
-        return [result['translatedText'] for result in results]
     except Exception as e:
         print(f"Error translating texts: {e}")
         return None
-def create_audio_from_text(text: str, target_language: str, target_voice: str) -> str:
     audio_file = "translated_" + str(uuid.uuid4()) + ".wav"
     try:
         client = texttospeech.TextToSpeechClient()
@@ -130,6 +206,10 @@ def create_audio_from_text(text: str, target_language: str, target_voice: str) -
         raise Exception(f"Error creating audio from text: {e}")
 def merge_audio_files(transcription, source_language, target_language, target_voice, audio_file):
     temp_files = []
     try:
@@ -138,6 +218,7 @@ def merge_audio_files(transcription, source_language, target_language, target_vo
         if spacy_models[source_language] not in spacy.util.get_installed_models():
             import spacy.cli
             spacy.cli.download(spacy_models[source_language])
         nlp = spacy.load(spacy_models[source_language])
         nlp.add_pipe("syllables", after="tagger")
@@ -145,11 +226,10 @@ def merge_audio_files(transcription, source_language, target_language, target_vo
         sentences = []
         sentence_starts = []
         sentence_ends = []
         sentence = ""
         sent_start = 0
-        print("Composing sentences")
         for segment in tqdm(transcription["segments"]):
             if segment["text"].isupper():
                 continue
@@ -163,26 +243,25 @@ def merge_audio_files(transcription, source_language, target_language, target_vo
                     sentence += word["word"] + " "
                 word_syllables = sum(
-                    token._.syllables_count
-                    for token in nlp(word["word"])
-                    if token._.syllables_count
                 )
                 segment_syllables = sum(
-                    token._.syllables_count
-                    for token in nlp(segment["text"])
-                    if token._.syllables_count
                 )
                 if i == 0 or sent_start == 0:
-                    word_speed = word_syllables / (word["end"] - word["start"])
                     if word_speed < 3:
-                        sent_start = word["end"] - word_syllables / 3
                     else:
                         sent_start = word["start"]
                 if i == len(segment["words"]) - 1:
-                    word_speed = word_syllables / (word["end"] - word["start"])
-                    segment_speed = segment_syllables / (segment["end"] - segment["start"])
                     if word_speed < 1.0 or segment_speed < 2.0:
                         word["word"] += "."
@@ -202,7 +281,7 @@ def merge_audio_files(transcription, source_language, target_language, target_vo
                 raise Exception("Translation failed")
             translated_texts.extend(translated_chunk)
-        print("Creating translated audio track")
         prev_end_time = 0
         for i, translated_text in enumerate(tqdm(translated_texts)):
             translated_audio_file = create_audio_from_text(
@@ -210,6 +289,7 @@ def merge_audio_files(transcription, source_language, target_language, target_vo
             )
             if translated_audio_file is None:
                 raise Exception("Audio creation failed")
             temp_files.append(translated_audio_file)
             translated_audio = AudioSegment.from_wav(translated_audio_file)
@@ -247,6 +327,7 @@ def merge_audio_files(transcription, source_language, target_language, target_vo
             merged_audio += padding + translated_audio
         return merged_audio, ducked_audio
     except Exception as e:
         print(f"Error merging audio files: {e}")
         return None, None
@@ -258,7 +339,11 @@ def merge_audio_files(transcription, source_language, target_language, target_vo
                 print(f"Error removing temporary file {file}: {e}")
-def save_audio_to_file(audio, filename: str):
     try:
         audio.export(filename, format="wav")
         print(f"Audio track with translation only saved to {filename}")
@@ -266,7 +351,7 @@ def save_audio_to_file(audio, filename: str):
         print(f"Error saving audio to file: {e}")
-def replace_audio_in_video(video_file: str, new_audio):
     temp_audio_file = None
     try:
         video = VideoFileClip(video_file)
@@ -283,18 +368,18 @@ def replace_audio_in_video(video_file: str, new_audio):
             return
         if new_audio_clip.duration < video.duration:
-            print("Warning: The new audio is shorter than the video. The remaining video will have no sound.")
         elif new_audio_clip.duration > video.duration:
-            print("Warning: The new audio is longer than the video. The extra audio will be cut off.")
             new_audio_clip = new_audio_clip.subclip(0, video.duration)
         video = video.set_audio(new_audio_clip)
         output_filename = os.path.splitext(video_file)[0] + "_translated.mp4"
         try:
-            video.write_videofile(output_filename, audio_codec='aac')
         except Exception as e:
-            print(f"Error writing the new video file: {e}")
             return
         print(f"Translated video saved as {output_filename}")
@@ -306,26 +391,30 @@ def replace_audio_in_video(video_file: str, new_audio):
             os.remove(temp_audio_file.name)
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--input', type=str, help='Path to the source video file', required=True)
     parser.add_argument(
-        '--voice',
         type=str,
         default="es-US-Neural2-B",
-        help='Target dubbing voice name from https://cloud.google.com/text-to-speech/docs/voices'
     )
     parser.add_argument(
-        '--credentials',
         type=str,
-        help='Path to the Google Cloud credentials JSON file',
-        required=True
     )
     parser.add_argument(
-        '--source_language',
         type=str,
-        help=f'Source language, e.g. english. Supported: {list(spacy_models.keys())}',
-        default="english"
     )
     args = parser.parse_args()
@@ -335,14 +424,14 @@ def main():
     if audio_file is None:
         return
-    transcription = transcribe_audio(audio_file, args.source_language.lower())
     if transcription is None:
         return
     merged_audio, ducked_audio = merge_audio_files(
         transcription,
         args.source_language.lower(),
-        args.voice[:5],
         args.voice,
         audio_file
     )

 from moviepy.editor import VideoFileClip, AudioFileClip
 from google.cloud import texttospeech
 from google.cloud import translate_v2 as translate
+from transformers import pipeline
 import spacy
 from spacy_syllables import SpacySyllables
 from tqdm import tqdm
+# ---------------- Hugging Face Whisper config ----------------
+HF_WHISPER_MODEL_ID = "openai/whisper-large-v3"  # change if you want smaller models
+# -------------------------------------------------------------
+# SpaCy models
+# -------------------------------------------------------------
 spacy_models = {
     "english": "en_core_web_sm",
     "dutch": "nl_core_news_sm",
     "finnish": "fi_core_news_sm",
     "greek": "el_core_news_sm",
+    "japanese": "ja_core_web_sm",
     "korean": "ko_core_news_sm",
     "lithuanian": "lt_core_news_sm",
     "macedonian": "mk_core_news_sm",
     "Corp.": "corporation"
 }
+ISWORD = re.compile(r".*\w.*")
+# -------------------------------------------------------------
+# Audio / video helpers
+# -------------------------------------------------------------
+def extract_audio_from_video(video_file):
     try:
         print("Extracting audio track")
         video = VideoFileClip(video_file)
         return None
+# -------------------------------------------------------------
+# Hugging Face Whisper transcription
+# -------------------------------------------------------------
+def transcribe_audio_hf(audio_file, source_language: str):
+    """
+    Use Hugging Face Transformers Whisper pipeline to transcribe with timestamps.
+    Returns a structure similar enough to your original Whisper output to reuse
+    the sentence-building logic.
+    We rely on HF's `automatic-speech-recognition` pipeline, with
+    `return_timestamps=True` to get segment/chunk timing. [web:62][web:64][web:71]
+    """
     try:
+        print("Loading HF Whisper pipeline")
+        # device=-1 means CPU; for GPU use device=0
+        asr = pipeline(
+            task="automatic-speech-recognition",
+            model=HF_WHISPER_MODEL_ID,
+            device=-1,  # change to 0 if you have CUDA
+            return_timestamps=True
+        )
+        print("Transcribing audio via Hugging Face Whisper")
+        result = asr(
             audio_file,
+            generate_kwargs={"language": source_language}
         )
+        # HF Whisper pipeline with return_timestamps usually returns:
+        # {"text": "...", "chunks": [{"text": "...", "timestamp": (start, end)}, ...]} [web:62][web:71]
+        # We convert it to a shape compatible with your previous merge logic.
+        segments = []
+        if "chunks" in result:
+            for ch in result["chunks"]:
+                start, end = ch.get("timestamp", (0.0, 0.0))
+                text = ch.get("text", "")
+                if not text:
+                    continue
+                segments.append(
+                    {
+                        "start": float(start),
+                        "end": float(end),
+                        "text": text,
+                        # No per-word timing from HF pipeline, but we emulate a single-word segment
+                        "words": [
+                            {
+                                "word": text.strip(),
+                                "start": float(start),
+                                "end": float(end)
+                            }
+                        ]
+                    }
+                )
+        else:
+            # Fallback: single segment, no timestamps
+            segments.append(
+                {
+                    "start": 0.0,
+                    "end": 0.0,
+                    "text": result.get("text", ""),
+                    "words": [
+                        {
+                            "word": result.get("text", "").strip(),
+                            "start": 0.0,
+                            "end": 0.0
+                        }
+                    ]
+                }
+            )
+        return {"segments": segments}
     except Exception as e:
+        print(f"Error transcribing audio with HF Whisper: {e}")
         return None
+# -------------------------------------------------------------
+# Translation + TTS
+# -------------------------------------------------------------
+def translate_text(texts, target_language):
     try:
         translate_client = translate.Client()
         results = translate_client.translate(texts, target_language=target_language)
+        return [result["translatedText"] for result in results]
     except Exception as e:
         print(f"Error translating texts: {e}")
         return None
+def create_audio_from_text(text, target_language, target_voice):
     audio_file = "translated_" + str(uuid.uuid4()) + ".wav"
     try:
         client = texttospeech.TextToSpeechClient()
         raise Exception(f"Error creating audio from text: {e}")
+# -------------------------------------------------------------
+# Merge translated audio with original using ducking
+# -------------------------------------------------------------
 def merge_audio_files(transcription, source_language, target_language, target_voice, audio_file):
     temp_files = []
     try:
         if spacy_models[source_language] not in spacy.util.get_installed_models():
             import spacy.cli
             spacy.cli.download(spacy_models[source_language])
         nlp = spacy.load(spacy_models[source_language])
         nlp.add_pipe("syllables", after="tagger")
         sentences = []
         sentence_starts = []
         sentence_ends = []
         sentence = ""
         sent_start = 0
+        print("Composing sentences from segments")
         for segment in tqdm(transcription["segments"]):
             if segment["text"].isupper():
                 continue
                     sentence += word["word"] + " "
                 word_syllables = sum(
+                    token._.syllables_count for token in nlp(word["word"]) if token._.syllables_count
                 )
                 segment_syllables = sum(
+                    token._.syllables_count for token in nlp(segment["text"]) if token._.syllables_count
                 )
                 if i == 0 or sent_start == 0:
+                    duration = max(word["end"] - word["start"], 1e-6)
+                    word_speed = word_syllables / duration if word_syllables else 1.0
                     if word_speed < 3:
+                        sent_start = word["end"] - word_syllables / 3 if word_syllables else word["start"]
                     else:
                         sent_start = word["start"]
                 if i == len(segment["words"]) - 1:
+                    duration = max(word["end"] - word["start"], 1e-6)
+                    word_speed = word_syllables / duration if word_syllables else 1.0
+                    seg_duration = max(segment["end"] - segment["start"], 1e-6)
+                    segment_speed = segment_syllables / seg_duration if segment_syllables else 2.0
                     if word_speed < 1.0 or segment_speed < 2.0:
                         word["word"] += "."
                 raise Exception("Translation failed")
             translated_texts.extend(translated_chunk)
+        print("Creating translated audio track and ducking original")
         prev_end_time = 0
         for i, translated_text in enumerate(tqdm(translated_texts)):
             translated_audio_file = create_audio_from_text(
             )
             if translated_audio_file is None:
                 raise Exception("Audio creation failed")
             temp_files.append(translated_audio_file)
             translated_audio = AudioSegment.from_wav(translated_audio_file)
             merged_audio += padding + translated_audio
         return merged_audio, ducked_audio
     except Exception as e:
         print(f"Error merging audio files: {e}")
         return None, None
                 print(f"Error removing temporary file {file}: {e}")
+# -------------------------------------------------------------
+# Save audio / replace in video
+# -------------------------------------------------------------
+def save_audio_to_file(audio, filename):
     try:
         audio.export(filename, format="wav")
         print(f"Audio track with translation only saved to {filename}")
         print(f"Error saving audio to file: {e}")
+def replace_audio_in_video(video_file, new_audio):
     temp_audio_file = None
     try:
         video = VideoFileClip(video_file)
             return
         if new_audio_clip.duration < video.duration:
+            print("Warning: new audio is shorter than video.")
         elif new_audio_clip.duration > video.duration:
+            print("Warning: new audio is longer than video, trimming.")
             new_audio_clip = new_audio_clip.subclip(0, video.duration)
         video = video.set_audio(new_audio_clip)
         output_filename = os.path.splitext(video_file)[0] + "_translated.mp4"
         try:
+            video.write_videofile(output_filename, audio_codec="aac")
         except Exception as e:
+            print(f"Error writing new video file: {e}")
             return
         print(f"Translated video saved as {output_filename}")
             os.remove(temp_audio_file.name)
+# -------------------------------------------------------------
+# CLI
+# -------------------------------------------------------------
 def main():
     parser = argparse.ArgumentParser()
+    parser.add_argument("--input", type=str, required=True, help="Path to source video file")
     parser.add_argument(
+        "--voice",
         type=str,
         default="es-US-Neural2-B",
+        help="Target dubbing voice name from Google TTS voices"
     )
     parser.add_argument(
+        "--credentials",
         type=str,
+        required=True,
+        help="Path to Google Cloud credentials JSON file"
     )
     parser.add_argument(
+        "--source_language",
         type=str,
+        default="english",
+        help=f"Source language, e.g. english. Supported: {list(spacy_models.keys())}"
     )
     args = parser.parse_args()
     if audio_file is None:
         return
+    transcription = transcribe_audio_hf(audio_file, args.source_language.lower())
     if transcription is None:
         return
     merged_audio, ducked_audio = merge_audio_files(
         transcription,
         args.source_language.lower(),
+        args.voice[:5],  # "es-US" style language_code for Google TTS
         args.voice,
         audio_file
     )