Dionisii Nuzhnyi commited on
Commit
1971594
·
1 Parent(s): 84a0a1a

time stamp sync

Browse files
Files changed (1) hide show
  1. app.py +120 -25
app.py CHANGED
@@ -16,14 +16,21 @@ if hasattr(torch, "load"):
16
  import tempfile
17
  from pathlib import Path
18
  import numpy as np
 
19
  import gradio as gr
20
- import spaces
21
  import whisper
22
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
23
  from TTS.api import TTS
24
  import yt_dlp
25
  from moviepy import VideoFileClip, AudioFileClip, AudioClip, concatenate_audioclips
26
 
 
 
 
 
 
 
 
27
  # ---------------------------------------------------------------------------
28
  # B. Global model loading on CPU (ZeroGPU has no CUDA at import time)
29
  # ---------------------------------------------------------------------------
@@ -36,6 +43,14 @@ trans_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled
36
  tts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
37
  print("All models loaded on CPU.")
38
 
 
 
 
 
 
 
 
 
39
  # ---------------------------------------------------------------------------
40
  # C. Helper functions
41
  # ---------------------------------------------------------------------------
@@ -54,18 +69,97 @@ def download_youtube_video(url: str, output_dir: str) -> str:
54
  return ydl.prepare_filename(info)
55
 
56
 
57
- def translate_uk_to_en(text: str, device: str) -> str:
58
- """Translate Ukrainian text to English using NLLB-200."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  trans_tokenizer.src_lang = "ukr_Cyrl"
60
- inputs = trans_tokenizer(text, return_tensors="pt").to(device)
61
  translated_tokens = trans_model.generate(
62
  **inputs,
63
  forced_bos_token_id=trans_tokenizer.convert_tokens_to_ids("eng_Latn"),
64
- max_length=256,
65
  num_beams=5,
66
  repetition_penalty=1.5,
67
  )
68
- return trans_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
 
71
  def swap_audio_in_video(video_path: str, audio_path: str, output_path: str):
@@ -101,7 +195,7 @@ def swap_audio_in_video(video_path: str, audio_path: str, output_path: str):
101
  # D. Main processing function
102
  # ---------------------------------------------------------------------------
103
 
104
- @spaces.GPU(duration=120)
105
  def process_video(youtube_url, video_file, progress=gr.Progress()):
106
  if not youtube_url and video_file is None:
107
  raise gr.Error("Please provide a YouTube URL or upload a video file.")
@@ -131,29 +225,30 @@ def process_video(youtube_url, video_file, progress=gr.Progress()):
131
  with VideoFileClip(video_path) as video:
132
  video.audio.write_audiofile(ref_audio_path, logger=None)
133
 
134
- # Step 3: Transcribe with Whisper
135
  progress(0.40, desc="Transcribing Ukrainian audio...")
136
  result = whisper_model.transcribe(ref_audio_path, task="transcribe", language="uk")
 
137
  ukrainian_text = result["text"]
138
 
139
- # Step 4: Translate with NLLB
140
- progress(0.55, desc="Translating to English...")
141
- english_text = translate_uk_to_en(ukrainian_text, device)
142
-
143
- # Step 5: Voice clone with XTTS
144
- progress(0.70, desc="Cloning voice...")
145
- cloned_audio_path = os.path.join(tmp_dir, "cloned_audio.wav")
146
- tts_model.tts_to_file(
147
- text=english_text,
148
- speaker_wav=ref_audio_path,
149
- language="en",
150
- file_path=cloned_audio_path,
151
- )
152
-
153
- # Step 6: Swap audio
154
- progress(0.85, desc="Combining video and dubbed audio...")
155
  output_path = os.path.join(tmp_dir, "dubbed_output.mp4")
156
- swap_audio_in_video(video_path, cloned_audio_path, output_path)
 
 
157
 
158
  progress(1.0, desc="Done!")
159
  return output_path, ukrainian_text, english_text
 
16
  import tempfile
17
  from pathlib import Path
18
  import numpy as np
19
+ import soundfile as sf
20
  import gradio as gr
 
21
  import whisper
22
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
23
  from TTS.api import TTS
24
  import yt_dlp
25
  from moviepy import VideoFileClip, AudioFileClip, AudioClip, concatenate_audioclips
26
 
27
+
28
+ def gpu_decorator(fn):
29
+ if os.environ.get("SPACE_ID"):
30
+ import spaces
31
+ return spaces.GPU(duration=120)(fn)
32
+ return fn
33
+
34
  # ---------------------------------------------------------------------------
35
  # B. Global model loading on CPU (ZeroGPU has no CUDA at import time)
36
  # ---------------------------------------------------------------------------
 
43
  tts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
44
  print("All models loaded on CPU.")
45
 
46
+ # ---------------------------------------------------------------------------
47
+ # Constants
48
+ # ---------------------------------------------------------------------------
49
+ XTTS_SAMPLE_RATE = 24000
50
+ MIN_SEGMENT_DURATION = 1.5 # seconds
51
+ MAX_STRETCH_RATE = 2.0
52
+ MIN_STRETCH_RATE = 0.5
53
+
54
  # ---------------------------------------------------------------------------
55
  # C. Helper functions
56
  # ---------------------------------------------------------------------------
 
69
  return ydl.prepare_filename(info)
70
 
71
 
72
+ def merge_short_segments(segments):
73
+ """Merge consecutive short segments to avoid garbage TTS output."""
74
+ if not segments:
75
+ return []
76
+
77
+ merged = []
78
+ current = {
79
+ "start": segments[0]["start"],
80
+ "end": segments[0]["end"],
81
+ "text": segments[0]["text"].strip(),
82
+ }
83
+
84
+ for seg in segments[1:]:
85
+ text = seg["text"].strip()
86
+ if not text:
87
+ continue
88
+ duration = current["end"] - current["start"]
89
+ if duration < MIN_SEGMENT_DURATION:
90
+ current["end"] = seg["end"]
91
+ current["text"] += " " + text
92
+ else:
93
+ if current["text"]:
94
+ merged.append(current)
95
+ current = {"start": seg["start"], "end": seg["end"], "text": text}
96
+
97
+ if current["text"]:
98
+ merged.append(current)
99
+
100
+ return merged
101
+
102
+
103
+ def translate_segments_uk_to_en(segments, device):
104
+ """Batch translation of segments using NLLB-200 with proper tokenizer batching."""
105
+ texts = [seg["text"] for seg in segments]
106
+
107
  trans_tokenizer.src_lang = "ukr_Cyrl"
108
+ inputs = trans_tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
109
  translated_tokens = trans_model.generate(
110
  **inputs,
111
  forced_bos_token_id=trans_tokenizer.convert_tokens_to_ids("eng_Latn"),
112
+ max_length=512,
113
  num_beams=5,
114
  repetition_penalty=1.5,
115
  )
116
+ return [t.strip() for t in trans_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)]
117
+
118
+
119
+ def build_audio_canvas(segments, translated_texts, ref_audio_path, video_duration, tmp_dir, progress):
120
+ """Generate per-segment TTS, time-stretch to fit, and assemble onto a silent canvas."""
121
+ canvas = np.zeros(int(video_duration * XTTS_SAMPLE_RATE))
122
+ total = len(segments)
123
+
124
+ for i, (seg, text) in enumerate(zip(segments, translated_texts)):
125
+ progress(0.55 + 0.30 * (i / total), desc=f"Synthesizing segment {i+1}/{total}...")
126
+
127
+ if len(text) < 5:
128
+ continue
129
+
130
+ seg_start = seg["start"]
131
+ seg_end = seg["end"]
132
+ target_duration = seg_end - seg_start
133
+
134
+ # First pass: generate at natural speed to measure duration
135
+ tts_audio = tts_model.tts(text=text, speaker_wav=ref_audio_path, language="en")
136
+ tts_audio = np.array(tts_audio, dtype=np.float32)
137
+ tts_duration = len(tts_audio) / XTTS_SAMPLE_RATE
138
+
139
+ # If duration is off, regenerate with speed parameter
140
+ speed = tts_duration / target_duration
141
+ speed = max(MIN_STRETCH_RATE, min(MAX_STRETCH_RATE, speed))
142
+ if abs(speed - 1.0) >= 0.05:
143
+ tts_audio = tts_model.tts(text=text, speaker_wav=ref_audio_path, language="en", speed=speed)
144
+ tts_audio = np.array(tts_audio, dtype=np.float32)
145
+
146
+ # Truncate if it would overlap the next segment
147
+ if i + 1 < total:
148
+ max_samples = int((segments[i + 1]["start"] - seg_start) * XTTS_SAMPLE_RATE)
149
+ if len(tts_audio) > max_samples:
150
+ tts_audio = tts_audio[:max_samples]
151
+
152
+ # Place on canvas
153
+ start_sample = int(seg_start * XTTS_SAMPLE_RATE)
154
+ end_sample = start_sample + len(tts_audio)
155
+ if end_sample > len(canvas):
156
+ tts_audio = tts_audio[:len(canvas) - start_sample]
157
+ end_sample = len(canvas)
158
+ canvas[start_sample:end_sample] = tts_audio
159
+
160
+ canvas_path = os.path.join(tmp_dir, "dubbed_canvas.wav")
161
+ sf.write(canvas_path, canvas, XTTS_SAMPLE_RATE)
162
+ return canvas_path
163
 
164
 
165
  def swap_audio_in_video(video_path: str, audio_path: str, output_path: str):
 
195
  # D. Main processing function
196
  # ---------------------------------------------------------------------------
197
 
198
+ @gpu_decorator
199
  def process_video(youtube_url, video_file, progress=gr.Progress()):
200
  if not youtube_url and video_file is None:
201
  raise gr.Error("Please provide a YouTube URL or upload a video file.")
 
225
  with VideoFileClip(video_path) as video:
226
  video.audio.write_audiofile(ref_audio_path, logger=None)
227
 
228
+ # Step 3: Transcribe with Whisper (segment-level)
229
  progress(0.40, desc="Transcribing Ukrainian audio...")
230
  result = whisper_model.transcribe(ref_audio_path, task="transcribe", language="uk")
231
+ raw_segments = result["segments"]
232
  ukrainian_text = result["text"]
233
 
234
+ # Step 4: Merge short segments
235
+ merged = merge_short_segments(raw_segments)
236
+
237
+ # Step 5: Context-aware translate
238
+ progress(0.50, desc="Translating to English...")
239
+ translated_texts = translate_segments_uk_to_en(merged, device)
240
+ english_text = " ".join(translated_texts)
241
+
242
+ # Step 6: Per-segment TTS + time-stretch + canvas
243
+ with VideoFileClip(video_path) as v:
244
+ video_duration = v.duration
245
+ canvas_path = build_audio_canvas(merged, translated_texts, ref_audio_path, video_duration, tmp_dir, progress)
246
+
247
+ # Step 7: Combine video and audio
 
 
248
  output_path = os.path.join(tmp_dir, "dubbed_output.mp4")
249
+
250
+ progress(0.90, desc="Combining video and dubbed audio...")
251
+ swap_audio_in_video(video_path, canvas_path, output_path)
252
 
253
  progress(1.0, desc="Done!")
254
  return output_path, ukrainian_text, english_text