Peeble commited on
Commit
57524e6
·
verified ·
1 Parent(s): 9f756c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -49
app.py CHANGED
@@ -8,14 +8,20 @@ from pydub import AudioSegment
8
  from moviepy.editor import VideoFileClip, AudioFileClip
9
  from google.cloud import texttospeech
10
  from google.cloud import translate_v2 as translate
11
- import whisper
 
 
12
  import spacy
13
  from spacy_syllables import SpacySyllables
14
  from tqdm import tqdm
15
 
16
- # If FFmpeg is not in PATH, set this to your ffmpeg binary
17
- # Example for Replit: "/home/runner/<your-repl-name>/ffmpeg"
18
- # AudioSegment.converter = "/path/to/ffmpeg"
 
 
 
 
19
 
20
  spacy_models = {
21
  "english": "en_core_web_sm",
@@ -29,7 +35,7 @@ spacy_models = {
29
  "dutch": "nl_core_news_sm",
30
  "finnish": "fi_core_news_sm",
31
  "greek": "el_core_news_sm",
32
- "japanese": "ja_core_news_sm",
33
  "korean": "ko_core_news_sm",
34
  "lithuanian": "lt_core_news_sm",
35
  "macedonian": "mk_core_news_sm",
@@ -63,10 +69,14 @@ ABBREVIATIONS = {
63
  "Corp.": "corporation"
64
  }
65
 
66
- ISWORD = re.compile(r'.*\w.*')
 
67
 
 
 
 
68
 
69
- def extract_audio_from_video(video_file: str) -> str:
70
  try:
71
  print("Extracting audio track")
72
  video = VideoFileClip(video_file)
@@ -79,33 +89,99 @@ def extract_audio_from_video(video_file: str) -> str:
79
  return None
80
 
81
 
82
- def transcribe_audio(audio_file: str, source_language: str):
 
 
 
 
 
 
 
 
 
 
 
 
83
  try:
84
- print("Transcribing audio track")
85
- model = whisper.load_model("large")
86
- trans = model.transcribe(
 
 
 
 
 
 
 
 
87
  audio_file,
88
- language=source_language,
89
- verbose=False,
90
- word_timestamps=True
91
  )
92
- return trans
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  except Exception as e:
94
- print(f"Error transcribing audio: {e}")
95
  return None
96
 
97
 
98
- def translate_text(texts, target_language: str):
 
 
 
 
99
  try:
100
  translate_client = translate.Client()
101
  results = translate_client.translate(texts, target_language=target_language)
102
- return [result['translatedText'] for result in results]
103
  except Exception as e:
104
  print(f"Error translating texts: {e}")
105
  return None
106
 
107
 
108
- def create_audio_from_text(text: str, target_language: str, target_voice: str) -> str:
109
  audio_file = "translated_" + str(uuid.uuid4()) + ".wav"
110
  try:
111
  client = texttospeech.TextToSpeechClient()
@@ -130,6 +206,10 @@ def create_audio_from_text(text: str, target_language: str, target_voice: str) -
130
  raise Exception(f"Error creating audio from text: {e}")
131
 
132
 
 
 
 
 
133
  def merge_audio_files(transcription, source_language, target_language, target_voice, audio_file):
134
  temp_files = []
135
  try:
@@ -138,6 +218,7 @@ def merge_audio_files(transcription, source_language, target_language, target_vo
138
  if spacy_models[source_language] not in spacy.util.get_installed_models():
139
  import spacy.cli
140
  spacy.cli.download(spacy_models[source_language])
 
141
  nlp = spacy.load(spacy_models[source_language])
142
  nlp.add_pipe("syllables", after="tagger")
143
 
@@ -145,11 +226,10 @@ def merge_audio_files(transcription, source_language, target_language, target_vo
145
  sentences = []
146
  sentence_starts = []
147
  sentence_ends = []
148
-
149
  sentence = ""
150
  sent_start = 0
151
 
152
- print("Composing sentences")
153
  for segment in tqdm(transcription["segments"]):
154
  if segment["text"].isupper():
155
  continue
@@ -163,26 +243,25 @@ def merge_audio_files(transcription, source_language, target_language, target_vo
163
  sentence += word["word"] + " "
164
 
165
  word_syllables = sum(
166
- token._.syllables_count
167
- for token in nlp(word["word"])
168
- if token._.syllables_count
169
  )
170
  segment_syllables = sum(
171
- token._.syllables_count
172
- for token in nlp(segment["text"])
173
- if token._.syllables_count
174
  )
175
 
176
  if i == 0 or sent_start == 0:
177
- word_speed = word_syllables / (word["end"] - word["start"])
 
178
  if word_speed < 3:
179
- sent_start = word["end"] - word_syllables / 3
180
  else:
181
  sent_start = word["start"]
182
 
183
  if i == len(segment["words"]) - 1:
184
- word_speed = word_syllables / (word["end"] - word["start"])
185
- segment_speed = segment_syllables / (segment["end"] - segment["start"])
 
 
186
  if word_speed < 1.0 or segment_speed < 2.0:
187
  word["word"] += "."
188
 
@@ -202,7 +281,7 @@ def merge_audio_files(transcription, source_language, target_language, target_vo
202
  raise Exception("Translation failed")
203
  translated_texts.extend(translated_chunk)
204
 
205
- print("Creating translated audio track")
206
  prev_end_time = 0
207
  for i, translated_text in enumerate(tqdm(translated_texts)):
208
  translated_audio_file = create_audio_from_text(
@@ -210,6 +289,7 @@ def merge_audio_files(transcription, source_language, target_language, target_vo
210
  )
211
  if translated_audio_file is None:
212
  raise Exception("Audio creation failed")
 
213
  temp_files.append(translated_audio_file)
214
  translated_audio = AudioSegment.from_wav(translated_audio_file)
215
 
@@ -247,6 +327,7 @@ def merge_audio_files(transcription, source_language, target_language, target_vo
247
  merged_audio += padding + translated_audio
248
 
249
  return merged_audio, ducked_audio
 
250
  except Exception as e:
251
  print(f"Error merging audio files: {e}")
252
  return None, None
@@ -258,7 +339,11 @@ def merge_audio_files(transcription, source_language, target_language, target_vo
258
  print(f"Error removing temporary file {file}: {e}")
259
 
260
 
261
- def save_audio_to_file(audio, filename: str):
 
 
 
 
262
  try:
263
  audio.export(filename, format="wav")
264
  print(f"Audio track with translation only saved to {filename}")
@@ -266,7 +351,7 @@ def save_audio_to_file(audio, filename: str):
266
  print(f"Error saving audio to file: {e}")
267
 
268
 
269
- def replace_audio_in_video(video_file: str, new_audio):
270
  temp_audio_file = None
271
  try:
272
  video = VideoFileClip(video_file)
@@ -283,18 +368,18 @@ def replace_audio_in_video(video_file: str, new_audio):
283
  return
284
 
285
  if new_audio_clip.duration < video.duration:
286
- print("Warning: The new audio is shorter than the video. The remaining video will have no sound.")
287
  elif new_audio_clip.duration > video.duration:
288
- print("Warning: The new audio is longer than the video. The extra audio will be cut off.")
289
  new_audio_clip = new_audio_clip.subclip(0, video.duration)
290
 
291
  video = video.set_audio(new_audio_clip)
292
 
293
  output_filename = os.path.splitext(video_file)[0] + "_translated.mp4"
294
  try:
295
- video.write_videofile(output_filename, audio_codec='aac')
296
  except Exception as e:
297
- print(f"Error writing the new video file: {e}")
298
  return
299
 
300
  print(f"Translated video saved as {output_filename}")
@@ -306,26 +391,30 @@ def replace_audio_in_video(video_file: str, new_audio):
306
  os.remove(temp_audio_file.name)
307
 
308
 
 
 
 
 
309
  def main():
310
  parser = argparse.ArgumentParser()
311
- parser.add_argument('--input', type=str, help='Path to the source video file', required=True)
312
  parser.add_argument(
313
- '--voice',
314
  type=str,
315
  default="es-US-Neural2-B",
316
- help='Target dubbing voice name from https://cloud.google.com/text-to-speech/docs/voices'
317
  )
318
  parser.add_argument(
319
- '--credentials',
320
  type=str,
321
- help='Path to the Google Cloud credentials JSON file',
322
- required=True
323
  )
324
  parser.add_argument(
325
- '--source_language',
326
  type=str,
327
- help=f'Source language, e.g. english. Supported: {list(spacy_models.keys())}',
328
- default="english"
329
  )
330
  args = parser.parse_args()
331
 
@@ -335,14 +424,14 @@ def main():
335
  if audio_file is None:
336
  return
337
 
338
- transcription = transcribe_audio(audio_file, args.source_language.lower())
339
  if transcription is None:
340
  return
341
 
342
  merged_audio, ducked_audio = merge_audio_files(
343
  transcription,
344
  args.source_language.lower(),
345
- args.voice[:5],
346
  args.voice,
347
  audio_file
348
  )
 
8
  from moviepy.editor import VideoFileClip, AudioFileClip
9
  from google.cloud import texttospeech
10
  from google.cloud import translate_v2 as translate
11
+
12
+ from transformers import pipeline
13
+
14
  import spacy
15
  from spacy_syllables import SpacySyllables
16
  from tqdm import tqdm
17
 
18
+ # ---------------- Hugging Face Whisper config ----------------
19
+
20
+ HF_WHISPER_MODEL_ID = "openai/whisper-large-v3" # change if you want smaller models
21
+
22
+ # -------------------------------------------------------------
23
+ # SpaCy models
24
+ # -------------------------------------------------------------
25
 
26
  spacy_models = {
27
  "english": "en_core_web_sm",
 
35
  "dutch": "nl_core_news_sm",
36
  "finnish": "fi_core_news_sm",
37
  "greek": "el_core_news_sm",
38
+ "japanese": "ja_core_web_sm",
39
  "korean": "ko_core_news_sm",
40
  "lithuanian": "lt_core_news_sm",
41
  "macedonian": "mk_core_news_sm",
 
69
  "Corp.": "corporation"
70
  }
71
 
72
+ ISWORD = re.compile(r".*\w.*")
73
+
74
 
75
+ # -------------------------------------------------------------
76
+ # Audio / video helpers
77
+ # -------------------------------------------------------------
78
 
79
+ def extract_audio_from_video(video_file):
80
  try:
81
  print("Extracting audio track")
82
  video = VideoFileClip(video_file)
 
89
  return None
90
 
91
 
92
+ # -------------------------------------------------------------
93
+ # Hugging Face Whisper transcription
94
+ # -------------------------------------------------------------
95
+
96
+ def transcribe_audio_hf(audio_file, source_language: str):
97
+ """
98
+ Use Hugging Face Transformers Whisper pipeline to transcribe with timestamps.
99
+ Returns a structure similar enough to your original Whisper output to reuse
100
+ the sentence-building logic.
101
+
102
+ We rely on HF's `automatic-speech-recognition` pipeline, with
103
+ `return_timestamps=True` to get segment/chunk timing. [web:62][web:64][web:71]
104
+ """
105
  try:
106
+ print("Loading HF Whisper pipeline")
107
+ # device=-1 means CPU; for GPU use device=0
108
+ asr = pipeline(
109
+ task="automatic-speech-recognition",
110
+ model=HF_WHISPER_MODEL_ID,
111
+ device=-1, # change to 0 if you have CUDA
112
+ return_timestamps=True
113
+ )
114
+
115
+ print("Transcribing audio via Hugging Face Whisper")
116
+ result = asr(
117
  audio_file,
118
+ generate_kwargs={"language": source_language}
 
 
119
  )
120
+
121
+ # HF Whisper pipeline with return_timestamps usually returns:
122
+ # {"text": "...", "chunks": [{"text": "...", "timestamp": (start, end)}, ...]} [web:62][web:71]
123
+ # We convert it to a shape compatible with your previous merge logic.
124
+ segments = []
125
+ if "chunks" in result:
126
+ for ch in result["chunks"]:
127
+ start, end = ch.get("timestamp", (0.0, 0.0))
128
+ text = ch.get("text", "")
129
+ if not text:
130
+ continue
131
+ segments.append(
132
+ {
133
+ "start": float(start),
134
+ "end": float(end),
135
+ "text": text,
136
+ # No per-word timing from HF pipeline, but we emulate a single-word segment
137
+ "words": [
138
+ {
139
+ "word": text.strip(),
140
+ "start": float(start),
141
+ "end": float(end)
142
+ }
143
+ ]
144
+ }
145
+ )
146
+ else:
147
+ # Fallback: single segment, no timestamps
148
+ segments.append(
149
+ {
150
+ "start": 0.0,
151
+ "end": 0.0,
152
+ "text": result.get("text", ""),
153
+ "words": [
154
+ {
155
+ "word": result.get("text", "").strip(),
156
+ "start": 0.0,
157
+ "end": 0.0
158
+ }
159
+ ]
160
+ }
161
+ )
162
+
163
+ return {"segments": segments}
164
+
165
  except Exception as e:
166
+ print(f"Error transcribing audio with HF Whisper: {e}")
167
  return None
168
 
169
 
170
+ # -------------------------------------------------------------
171
+ # Translation + TTS
172
+ # -------------------------------------------------------------
173
+
174
+ def translate_text(texts, target_language):
175
  try:
176
  translate_client = translate.Client()
177
  results = translate_client.translate(texts, target_language=target_language)
178
+ return [result["translatedText"] for result in results]
179
  except Exception as e:
180
  print(f"Error translating texts: {e}")
181
  return None
182
 
183
 
184
+ def create_audio_from_text(text, target_language, target_voice):
185
  audio_file = "translated_" + str(uuid.uuid4()) + ".wav"
186
  try:
187
  client = texttospeech.TextToSpeechClient()
 
206
  raise Exception(f"Error creating audio from text: {e}")
207
 
208
 
209
+ # -------------------------------------------------------------
210
+ # Merge translated audio with original using ducking
211
+ # -------------------------------------------------------------
212
+
213
  def merge_audio_files(transcription, source_language, target_language, target_voice, audio_file):
214
  temp_files = []
215
  try:
 
218
  if spacy_models[source_language] not in spacy.util.get_installed_models():
219
  import spacy.cli
220
  spacy.cli.download(spacy_models[source_language])
221
+
222
  nlp = spacy.load(spacy_models[source_language])
223
  nlp.add_pipe("syllables", after="tagger")
224
 
 
226
  sentences = []
227
  sentence_starts = []
228
  sentence_ends = []
 
229
  sentence = ""
230
  sent_start = 0
231
 
232
+ print("Composing sentences from segments")
233
  for segment in tqdm(transcription["segments"]):
234
  if segment["text"].isupper():
235
  continue
 
243
  sentence += word["word"] + " "
244
 
245
  word_syllables = sum(
246
+ token._.syllables_count for token in nlp(word["word"]) if token._.syllables_count
 
 
247
  )
248
  segment_syllables = sum(
249
+ token._.syllables_count for token in nlp(segment["text"]) if token._.syllables_count
 
 
250
  )
251
 
252
  if i == 0 or sent_start == 0:
253
+ duration = max(word["end"] - word["start"], 1e-6)
254
+ word_speed = word_syllables / duration if word_syllables else 1.0
255
  if word_speed < 3:
256
+ sent_start = word["end"] - word_syllables / 3 if word_syllables else word["start"]
257
  else:
258
  sent_start = word["start"]
259
 
260
  if i == len(segment["words"]) - 1:
261
+ duration = max(word["end"] - word["start"], 1e-6)
262
+ word_speed = word_syllables / duration if word_syllables else 1.0
263
+ seg_duration = max(segment["end"] - segment["start"], 1e-6)
264
+ segment_speed = segment_syllables / seg_duration if segment_syllables else 2.0
265
  if word_speed < 1.0 or segment_speed < 2.0:
266
  word["word"] += "."
267
 
 
281
  raise Exception("Translation failed")
282
  translated_texts.extend(translated_chunk)
283
 
284
+ print("Creating translated audio track and ducking original")
285
  prev_end_time = 0
286
  for i, translated_text in enumerate(tqdm(translated_texts)):
287
  translated_audio_file = create_audio_from_text(
 
289
  )
290
  if translated_audio_file is None:
291
  raise Exception("Audio creation failed")
292
+
293
  temp_files.append(translated_audio_file)
294
  translated_audio = AudioSegment.from_wav(translated_audio_file)
295
 
 
327
  merged_audio += padding + translated_audio
328
 
329
  return merged_audio, ducked_audio
330
+
331
  except Exception as e:
332
  print(f"Error merging audio files: {e}")
333
  return None, None
 
339
  print(f"Error removing temporary file {file}: {e}")
340
 
341
 
342
+ # -------------------------------------------------------------
343
+ # Save audio / replace in video
344
+ # -------------------------------------------------------------
345
+
346
+ def save_audio_to_file(audio, filename):
347
  try:
348
  audio.export(filename, format="wav")
349
  print(f"Audio track with translation only saved to {filename}")
 
351
  print(f"Error saving audio to file: {e}")
352
 
353
 
354
+ def replace_audio_in_video(video_file, new_audio):
355
  temp_audio_file = None
356
  try:
357
  video = VideoFileClip(video_file)
 
368
  return
369
 
370
  if new_audio_clip.duration < video.duration:
371
+ print("Warning: new audio is shorter than video.")
372
  elif new_audio_clip.duration > video.duration:
373
+ print("Warning: new audio is longer than video, trimming.")
374
  new_audio_clip = new_audio_clip.subclip(0, video.duration)
375
 
376
  video = video.set_audio(new_audio_clip)
377
 
378
  output_filename = os.path.splitext(video_file)[0] + "_translated.mp4"
379
  try:
380
+ video.write_videofile(output_filename, audio_codec="aac")
381
  except Exception as e:
382
+ print(f"Error writing new video file: {e}")
383
  return
384
 
385
  print(f"Translated video saved as {output_filename}")
 
391
  os.remove(temp_audio_file.name)
392
 
393
 
394
+ # -------------------------------------------------------------
395
+ # CLI
396
+ # -------------------------------------------------------------
397
+
398
  def main():
399
  parser = argparse.ArgumentParser()
400
+ parser.add_argument("--input", type=str, required=True, help="Path to source video file")
401
  parser.add_argument(
402
+ "--voice",
403
  type=str,
404
  default="es-US-Neural2-B",
405
+ help="Target dubbing voice name from Google TTS voices"
406
  )
407
  parser.add_argument(
408
+ "--credentials",
409
  type=str,
410
+ required=True,
411
+ help="Path to Google Cloud credentials JSON file"
412
  )
413
  parser.add_argument(
414
+ "--source_language",
415
  type=str,
416
+ default="english",
417
+ help=f"Source language, e.g. english. Supported: {list(spacy_models.keys())}"
418
  )
419
  args = parser.parse_args()
420
 
 
424
  if audio_file is None:
425
  return
426
 
427
+ transcription = transcribe_audio_hf(audio_file, args.source_language.lower())
428
  if transcription is None:
429
  return
430
 
431
  merged_audio, ducked_audio = merge_audio_files(
432
  transcription,
433
  args.source_language.lower(),
434
+ args.voice[:5], # "es-US" style language_code for Google TTS
435
  args.voice,
436
  audio_file
437
  )