Spaces:
Paused
Paused
File size: 6,145 Bytes
8b4831d 275e48a 8b4831d 1768850 3219df3 2de3a57 275e48a a6be743 7848193 e00b158 f7fa4ee 889d885 baa7f8b 9fa54f4 0bc447a 738eaed a1bbd1f e00b158 5a957c1 2de3a57 0bc447a 8b4831d 0bc447a 3219df3 8b4831d bbd0c30 09459a8 738eaed d41a72f 098d1fb 275e48a 640aa41 740becd 640aa41 275e48a d41a72f fabf10e d544473 f00d512 d41a72f fabf10e a0e508d f00d512 d41a72f 640aa41 bc5e2dc 740becd fabf10e 7117787 fabf10e 9fd2740 fabf10e fdbd781 fabf10e 098d1fb 740becd ebd5b31 e304087 ebd5b31 e304087 740becd 99202f6 740becd 99202f6 f7fa4ee 740becd 99202f6 275e48a f2de2c9 698dfc3 f2de2c9 8b4831d 1768850 b8713f0 d41a72f 8b4831d 4684a0c f2de2c9 d899511 1768850 43c56bd ec0b887 1f8e6c3 8b4831d 1f8e6c3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import gradio as gr
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from subprocess import run
from faster_whisper import WhisperModel
import json
import tempfile
import os
import ffmpeg
from zipfile import ZipFile
import stat
import uuid
import subprocess
import torch
import bitsandbytes
import scipy
from googletrans import Translator
import re
import subprocess
import datetime
ZipFile("ffmpeg.zip").extractall()
st = os.stat('ffmpeg')
os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC)
with open('google_lang_codes.json', 'r') as f:
google_lang_codes = json.load(f)
translator = Translator()
whisper_model = WhisperModel("large-v2", device="cuda", compute_type="float16")
print("cwd", os.getcwd())
print(os.listdir())
def process_video(Video, target_language, translate_video):
current_path = os.getcwd()
print("Iniciando process_video")
common_uuid = uuid.uuid4()
print("Checking FFmpeg availability...")
run(["ffmpeg", "-version"])
audio_file = f"{common_uuid}.wav"
run(["ffmpeg", "-i", Video, audio_file])
transcript_file = f"{current_path}/{common_uuid}.srt"
# Transcription with Whisper.
target_language_code = google_lang_codes.get(target_language, "en")
print("Iniciando transcrição com Whisper")
segments, _ = whisper_model.transcribe(audio_file, beam_size=5)
segments = list(segments)
with open(transcript_file, "w+", encoding="utf-8") as f:
counter = 1
for segment in segments:
start_hours = int(segment.start // 3600)
start_minutes = int((segment.start % 3600) // 60)
start_seconds = int(segment.start % 60)
start_milliseconds = int((segment.start - int(segment.start)) * 1000)
end_hours = int(segment.end // 3600)
end_minutes = int((segment.end % 3600) // 60)
end_seconds = int(segment.end % 60)
end_milliseconds = int((segment.end - int(segment.end)) * 1000)
formatted_start = f"{start_hours:02d}:{start_minutes:02d}:{start_seconds:02d},{start_milliseconds:03d}"
formatted_end = f"{end_hours:02d}:{end_minutes:02d}:{end_seconds:02d},{end_milliseconds:03d}"
f.write(f"{counter}\n")
f.write(f"{formatted_start} --> {formatted_end}\n")
f.write(f"{segment.text}\n\n")
counter += 1
# Check if translation is needed
if translate_video:
# Translating the SRT from Whisper with Google Translate.
translated_lines = []
f.seek(0) # Move the file pointer to the beginning of the file.
for line in f:
if line.strip().isnumeric() or "-->" in line:
translated_lines.append(line)
elif line.strip() != "":
translated_text = translator.translate(line.strip(), dest=target_language_code).text
translated_lines.append(translated_text + "\n")
else:
translated_lines.append("\n")
f.seek(0) # Move the file pointer to the beginning of the file and truncate it.
f.truncate()
f.writelines(translated_lines) # Write the translated lines back into the original file.
output_video = f"{common_uuid}_output_video.mp4"
# Debugging: Validate FFmpeg command for subtitle embedding
print("Validating FFmpeg command for subtitle embedding...")
print(f"Translated SRT file: {transcript_file}")
with open(transcript_file, 'r', encoding='utf-8') as f:
print(f"First few lines of translated SRT: {f.readlines()[:10]}")
if os.path.exists(transcript_file):
print(f"{transcript_file} exists.")
else:
print(f"{transcript_file} does not exist.")
#transcript_file_abs_path = os.path.abspath(transcript_file)
try:
if target_language_code == 'ja': # 'ja' é o código de idioma para o japonês
subtitle_style = "FontName=Noto Sans CJK JP,PrimaryColour=&H00FFFF,OutlineColour=&H000000,BackColour=&H80000000,BorderStyle=3,Outline=2,Shadow=1"
else:
subtitle_style = "FontName=Arial Unicode MS,PrimaryColour=&H00FFFF,OutlineColour=&H000000,BackColour=&H80000000,BorderStyle=3,Outline=2,Shadow=1"
result = subprocess.run(["ffmpeg", "-i", Video, "-vf", f"subtitles={transcript_file}:force_style='{subtitle_style}'", output_video], capture_output=True, text=True)
if result.returncode == 0:
print("FFmpeg executado com sucesso.")
else:
print(f"FFmpeg falhou com o código de retorno {result.returncode}.")
print("Stdout:", result.stdout)
print("Stderr:", result.stderr)
except Exception as e:
print(f"Ocorreu uma exceção: {e}")
print("process_video concluído com sucesso")
os.unlink(audio_file)
os.unlink(transcript_file)
print(f"Returning output video path: {output_video}")
return output_video
iface = gr.Interface(
fn=process_video,
inputs=[
gr.Video(),
gr.Dropdown(choices=list(google_lang_codes.keys()), label="Target Language for Translation", value="English"),
gr.Checkbox(label="Translate Video", value=True, info="Check to translate the video to the selected language. Uncheck for transcription only."),
],
outputs=[
gr.Video(),
#gr.FileExplorer()
],
live=False,
title="VIDEO TRANSCRIPTION AND TRANSLATION",
description="""This tool was developed by [@artificialguybr](https://twitter.com/artificialguybr) using entirely open-source tools. Special thanks to Hugging Face for the GPU support. Test the [Video Dubbing](https://huggingface.co/spaces/artificialguybr/video-dubbing) space!""",
allow_flagging=False
)
with gr.Blocks() as demo:
iface.render()
gr.Markdown("""
**Note:**
- Video limit is 15 minute. It will do the transcription and translate of subtitles.
- The tool uses open-source models for all models. It's a alpha version.
""")
demo.queue(concurrency_count=1, max_size=15)
demo.launch() |