File size: 6,145 Bytes
8b4831d
 
275e48a
8b4831d
1768850
3219df3
2de3a57
275e48a
a6be743
7848193
e00b158
f7fa4ee
889d885
baa7f8b
9fa54f4
0bc447a
738eaed
 
a1bbd1f
e00b158
5a957c1
 
 
2de3a57
0bc447a
 
8b4831d
0bc447a
3219df3
8b4831d
bbd0c30
 
09459a8
738eaed
d41a72f
098d1fb
275e48a
640aa41
740becd
 
640aa41
275e48a
d41a72f
 
fabf10e
d544473
f00d512
 
 
d41a72f
fabf10e
a0e508d
f00d512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d41a72f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
640aa41
bc5e2dc
740becd
fabf10e
7117787
fabf10e
9fd2740
fabf10e
 
fdbd781
fabf10e
098d1fb
740becd
ebd5b31
e304087
ebd5b31
e304087
 
740becd
99202f6
740becd
99202f6
f7fa4ee
 
740becd
99202f6
275e48a
f2de2c9
 
698dfc3
f2de2c9
8b4831d
 
 
 
1768850
b8713f0
d41a72f
8b4831d
4684a0c
 
f2de2c9
d899511
1768850
43c56bd
ec0b887
1f8e6c3
8b4831d
1f8e6c3
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import gradio as gr
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from subprocess import run
from faster_whisper import WhisperModel
import json
import tempfile
import os
import ffmpeg
from zipfile import ZipFile
import stat
import uuid
import subprocess
import torch 
import bitsandbytes
import scipy
from googletrans import Translator
import re
import subprocess
import datetime

ZipFile("ffmpeg.zip").extractall()
st = os.stat('ffmpeg')
os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC)

with open('google_lang_codes.json', 'r') as f:
    google_lang_codes = json.load(f)

translator = Translator()
whisper_model = WhisperModel("large-v2", device="cuda", compute_type="float16")

print("cwd", os.getcwd())
print(os.listdir())

        
def process_video(Video, target_language, translate_video):
    current_path = os.getcwd()
    print("Iniciando process_video")
    common_uuid = uuid.uuid4()
    print("Checking FFmpeg availability...")
    run(["ffmpeg", "-version"])
    audio_file = f"{common_uuid}.wav"
    run(["ffmpeg", "-i", Video, audio_file])
    transcript_file = f"{current_path}/{common_uuid}.srt"

    # Transcription with Whisper.
    target_language_code = google_lang_codes.get(target_language, "en")
    print("Iniciando transcrição com Whisper")
    segments, _ = whisper_model.transcribe(audio_file, beam_size=5)
    segments = list(segments)

    with open(transcript_file, "w+", encoding="utf-8") as f:
        counter = 1
        for segment in segments:
            start_hours = int(segment.start // 3600)
            start_minutes = int((segment.start % 3600) // 60)
            start_seconds = int(segment.start % 60)
            start_milliseconds = int((segment.start - int(segment.start)) * 1000)
            
            end_hours = int(segment.end // 3600)
            end_minutes = int((segment.end % 3600) // 60)
            end_seconds = int(segment.end % 60)
            end_milliseconds = int((segment.end - int(segment.end)) * 1000)
            
            formatted_start = f"{start_hours:02d}:{start_minutes:02d}:{start_seconds:02d},{start_milliseconds:03d}"
            formatted_end = f"{end_hours:02d}:{end_minutes:02d}:{end_seconds:02d},{end_milliseconds:03d}"
            
            f.write(f"{counter}\n")
            f.write(f"{formatted_start} --> {formatted_end}\n")
            f.write(f"{segment.text}\n\n")
            counter += 1

        # Check if translation is needed
        if translate_video:
            # Translating the SRT from Whisper with Google Translate.
            translated_lines = []
            f.seek(0)  # Move the file pointer to the beginning of the file.
            for line in f:
                if line.strip().isnumeric() or "-->" in line:
                    translated_lines.append(line)
                elif line.strip() != "":
                    translated_text = translator.translate(line.strip(), dest=target_language_code).text
                    translated_lines.append(translated_text + "\n")
                else:
                    translated_lines.append("\n")
            
            f.seek(0)  # Move the file pointer to the beginning of the file and truncate it.
            f.truncate()
            f.writelines(translated_lines)  # Write the translated lines back into the original file.
    output_video = f"{common_uuid}_output_video.mp4"
    # Debugging: Validate FFmpeg command for subtitle embedding
    print("Validating FFmpeg command for subtitle embedding...")
    print(f"Translated SRT file: {transcript_file}")
    
    with open(transcript_file, 'r', encoding='utf-8') as f:
        print(f"First few lines of translated SRT: {f.readlines()[:10]}")
    if os.path.exists(transcript_file):
        print(f"{transcript_file} exists.")
    else:
        print(f"{transcript_file} does not exist.")
    #transcript_file_abs_path = os.path.abspath(transcript_file)
    try:
        if target_language_code == 'ja':  # 'ja' é o código de idioma para o japonês
            subtitle_style = "FontName=Noto Sans CJK JP,PrimaryColour=&H00FFFF,OutlineColour=&H000000,BackColour=&H80000000,BorderStyle=3,Outline=2,Shadow=1"
        else:
            subtitle_style = "FontName=Arial Unicode MS,PrimaryColour=&H00FFFF,OutlineColour=&H000000,BackColour=&H80000000,BorderStyle=3,Outline=2,Shadow=1"
        result = subprocess.run(["ffmpeg", "-i", Video, "-vf", f"subtitles={transcript_file}:force_style='{subtitle_style}'", output_video], capture_output=True, text=True)
        if result.returncode == 0:
            print("FFmpeg executado com sucesso.")
        else:
            print(f"FFmpeg falhou com o código de retorno {result.returncode}.")
            print("Stdout:", result.stdout)
            print("Stderr:", result.stderr)
    except Exception as e:
        print(f"Ocorreu uma exceção: {e}")
    print("process_video concluído com sucesso")
    os.unlink(audio_file)
    os.unlink(transcript_file)
    print(f"Returning output video path: {output_video}")
    return output_video

iface = gr.Interface(
    fn=process_video,
    inputs=[
        gr.Video(),
        gr.Dropdown(choices=list(google_lang_codes.keys()), label="Target Language for Translation", value="English"),
        gr.Checkbox(label="Translate Video", value=True, info="Check to translate the video to the selected language. Uncheck for transcription only."),
    ],
    outputs=[
        gr.Video(),
        #gr.FileExplorer()
    ],
    live=False,
    title="VIDEO TRANSCRIPTION AND TRANSLATION",
    description="""This tool was developed by [@artificialguybr](https://twitter.com/artificialguybr) using entirely open-source tools. Special thanks to Hugging Face for the GPU support. Test the [Video Dubbing](https://huggingface.co/spaces/artificialguybr/video-dubbing) space!""",
    allow_flagging=False
)
with gr.Blocks() as demo:
    iface.render()
    gr.Markdown("""
    **Note:**
    - Video limit is 15 minute. It will do the transcription and translate of subtitles.
    - The tool uses open-source models for all models. It's a alpha version.
    """)
demo.queue(concurrency_count=1, max_size=15)
demo.launch()