Spaces:

RASMUS
/

Whisper-youtube-crosslingual-subtitles

Running

App Files Files Community

RASMUS commited on Dec 20, 2022

Commit

3e2d289

1 Parent(s): 178aea2

Create app.py

Browse files

Files changed (1) hide show

app.py +378 -0

app.py ADDED Viewed

	@@ -0,0 +1,378 @@

+import os
+os.system('git clone https://github.com/ggerganov/whisper.cpp.git')
+os.system('make -C ./whisper.cpp')
+os.system('bash ./whisper.cpp/models/download-ggml-model.sh small')
+os.system('bash ./whisper.cpp/models/download-ggml-model.sh base')
+os.system('bash ./whisper.cpp/models/download-ggml-model.sh medium')
+os.system('bash ./whisper.cpp/models/download-ggml-model.sh base.en')
+#os.system('./whisper.cpp/main -m whisper.cpp/models/ggml-base.en.bin -f whisper.cpp/samples/jfk.wav')
+#print("SEURAAVAKSI SMALL TESTI")
+#os.system('./whisper.cpp/main -m whisper.cpp/models/ggml-small.bin -f whisper.cpp/samples/jfk.wav')
+#print("MOI")
+import os
+import gradio as gr
+import os
+from pathlib import Path
+import pysrt
+import pandas as pd
+import re
+import time
+import os
+from pytube import YouTube
+from transformers import MarianMTModel, MarianTokenizer
+import psutil
+num_cores = psutil.cpu_count()
+os.environ["OMP_NUM_THREADS"] = f"{num_cores}"
+import torch
+finnish_marian_nmt_model = "Helsinki-NLP/opus-mt-tc-big-en-fi"
+finnish_tokenizer_marian = MarianTokenizer.from_pretrained(finnish_marian_nmt_model, max_length=40)
+finnish_tokenizer_marian.max_new_tokens = 30
+finnish_translation_model = MarianMTModel.from_pretrained(finnish_marian_nmt_model)
+swedish_marian_nmt_model = "Helsinki-NLP/opus-mt-en-sv"
+swedish_tokenizer_marian = MarianTokenizer.from_pretrained(swedish_marian_nmt_model, max_length=40)
+swedish_tokenizer_marian.max_new_tokens = 30
+swedish_translation_model = MarianMTModel.from_pretrained(swedish_marian_nmt_model)
+danish_marian_nmt_model = "Helsinki-NLP/opus-mt-en-da"
+danish_tokenizer_marian = MarianTokenizer.from_pretrained(danish_marian_nmt_model, max_length=40)
+danish_tokenizer_marian.max_new_tokens = 30
+danish_translation_model = MarianMTModel.from_pretrained(danish_marian_nmt_model)
+translation_models = {
+"Finnish": [finnish_tokenizer_marian, finnish_translation_model],
+"Swedish": [swedish_tokenizer_marian, swedish_translation_model],
+"Danish": [danish_tokenizer_marian, danish_translation_model]
+}
+whisper_models = ["base", "small", "medium", "base.en"]
+source_languages = {
+"Arabic": "ar",
+"Asturian ":"st",
+"Belarusian":"be",
+"Bulgarian":"bg",
+"Czech":"cs",
+"Danish":"da",
+"German":"de",
+"Greeek":"el",
+"English":"en",
+"Estonian":"et",
+"Finnish":"fi",
+"Swedish": "sv",
+"Spanish":"es",
+"Let the model analyze": "Let the model analyze"
+}
+source_languages_2 = {
+"English":"en",
+}
+transcribe_options = dict(beam_size=3, best_of=3, without_timestamps=False)
+source_language_list = [key[0] for key in source_languages.items()]
+source_language_list_2 = [key[0] for key in source_languages_2.items()]
+translation_models_list = [key[0] for key in translation_models.items()]
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print("DEVICE IS: ")
+print(device)
+videos_out_path = Path("./videos_out")
+videos_out_path.mkdir(parents=True, exist_ok=True)
+def get_youtube(video_url):
+    yt = YouTube(video_url)
+    abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
+    print("LADATATTU POLKUUN")
+    print(abs_video_path)
+    return abs_video_path
+def speech_to_text(video_file_path, selected_source_lang, whisper_model):
+    """
+    # Youtube with translated subtitles using OpenAI Whisper and Opus-MT models.
+    # Currently supports only English audio
+    This space allows you to:
+    1. Download youtube video with a given url
+    2. Watch it in the first video component
+    3. Run automatic speech recognition on the video using Whisper
+    4. Translate the recognized transcriptions to Finnish, Swedish, Danish
+    5. Burn the translations to the original video and watch the video in the 2nd video component
+    Speech Recognition is based on OpenAI Whisper https://github.com/openai/whisper
+    """
+    if(video_file_path == None):
+        raise ValueError("Error no video input")
+    print(video_file_path)
+    try:
+        _,file_ending = os.path.splitext(f'{video_file_path}')
+        print(f'file enging is {file_ending}')
+        print("starting conversion to wav")
+        os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{video_file_path.replace(file_ending, ".wav")}"')
+        print("conversion to wav ready")
+        print("starting whisper c++")
+        srt_path = str(video_file_path.replace(file_ending, ".wav")) + ".srt"
+        os.system(f'rm -f {srt_path}')
+        if selected_source_lang == "Let the model analyze":
+            os.system(f'./whisper.cpp/main "{video_file_path.replace(file_ending, ".wav")}" -t 4 -m ./whisper.cpp/models/ggml-{whisper_model}.bin -osrt')
+        else:
+            os.system(f'./whisper.cpp/main "{video_file_path.replace(file_ending, ".wav")}" -t 4 -l {source_languages.get(selected_source_lang)} -m ./whisper.cpp/models/ggml-{whisper_model}.bin -osrt')
+        print("starting whisper done with whisper")
+    except Exception as e:
+        raise RuntimeError("Error converting video to audio")
+    try:
+        df = pd.DataFrame(columns = ['start','end','text'])
+        srt_path = str(video_file_path.replace(file_ending, ".wav")) + ".srt"
+        subs = pysrt.open(srt_path)
+        objects = []
+        for sub in subs:
+            start_hours = str(str(sub.start.hours) + "00")[0:2] if len(str(sub.start.hours)) == 2 else str("0" + str(sub.start.hours) + "00")[0:2]
+            end_hours = str(str(sub.end.hours) + "00")[0:2] if len(str(sub.end.hours)) == 2 else str("0" + str(sub.end.hours) + "00")[0:2]
+            start_minutes = str(str(sub.start.minutes) + "00")[0:2] if len(str(sub.start.minutes)) == 2 else str("0" + str(sub.start.minutes) + "00")[0:2]
+            end_minutes = str(str(sub.end.minutes) + "00")[0:2] if len(str(sub.end.minutes)) == 2 else str("0" + str(sub.end.minutes) + "00")[0:2]
+            start_seconds = str(str(sub.start.seconds) + "00")[0:2] if len(str(sub.start.seconds)) == 2 else str("0" + str(sub.start.seconds) + "00")[0:2]
+            end_seconds = str(str(sub.end.seconds) + "00")[0:2] if len(str(sub.end.seconds)) == 2 else str("0" + str(sub.end.seconds) + "00")[0:2]
+            start_millis = str(str(sub.start.milliseconds) + "000")[0:3]
+            end_millis = str(str(sub.end.milliseconds) + "000")[0:3]
+            objects.append([sub.text, f'{start_hours}:{start_minutes}:{start_seconds}.{start_millis}', f'{end_hours}:{end_minutes}:{end_seconds}.{end_millis}'])
+        for object in objects:
+            srt_to_df = {
+            'start': [object[1]],
+            'end': [object[2]],
+            'text': [object[0]]
+            }
+            df = pd.concat([df, pd.DataFrame(srt_to_df)])
+        return df
+    except Exception as e:
+        raise RuntimeError("Error Running inference with local model", e)
+def translate_transcriptions(df, selected_translation_lang_2, selected_source_lang_2):
+    print("IN TRANSLATE")
+    if selected_translation_lang_2 is None:
+            selected_translation_lang_2 = 'Finnish'
+    df.reset_index(inplace=True)
+    print("Getting models")
+    tokenizer_marian = translation_models.get(selected_translation_lang_2)[0]
+    translation_model = translation_models.get(selected_translation_lang_2)[1]
+    print("start_translation")
+    translations = []
+    print(df.head())
+    if selected_translation_lang_2 != selected_source_lang_2:
+        print("TRASNLATING")
+        sentences = list(df['text'])
+        sentences = [stringi.replace('[','').replace(']','') for stringi in sentences]
+        translations = translation_model.generate(**tokenizer_marian(sentences, return_tensors="pt", padding=True, truncation=True))
+        print(translations)
+        df['translation'] = translations
+    else:
+        df['translation'] = df['text']
+    print("translations done")
+    return (df)
+def create_srt_and_burn(df, video_in):
+    print("Starting creation of video wit srt")
+    print("video in path is:")
+    print(video_in)
+    with open('testi.srt','w', encoding="utf-8") as file:
+        for i in range(len(df)):
+            file.write(str(i+1))
+            file.write('\n')
+            start = df.iloc[i]['start']
+            file.write(f"{start}")
+            stop = df.iloc[i]['end']
+            file.write(' --> ')
+            file.write(f"{stop}")
+            file.write('\n')
+            file.writelines(df.iloc[i]['translation'])
+            if int(i) != len(df)-1:
+                file.write('\n\n')
+    print("SRT DONE")
+    try:
+        file1 = open('./testi.srt', 'r', encoding="utf-8")
+        Lines = file1.readlines()
+        count = 0
+        # Strips the newline character
+        for line in Lines:
+            count += 1
+            print("{}".format(line))
+        print(type(video_in))
+        print(video_in)
+        video_out = video_in.replace('.mp4', '_out.mp4')
+        print("video_out_path")
+        print(video_out)
+        command = 'ffmpeg -i "{}" -y -vf subtitles=./testi.srt "{}"'.format(video_in, video_out)
+        print(command)
+        os.system(command)
+        return video_out
+    except Exception as e:
+        print(e)
+        return video_out
+# ---- Gradio Layout -----
+video_in = gr.Video(label="Video file", mirror_webcam=False)
+youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
+video_out = gr.Video(label="Video Out", mirror_webcam=False)
+df_init = pd.DataFrame(columns=['start','end','text'])
+df_init_2 = pd.DataFrame(columns=['start','end','text','translation'])
+selected_translation_lang = gr.Dropdown(choices=translation_models_list, type="value", value="English", label="In which language you want the transcriptions?", interactive=True)
+selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="Let the model analyze", label="Spoken language in video", interactive=True)
+selected_source_lang_2 = gr.Dropdown(choices=source_language_list_2, type="value", value="English", label="Spoken language in video", interactive=True)
+selected_translation_lang_2 = gr.Dropdown(choices=translation_models_list, type="value", value="English", label="In which language you want the transcriptions?", interactive=True)
+selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model", interactive=True)
+transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
+transcription_and_translation_df = gr.DataFrame(value=df_init_2,label="Transcription and translation dataframe", max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
+demo = gr.Blocks(css='''
+#cut_btn, #reset_btn { align-self:stretch; }
+#\\31 3 { max-width: 540px; }
+.output-markdown {max-width: 65ch !important;}
+''')
+demo.encrypt = False
+with demo:
+    transcription_var = gr.Variable()
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown('''
+            ### This space allows you to:
+            ##### 1. Download youtube video with a given URL
+            ##### 2. Watch it in the first video component
+            ##### 3. Run automatic speech recognition on the video using Whisper (Please remember to select translation language)
+            ##### 4. Translate the recognized transcriptions to Finnish, Swedish, Danish
+            ##### 5. Burn the translations to the original video and watch the video in the 2nd video component
+            ''')
+        with gr.Column():
+            gr.Markdown('''
+            ### 1. Insert Youtube URL below (Some examples below which I suggest to use for first tests)
+            ##### 1. https://www.youtube.com/watch?v=nlMuHtV82q8&ab_channel=NothingforSale24
+            ##### 2. https://www.youtube.com/watch?v=JzPfMbG1vrE&ab_channel=ExplainerVideosByLauren
+            ##### 3. https://www.youtube.com/watch?v=S68vvV0kod8&ab_channel=Pearl-CohnTelevision
+            ''')
+    with gr.Row():
+        with gr.Column():
+            youtube_url_in.render()
+            download_youtube_btn = gr.Button("Step 1. Download Youtube video")
+            download_youtube_btn.click(get_youtube, [youtube_url_in], [
+                video_in])
+            print(video_in)
+    with gr.Row():
+        with gr.Column():
+            video_in.render()
+            with gr.Column():
+                gr.Markdown('''
+                ##### Here you can start the transcription and translation process.
+                ##### Be aware that processing will last for a while (35 second video took around 20 seconds in my testing and might fail for longer videos)
+                ''')
+            selected_source_lang.render()
+            selected_whisper_model.render()
+            transcribe_btn = gr.Button("Step 2. Transcribe audio")
+            transcribe_btn.click(speech_to_text, [video_in, selected_source_lang, selected_whisper_model], transcription_df)
+    with gr.Row():
+        gr.Markdown('''
+        ##### Here you will get transcription  output
+        ##### ''')
+    with gr.Row():
+        with gr.Column():
+            transcription_df.render()
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown('''
+            ##### Here you will get translated transcriptions.
+            ##### Please remember to select Spoken Language and wanted translation language
+            ##### ''')
+            selected_source_lang_2.render()
+            selected_translation_lang_2.render()
+            translate_transcriptions_button = gr.Button("Step 3. Translate transcription")
+            translate_transcriptions_button.click(translate_transcriptions, [transcription_df, selected_translation_lang_2, selected_source_lang_2], transcription_and_translation_df)
+            transcription_and_translation_df.render()
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown('''
+            ##### Now press the Step 4. Button to create output video with translated transcriptions
+            ##### ''')
+            translate_and_make_srt_btn = gr.Button("Step 4. Create and burn srt to video")
+            print(video_in)
+            translate_and_make_srt_btn.click(create_srt_and_burn, [transcription_and_translation_df,video_in], [
+                video_out])
+            video_out.render()
+demo.launch()