Update app.py
Browse files
app.py
CHANGED
|
@@ -1,8 +1,14 @@
|
|
| 1 |
import os
|
| 2 |
|
|
|
|
|
|
|
| 3 |
|
| 4 |
os.system('git clone https://github.com/ggerganov/whisper.cpp.git')
|
| 5 |
os.system('make -C ./whisper.cpp')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
os.system('bash ./whisper.cpp/models/download-ggml-model.sh small')
|
| 7 |
os.system('bash ./whisper.cpp/models/download-ggml-model.sh base')
|
| 8 |
os.system('bash ./whisper.cpp/models/download-ggml-model.sh medium')
|
|
@@ -15,17 +21,14 @@ os.system('bash ./whisper.cpp/models/download-ggml-model.sh base.en')
|
|
| 15 |
|
| 16 |
|
| 17 |
|
| 18 |
-
import os
|
| 19 |
-
|
| 20 |
-
|
| 21 |
import gradio as gr
|
| 22 |
-
import os
|
| 23 |
from pathlib import Path
|
| 24 |
import pysrt
|
| 25 |
import pandas as pd
|
| 26 |
import re
|
| 27 |
import time
|
| 28 |
import os
|
|
|
|
| 29 |
|
| 30 |
from pytube import YouTube
|
| 31 |
from transformers import MarianMTModel, MarianTokenizer
|
|
@@ -33,32 +36,7 @@ from transformers import MarianMTModel, MarianTokenizer
|
|
| 33 |
import psutil
|
| 34 |
num_cores = psutil.cpu_count()
|
| 35 |
os.environ["OMP_NUM_THREADS"] = f"{num_cores}"
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
import torch
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
finnish_marian_nmt_model = "Helsinki-NLP/opus-mt-tc-big-en-fi"
|
| 42 |
-
finnish_tokenizer_marian = MarianTokenizer.from_pretrained(finnish_marian_nmt_model, max_length=40)
|
| 43 |
-
finnish_tokenizer_marian.max_new_tokens = 30
|
| 44 |
-
finnish_translation_model = MarianMTModel.from_pretrained(finnish_marian_nmt_model)
|
| 45 |
-
|
| 46 |
-
swedish_marian_nmt_model = "Helsinki-NLP/opus-mt-en-sv"
|
| 47 |
-
swedish_tokenizer_marian = MarianTokenizer.from_pretrained(swedish_marian_nmt_model, max_length=40)
|
| 48 |
-
swedish_tokenizer_marian.max_new_tokens = 30
|
| 49 |
-
swedish_translation_model = MarianMTModel.from_pretrained(swedish_marian_nmt_model)
|
| 50 |
-
|
| 51 |
-
danish_marian_nmt_model = "Helsinki-NLP/opus-mt-en-da"
|
| 52 |
-
danish_tokenizer_marian = MarianTokenizer.from_pretrained(danish_marian_nmt_model, max_length=40)
|
| 53 |
-
danish_tokenizer_marian.max_new_tokens = 30
|
| 54 |
-
danish_translation_model = MarianMTModel.from_pretrained(danish_marian_nmt_model)
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
translation_models = {
|
| 58 |
-
"Finnish": [finnish_tokenizer_marian, finnish_translation_model],
|
| 59 |
-
"Swedish": [swedish_tokenizer_marian, swedish_translation_model],
|
| 60 |
-
"Danish": [danish_tokenizer_marian, danish_translation_model]
|
| 61 |
-
}
|
| 62 |
|
| 63 |
whisper_models = ["base", "small", "medium", "base.en"]
|
| 64 |
|
|
@@ -80,8 +58,34 @@ source_languages = {
|
|
| 80 |
"Let the model analyze": "Let the model analyze"
|
| 81 |
}
|
| 82 |
|
| 83 |
-
|
| 84 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
}
|
| 86 |
|
| 87 |
|
|
@@ -90,7 +94,7 @@ transcribe_options = dict(beam_size=3, best_of=3, without_timestamps=False)
|
|
| 90 |
|
| 91 |
|
| 92 |
source_language_list = [key[0] for key in source_languages.items()]
|
| 93 |
-
source_language_list_2 = [key[0] for key in
|
| 94 |
translation_models_list = [key[0] for key in translation_models.items()]
|
| 95 |
|
| 96 |
|
|
@@ -190,27 +194,32 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model):
|
|
| 190 |
|
| 191 |
|
| 192 |
def translate_transcriptions(df, selected_translation_lang_2, selected_source_lang_2):
|
| 193 |
-
print("IN TRANSLATE")
|
| 194 |
-
|
| 195 |
if selected_translation_lang_2 is None:
|
| 196 |
-
selected_translation_lang_2 = '
|
| 197 |
df.reset_index(inplace=True)
|
| 198 |
|
| 199 |
-
print("Getting models")
|
| 200 |
-
|
| 201 |
-
tokenizer_marian = translation_models.get(selected_translation_lang_2)[0]
|
| 202 |
-
translation_model = translation_models.get(selected_translation_lang_2)[1]
|
| 203 |
-
|
| 204 |
print("start_translation")
|
| 205 |
translations = []
|
| 206 |
-
|
| 207 |
if selected_translation_lang_2 != selected_source_lang_2:
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
else:
|
| 215 |
df['translation'] = df['text']
|
| 216 |
print("translations done")
|
|
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
+
# Download and build ggergavos/whisper.cpp Kudos to this man for wonderful whisper implementation!
|
| 4 |
+
# This means speed!
|
| 5 |
|
| 6 |
os.system('git clone https://github.com/ggerganov/whisper.cpp.git')
|
| 7 |
os.system('make -C ./whisper.cpp')
|
| 8 |
+
|
| 9 |
+
# Download models, add finetuned languages later once whisper finetuning event is ready
|
| 10 |
+
# Models are downloaded on the fly so we can get quite many models :)
|
| 11 |
+
|
| 12 |
os.system('bash ./whisper.cpp/models/download-ggml-model.sh small')
|
| 13 |
os.system('bash ./whisper.cpp/models/download-ggml-model.sh base')
|
| 14 |
os.system('bash ./whisper.cpp/models/download-ggml-model.sh medium')
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
|
|
|
|
|
|
|
|
|
|
| 24 |
import gradio as gr
|
|
|
|
| 25 |
from pathlib import Path
|
| 26 |
import pysrt
|
| 27 |
import pandas as pd
|
| 28 |
import re
|
| 29 |
import time
|
| 30 |
import os
|
| 31 |
+
import json
|
| 32 |
|
| 33 |
from pytube import YouTube
|
| 34 |
from transformers import MarianMTModel, MarianTokenizer
|
|
|
|
| 36 |
import psutil
|
| 37 |
num_cores = psutil.cpu_count()
|
| 38 |
os.environ["OMP_NUM_THREADS"] = f"{num_cores}"
|
| 39 |
+
headers = {'Authorization': os.environ['DeepL_API_KEY']}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
whisper_models = ["base", "small", "medium", "base.en"]
|
| 42 |
|
|
|
|
| 58 |
"Let the model analyze": "Let the model analyze"
|
| 59 |
}
|
| 60 |
|
| 61 |
+
DeepL_language_codes_for_translation = {
|
| 62 |
+
"Bulgarian": "BG",
|
| 63 |
+
"Czech": "CS",
|
| 64 |
+
"Danish": "DA",
|
| 65 |
+
"German": "DE",
|
| 66 |
+
"Greek": "EL",
|
| 67 |
+
"English": "EN",
|
| 68 |
+
"Spanish": "ES",
|
| 69 |
+
"Estonian": "ET",
|
| 70 |
+
"Finnish": "FI",
|
| 71 |
+
"French": "FR",
|
| 72 |
+
"Hungarian": "HU",
|
| 73 |
+
"Indonesian": "ID",
|
| 74 |
+
"Italian": "IT",
|
| 75 |
+
"Japanese": "JA",
|
| 76 |
+
"Lithuanian": "LT",
|
| 77 |
+
"Latvian": "LV",
|
| 78 |
+
"Dutch": "NL",
|
| 79 |
+
"Polish": "PL",
|
| 80 |
+
"Portuguese": "PT",
|
| 81 |
+
"Romanian": "RO",
|
| 82 |
+
"Russian": "RU",
|
| 83 |
+
"Slovak": "SK",
|
| 84 |
+
"Slovenian": "SL",
|
| 85 |
+
"Swedish": "SV",
|
| 86 |
+
"Turkish": "TR",
|
| 87 |
+
"Ukrainian": "UK",
|
| 88 |
+
"Chinese": "ZH"
|
| 89 |
}
|
| 90 |
|
| 91 |
|
|
|
|
| 94 |
|
| 95 |
|
| 96 |
source_language_list = [key[0] for key in source_languages.items()]
|
| 97 |
+
source_language_list_2 = [key[0] for key in DeepL_language_codes_for_translation.items()]
|
| 98 |
translation_models_list = [key[0] for key in translation_models.items()]
|
| 99 |
|
| 100 |
|
|
|
|
| 194 |
|
| 195 |
|
| 196 |
def translate_transcriptions(df, selected_translation_lang_2, selected_source_lang_2):
|
|
|
|
|
|
|
| 197 |
if selected_translation_lang_2 is None:
|
| 198 |
+
selected_translation_lang_2 = 'English'
|
| 199 |
df.reset_index(inplace=True)
|
| 200 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
print("start_translation")
|
| 202 |
translations = []
|
| 203 |
+
|
| 204 |
if selected_translation_lang_2 != selected_source_lang_2:
|
| 205 |
+
|
| 206 |
+
text_combined = ""
|
| 207 |
+
for i, sentence in enumerate(init__df['text']):
|
| 208 |
+
if i == 0:
|
| 209 |
+
text_combined = sentence
|
| 210 |
+
else:
|
| 211 |
+
text_combined = text_combined + '\n' + sentence
|
| 212 |
+
|
| 213 |
+
data = {'text': text_combined,
|
| 214 |
+
'tag_spitting': 'xml',
|
| 215 |
+
'target_lang': DeepL_language_codes.get(selected_source_lang_2)
|
| 216 |
+
}
|
| 217 |
+
response = requests.post('https://api-free.deepl.com/v2/translate', headers=headers, data=data)
|
| 218 |
+
|
| 219 |
+
# Print the response from the server
|
| 220 |
+
translated_sentences = json.loads(response.text)
|
| 221 |
+
translated_sentences['translations'][0]['text'].split('\n')
|
| 222 |
+
df['translation'] = translated_sentences
|
| 223 |
else:
|
| 224 |
df['translation'] = df['text']
|
| 225 |
print("translations done")
|