Spaces:
Runtime error
Runtime error
Commit
·
613b97e
1
Parent(s):
480e8fe
- .gitattributes +0 -31
- .streamlit/config.toml +8 -0
- app.py → 01_🎥_Input_YouTube_Link.py +14 -15
- LICENSE +21 -0
- README.md +21 -12
- pages +0 -0
- pages/02_📼_Upload_Video_File.py +230 -0
- pages/03_🔊_Upload_Audio_File.py +205 -0
.gitattributes
DELETED
|
@@ -1,31 +0,0 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.streamlit/config.toml
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[theme]
|
| 2 |
+
primaryColor="#F63366"
|
| 3 |
+
backgroundColor="#FFFFFF"
|
| 4 |
+
secondaryBackgroundColor="#F0F2F6"
|
| 5 |
+
textColor="#262730"
|
| 6 |
+
font="sans serif"
|
| 7 |
+
[server]
|
| 8 |
+
maxUploadSize=1028
|
app.py → 01_🎥_Input_YouTube_Link.py
RENAMED
|
@@ -75,7 +75,7 @@ def change_model(current_size, size):
|
|
| 75 |
@st.cache(allow_output_mutation=True)
|
| 76 |
def inference(link, loaded_model, task):
|
| 77 |
yt = YouTube(link)
|
| 78 |
-
path = yt.streams.filter(only_audio=True)[0].download(filename="audio.
|
| 79 |
if task == "Transcribe":
|
| 80 |
options = dict(task="transcribe", best_of=5)
|
| 81 |
results = loaded_model.transcribe(path, **options)
|
|
@@ -153,18 +153,18 @@ def main():
|
|
| 153 |
with open(os.path.join(os.getcwd(), "transcript.txt"), "rb") as f:
|
| 154 |
datatxt = f.read()
|
| 155 |
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
datavtt = f.read()
|
| 162 |
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
|
|
|
| 168 |
with col5:
|
| 169 |
st.download_button(label="Download Transcript (.txt)",
|
| 170 |
data=datatxt,
|
|
@@ -184,7 +184,7 @@ def main():
|
|
| 184 |
|
| 185 |
with col4:
|
| 186 |
with st.spinner("Generating Subtitled Video"):
|
| 187 |
-
video_with_subs = generate_subtitled_video(video, "audio.
|
| 188 |
st.video(video_with_subs)
|
| 189 |
st.balloons()
|
| 190 |
with col8:
|
|
@@ -212,7 +212,6 @@ def main():
|
|
| 212 |
with open(os.path.join(os.getcwd(), "transcript.txt"), "rb") as f:
|
| 213 |
datatxt = f.read()
|
| 214 |
|
| 215 |
-
|
| 216 |
with open("transcript.vtt", "w+",encoding='utf8') as f:
|
| 217 |
f.writelines(results[1])
|
| 218 |
f.close()
|
|
@@ -243,7 +242,7 @@ def main():
|
|
| 243 |
|
| 244 |
with col4:
|
| 245 |
with st.spinner("Generating Subtitled Video"):
|
| 246 |
-
video_with_subs = generate_subtitled_video(video, "audio.
|
| 247 |
st.video(video_with_subs)
|
| 248 |
st.balloons()
|
| 249 |
with col8:
|
|
|
|
| 75 |
@st.cache(allow_output_mutation=True)
|
| 76 |
def inference(link, loaded_model, task):
|
| 77 |
yt = YouTube(link)
|
| 78 |
+
path = yt.streams.filter(only_audio=True)[0].download(filename="audio.mp3")
|
| 79 |
if task == "Transcribe":
|
| 80 |
options = dict(task="transcribe", best_of=5)
|
| 81 |
results = loaded_model.transcribe(path, **options)
|
|
|
|
| 153 |
with open(os.path.join(os.getcwd(), "transcript.txt"), "rb") as f:
|
| 154 |
datatxt = f.read()
|
| 155 |
|
| 156 |
+
with open("transcript.vtt", "w+",encoding='utf8') as f:
|
| 157 |
+
f.writelines(results[1])
|
| 158 |
+
f.close()
|
| 159 |
+
with open(os.path.join(os.getcwd(), "transcript.vtt"), "rb") as f:
|
| 160 |
+
datavtt = f.read()
|
|
|
|
| 161 |
|
| 162 |
+
with open("transcript.srt", "w+",encoding='utf8') as f:
|
| 163 |
+
f.writelines(results[2])
|
| 164 |
+
f.close()
|
| 165 |
+
with open(os.path.join(os.getcwd(), "transcript.srt"), "rb") as f:
|
| 166 |
+
datasrt = f.read()
|
| 167 |
+
|
| 168 |
with col5:
|
| 169 |
st.download_button(label="Download Transcript (.txt)",
|
| 170 |
data=datatxt,
|
|
|
|
| 184 |
|
| 185 |
with col4:
|
| 186 |
with st.spinner("Generating Subtitled Video"):
|
| 187 |
+
video_with_subs = generate_subtitled_video(video, "audio.mp3", "transcript.srt")
|
| 188 |
st.video(video_with_subs)
|
| 189 |
st.balloons()
|
| 190 |
with col8:
|
|
|
|
| 212 |
with open(os.path.join(os.getcwd(), "transcript.txt"), "rb") as f:
|
| 213 |
datatxt = f.read()
|
| 214 |
|
|
|
|
| 215 |
with open("transcript.vtt", "w+",encoding='utf8') as f:
|
| 216 |
f.writelines(results[1])
|
| 217 |
f.close()
|
|
|
|
| 242 |
|
| 243 |
with col4:
|
| 244 |
with st.spinner("Generating Subtitled Video"):
|
| 245 |
+
video_with_subs = generate_subtitled_video(video, "audio.mp3", "transcript.srt")
|
| 246 |
st.video(video_with_subs)
|
| 247 |
st.balloons()
|
| 248 |
with col8:
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2022 Batuhan Yılmaz
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
CHANGED
|
@@ -1,13 +1,22 @@
|
|
| 1 |
-
|
| 2 |
-
title: Auto Subtitled Video Generator
|
| 3 |
-
emoji: 📚
|
| 4 |
-
colorFrom: yellow
|
| 5 |
-
colorTo: blue
|
| 6 |
-
sdk: streamlit
|
| 7 |
-
sdk_version: 1.10.0
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
-
license: mit
|
| 11 |
-
---
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Auto-Subtitled-Video-Generator
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+

|
| 4 |
+

|
| 5 |
+

|
| 6 |
+
|
| 7 |
+
#### About this project
|
| 8 |
+
- This project is an automatic speech recognition application that takes a YouTube video link or a video file as input to generate a video with subtitles.
|
| 9 |
+
- You can also upload an audio file to generate a transcript as .txt, .vtt, .srt files.
|
| 10 |
+
- The application performs 2 tasks:
|
| 11 |
+
- Detects the language, transcribes the input video in its original language.
|
| 12 |
+
- Detects the language, translates it into English and then transcribes.
|
| 13 |
+
- Downloaded the video of the input link using [pytube](https://github.com/pytube/pytube).
|
| 14 |
+
- Generated a transcription of the video using the [OpenAI Whisper](https://openai.com/blog/whisper) model.
|
| 15 |
+
- Saved the transcriptions as .txt, .vtt and .srt files.
|
| 16 |
+
- Generated a subtitled version of the input video using [ffmpeg](https://github.com/FFmpeg).
|
| 17 |
+
- Displayed the original video and the subtitled video side by side.
|
| 18 |
+
- Built a multipage web app using [Streamlit](https://streamlit.io) and hosted on [HuggingFace Spaces](https://huggingface.co/spaces).
|
| 19 |
+
- You can download the generated .txt, .vtt, .srt files and the subtitled video.
|
| 20 |
+
- You can use the app via this [link](https://huggingface.co/spaces/BatuhanYilmaz/Auto-Subtitled-Video-Generator).
|
| 21 |
+
|
| 22 |
+

|
pages
DELETED
|
File without changes
|
pages/02_📼_Upload_Video_File.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import whisper
|
| 2 |
+
import streamlit as st
|
| 3 |
+
from streamlit_lottie import st_lottie
|
| 4 |
+
from utils import write_vtt, write_srt
|
| 5 |
+
import ffmpeg
|
| 6 |
+
import requests
|
| 7 |
+
from typing import Iterator
|
| 8 |
+
from io import StringIO
|
| 9 |
+
import numpy as np
|
| 10 |
+
import pathlib
|
| 11 |
+
import os
|
| 12 |
+
|
| 13 |
+
st.set_page_config(page_title="Auto Subtitled Video Generator", page_icon=":movie_camera:", layout="wide")
|
| 14 |
+
|
| 15 |
+
# Define a function that we can use to load lottie files from a link.
|
| 16 |
+
@st.cache(allow_output_mutation=True)
|
| 17 |
+
def load_lottieurl(url: str):
|
| 18 |
+
r = requests.get(url)
|
| 19 |
+
if r.status_code != 200:
|
| 20 |
+
return None
|
| 21 |
+
return r.json()
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
APP_DIR = pathlib.Path(__file__).parent.absolute()
|
| 25 |
+
|
| 26 |
+
LOCAL_DIR = APP_DIR / "local"
|
| 27 |
+
LOCAL_DIR.mkdir(exist_ok=True)
|
| 28 |
+
save_dir = LOCAL_DIR / "output"
|
| 29 |
+
save_dir.mkdir(exist_ok=True)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
loaded_model = whisper.load_model("base")
|
| 33 |
+
current_size = "None"
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
col1, col2 = st.columns([1, 3])
|
| 37 |
+
with col1:
|
| 38 |
+
lottie = load_lottieurl("https://assets1.lottiefiles.com/packages/lf20_HjK9Ol.json")
|
| 39 |
+
st_lottie(lottie, speed=1, height=250, width=250)
|
| 40 |
+
|
| 41 |
+
with col2:
|
| 42 |
+
st.write("""
|
| 43 |
+
## Auto Subtitled Video Generator
|
| 44 |
+
##### Upload a video file and get a video with subtitles.
|
| 45 |
+
###### ➠ If you want to transcribe the video in its original language, select the task as "Transcribe"
|
| 46 |
+
###### ➠ If you want to translate the subtitles to English, select the task as "Translate"
|
| 47 |
+
###### I recommend starting with the base model and then experimenting with the larger models, the small and medium models often work well. """)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
@st.cache(allow_output_mutation=True)
|
| 51 |
+
def change_model(current_size, size):
|
| 52 |
+
if current_size != size:
|
| 53 |
+
loaded_model = whisper.load_model(size)
|
| 54 |
+
return loaded_model
|
| 55 |
+
else:
|
| 56 |
+
raise Exception("Model size is the same as the current size.")
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
@st.cache(allow_output_mutation=True)
|
| 60 |
+
def inferecence(loaded_model, uploaded_file, task):
|
| 61 |
+
with open(f"{save_dir}/input.mp4", "wb") as f:
|
| 62 |
+
f.write(uploaded_file.read())
|
| 63 |
+
audio = ffmpeg.input(f"{save_dir}/input.mp4")
|
| 64 |
+
audio = ffmpeg.output(audio, f"{save_dir}/output.wav", acodec="pcm_s16le", ac=1, ar="16k")
|
| 65 |
+
ffmpeg.run(audio, overwrite_output=True)
|
| 66 |
+
if task == "Transcribe":
|
| 67 |
+
options = dict(task="transcribe", best_of=5)
|
| 68 |
+
results = loaded_model.transcribe(f"{save_dir}/output.wav", **options)
|
| 69 |
+
vtt = getSubs(results["segments"], "vtt", 80)
|
| 70 |
+
srt = getSubs(results["segments"], "srt", 80)
|
| 71 |
+
lang = results["language"]
|
| 72 |
+
return results["text"], vtt, srt, lang
|
| 73 |
+
elif task == "Translate":
|
| 74 |
+
options = dict(task="translate", best_of=5)
|
| 75 |
+
results = loaded_model.transcribe(f"{save_dir}/output.wav", **options)
|
| 76 |
+
vtt = getSubs(results["segments"], "vtt", 80)
|
| 77 |
+
srt = getSubs(results["segments"], "srt", 80)
|
| 78 |
+
lang = results["language"]
|
| 79 |
+
return results["text"], vtt, srt, lang
|
| 80 |
+
else:
|
| 81 |
+
raise ValueError("Task not supported")
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def getSubs(segments: Iterator[dict], format: str, maxLineWidth: int) -> str:
|
| 85 |
+
segmentStream = StringIO()
|
| 86 |
+
|
| 87 |
+
if format == 'vtt':
|
| 88 |
+
write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
|
| 89 |
+
elif format == 'srt':
|
| 90 |
+
write_srt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
|
| 91 |
+
else:
|
| 92 |
+
raise Exception("Unknown format " + format)
|
| 93 |
+
|
| 94 |
+
segmentStream.seek(0)
|
| 95 |
+
return segmentStream.read()
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def generate_subtitled_video(video, audio, transcript):
|
| 99 |
+
video_file = ffmpeg.input(video)
|
| 100 |
+
audio_file = ffmpeg.input(audio)
|
| 101 |
+
ffmpeg.concat(video_file.filter("subtitles", transcript), audio_file, v=1, a=1).output("final.mp4").run(quiet=True, overwrite_output=True)
|
| 102 |
+
video_with_subs = open("final.mp4", "rb")
|
| 103 |
+
return video_with_subs
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def main():
|
| 107 |
+
size = st.selectbox("Select Model Size (The larger the model, the more accurate the transcription will be, but it will take longer)", ["tiny", "base", "small", "medium", "large"], index=1)
|
| 108 |
+
loaded_model = change_model(current_size, size)
|
| 109 |
+
st.write(f"Model is {'multilingual' if loaded_model.is_multilingual else 'English-only'} "
|
| 110 |
+
f"and has {sum(np.prod(p.shape) for p in loaded_model.parameters()):,} parameters.")
|
| 111 |
+
input_file = st.file_uploader("File", type=["mp4", "avi", "mov", "mkv"])
|
| 112 |
+
# get the name of the input_file
|
| 113 |
+
if input_file is not None:
|
| 114 |
+
filename = input_file.name[:-4]
|
| 115 |
+
else:
|
| 116 |
+
filename = None
|
| 117 |
+
task = st.selectbox("Select Task", ["Transcribe", "Translate"], index=0)
|
| 118 |
+
if task == "Transcribe":
|
| 119 |
+
if st.button("Transcribe"):
|
| 120 |
+
results = inferecence(loaded_model, input_file, task)
|
| 121 |
+
col3, col4 = st.columns(2)
|
| 122 |
+
col5, col6, col7, col8 = st.columns(4)
|
| 123 |
+
col9, col10 = st.columns(2)
|
| 124 |
+
with col3:
|
| 125 |
+
st.video(input_file)
|
| 126 |
+
|
| 127 |
+
with open("transcript.txt", "w+", encoding='utf8') as f:
|
| 128 |
+
f.writelines(results[0])
|
| 129 |
+
f.close()
|
| 130 |
+
with open(os.path.join(os.getcwd(), "transcript.txt"), "rb") as f:
|
| 131 |
+
datatxt = f.read()
|
| 132 |
+
|
| 133 |
+
with open("transcript.vtt", "w+",encoding='utf8') as f:
|
| 134 |
+
f.writelines(results[1])
|
| 135 |
+
f.close()
|
| 136 |
+
with open(os.path.join(os.getcwd(), "transcript.vtt"), "rb") as f:
|
| 137 |
+
datavtt = f.read()
|
| 138 |
+
|
| 139 |
+
with open("transcript.srt", "w+",encoding='utf8') as f:
|
| 140 |
+
f.writelines(results[2])
|
| 141 |
+
f.close()
|
| 142 |
+
with open(os.path.join(os.getcwd(), "transcript.srt"), "rb") as f:
|
| 143 |
+
datasrt = f.read()
|
| 144 |
+
|
| 145 |
+
with col5:
|
| 146 |
+
st.download_button(label="Download Transcript (.txt)",
|
| 147 |
+
data=datatxt,
|
| 148 |
+
file_name="transcript.txt")
|
| 149 |
+
with col6:
|
| 150 |
+
st.download_button(label="Download Transcript (.vtt)",
|
| 151 |
+
data=datavtt,
|
| 152 |
+
file_name="transcript.vtt")
|
| 153 |
+
with col7:
|
| 154 |
+
st.download_button(label="Download Transcript (.srt)",
|
| 155 |
+
data=datasrt,
|
| 156 |
+
file_name="transcript.srt")
|
| 157 |
+
with col9:
|
| 158 |
+
st.success("You can download the transcript in .srt format, edit it (if you need to) and upload it to YouTube to create subtitles for your video.")
|
| 159 |
+
with col10:
|
| 160 |
+
st.info("Streamlit refreshes after the download button is clicked. The data is cached so you can download the transcript again without having to transcribe the video again.")
|
| 161 |
+
|
| 162 |
+
with col4:
|
| 163 |
+
with st.spinner("Generating Subtitled Video"):
|
| 164 |
+
video_with_subs = generate_subtitled_video(f"{save_dir}/input.mp4", f"{save_dir}/output.wav", "transcript.srt")
|
| 165 |
+
st.video(video_with_subs)
|
| 166 |
+
st.snow()
|
| 167 |
+
with col8:
|
| 168 |
+
st.download_button(label="Download Video with Subtitles",
|
| 169 |
+
data=video_with_subs,
|
| 170 |
+
file_name=f"{filename}_with_subs.mp4")
|
| 171 |
+
elif task == "Translate":
|
| 172 |
+
if st.button("Translate to English"):
|
| 173 |
+
results = inferecence(loaded_model, input_file, task)
|
| 174 |
+
col3, col4 = st.columns(2)
|
| 175 |
+
col5, col6, col7, col8 = st.columns(4)
|
| 176 |
+
col9, col10 = st.columns(2)
|
| 177 |
+
with col3:
|
| 178 |
+
st.video(input_file)
|
| 179 |
+
|
| 180 |
+
with open("transcript.txt", "w+", encoding='utf8') as f:
|
| 181 |
+
f.writelines(results[0])
|
| 182 |
+
f.close()
|
| 183 |
+
with open(os.path.join(os.getcwd(), "transcript.txt"), "rb") as f:
|
| 184 |
+
datatxt = f.read()
|
| 185 |
+
|
| 186 |
+
with open("transcript.vtt", "w+",encoding='utf8') as f:
|
| 187 |
+
f.writelines(results[1])
|
| 188 |
+
f.close()
|
| 189 |
+
with open(os.path.join(os.getcwd(), "transcript.vtt"), "rb") as f:
|
| 190 |
+
datavtt = f.read()
|
| 191 |
+
|
| 192 |
+
with open("transcript.srt", "w+",encoding='utf8') as f:
|
| 193 |
+
f.writelines(results[2])
|
| 194 |
+
f.close()
|
| 195 |
+
with open(os.path.join(os.getcwd(), "transcript.srt"), "rb") as f:
|
| 196 |
+
datasrt = f.read()
|
| 197 |
+
|
| 198 |
+
with col5:
|
| 199 |
+
st.download_button(label="Download Transcript (.txt)",
|
| 200 |
+
data=datatxt,
|
| 201 |
+
file_name="transcript.txt")
|
| 202 |
+
with col6:
|
| 203 |
+
st.download_button(label="Download Transcript (.vtt)",
|
| 204 |
+
data=datavtt,
|
| 205 |
+
file_name="transcript.vtt")
|
| 206 |
+
with col7:
|
| 207 |
+
st.download_button(label="Download Transcript (.srt)",
|
| 208 |
+
data=datasrt,
|
| 209 |
+
file_name="transcript.srt")
|
| 210 |
+
with col9:
|
| 211 |
+
st.success("You can download the transcript in .srt format, edit it (if you need to) and upload it to YouTube to create subtitles for your video.")
|
| 212 |
+
with col10:
|
| 213 |
+
st.info("Streamlit refreshes after the download button is clicked. The data is cached so you can download the transcript again without having to transcribe the video again.")
|
| 214 |
+
|
| 215 |
+
with col4:
|
| 216 |
+
with st.spinner("Generating Subtitled Video"):
|
| 217 |
+
video_with_subs = generate_subtitled_video(f"{save_dir}/input.mp4", f"{save_dir}/output.wav", "transcript.srt")
|
| 218 |
+
st.video(video_with_subs)
|
| 219 |
+
st.snow()
|
| 220 |
+
with col8:
|
| 221 |
+
st.download_button(label="Download Video with Subtitles",
|
| 222 |
+
data=video_with_subs,
|
| 223 |
+
file_name=f"{filename}_with_subs.mp4")
|
| 224 |
+
else:
|
| 225 |
+
st.error("Please select a task.")
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
if __name__ == "__main__":
|
| 229 |
+
main()
|
| 230 |
+
st.markdown("###### Made with :heart: by [@BatuhanYılmaz](https://twitter.com/batuhan3326) [](https://www.buymeacoffee.com/batuhanylmz)")
|
pages/03_🔊_Upload_Audio_File.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import whisper
|
| 2 |
+
import streamlit as st
|
| 3 |
+
from streamlit_lottie import st_lottie
|
| 4 |
+
from utils import write_vtt, write_srt
|
| 5 |
+
import ffmpeg
|
| 6 |
+
import requests
|
| 7 |
+
from typing import Iterator
|
| 8 |
+
from io import StringIO
|
| 9 |
+
import numpy as np
|
| 10 |
+
import pathlib
|
| 11 |
+
import os
|
| 12 |
+
|
| 13 |
+
st.set_page_config(page_title="Auto Transcriber", page_icon="🔊", layout="wide")
|
| 14 |
+
|
| 15 |
+
# Define a function that we can use to load lottie files from a link.
|
| 16 |
+
@st.cache(allow_output_mutation=True)
|
| 17 |
+
def load_lottieurl(url: str):
|
| 18 |
+
r = requests.get(url)
|
| 19 |
+
if r.status_code != 200:
|
| 20 |
+
return None
|
| 21 |
+
return r.json()
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
APP_DIR = pathlib.Path(__file__).parent.absolute()
|
| 25 |
+
|
| 26 |
+
LOCAL_DIR = APP_DIR / "local_audio"
|
| 27 |
+
LOCAL_DIR.mkdir(exist_ok=True)
|
| 28 |
+
save_dir = LOCAL_DIR / "output"
|
| 29 |
+
save_dir.mkdir(exist_ok=True)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
col1, col2 = st.columns([1, 3])
|
| 33 |
+
with col1:
|
| 34 |
+
lottie = load_lottieurl("https://assets1.lottiefiles.com/packages/lf20_1xbk4d2v.json")
|
| 35 |
+
st_lottie(lottie, speed=1, height=250, width=250)
|
| 36 |
+
|
| 37 |
+
with col2:
|
| 38 |
+
st.write("""
|
| 39 |
+
## Auto Transcriber
|
| 40 |
+
##### Input an audio file and get a transcript.
|
| 41 |
+
###### ➠ If you want to transcribe the audio in its original language, select the task as "Transcribe"
|
| 42 |
+
###### ➠ If you want to translate the transcription to English, select the task as "Translate"
|
| 43 |
+
###### I recommend starting with the base model and then experimenting with the larger models, the small and medium models often work well. """)
|
| 44 |
+
|
| 45 |
+
loaded_model = whisper.load_model("base")
|
| 46 |
+
current_size = "None"
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
@st.cache(allow_output_mutation=True)
|
| 50 |
+
def change_model(current_size, size):
|
| 51 |
+
if current_size != size:
|
| 52 |
+
loaded_model = whisper.load_model(size)
|
| 53 |
+
return loaded_model
|
| 54 |
+
else:
|
| 55 |
+
raise Exception("Model size is the same as the current size.")
|
| 56 |
+
|
| 57 |
+
@st.cache(allow_output_mutation=True)
|
| 58 |
+
def inferecence(loaded_model, uploaded_file, task):
|
| 59 |
+
with open(f"{save_dir}/input.mp3", "wb") as f:
|
| 60 |
+
f.write(uploaded_file.read())
|
| 61 |
+
audio = ffmpeg.input(f"{save_dir}/input.mp3")
|
| 62 |
+
audio = ffmpeg.output(audio, f"{save_dir}/output.wav", acodec="pcm_s16le", ac=1, ar="16k")
|
| 63 |
+
ffmpeg.run(audio, overwrite_output=True)
|
| 64 |
+
if task == "Transcribe":
|
| 65 |
+
options = dict(task="transcribe", best_of=5)
|
| 66 |
+
results = loaded_model.transcribe(f"{save_dir}/output.wav", **options)
|
| 67 |
+
vtt = getSubs(results["segments"], "vtt", 80)
|
| 68 |
+
srt = getSubs(results["segments"], "srt", 80)
|
| 69 |
+
lang = results["language"]
|
| 70 |
+
return results["text"], vtt, srt, lang
|
| 71 |
+
elif task == "Translate":
|
| 72 |
+
options = dict(task="translate", best_of=5)
|
| 73 |
+
results = loaded_model.transcribe(f"{save_dir}/output.wav", **options)
|
| 74 |
+
vtt = getSubs(results["segments"], "vtt", 80)
|
| 75 |
+
srt = getSubs(results["segments"], "srt", 80)
|
| 76 |
+
lang = results["language"]
|
| 77 |
+
return results["text"], vtt, srt, lang
|
| 78 |
+
else:
|
| 79 |
+
raise ValueError("Task not supported")
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def getSubs(segments: Iterator[dict], format: str, maxLineWidth: int) -> str:
|
| 83 |
+
segmentStream = StringIO()
|
| 84 |
+
|
| 85 |
+
if format == 'vtt':
|
| 86 |
+
write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
|
| 87 |
+
elif format == 'srt':
|
| 88 |
+
write_srt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
|
| 89 |
+
else:
|
| 90 |
+
raise Exception("Unknown format " + format)
|
| 91 |
+
|
| 92 |
+
segmentStream.seek(0)
|
| 93 |
+
return segmentStream.read()
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def main():
|
| 97 |
+
size = st.selectbox("Select Model Size (The larger the model, the more accurate the transcription will be, but it will take longer)", ["tiny", "base", "small", "medium", "large"], index=1)
|
| 98 |
+
loaded_model = change_model(current_size, size)
|
| 99 |
+
st.write(f"Model is {'multilingual' if loaded_model.is_multilingual else 'English-only'} "
|
| 100 |
+
f"and has {sum(np.prod(p.shape) for p in loaded_model.parameters()):,} parameters.")
|
| 101 |
+
input_file = st.file_uploader("Upload an audio file", type=["mp3", "wav", "m4a"])
|
| 102 |
+
if input_file is not None:
|
| 103 |
+
filename = input_file.name[:-4]
|
| 104 |
+
else:
|
| 105 |
+
filename = None
|
| 106 |
+
task = st.selectbox("Select Task", ["Transcribe", "Translate"], index=0)
|
| 107 |
+
if task == "Transcribe":
|
| 108 |
+
if st.button("Transcribe"):
|
| 109 |
+
results = inferecence(loaded_model, input_file, task)
|
| 110 |
+
col3, col4 = st.columns(2)
|
| 111 |
+
col5, col6, col7 = st.columns(3)
|
| 112 |
+
col9, col10 = st.columns(2)
|
| 113 |
+
|
| 114 |
+
with col3:
|
| 115 |
+
st.audio(input_file)
|
| 116 |
+
|
| 117 |
+
with open("transcript.txt", "w+", encoding='utf8') as f:
|
| 118 |
+
f.writelines(results[0])
|
| 119 |
+
f.close()
|
| 120 |
+
with open(os.path.join(os.getcwd(), "transcript.txt"), "rb") as f:
|
| 121 |
+
datatxt = f.read()
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
with open("transcript.vtt", "w+",encoding='utf8') as f:
|
| 125 |
+
f.writelines(results[1])
|
| 126 |
+
f.close()
|
| 127 |
+
with open(os.path.join(os.getcwd(), "transcript.vtt"), "rb") as f:
|
| 128 |
+
datavtt = f.read()
|
| 129 |
+
|
| 130 |
+
with open("transcript.srt", "w+",encoding='utf8') as f:
|
| 131 |
+
f.writelines(results[2])
|
| 132 |
+
f.close()
|
| 133 |
+
with open(os.path.join(os.getcwd(), "transcript.srt"), "rb") as f:
|
| 134 |
+
datasrt = f.read()
|
| 135 |
+
|
| 136 |
+
with col5:
|
| 137 |
+
st.download_button(label="Download Transcript (.txt)",
|
| 138 |
+
data=datatxt,
|
| 139 |
+
file_name="transcript.txt")
|
| 140 |
+
with col6:
|
| 141 |
+
st.download_button(label="Download Transcript (.vtt)",
|
| 142 |
+
data=datavtt,
|
| 143 |
+
file_name="transcript.vtt")
|
| 144 |
+
with col7:
|
| 145 |
+
st.download_button(label="Download Transcript (.srt)",
|
| 146 |
+
data=datasrt,
|
| 147 |
+
file_name="transcript.srt")
|
| 148 |
+
with col9:
|
| 149 |
+
st.success("You can download the transcript in .srt format, edit it (if you need to) and upload it to YouTube to create subtitles for your video.")
|
| 150 |
+
with col10:
|
| 151 |
+
st.info("Streamlit refreshes after the download button is clicked. The data is cached so you can download the transcript again without having to transcribe the video again.")
|
| 152 |
+
|
| 153 |
+
elif task == "Translate":
|
| 154 |
+
if st.button("Translate to English"):
|
| 155 |
+
results = inferecence(loaded_model, input_file, task)
|
| 156 |
+
col3, col4 = st.columns(2)
|
| 157 |
+
col5, col6, col7 = st.columns(3)
|
| 158 |
+
col9, col10 = st.columns(2)
|
| 159 |
+
|
| 160 |
+
with col3:
|
| 161 |
+
st.audio(input_file)
|
| 162 |
+
|
| 163 |
+
with open("transcript.txt", "w+", encoding='utf8') as f:
|
| 164 |
+
f.writelines(results[0])
|
| 165 |
+
f.close()
|
| 166 |
+
with open(os.path.join(os.getcwd(), "transcript.txt"), "rb") as f:
|
| 167 |
+
datatxt = f.read()
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
with open("transcript.vtt", "w+",encoding='utf8') as f:
|
| 171 |
+
f.writelines(results[1])
|
| 172 |
+
f.close()
|
| 173 |
+
with open(os.path.join(os.getcwd(), "transcript.vtt"), "rb") as f:
|
| 174 |
+
datavtt = f.read()
|
| 175 |
+
|
| 176 |
+
with open("transcript.srt", "w+",encoding='utf8') as f:
|
| 177 |
+
f.writelines(results[2])
|
| 178 |
+
f.close()
|
| 179 |
+
with open(os.path.join(os.getcwd(), "transcript.srt"), "rb") as f:
|
| 180 |
+
datasrt = f.read()
|
| 181 |
+
|
| 182 |
+
with col5:
|
| 183 |
+
st.download_button(label="Download Transcript (.txt)",
|
| 184 |
+
data=datatxt,
|
| 185 |
+
file_name="transcript.txt")
|
| 186 |
+
with col6:
|
| 187 |
+
st.download_button(label="Download Transcript (.vtt)",
|
| 188 |
+
data=datavtt,
|
| 189 |
+
file_name="transcript.vtt")
|
| 190 |
+
with col7:
|
| 191 |
+
st.download_button(label="Download Transcript (.srt)",
|
| 192 |
+
data=datasrt,
|
| 193 |
+
file_name="transcript.srt")
|
| 194 |
+
with col9:
|
| 195 |
+
st.success("You can download the transcript in .srt format, edit it (if you need to) and upload it to YouTube to create subtitles for your video.")
|
| 196 |
+
with col10:
|
| 197 |
+
st.info("Streamlit refreshes after the download button is clicked. The data is cached so you can download the transcript again without having to transcribe the video again.")
|
| 198 |
+
|
| 199 |
+
else:
|
| 200 |
+
st.error("Please select a task.")
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
if __name__ == "__main__":
|
| 204 |
+
main()
|
| 205 |
+
st.markdown("###### Made with :heart: by [@BatuhanYılmaz](https://twitter.com/batuhan3326) [](https://www.buymeacoffee.com/batuhanylmz)")
|