AutoDub / app.py
Sanchayt's picture
voice
e69dac4
import streamlit as st
import whisper
from pytube import YouTube
from pydub import AudioSegment
import pandas as pd
import anthropic
from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
import io
from elevenlabs import generate, set_api_key
import subprocess
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
import os
from elevenlabs import set_api_key
set_api_key(st.secrets["xi_api_key"])
def shorten_audio(filename):
cut_filename = "cut_audio.mp4"
audio = AudioSegment.from_file(filename)
cut_audio = audio[: 60 * 1000]
cut_audio.export(cut_filename, format="mp4")
return cut_filename
def generate_translation(original_text, destination_language):
anthropic = Anthropic(api_key=st.secrets["anthropic"])
prompt = (
f"{HUMAN_PROMPT} Please translate this video transcript into {destination_language}. You will get "
f"to the translation directly after I prompted 'the translation:'"
f"{AI_PROMPT} Understood, I will get to the translation without any opening lines."
f"{HUMAN_PROMPT} Great! this is the transcript: {original_text}; the translation:"
)
resp = anthropic.completions.create(
prompt=f"{prompt} {AI_PROMPT}",
model="claude-2",
stop_sequences=[HUMAN_PROMPT],
max_tokens_to_sample=900,
)
print(resp.completion)
return resp.completion
def generate_dubs(text):
filename = "output.mp3"
set_api_key(st.secrets["xi_api_key"])
audio = generate(text=text, voice="Liam Evans", model="eleven_multilingual_v1")
audio_io = io.BytesIO(audio)
insert_audio = AudioSegment.from_file(audio_io, format="mp3")
insert_audio.export(filename, format="mp3")
return filename
def combine_video(video_filename, audio_filename):
ffmpeg_extract_subclip(video_filename, 0, 60, targetname="cut_video.mp4")
output_filename = "output.mp4"
command = [
"ffmpeg",
"-y",
"-i",
"cut_video.mp4",
"-i",
audio_filename,
"-c:v",
"copy",
"-c:a",
"aac",
output_filename,
]
subprocess.run(command)
return output_filename
st.title("AutoDubs πŸ“ΊπŸŽ΅")
link = st.text_input("Link to Youtube Video", key="link")
language = st.selectbox(
"Translate to",
("French", "German", "Hindi", "Italian", "Polish", "Portuguese", "Spanish"),
)
if st.button("Transcribe!"):
print(f"downloading from link: {link}")
model = whisper.load_model("base")
yt = YouTube(link)
if yt is not None:
st.subheader(yt.title)
st.image(yt.thumbnail_url)
audio_name = st.caption("Downloading audio stream...")
audio_streams = yt.streams.filter(only_audio=True)
filename = audio_streams.first().download()
print("filename: ", filename)
if filename:
audio_name.caption(filename)
cut_filename = shorten_audio(filename)
transcription = model.transcribe(cut_filename)
print(transcription)
if transcription:
df = pd.DataFrame(
transcription["segments"], columns=["start", "end", "text"]
)
st.dataframe(df)
print(transcription["text"])
dubbing_caption = st.caption("Dubbing...")
translation = generate_translation(transcription["text"], language)
dubbing_caption = st.caption("Begin dubbing...")
dubs_audio = generate_dubs(translation)
dubbing_caption.caption("Dubs generated! combining with the video...")
video_streams = yt.streams.filter(only_video=True)
video_filename = video_streams.first().download()
if video_filename:
dubbing_caption.caption(
"Video downloaded! combining the video and the dubs..."
)
output_filename = combine_video(video_filename, dubs_audio)
if os.path.exists(output_filename):
dubbing_caption.caption("Video successfully dubbed! Enjoy! πŸ˜€")
st.video(output_filename)