File size: 4,236 Bytes
f86362b
 
 
 
 
 
 
 
 
 
 
 
 
 
7abe680
f86362b
 
 
7abe680
f86362b
7abe680
f86362b
0452b90
f86362b
 
 
7abe680
f86362b
 
 
 
 
 
 
 
 
 
0c670e4
f86362b
 
 
 
 
 
 
 
 
 
7abe680
f86362b
7abe680
f86362b
e69dac4
f86362b
 
7abe680
f86362b
 
 
 
 
 
 
 
7abe680
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f86362b
 
 
 
 
 
 
 
 
 
7abe680
 
 
 
f86362b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7abe680
 
 
f86362b
7abe680
f86362b
7abe680
f86362b
 
 
 
 
 
 
 
7abe680
 
 
f86362b
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import streamlit as st
import whisper
from pytube import YouTube
from pydub import AudioSegment
import pandas as pd
import anthropic
from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
import io
from elevenlabs import generate, set_api_key
import subprocess
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
import os
from elevenlabs import set_api_key

set_api_key(st.secrets["xi_api_key"])


def shorten_audio(filename):
    cut_filename = "cut_audio.mp4"
    audio = AudioSegment.from_file(filename)
    cut_audio = audio[: 60 * 1000]
    cut_audio.export(cut_filename, format="mp4")
    return cut_filename 


def generate_translation(original_text, destination_language):
    anthropic = Anthropic(api_key=st.secrets["anthropic"])

    prompt = (
        f"{HUMAN_PROMPT} Please translate this video transcript into {destination_language}. You will get "
        f"to the translation directly after I prompted 'the translation:'"
        f"{AI_PROMPT} Understood, I will get to the translation without any opening lines."
        f"{HUMAN_PROMPT} Great! this is the transcript: {original_text}; the translation:"
    )

    resp = anthropic.completions.create(
        prompt=f"{prompt} {AI_PROMPT}",
        model="claude-2",
        stop_sequences=[HUMAN_PROMPT],
        max_tokens_to_sample=900,
    )

    print(resp.completion)

    return resp.completion


def generate_dubs(text):
    filename = "output.mp3"

    set_api_key(st.secrets["xi_api_key"])

    audio = generate(text=text, voice="Liam Evans", model="eleven_multilingual_v1")

    audio_io = io.BytesIO(audio)
    insert_audio = AudioSegment.from_file(audio_io, format="mp3")
    insert_audio.export(filename, format="mp3")

    return filename


def combine_video(video_filename, audio_filename):
    ffmpeg_extract_subclip(video_filename, 0, 60, targetname="cut_video.mp4")

    output_filename = "output.mp4"

    command = [
        "ffmpeg",
        "-y",
        "-i",
        "cut_video.mp4",
        "-i",
        audio_filename,
        "-c:v",
        "copy",
        "-c:a",
        "aac",
        output_filename,
    ]

    subprocess.run(command)

    return output_filename


st.title("AutoDubs 📺🎵")

link = st.text_input("Link to Youtube Video", key="link")

language = st.selectbox(
    "Translate to",
    ("French", "German", "Hindi", "Italian", "Polish", "Portuguese", "Spanish"),
)

if st.button("Transcribe!"):
    print(f"downloading from link: {link}")

    model = whisper.load_model("base")

    yt = YouTube(link)

    if yt is not None:
        st.subheader(yt.title)
        st.image(yt.thumbnail_url)
        audio_name = st.caption("Downloading audio stream...")
        audio_streams = yt.streams.filter(only_audio=True)
        filename = audio_streams.first().download()
        print("filename: ", filename)

        if filename:
            audio_name.caption(filename)
            cut_filename = shorten_audio(filename)
            transcription = model.transcribe(cut_filename)
            print(transcription)

            if transcription:
                df = pd.DataFrame(
                    transcription["segments"], columns=["start", "end", "text"]
                )
                st.dataframe(df)
                print(transcription["text"])
                dubbing_caption = st.caption("Dubbing...")
                translation = generate_translation(transcription["text"], language)
                dubbing_caption = st.caption("Begin dubbing...")
                dubs_audio = generate_dubs(translation)
                dubbing_caption.caption("Dubs generated! combining with the video...")

                video_streams = yt.streams.filter(only_video=True)
                video_filename = video_streams.first().download()

                if video_filename:
                    dubbing_caption.caption(
                        "Video downloaded! combining the video and the dubs..."
                    )
                    output_filename = combine_video(video_filename, dubs_audio)

                    if os.path.exists(output_filename):
                        dubbing_caption.caption("Video successfully dubbed! Enjoy! 😀")
                        st.video(output_filename)