File size: 2,990 Bytes
d219730
62e2507
9e51fe9
2974e01
e6ed917
2974e01
e6ed917
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d219730
 
 
62e2507
d219730
 
 
62e2507
d219730
62e2507
 
 
 
 
e6ed917
62e2507
 
 
 
 
 
 
e6ed917
62e2507
 
 
 
 
 
 
 
 
e6ed917
62e2507
 
 
 
 
e6ed917
62e2507
e6ed917
 
 
 
 
62e2507
 
e6ed917
 
62e2507
e6ed917
 
62e2507
93476c0
 
 
e6ed917
93476c0
 
 
e6ed917
93476c0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from transformers import pipeline
from gtts import gTTS
import subprocess
import streamlit as st
import os

# Step 1: Extract audio from the video
def extract_audio_from_video(video_path, audio_path="extracted_audio.mp3"):
    # Use FFmpeg to extract audio from the video file
    ffmpeg_command = [
        "ffmpeg", 
        "-i", video_path,  # Input video
        "-vn",             # Disable video processing
        "-acodec", "libmp3lame",  # Set audio codec to mp3
        "-ar", "44100",     # Set audio sample rate
        "-ac", "2",         # Set number of audio channels
        audio_path
    ]
    subprocess.run(ffmpeg_command)
    print(f"Audio extracted to {audio_path}")
    return audio_path

# Step 2: Extract text from the audio using Hugging Face Transformers (Whisper)
def extract_text_from_audio(audio_path):
    # Load the ASR pipeline from Hugging Face with a Whisper-like model
    transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base")
    
    # Transcribe the audio file
    transcription = transcriber(audio_path)
    text = transcription["text"]
    
    # Save transcribed text to a file (optional)
    with open("video_text.txt", "w") as f:
        f.write(text)
    
    return text

# Step 3: Generate voice-over using gTTS
def generate_voice_over(text, output_audio_path="voice_over.mp3"):
    # Generate audio with gTTS
    tts = gTTS(text=text, lang="en")
    tts.save(output_audio_path)
    print(f"Voice-over saved as {output_audio_path}")
    return output_audio_path

# Step 4: Combine voice-over with original video using FFmpeg
def add_voice_over_to_video(video_path, audio_path, output_video_path="output_video_with_voice.mp4"):
    # Use FFmpeg to combine video with new audio
    ffmpeg_command = [
        "ffmpeg",
        "-i", video_path,
        "-i", audio_path,
        "-c:v", "copy",
        "-map", "0:v:0",
        "-map", "1:a:0",
        "-shortest",  # Ensure the video ends when the audio ends
        output_video_path
    ]
    subprocess.run(ffmpeg_command)
    print(f"Final video with voice-over saved as {output_video_path}")

# Step 5: Run the complete process
def main(video_path):
    # Step 1: Extract audio from the video
    audio_path = extract_audio_from_video(video_path)
    
    # Step 2: Extract text from the audio
    text = extract_text_from_audio(audio_path)
    print("Extracted Text:", text)
    
    # Step 3: Generate voice-over from extracted text
    voice_over_path = generate_voice_over(text)
    
    # Step 4: Add voice-over to the video
    add_voice_over_to_video(video_path, voice_over_path)

# Streamlit interface to upload video file
uploaded_file = st.file_uploader("Upload a video file", type=["mp4"])
if uploaded_file is not None:
    # Save the uploaded file as input_video.mp4
    with open("input_video.mp4", "wb") as f:
        f.write(uploaded_file.getbuffer())
    
    # Call the main function after the video is uploaded
    main("input_video.mp4")