Spaces:

CereusTech
/

Sepia

Sleeping

File size: 10,035 Bytes

import os
import gradio as gr
from cerebras.cloud.sdk import Cerebras
from gtts import gTTS
import assemblyai as aai
from moviepy import VideoFileClip,concatenate_videoclips, AudioFileClip, TextClip, CompositeVideoClip
import requests

# Initialize Cerebras client
Cerekey = os.getenv("Ckey")
client = Cerebras(api_key= Cerekey)

# Pexels API key
pexkey = os.getenv("Pkey")
PEXELS_API_KEY = pexkey

# assembly AI API key
asskey = os.getenv("Akey")
aai.settings.api_key = asskey

# Modify the system prompt to include the estimated word count based on video duration
def generate_script(prompt, max_duration):
    system_message = f"You are an expert video content creator and narration writer who is proficient in generating narration from user prompts and crafting a concise and poetic narration that aligns with the prompt. Craft a concise, poetic narration for the prompt. Go straight to the narration, don't write a foreward or a description of your action. The narration should be suitable for a video that can be read in less than {max_duration} seconds."

    stream = client.chat.completions.create(
        messages=[{"role": "system", "content": system_message}, {"role": "user", "content": prompt}],
        model="llama-3.3-70b",
        stream=False,
        max_completion_tokens=1024,
        temperature=0.7,
        top_p=1
    )
    return stream.choices[0].message.content


def search_and_download_videos(query, max_duration, aspect_ratio, download_folder, max_results=6):
    url = "https://api.pexels.com/videos/search"
    headers = {"Authorization": PEXELS_API_KEY}
    params = {"query": query, "per_page": max_results}

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        videos = response.json().get("videos", [])

        if not os.path.exists(download_folder):
            os.makedirs(download_folder)

        downloaded_files = []
        for video in videos:
            duration = video.get("duration")
            width = video.get("width")
            height = video.get("height")
            if width and height:
                video_aspect_ratio = "landscape" if width > height else "portrait" if height > width else "square"
                if duration <= max_duration and video_aspect_ratio == aspect_ratio:
                    video_url = video["video_files"][0]["link"]
                    video_id = video["id"]
                    video_filename = os.path.join(download_folder, f"{video_id}.mp4")
                    video_response = requests.get(video_url, stream=True)
                    with open(video_filename, "wb") as file:
                        for chunk in video_response.iter_content(chunk_size=1024):
                            file.write(chunk)

                    downloaded_files.append(video_filename)
        return downloaded_files
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return []


def generate_narration(script, output_file="narration.mp3"):
    tts = gTTS(script, lang="en")
    tts.save(output_file)
    return output_file


def load_videos_from_folder(folder_path):
    if not os.path.exists(folder_path):
        print(f"Error: The folder '{folder_path}' does not exist.")
        return []

    video_files = [
        os.path.join(folder_path, file)
        for file in os.listdir(folder_path)
        if file.endswith(('.mp4', '.mov', '.avi', '.mkv'))
    ]
    return video_files


def aggregate_videos(clips):
    if not clips:
        return None
    return concatenate_videoclips(clips, method="compose")


def trim_video_to_audio_length(final_video, audio_length):
    if final_video.duration > audio_length:
        # Use subclipped method for CompositeVideoClip
        final_video = final_video.subclipped(0, audio_length)
    return final_video



# Function to add narration to the final video
def add_narration_to_video(final_video, narration_path):
    if os.path.exists(narration_path):
        narration_audio = AudioFileClip(narration_path)
        narration_audio = narration_audio.with_duration(final_video.duration)  # Adjust duration to match video
        final_video = final_video.with_audio(narration_audio)  # Use with_audio instead of set_audio
    return final_video



def save_final_video(final_video, output_path):
    final_video.write_videofile(output_path, codec="libx264", audio_codec="aac", preset="ultrafast")

def split_text_into_lines(data):
    MaxChars   = 40
    MaxDuration = 2.5
    MaxGap      = 1.5

    subtitles = []
    line = []
    line_duration = 0
    line_chars = 0

    for idx, wd in enumerate(data):
        # start a new line if too many chars or too long duration
        if (line_chars + len(wd['word']) > MaxChars) or (line_duration > MaxDuration):
            subtitles.append({
                "word": " ".join(w['word'] for w in line),
                "start": line[0]['start'],
                "end":   line[-1]['end'],
                "textcontents": line
            })
            line = []
            line_chars = 0
            line_duration = 0

        line.append(wd)
        line_chars   += len(wd['word'])
        line_duration = wd['end'] - line[0]['start']

        # also split on long pause
        if idx < len(data)-1 and data[idx+1]['start'] - wd['end'] > MaxGap:
            subtitles.append({
                "word": " ".join(w['word'] for w in line),
                "start": line[0]['start'],
                "end":   wd['end'],
                "textcontents": line
            })
            line = []
            line_chars = 0
            line_duration = 0

    if line:
        subtitles.append({
            "word": " ".join(w['word'] for w in line),
            "start": line[0]['start'],
            "end":   line[-1]['end'],
            "textcontents": line
        })

    return subtitles

def generate_video(
    prompt: str,
    max_duration: int,
    aspect_ratio: str,
    download_folder: str = "downloaded_videos",
    max_results: int = 6
):
    # 1️⃣ Generate the narration script
    script = generate_script(prompt, max_duration)

    # 2️⃣ Search & download Pexels videos
    videos = search_and_download_videos(
        prompt, max_duration, aspect_ratio, download_folder, max_results
    )
    if not videos:
        return "No videos were downloaded.", None, script

    # 3️⃣ Load and concatenate downloaded clips
    video_clips = [VideoFileClip(path) for path in videos]
    final_video = aggregate_videos(video_clips)
    if final_video is None:
        return "Error generating video.", None, script

    # 4️⃣ Generate TTS narration and attach audio
    narration_file = generate_narration(script)
    audio_len = AudioFileClip(narration_file).duration
    final_video = trim_video_to_audio_length(final_video, audio_len)
    final_video = add_narration_to_video(final_video, narration_file)

    # 5️⃣ Transcribe narration for word‑level timings
    transcript = aai.Transcriber().transcribe(narration_file)
    wordlevel_info = [
        {
            "word": w.text,
            "start": w.start / 1000.0,
            "end":   w.end   / 1000.0
        }
        for w in transcript.words
    ]

    # 6️⃣ Split word‑timestamps into line‑level subtitles
    linelevel_subs = split_text_into_lines(wordlevel_info)

    # 7️⃣ Build subtitle clips (static + highlights)
    fw, fh = final_video.size
    font, fs, ypos = "Helvetica", 44, fh - 64
    all_clips = [final_video]

    for line in linelevel_subs:
        # ─ Static full‑line text
        txt = TextClip(
            line["word"],
            font=font,
            fontsize=fs,
            color="white",
            stroke_color="black",
            stroke_width=1
        )
        x0 = (fw - txt.w) / 2
        static = (
            txt
            .set_start(line["start"])
            .set_duration(line["end"] - line["start"])
            .set_position((x0, ypos))
        )
        all_clips.append(static)

        # ─ Word‑by‑word highlight
        cursor = x0
        for wd in line["textcontents"]:
            wc = TextClip(
                wd["word"],
                font=font,
                fontsize=fs,
                color="yellow",
                stroke_color="black",
                stroke_width=1
            )
            hl = (
                wc
                .set_start(wd["start"])
                .set_duration(wd["end"] - wd["start"])
                .set_position((cursor, ypos))
            )
            all_clips.append(hl)

            # advance cursor by measuring a space after the word
            dummy = TextClip(wd["word"] + " ", font=font, fontsize=fs)
            cursor += dummy.w

    # 8️⃣ Composite all clips and export
    subtitled = CompositeVideoClip(all_clips, size=(fw, fh)) \
                   .set_audio(final_video.audio)
    output_path = "final_with_subtitles.mp4"
    subtitled.write_videofile(
        output_path,
        fps=24,
        codec="libx264",
        audio_codec="aac",
        preset="ultrafast"
    )

    # Return TTS audio path, final video path, and the script
    return narration_file, output_path, script

iface = gr.Interface(
    fn=generate_video,
    inputs=[
        gr.Textbox(label="Enter Text Prompt", placeholder="Enter the text to generate the video script."),
        gr.Slider(minimum=1, maximum=30, step=1, label="Video Length (seconds)", value=10),
        gr.Radio(choices=["portrait", "landscape", "square"], label="Select Aspect Ratio", value="landscape"),
    ],
    outputs=[
        gr.Audio(label="Narration Audio"),
        gr.Video(label="Generated Video"),
        gr.Textbox(label="Generated Script", interactive=False)
    ],
    title="Sepia Text-to-Video Generator",
    description="Enter a text prompt, specify the length of the video (maximum 30 seconds), select the aspect ratio, and click 'Submit' to get the narrated audio, the video and the script.",
    live=False
)

iface.launch(debug=True)