Spaces:

AshBlanc
/

vividly

Runtime error

File size: 9,110 Bytes

import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
from diffusers import StableVideoDiffusionPipeline
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write
import torch
import tempfile
import os
import cv2
import numpy as np
from PIL import Image

# Load SmolLM2-1.7B model (correct model name and size)
print("Loading text generation model...")
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
model = AutoModelForCausalLM.from_pretrained(
    "HuggingFaceTB/SmolLM2-1.7B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)

# Load Stable Video Diffusion model (correct model name)
print("Loading video generation model...")
video_pipe = StableVideoDiffusionPipeline.from_pretrained(
    "stabilityai/stable-video-diffusion-img2vid-xt", 
    torch_dtype=torch.float16,
    variant="fp16"
)
if torch.cuda.is_available():
    video_pipe = video_pipe.to("cuda")
video_pipe.enable_model_cpu_offload()
video_pipe.enable_vae_slicing()

# Load MusicGen model
print("Loading music generation model...")
music_model = MusicGen.get_pretrained('facebook/musicgen-small')
music_model.set_generation_params(duration=8)  # 8 seconds music

def generate_music(prompt: str):
    """Generate background music from text prompt"""
    try:
        wav = music_model.generate([prompt], progress=True)
        tmp_dir = tempfile.mkdtemp()
        out_path = os.path.join(tmp_dir, "music")
        audio_write(out_path, wav[0].cpu(), music_model.sample_rate, format="mp3")
        return f"{out_path}.mp3"
    except Exception as e:
        print(f"Music generation error: {e}")
        return None

def generate_scenes_with_smol(script, style):
    """Generate scene descriptions using SmolLM2"""
    try:
        prompt = f"""<|im_start|>system
You are a professional video director. Break down scripts into detailed cinematic scenes.
<|im_end|>
<|im_start|>user
Break this {style.lower()} script into 3-5 cinematic scenes with camera angles, characters, and mood.

Script: {script}

Format each scene as:
Scene X: [Detailed visual description with camera angle, lighting, characters, and action]
<|im_end|>
<|im_start|>assistant"""
        
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
        if torch.cuda.is_available():
            inputs = {k: v.to("cuda") for k, v in inputs.items()}
            
        with torch.no_grad():
            outputs = model.generate(
                **inputs, 
                max_new_tokens=512,
                temperature=0.7,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )
        
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract only the assistant's response
        response = decoded.split("<|im_start|>assistant")[-1].strip()
        
        # Parse scenes
        scenes = []
        lines = response.split('\n')
        for i, line in enumerate(lines):
            if line.strip() and ('Scene' in line or len(line.strip()) > 20):
                scenes.append({
                    "scene_id": len(scenes) + 1, 
                    "description": line.strip()
                })
        
        # Ensure we have at least one scene
        if not scenes:
            scenes = [{"scene_id": 1, "description": f"A {style.lower()} scene: {script[:100]}..."}]
            
        return scenes[:5]  # Limit to 5 scenes max
    except Exception as e:
        print(f"Scene generation error: {e}")
        return [{"scene_id": 1, "description": f"A {style.lower()} scene based on the script"}]

def create_initial_image(prompt, width=1024, height=576):
    """Create a simple initial image for SVD (since it requires an input image)"""
    # Create a simple gradient or solid color image as starting point
    # In practice, you'd want to use a text-to-image model like Stable Diffusion
    img = np.random.randint(50, 200, (height, width, 3), dtype=np.uint8)
    img = Image.fromarray(img)
    return img

def generate_video_with_svd(prompt):
    """Generate video using Stable Video Diffusion"""
    try:
        # Create initial image (in practice, use a text-to-image model)
        initial_image = create_initial_image(prompt)
        
        # Generate video frames
        frames = video_pipe(
            image=initial_image,
            decode_chunk_size=2,
            generator=torch.manual_seed(42),
            motion_bucket_id=127,
            noise_aug_strength=0.02,
        ).frames[0]
        
        # Save as video file
        tmp_dir = tempfile.mkdtemp()
        output_path = os.path.join(tmp_dir, "scene.mp4")
        
        # Convert PIL images to video using OpenCV
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        fps = 6  # SVD typically generates 6 fps
        height, width = frames[0].size[1], frames[0].size[0]
        
        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
        
        for frame in frames:
            frame_array = np.array(frame)
            frame_bgr = cv2.cvtColor(frame_array, cv2.COLOR_RGB2BGR)
            out.write(frame_bgr)
        
        out.release()
        return output_path
        
    except Exception as e:
        print(f"Video generation error: {e}")
        # Return a placeholder or None
        return None

def process_script(script, style, want_music):
    """Main processing function"""
    if not script.strip():
        return [], None
        
    print("Generating scenes...")
    scenes = generate_scenes_with_smol(script, style)
    
    print("Generating videos...")
    video_clips = []
    for i, scene in enumerate(scenes):
        print(f"Processing scene {i+1}/{len(scenes)}")
        text_prompt = scene['description']
        video_path = generate_video_with_svd(text_prompt)
        if video_path:
            video_clips.append((scene['description'], video_path))
    
    music_path = None
    if want_music:
        print("Generating music...")
        music_prompt = f"Background music for {style.lower()} video: {script[:100]}"
        music_path = generate_music(music_prompt)
    
    return video_clips, music_path

# Gradio Interface
with gr.Blocks(title="Vividly MVP", theme=gr.themes.Soft()) as app:
    gr.Markdown("# 🎬 Vividly MVP – AI Video Creator")
    gr.Markdown("Transform your script into cinematic scenes with AI-generated videos and music!")
    
    with gr.Row():
        with gr.Column(scale=2):
            script_input = gr.Textbox(
                label="Video Script", 
                lines=6,
                placeholder="Enter your video script here..."
            )
        with gr.Column(scale=1):
            style_input = gr.Dropdown(
                ["Cinematic", "Vlog", "Explainer", "Documentary"], 
                value="Cinematic", 
                label="Video Style"
            )
            music_toggle = gr.Checkbox(label="Generate background music", value=True)
            submit_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg")
    
    with gr.Row():
        with gr.Column():
            video_outputs = gr.Video(
                label="Generated Video Clip", 
                interactive=False, 
                visible=False
            )
        with gr.Column():
            music_player = gr.Audio(
                label="Generated Background Music", 
                visible=False
            )
            
    scene_gallery = gr.Gallery(
        label="Scene Descriptions",
        visible=False,
        columns=1,
        height="auto"
    )
    
    def wrap_processing(script, style, music):
        if not script.strip():
            return (
                gr.update(visible=False), 
                gr.update(visible=False),
                gr.update(visible=False)
            )
            
        try:
            scenes, music_path = process_script(script, style, music)
            
            # Show first video if available
            first_video = scenes[0][1] if scenes else None
            
            # Create scene descriptions for gallery
            scene_descriptions = [scene[0] for scene in scenes] if scenes else []
            
            return (
                gr.update(value=first_video, visible=bool(first_video)),
                gr.update(value=music_path, visible=bool(music_path)),
                gr.update(value=scene_descriptions, visible=bool(scene_descriptions))
            )
        except Exception as e:
            print(f"Processing error: {e}")
            return (
                gr.update(visible=False),
                gr.update(visible=False), 
                gr.update(visible=False)
            )
    
    submit_btn.click(
        wrap_processing, 
        inputs=[script_input, style_input, music_toggle], 
        outputs=[video_outputs, music_player, scene_gallery]
    )

if __name__ == "__main__":
    print("Starting Vividly MVP...")
    app.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        debug=True
    )