|
|
import gradio as gr |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
from diffusers import StableVideoDiffusionPipeline |
|
|
from audiocraft.models import MusicGen |
|
|
from audiocraft.data.audio import audio_write |
|
|
import torch |
|
|
import tempfile |
|
|
import os |
|
|
import cv2 |
|
|
import numpy as np |
|
|
from PIL import Image |
|
|
|
|
|
|
|
|
print("Loading text generation model...") |
|
|
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct") |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"HuggingFaceTB/SmolLM2-1.7B-Instruct", |
|
|
torch_dtype=torch.float16, |
|
|
device_map="auto" |
|
|
) |
|
|
|
|
|
|
|
|
print("Loading video generation model...") |
|
|
video_pipe = StableVideoDiffusionPipeline.from_pretrained( |
|
|
"stabilityai/stable-video-diffusion-img2vid-xt", |
|
|
torch_dtype=torch.float16, |
|
|
variant="fp16" |
|
|
) |
|
|
if torch.cuda.is_available(): |
|
|
video_pipe = video_pipe.to("cuda") |
|
|
video_pipe.enable_model_cpu_offload() |
|
|
video_pipe.enable_vae_slicing() |
|
|
|
|
|
|
|
|
print("Loading music generation model...") |
|
|
music_model = MusicGen.get_pretrained('facebook/musicgen-small') |
|
|
music_model.set_generation_params(duration=8) |
|
|
|
|
|
def generate_music(prompt: str): |
|
|
"""Generate background music from text prompt""" |
|
|
try: |
|
|
wav = music_model.generate([prompt], progress=True) |
|
|
tmp_dir = tempfile.mkdtemp() |
|
|
out_path = os.path.join(tmp_dir, "music") |
|
|
audio_write(out_path, wav[0].cpu(), music_model.sample_rate, format="mp3") |
|
|
return f"{out_path}.mp3" |
|
|
except Exception as e: |
|
|
print(f"Music generation error: {e}") |
|
|
return None |
|
|
|
|
|
def generate_scenes_with_smol(script, style): |
|
|
"""Generate scene descriptions using SmolLM2""" |
|
|
try: |
|
|
prompt = f"""<|im_start|>system |
|
|
You are a professional video director. Break down scripts into detailed cinematic scenes. |
|
|
<|im_end|> |
|
|
<|im_start|>user |
|
|
Break this {style.lower()} script into 3-5 cinematic scenes with camera angles, characters, and mood. |
|
|
|
|
|
Script: {script} |
|
|
|
|
|
Format each scene as: |
|
|
Scene X: [Detailed visual description with camera angle, lighting, characters, and action] |
|
|
<|im_end|> |
|
|
<|im_start|>assistant""" |
|
|
|
|
|
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024) |
|
|
if torch.cuda.is_available(): |
|
|
inputs = {k: v.to("cuda") for k, v in inputs.items()} |
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=512, |
|
|
temperature=0.7, |
|
|
do_sample=True, |
|
|
pad_token_id=tokenizer.eos_token_id |
|
|
) |
|
|
|
|
|
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
|
|
|
response = decoded.split("<|im_start|>assistant")[-1].strip() |
|
|
|
|
|
|
|
|
scenes = [] |
|
|
lines = response.split('\n') |
|
|
for i, line in enumerate(lines): |
|
|
if line.strip() and ('Scene' in line or len(line.strip()) > 20): |
|
|
scenes.append({ |
|
|
"scene_id": len(scenes) + 1, |
|
|
"description": line.strip() |
|
|
}) |
|
|
|
|
|
|
|
|
if not scenes: |
|
|
scenes = [{"scene_id": 1, "description": f"A {style.lower()} scene: {script[:100]}..."}] |
|
|
|
|
|
return scenes[:5] |
|
|
except Exception as e: |
|
|
print(f"Scene generation error: {e}") |
|
|
return [{"scene_id": 1, "description": f"A {style.lower()} scene based on the script"}] |
|
|
|
|
|
def create_initial_image(prompt, width=1024, height=576): |
|
|
"""Create a simple initial image for SVD (since it requires an input image)""" |
|
|
|
|
|
|
|
|
img = np.random.randint(50, 200, (height, width, 3), dtype=np.uint8) |
|
|
img = Image.fromarray(img) |
|
|
return img |
|
|
|
|
|
def generate_video_with_svd(prompt): |
|
|
"""Generate video using Stable Video Diffusion""" |
|
|
try: |
|
|
|
|
|
initial_image = create_initial_image(prompt) |
|
|
|
|
|
|
|
|
frames = video_pipe( |
|
|
image=initial_image, |
|
|
decode_chunk_size=2, |
|
|
generator=torch.manual_seed(42), |
|
|
motion_bucket_id=127, |
|
|
noise_aug_strength=0.02, |
|
|
).frames[0] |
|
|
|
|
|
|
|
|
tmp_dir = tempfile.mkdtemp() |
|
|
output_path = os.path.join(tmp_dir, "scene.mp4") |
|
|
|
|
|
|
|
|
fourcc = cv2.VideoWriter_fourcc(*'mp4v') |
|
|
fps = 6 |
|
|
height, width = frames[0].size[1], frames[0].size[0] |
|
|
|
|
|
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height)) |
|
|
|
|
|
for frame in frames: |
|
|
frame_array = np.array(frame) |
|
|
frame_bgr = cv2.cvtColor(frame_array, cv2.COLOR_RGB2BGR) |
|
|
out.write(frame_bgr) |
|
|
|
|
|
out.release() |
|
|
return output_path |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Video generation error: {e}") |
|
|
|
|
|
return None |
|
|
|
|
|
def process_script(script, style, want_music): |
|
|
"""Main processing function""" |
|
|
if not script.strip(): |
|
|
return [], None |
|
|
|
|
|
print("Generating scenes...") |
|
|
scenes = generate_scenes_with_smol(script, style) |
|
|
|
|
|
print("Generating videos...") |
|
|
video_clips = [] |
|
|
for i, scene in enumerate(scenes): |
|
|
print(f"Processing scene {i+1}/{len(scenes)}") |
|
|
text_prompt = scene['description'] |
|
|
video_path = generate_video_with_svd(text_prompt) |
|
|
if video_path: |
|
|
video_clips.append((scene['description'], video_path)) |
|
|
|
|
|
music_path = None |
|
|
if want_music: |
|
|
print("Generating music...") |
|
|
music_prompt = f"Background music for {style.lower()} video: {script[:100]}" |
|
|
music_path = generate_music(music_prompt) |
|
|
|
|
|
return video_clips, music_path |
|
|
|
|
|
|
|
|
with gr.Blocks(title="Vividly MVP", theme=gr.themes.Soft()) as app: |
|
|
gr.Markdown("# 🎬 Vividly MVP – AI Video Creator") |
|
|
gr.Markdown("Transform your script into cinematic scenes with AI-generated videos and music!") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=2): |
|
|
script_input = gr.Textbox( |
|
|
label="Video Script", |
|
|
lines=6, |
|
|
placeholder="Enter your video script here..." |
|
|
) |
|
|
with gr.Column(scale=1): |
|
|
style_input = gr.Dropdown( |
|
|
["Cinematic", "Vlog", "Explainer", "Documentary"], |
|
|
value="Cinematic", |
|
|
label="Video Style" |
|
|
) |
|
|
music_toggle = gr.Checkbox(label="Generate background music", value=True) |
|
|
submit_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
video_outputs = gr.Video( |
|
|
label="Generated Video Clip", |
|
|
interactive=False, |
|
|
visible=False |
|
|
) |
|
|
with gr.Column(): |
|
|
music_player = gr.Audio( |
|
|
label="Generated Background Music", |
|
|
visible=False |
|
|
) |
|
|
|
|
|
scene_gallery = gr.Gallery( |
|
|
label="Scene Descriptions", |
|
|
visible=False, |
|
|
columns=1, |
|
|
height="auto" |
|
|
) |
|
|
|
|
|
def wrap_processing(script, style, music): |
|
|
if not script.strip(): |
|
|
return ( |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False) |
|
|
) |
|
|
|
|
|
try: |
|
|
scenes, music_path = process_script(script, style, music) |
|
|
|
|
|
|
|
|
first_video = scenes[0][1] if scenes else None |
|
|
|
|
|
|
|
|
scene_descriptions = [scene[0] for scene in scenes] if scenes else [] |
|
|
|
|
|
return ( |
|
|
gr.update(value=first_video, visible=bool(first_video)), |
|
|
gr.update(value=music_path, visible=bool(music_path)), |
|
|
gr.update(value=scene_descriptions, visible=bool(scene_descriptions)) |
|
|
) |
|
|
except Exception as e: |
|
|
print(f"Processing error: {e}") |
|
|
return ( |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False) |
|
|
) |
|
|
|
|
|
submit_btn.click( |
|
|
wrap_processing, |
|
|
inputs=[script_input, style_input, music_toggle], |
|
|
outputs=[video_outputs, music_player, scene_gallery] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("Starting Vividly MVP...") |
|
|
app.launch( |
|
|
server_name="0.0.0.0", |
|
|
server_port=7860, |
|
|
share=False, |
|
|
debug=True |
|
|
) |