vividly / app.py
AshBlanc's picture
Update app.py
520aa4f verified
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
from diffusers import StableVideoDiffusionPipeline
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write
import torch
import tempfile
import os
import cv2
import numpy as np
from PIL import Image
# Load SmolLM2-1.7B model (correct model name and size)
print("Loading text generation model...")
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
model = AutoModelForCausalLM.from_pretrained(
"HuggingFaceTB/SmolLM2-1.7B-Instruct",
torch_dtype=torch.float16,
device_map="auto"
)
# Load Stable Video Diffusion model (correct model name)
print("Loading video generation model...")
video_pipe = StableVideoDiffusionPipeline.from_pretrained(
"stabilityai/stable-video-diffusion-img2vid-xt",
torch_dtype=torch.float16,
variant="fp16"
)
if torch.cuda.is_available():
video_pipe = video_pipe.to("cuda")
video_pipe.enable_model_cpu_offload()
video_pipe.enable_vae_slicing()
# Load MusicGen model
print("Loading music generation model...")
music_model = MusicGen.get_pretrained('facebook/musicgen-small')
music_model.set_generation_params(duration=8) # 8 seconds music
def generate_music(prompt: str):
"""Generate background music from text prompt"""
try:
wav = music_model.generate([prompt], progress=True)
tmp_dir = tempfile.mkdtemp()
out_path = os.path.join(tmp_dir, "music")
audio_write(out_path, wav[0].cpu(), music_model.sample_rate, format="mp3")
return f"{out_path}.mp3"
except Exception as e:
print(f"Music generation error: {e}")
return None
def generate_scenes_with_smol(script, style):
"""Generate scene descriptions using SmolLM2"""
try:
prompt = f"""<|im_start|>system
You are a professional video director. Break down scripts into detailed cinematic scenes.
<|im_end|>
<|im_start|>user
Break this {style.lower()} script into 3-5 cinematic scenes with camera angles, characters, and mood.
Script: {script}
Format each scene as:
Scene X: [Detailed visual description with camera angle, lighting, characters, and action]
<|im_end|>
<|im_start|>assistant"""
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
if torch.cuda.is_available():
inputs = {k: v.to("cuda") for k, v in inputs.items()}
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=512,
temperature=0.7,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract only the assistant's response
response = decoded.split("<|im_start|>assistant")[-1].strip()
# Parse scenes
scenes = []
lines = response.split('\n')
for i, line in enumerate(lines):
if line.strip() and ('Scene' in line or len(line.strip()) > 20):
scenes.append({
"scene_id": len(scenes) + 1,
"description": line.strip()
})
# Ensure we have at least one scene
if not scenes:
scenes = [{"scene_id": 1, "description": f"A {style.lower()} scene: {script[:100]}..."}]
return scenes[:5] # Limit to 5 scenes max
except Exception as e:
print(f"Scene generation error: {e}")
return [{"scene_id": 1, "description": f"A {style.lower()} scene based on the script"}]
def create_initial_image(prompt, width=1024, height=576):
"""Create a simple initial image for SVD (since it requires an input image)"""
# Create a simple gradient or solid color image as starting point
# In practice, you'd want to use a text-to-image model like Stable Diffusion
img = np.random.randint(50, 200, (height, width, 3), dtype=np.uint8)
img = Image.fromarray(img)
return img
def generate_video_with_svd(prompt):
"""Generate video using Stable Video Diffusion"""
try:
# Create initial image (in practice, use a text-to-image model)
initial_image = create_initial_image(prompt)
# Generate video frames
frames = video_pipe(
image=initial_image,
decode_chunk_size=2,
generator=torch.manual_seed(42),
motion_bucket_id=127,
noise_aug_strength=0.02,
).frames[0]
# Save as video file
tmp_dir = tempfile.mkdtemp()
output_path = os.path.join(tmp_dir, "scene.mp4")
# Convert PIL images to video using OpenCV
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
fps = 6 # SVD typically generates 6 fps
height, width = frames[0].size[1], frames[0].size[0]
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
for frame in frames:
frame_array = np.array(frame)
frame_bgr = cv2.cvtColor(frame_array, cv2.COLOR_RGB2BGR)
out.write(frame_bgr)
out.release()
return output_path
except Exception as e:
print(f"Video generation error: {e}")
# Return a placeholder or None
return None
def process_script(script, style, want_music):
"""Main processing function"""
if not script.strip():
return [], None
print("Generating scenes...")
scenes = generate_scenes_with_smol(script, style)
print("Generating videos...")
video_clips = []
for i, scene in enumerate(scenes):
print(f"Processing scene {i+1}/{len(scenes)}")
text_prompt = scene['description']
video_path = generate_video_with_svd(text_prompt)
if video_path:
video_clips.append((scene['description'], video_path))
music_path = None
if want_music:
print("Generating music...")
music_prompt = f"Background music for {style.lower()} video: {script[:100]}"
music_path = generate_music(music_prompt)
return video_clips, music_path
# Gradio Interface
with gr.Blocks(title="Vividly MVP", theme=gr.themes.Soft()) as app:
gr.Markdown("# 🎬 Vividly MVP – AI Video Creator")
gr.Markdown("Transform your script into cinematic scenes with AI-generated videos and music!")
with gr.Row():
with gr.Column(scale=2):
script_input = gr.Textbox(
label="Video Script",
lines=6,
placeholder="Enter your video script here..."
)
with gr.Column(scale=1):
style_input = gr.Dropdown(
["Cinematic", "Vlog", "Explainer", "Documentary"],
value="Cinematic",
label="Video Style"
)
music_toggle = gr.Checkbox(label="Generate background music", value=True)
submit_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg")
with gr.Row():
with gr.Column():
video_outputs = gr.Video(
label="Generated Video Clip",
interactive=False,
visible=False
)
with gr.Column():
music_player = gr.Audio(
label="Generated Background Music",
visible=False
)
scene_gallery = gr.Gallery(
label="Scene Descriptions",
visible=False,
columns=1,
height="auto"
)
def wrap_processing(script, style, music):
if not script.strip():
return (
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False)
)
try:
scenes, music_path = process_script(script, style, music)
# Show first video if available
first_video = scenes[0][1] if scenes else None
# Create scene descriptions for gallery
scene_descriptions = [scene[0] for scene in scenes] if scenes else []
return (
gr.update(value=first_video, visible=bool(first_video)),
gr.update(value=music_path, visible=bool(music_path)),
gr.update(value=scene_descriptions, visible=bool(scene_descriptions))
)
except Exception as e:
print(f"Processing error: {e}")
return (
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False)
)
submit_btn.click(
wrap_processing,
inputs=[script_input, style_input, music_toggle],
outputs=[video_outputs, music_player, scene_gallery]
)
if __name__ == "__main__":
print("Starting Vividly MVP...")
app.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
debug=True
)