Spaces:

AshBlanc
/

vividly

Runtime error

App Files Files Community

vividly / app.py

AshBlanc

Update app.py

520aa4f verified 7 months ago

raw

history blame contribute delete

9.11 kB

	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForCausalLM
	from diffusers import StableVideoDiffusionPipeline
	from audiocraft.models import MusicGen
	from audiocraft.data.audio import audio_write
	import torch
	import tempfile
	import os
	import cv2
	import numpy as np
	from PIL import Image

	# Load SmolLM2-1.7B model (correct model name and size)
	print("Loading text generation model...")
	tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
	model = AutoModelForCausalLM.from_pretrained(
	"HuggingFaceTB/SmolLM2-1.7B-Instruct",
	torch_dtype=torch.float16,
	device_map="auto"
	)

	# Load Stable Video Diffusion model (correct model name)
	print("Loading video generation model...")
	video_pipe = StableVideoDiffusionPipeline.from_pretrained(
	"stabilityai/stable-video-diffusion-img2vid-xt",
	torch_dtype=torch.float16,
	variant="fp16"
	)
	if torch.cuda.is_available():
	video_pipe = video_pipe.to("cuda")
	video_pipe.enable_model_cpu_offload()
	video_pipe.enable_vae_slicing()

	# Load MusicGen model
	print("Loading music generation model...")
	music_model = MusicGen.get_pretrained('facebook/musicgen-small')
	music_model.set_generation_params(duration=8) # 8 seconds music

	def generate_music(prompt: str):
	"""Generate background music from text prompt"""
	try:
	wav = music_model.generate([prompt], progress=True)
	tmp_dir = tempfile.mkdtemp()
	out_path = os.path.join(tmp_dir, "music")
	audio_write(out_path, wav[0].cpu(), music_model.sample_rate, format="mp3")
	return f"{out_path}.mp3"
	except Exception as e:
	print(f"Music generation error: {e}")
	return None

	def generate_scenes_with_smol(script, style):
	"""Generate scene descriptions using SmolLM2"""
	try:
	prompt = f"""<\|im_start\|>system
	You are a professional video director. Break down scripts into detailed cinematic scenes.
	<\|im_end\|>
	<\|im_start\|>user
	Break this {style.lower()} script into 3-5 cinematic scenes with camera angles, characters, and mood.

	Script: {script}

	Format each scene as:
	Scene X: [Detailed visual description with camera angle, lighting, characters, and action]
	<\|im_end\|>
	<\|im_start\|>assistant"""

	inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
	if torch.cuda.is_available():
	inputs = {k: v.to("cuda") for k, v in inputs.items()}

	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=512,
	temperature=0.7,
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id
	)

	decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
	# Extract only the assistant's response
	response = decoded.split("<\|im_start\|>assistant")[-1].strip()

	# Parse scenes
	scenes = []
	lines = response.split('\n')
	for i, line in enumerate(lines):
	if line.strip() and ('Scene' in line or len(line.strip()) > 20):
	scenes.append({
	"scene_id": len(scenes) + 1,
	"description": line.strip()
	})

	# Ensure we have at least one scene
	if not scenes:
	scenes = [{"scene_id": 1, "description": f"A {style.lower()} scene: {script[:100]}..."}]

	return scenes[:5] # Limit to 5 scenes max
	except Exception as e:
	print(f"Scene generation error: {e}")
	return [{"scene_id": 1, "description": f"A {style.lower()} scene based on the script"}]

	def create_initial_image(prompt, width=1024, height=576):
	"""Create a simple initial image for SVD (since it requires an input image)"""
	# Create a simple gradient or solid color image as starting point
	# In practice, you'd want to use a text-to-image model like Stable Diffusion
	img = np.random.randint(50, 200, (height, width, 3), dtype=np.uint8)
	img = Image.fromarray(img)
	return img

	def generate_video_with_svd(prompt):
	"""Generate video using Stable Video Diffusion"""
	try:
	# Create initial image (in practice, use a text-to-image model)
	initial_image = create_initial_image(prompt)

	# Generate video frames
	frames = video_pipe(
	image=initial_image,
	decode_chunk_size=2,
	generator=torch.manual_seed(42),
	motion_bucket_id=127,
	noise_aug_strength=0.02,
	).frames[0]

	# Save as video file
	tmp_dir = tempfile.mkdtemp()
	output_path = os.path.join(tmp_dir, "scene.mp4")

	# Convert PIL images to video using OpenCV
	fourcc = cv2.VideoWriter_fourcc(*'mp4v')
	fps = 6 # SVD typically generates 6 fps
	height, width = frames[0].size[1], frames[0].size[0]

	out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

	for frame in frames:
	frame_array = np.array(frame)
	frame_bgr = cv2.cvtColor(frame_array, cv2.COLOR_RGB2BGR)
	out.write(frame_bgr)

	out.release()
	return output_path

	except Exception as e:
	print(f"Video generation error: {e}")
	# Return a placeholder or None
	return None

	def process_script(script, style, want_music):
	"""Main processing function"""
	if not script.strip():
	return [], None

	print("Generating scenes...")
	scenes = generate_scenes_with_smol(script, style)

	print("Generating videos...")
	video_clips = []
	for i, scene in enumerate(scenes):
	print(f"Processing scene {i+1}/{len(scenes)}")
	text_prompt = scene['description']
	video_path = generate_video_with_svd(text_prompt)
	if video_path:
	video_clips.append((scene['description'], video_path))

	music_path = None
	if want_music:
	print("Generating music...")
	music_prompt = f"Background music for {style.lower()} video: {script[:100]}"
	music_path = generate_music(music_prompt)

	return video_clips, music_path

	# Gradio Interface
	with gr.Blocks(title="Vividly MVP", theme=gr.themes.Soft()) as app:
	gr.Markdown("# 🎬 Vividly MVP – AI Video Creator")
	gr.Markdown("Transform your script into cinematic scenes with AI-generated videos and music!")

	with gr.Row():
	with gr.Column(scale=2):
	script_input = gr.Textbox(
	label="Video Script",
	lines=6,
	placeholder="Enter your video script here..."
	)
	with gr.Column(scale=1):
	style_input = gr.Dropdown(
	["Cinematic", "Vlog", "Explainer", "Documentary"],
	value="Cinematic",
	label="Video Style"
	)
	music_toggle = gr.Checkbox(label="Generate background music", value=True)
	submit_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg")

	with gr.Row():
	with gr.Column():
	video_outputs = gr.Video(
	label="Generated Video Clip",
	interactive=False,
	visible=False
	)
	with gr.Column():
	music_player = gr.Audio(
	label="Generated Background Music",
	visible=False
	)

	scene_gallery = gr.Gallery(
	label="Scene Descriptions",
	visible=False,
	columns=1,
	height="auto"
	)

	def wrap_processing(script, style, music):
	if not script.strip():
	return (
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False)
	)

	try:
	scenes, music_path = process_script(script, style, music)

	# Show first video if available
	first_video = scenes[0][1] if scenes else None

	# Create scene descriptions for gallery
	scene_descriptions = [scene[0] for scene in scenes] if scenes else []

	return (
	gr.update(value=first_video, visible=bool(first_video)),
	gr.update(value=music_path, visible=bool(music_path)),
	gr.update(value=scene_descriptions, visible=bool(scene_descriptions))
	)
	except Exception as e:
	print(f"Processing error: {e}")
	return (
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False)
	)

	submit_btn.click(
	wrap_processing,
	inputs=[script_input, style_input, music_toggle],
	outputs=[video_outputs, music_player, scene_gallery]
	)

	if __name__ == "__main__":
	print("Starting Vividly MVP...")
	app.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	debug=True
	)