Spaces:

tomiconic
/

VideoGen

Sleeping

App Files Files Community

VideoGen / app.py

tomiconic

Update app.py

83f4a0d verified 10 days ago

raw

history blame contribute delete

12.3 kB

	import gradio as gr
	import torch
	import spaces
	import os
	import tempfile
	import random
	from PIL import Image
	from diffusers import AutoencoderKLWan, WanImageToVideoPipeline
	from diffusers.utils import export_to_video
	from transformers import CLIPVisionModel
	from huggingface_hub import InferenceClient

	# ── Config ────────────────────────────────────────────────────────────────────
	HF_TOKEN = os.environ.get("HF_TOKEN", None)
	MODEL_REPO = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"

	# ── Prompt expansion LLM ──────────────────────────────────────────────────────
	llm_client = InferenceClient(
	model="mistralai/Mistral-7B-Instruct-v0.3",
	token=HF_TOKEN,
	)

	VIDEO_SYSTEM = """You are an expert at writing motion prompts for AI video generation using Wan I2V.

	Your job: take a short description of desired motion/animation and expand it into a detailed video motion prompt.

	Rules:
	- Focus on MOTION — what moves, how it moves, camera movement
	- Be specific: "hair gently blowing in breeze", "camera slowly pulls back", "eyes blink naturally"
	- Keep subjects consistent with what is already in the image
	- Describe lighting changes if relevant e.g. "light flickers softly"
	- Do NOT describe the static image content — only the motion
	- Return ONLY the prompt, no explanation, no preamble
	- Keep under 80 words"""

	def expand_video_prompt(raw_prompt):
	if not raw_prompt.strip():
	return "subtle natural movement, gentle camera drift, cinematic atmosphere"
	try:
	response = llm_client.chat_completion(
	messages=[
	{"role": "system", "content": VIDEO_SYSTEM},
	{"role": "user", "content": f"Expand this motion description:\n{raw_prompt.strip()}"},
	],
	max_tokens=150,
	temperature=0.6,
	)
	return response.choices[0].message.content.strip().strip('"').strip("'")
	except Exception as e:
	print(f"LLM expansion failed, using raw prompt: {e}")
	return raw_prompt.strip()

	# ── Load pipeline ─────────────────────────────────────────────────────────────
	print("Loading Wan2.1 I2V pipeline...")

	image_encoder = CLIPVisionModel.from_pretrained(
	MODEL_REPO,
	subfolder="image_encoder",
	torch_dtype=torch.float32,
	)

	vae = AutoencoderKLWan.from_pretrained(
	MODEL_REPO,
	subfolder="vae",
	torch_dtype=torch.float32,
	)

	pipe = WanImageToVideoPipeline.from_pretrained(
	MODEL_REPO,
	vae=vae,
	image_encoder=image_encoder,
	torch_dtype=torch.bfloat16,
	)

	pipe.enable_model_cpu_offload()
	print("Pipeline ready.")

	# ── Negative prompt ───────────────────────────────────────────────────────────
	VIDEO_NEG = (
	"static, no movement, blurry, low quality, worst quality, "
	"inconsistent motion, flickering, jitter, artifacts, "
	"watermark, text, deformed"
	)

	# ── Generation ────────────────────────────────────────────────────────────────
	@spaces.GPU(duration=300)
	def generate_video(input_image, motion_prompt, num_frames, guidance, seed, randomize):

	if input_image is None:
	raise gr.Error("Please upload an image first.")

	if randomize:
	seed = random.randint(0, 2**32 - 1)
	seed = int(seed)

	# Expand motion prompt via LLM
	expanded_motion = expand_video_prompt(motion_prompt)
	print(f"Expanded motion: {expanded_motion}")

	# Resize — Wan I2V works best at 832x480
	img = Image.fromarray(input_image).convert("RGB")
	orig_w, orig_h = img.size
	aspect = orig_w / orig_h
	if aspect >= 1:
	new_w, new_h = 832, 480
	else:
	new_w, new_h = 480, 832
	img = img.resize((new_w, new_h), Image.LANCZOS)

	generator = torch.Generator(device="cpu").manual_seed(seed)

	output = pipe(
	image=img,
	prompt=expanded_motion,
	negative_prompt=VIDEO_NEG,
	height=new_h,
	width=new_w,
	num_frames=int(num_frames),
	guidance_scale=float(guidance),
	num_inference_steps=30,
	generator=generator,
	)

	frames = output.frames[0]

	tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
	export_to_video(frames, tmp.name, fps=16)

	return tmp.name, seed, f"Motion prompt sent to model:\n\n{expanded_motion}"

	# ── CSS ───────────────────────────────────────────────────────────────────────
	css = """
	* { box-sizing: border-box; margin: 0; padding: 0; }

	body, .gradio-container {
	background: #07070e !important;
	font-family: 'Inter', system-ui, sans-serif !important;
	max-width: 500px !important;
	margin: 0 auto !important;
	padding: 8px !important;
	}

	.topbar {
	display: flex;
	align-items: center;
	justify-content: space-between;
	padding: 10px 2px 14px;
	}
	.topbar-title {
	color: #e8e0ff;
	font-size: 0.95em;
	font-weight: 800;
	}
	.gpu-pill {
	background: #1aff7a18;
	border: 1px solid #1aff7a44;
	color: #1aff7a;
	font-size: 0.6em;
	font-weight: 800;
	padding: 4px 12px;
	border-radius: 20px;
	letter-spacing: 1.5px;
	text-transform: uppercase;
	}

	.upload-area {
	background: #0d0d1a;
	border: 2px dashed #1e1e35;
	border-radius: 18px;
	overflow: hidden;
	margin-bottom: 8px;
	min-height: 260px;
	display: flex;
	align-items: center;
	justify-content: center;
	}

	.video-out {
	background: #0d0d1a;
	border: 1px solid #16162a;
	border-radius: 18px;
	overflow: hidden;
	margin-bottom: 8px;
	min-height: 260px;
	}

	.card {
	background: #0d0d1a;
	border: 1px solid #16162a;
	border-radius: 14px;
	padding: 14px;
	margin-bottom: 8px;
	}
	.card-label {
	color: #3d3060;
	font-size: 0.62em;
	font-weight: 800;
	text-transform: uppercase;
	letter-spacing: 2px;
	margin-bottom: 8px;
	}

	textarea {
	background: transparent !important;
	border: none !important;
	color: #c8b8f0 !important;
	font-size: 15px !important;
	line-height: 1.6 !important;
	padding: 0 !important;
	resize: none !important;
	box-shadow: none !important;
	width: 100% !important;
	outline: none !important;
	}
	textarea::placeholder { color: #252038 !important; }
	textarea:focus {
	outline: none !important;
	box-shadow: none !important;
	border: none !important;
	}

	.gradio-accordion {
	background: #0d0d1a !important;
	border: 1px solid #16162a !important;
	border-radius: 14px !important;
	margin-bottom: 8px !important;
	overflow: hidden !important;
	}
	.gradio-accordion .label-wrap button {
	color: #4a3a6a !important;
	font-size: 0.72em !important;
	font-weight: 700 !important;
	text-transform: uppercase !important;
	letter-spacing: 1.5px !important;
	padding: 12px 16px !important;
	}

	.gradio-slider {
	background: transparent !important;
	border: none !important;
	padding: 4px 0 10px !important;
	}
	input[type=range] {
	accent-color: #3366bb !important;
	width: 100% !important;
	}

	input[type=number] {
	background: #0a0a14 !important;
	border: 1px solid #18182a !important;
	border-radius: 10px !important;
	color: #7799cc !important;
	font-size: 13px !important;
	padding: 8px 10px !important;
	}

	input[type=checkbox] { accent-color: #3366bb !important; }
	.gradio-checkbox label span {
	color: #4a3a6a !important;
	font-size: 0.75em !important;
	font-weight: 600 !important;
	}

	label > span:first-child {
	color: #3a2d55 !important;
	font-size: 0.7em !important;
	font-weight: 700 !important;
	text-transform: uppercase !important;
	letter-spacing: 1px !important;
	}

	.seed-out input[type=number] {
	background: transparent !important;
	border: none !important;
	color: #2e2848 !important;
	font-size: 0.7em !important;
	text-align: center !important;
	padding: 2px !important;
	}

	.hint-box {
	background: #0a0a14;
	border: 1px solid #111122;
	border-radius: 10px;
	padding: 10px 14px;
	color: #443366;
	font-size: 0.72em;
	line-height: 1.7;
	margin-bottom: 8px;
	word-break: break-word;
	}

	.gen-btn button {
	background: linear-gradient(135deg, #1a3aaa 0%, #0e1e77 100%) !important;
	border: 1px solid #2255cc !important;
	border-radius: 14px !important;
	color: #fff !important;
	font-size: 0.88em !important;
	font-weight: 900 !important;
	padding: 17px !important;
	width: 100% !important;
	letter-spacing: 2px !important;
	text-transform: uppercase !important;
	box-shadow: 0 4px 24px #1a3aaa55 !important;
	transition: all 0.15s ease !important;
	margin-top: 6px !important;
	}
	.gen-btn button:hover {
	box-shadow: 0 6px 32px #1a3aaa99 !important;
	transform: translateY(-1px) !important;
	}
	.gen-btn button:active {
	transform: scale(0.98) !important;
	box-shadow: 0 2px 12px #1a3aaa33 !important;
	}

	footer, .built-with { display: none !important; }
	"""

	# ── UI ────────────────────────────────────────────────────────────────────────
	with gr.Blocks(css=css, title="VideoGen") as demo:

	gr.HTML("""
	<div class="topbar">
	<span class="topbar-title">🎬 Wan I2V — Image to Video</span>
	<span class="gpu-pill">⚡ ZeroGPU</span>
	</div>
	""")

	gr.HTML("""
	<div class="hint-box">
	Upload any image → describe the motion → get a ~3–5 second 480P video.<br><br>
	<strong>Motion tips:</strong> describe what moves, not what's in the image.<br>
	e.g. <em>"hair gently blowing, eyes blink, camera slowly pulls back"</em>
	</div>
	""")

	input_image = gr.Image(
	label="Input Image",
	type="numpy",
	height=300,
	elem_classes="upload-area",
	)

	gr.HTML('<div class="card"><div class="card-label">✦ Motion — what should move?</div>')
	motion_prompt = gr.Textbox(
	show_label=False,
	placeholder="hair gently blowing, eyes blinking slowly, soft light shimmer...",
	lines=2,
	)
	gr.HTML('</div>')

	generate_btn = gr.Button(
	"Generate Video ✦", variant="primary",
	size="lg", elem_classes="gen-btn",
	)

	output_video = gr.Video(
	label="Generated Video",
	elem_classes="video-out",
	height=300,
	)

	used_seed = gr.Number(
	label="seed", interactive=False,
	elem_classes="seed-out",
	)

	expanded_out = gr.Markdown(
	value="",
	elem_classes="hint-box",
	)

	with gr.Accordion("⚙️ Settings", open=False):
	gr.HTML('<div style="height:6px"></div>')

	num_frames = gr.Slider(
	minimum=17,
	maximum=81,
	value=49,
	step=16,
	label="Frames — 17≈1s 49≈3s 81≈5s (at 16fps)",
	)
	guidance = gr.Slider(
	minimum=1.0,
	maximum=10.0,
	value=5.0,
	step=0.5,
	label="Guidance Scale",
	)
	with gr.Row():
	seed = gr.Number(
	label="Seed", value=42, precision=0,
	minimum=0, maximum=2**32-1, scale=3,
	)
	randomize = gr.Checkbox(
	label="Random seed", value=True, scale=1,
	)

	generate_btn.click(
	fn=generate_video,
	inputs=[
	input_image, motion_prompt, num_frames,
	guidance, seed, randomize,
	],
	outputs=[output_video, used_seed, expanded_out],
	)

	demo.launch()