Spaces:

jiuface
/

wan-fast

Running on Zero

App Files Files Community

wan-fast / app.py

jiuface

Update app.py

53db54f verified 2 months ago

raw

history blame contribute delete

12.4 kB

	import spaces
	import torch
	from diffusers.pipelines.wan.pipeline_wan_i2v import WanImageToVideoPipeline
	from diffusers.models.transformers.transformer_wan import WanTransformer3DModel
	from diffusers.utils import export_to_video
	import gradio as gr
	import tempfile
	import numpy as np
	from PIL import Image
	import random
	from datetime import datetime
	import os
	import time
	from PIL import Image
	import json
	import boto3
	from io import BytesIO
	from diffusers.utils import load_image
	import random
	import gc

	from torchao.quantization import quantize_
	from torchao.quantization import Float8DynamicActivationFloat8WeightConfig
	from torchao.quantization import Int8WeightOnlyConfig
	import aoti



	MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"

	MAX_DIM = 832
	MIN_DIM = 480
	SQUARE_DIM = 640
	MULTIPLE_OF = 16

	MAX_SEED = np.iinfo(np.int32).max

	FIXED_FPS = 24
	MIN_FRAMES_MODEL = 8
	MAX_FRAMES_MODEL = 120

	MIN_DURATION = round(MIN_FRAMES_MODEL/FIXED_FPS,1)
	MAX_DURATION = round(MAX_FRAMES_MODEL/FIXED_FPS,1)


	pipe = WanImageToVideoPipeline.from_pretrained(MODEL_ID,
	transformer=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
	subfolder='transformer',
	torch_dtype=torch.bfloat16,
	device_map='cuda',
	),
	transformer_2=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
	subfolder='transformer_2',
	torch_dtype=torch.bfloat16,
	device_map='cuda',
	),
	torch_dtype=torch.bfloat16,
	).to('cuda')

	pipe.load_lora_weights(
	"Kijai/WanVideo_comfy",
	weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors",
	adapter_name="lightx2v"
	)
	kwargs_lora = {}
	kwargs_lora["load_into_transformer_2"] = True
	pipe.load_lora_weights(
	"Kijai/WanVideo_comfy",
	weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors",
	adapter_name="lightx2v_2", **kwargs_lora
	)
	pipe.set_adapters(["lightx2v", "lightx2v_2"], adapter_weights=[1., 1.])
	pipe.fuse_lora(adapter_names=["lightx2v"], lora_scale=3., components=["transformer"])
	pipe.fuse_lora(adapter_names=["lightx2v_2"], lora_scale=1., components=["transformer_2"])
	pipe.unload_lora_weights()

	quantize_(pipe.text_encoder, Int8WeightOnlyConfig())
	quantize_(pipe.transformer, Float8DynamicActivationFloat8WeightConfig())
	quantize_(pipe.transformer_2, Float8DynamicActivationFloat8WeightConfig())

	aoti.aoti_blocks_load(pipe.transformer, 'zerogpu-aoti/Wan2', variant='fp8da')
	aoti.aoti_blocks_load(pipe.transformer_2, 'zerogpu-aoti/Wan2', variant='fp8da')



	default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation"
	default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature"


	class calculateDuration:
	def __init__(self, activity_name=""):
	self.activity_name = activity_name

	def __enter__(self):
	self.start_time = time.time()
	self.start_time_formatted = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.start_time))
	print(f"Activity: {self.activity_name}, Start time: {self.start_time_formatted}")
	return self

	def __exit__(self, exc_type, exc_value, traceback):
	self.end_time = time.time()
	self.elapsed_time = self.end_time - self.start_time
	self.end_time_formatted = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.end_time))

	if self.activity_name:
	print(f"Elapsed time for {self.activity_name}: {self.elapsed_time:.6f} seconds")
	else:
	print(f"Elapsed time: {self.elapsed_time:.6f} seconds")


	def resize_image(image: Image.Image) -> Image.Image:
	"""
	Resizes an image to fit within the model's constraints, preserving aspect ratio as much as possible.
	"""
	width, height = image.size

	# Handle square case
	if width == height:
	return image.resize((SQUARE_DIM, SQUARE_DIM), Image.LANCZOS)

	aspect_ratio = width / height

	MAX_ASPECT_RATIO = MAX_DIM / MIN_DIM
	MIN_ASPECT_RATIO = MIN_DIM / MAX_DIM

	image_to_resize = image

	if aspect_ratio > MAX_ASPECT_RATIO:
	# Very wide image -> crop width to fit 832x480 aspect ratio
	target_w, target_h = MAX_DIM, MIN_DIM
	crop_width = int(round(height * MAX_ASPECT_RATIO))
	left = (width - crop_width) // 2
	image_to_resize = image.crop((left, 0, left + crop_width, height))
	elif aspect_ratio < MIN_ASPECT_RATIO:
	# Very tall image -> crop height to fit 480x832 aspect ratio
	target_w, target_h = MIN_DIM, MAX_DIM
	crop_height = int(round(width / MIN_ASPECT_RATIO))
	top = (height - crop_height) // 2
	image_to_resize = image.crop((0, top, width, top + crop_height))
	else:
	if width > height: # Landscape
	target_w = MAX_DIM
	target_h = int(round(target_w / aspect_ratio))
	else: # Portrait
	target_h = MAX_DIM
	target_w = int(round(target_h * aspect_ratio))

	final_w = round(target_w / MULTIPLE_OF) * MULTIPLE_OF
	final_h = round(target_h / MULTIPLE_OF) * MULTIPLE_OF

	final_w = max(MIN_DIM, min(MAX_DIM, final_w))
	final_h = max(MIN_DIM, min(MAX_DIM, final_h))

	return image_to_resize.resize((final_w, final_h), Image.LANCZOS)


	def get_num_frames(duration_seconds: float):
	return 1 + int(np.clip(
	int(round(duration_seconds * FIXED_FPS)),
	MIN_FRAMES_MODEL,
	MAX_FRAMES_MODEL,
	))



	def upload_video_to_r2(video_file, account_id, access_key, secret_key, bucket_name):
	with calculateDuration("Upload video"):
	connectionUrl = f"https://{account_id}.r2.cloudflarestorage.com"
	s3 = boto3.client(
	's3',
	endpoint_url=connectionUrl,
	region_name='auto',
	aws_access_key_id=access_key,
	aws_secret_access_key=secret_key
	)
	current_time = datetime.now().strftime("%Y/%m/%d/%H%M%S")
	video_remote_path = f"generated_videos/{current_time}_{random.randint(0, MAX_SEED)}.mp4"
	with open(video_file, "rb") as f: # 修正关键点
	s3.upload_fileobj(f, bucket_name, video_remote_path)
	print("upload finish", video_remote_path)

	return video_remote_path

	def get_duration(
	image_url,
	prompt,
	height,
	width,
	negative_prompt,
	duration_seconds,
	guidance_scale,
	steps,
	seed,
	randomize_seed,
	upload_to_r2,
	account_id,
	access_key,
	secret_key,
	bucket,
	progress
	):
	BASE_FRAMES_HEIGHT_WIDTH = 81 * 832 * 624
	BASE_STEP_DURATION = 15
	input_image = load_image(image_url)
	width, height = resize_image(input_image).size
	frames = get_num_frames(duration_seconds)
	factor = frames * width * height / BASE_FRAMES_HEIGHT_WIDTH
	step_duration = BASE_STEP_DURATION * factor ** 1.5
	return 10 + int(steps) * step_duration


	@spaces.GPU(duration=120)
	def generate_video(image_url,
	prompt,
	height,
	width,
	negative_prompt,
	duration_seconds,
	guidance_scale,
	steps,
	seed,
	randomize_seed,
	upload_to_r2,
	account_id,
	access_key,
	secret_key,
	bucket,
	progress=gr.Progress(track_tqdm=True)):

	if image_url is None:
	raise gr.Error("Please upload an input image.")

	input_image = load_image(image_url)
	num_frames = get_num_frames(duration_seconds)

	current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)

	resized_image = resize_image(input_image)
	print("final size:", resized_image.width, resized_image.height)

	with torch.inference_mode():
	output_frames_list = pipe(
	image=resized_image,
	prompt=prompt,
	negative_prompt=negative_prompt,
	height=resized_image.height,
	width=resized_image.width,
	num_frames=num_frames,
	guidance_scale=float(guidance_scale),
	guidance_scale_2=float(guidance_scale),
	num_inference_steps=int(steps),
	generator=torch.Generator(device="cuda").manual_seed(current_seed)
	).frames[0]

	with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
	video_path = tmpfile.name
	export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
	if upload_to_r2:
	video_url = upload_video_to_r2(video_path, account_id, access_key, secret_key, bucket)
	result = {"status": "success", "message": "upload video success", "url": video_url}
	else:
	result = {"status": "success", "message": "Image generated but not uploaded", "url": video_path}
	return json.dumps(result)


	with gr.Blocks() as demo:
	gr.Markdown("# Fast 4 steps Wan 2.2 I2V (14B)")
	with gr.Row():
	with gr.Column():
	image_url_input = gr.Textbox(
	label="Orginal image url",
	show_label=True,
	max_lines=1,
	placeholder="Enter image url for inpainting",
	container=False
	)
	prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v)
	duration_seconds_input = gr.Slider(minimum=MIN_DURATION, maximum=MAX_DURATION, step=0.1, value=3.5, label="Duration (seconds)", info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps.")

	with gr.Accordion("Advanced Settings", open=False):
	negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
	seed_input = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, interactive=True)
	randomize_seed_checkbox = gr.Checkbox(label="Randomize seed", value=True, interactive=True)
	with gr.Row():
	height_input = gr.Slider(minimum=480, maximum=1024, step=1, value=640, label=f"Output Height")
	width_input = gr.Slider(minimum=480, maximum=1024, step=1, value=540, label=f"Output Width")
	steps_slider = gr.Slider(minimum=1, maximum=30, step=1, value=4, label="Inference Steps")
	guidance_scale_input = gr.Slider(minimum=0.0, maximum=20.0, step=0.5, value=1.0, label="Guidance Scale", visible=True)

	with gr.Accordion("R2 Settings", open=False):
	upload_to_r2 = gr.Checkbox(label="Upload to R2", value=False)
	with gr.Row():
	account_id = gr.Textbox(label="Account Id", placeholder="Enter R2 account id", value="")
	bucket = gr.Textbox(label="Bucket Name", placeholder="Enter R2 bucket name here", value="")

	with gr.Row():
	access_key = gr.Textbox(label="Access Key", placeholder="Enter R2 access key here", value="")
	secret_key = gr.Textbox(label="Secret Key", placeholder="Enter R2 secret key here", value="")

	generate_button = gr.Button("Generate Video", variant="primary")
	with gr.Column():
	output_json_component = gr.Code(label="JSON Result", language="json", value="{}")



	ui_inputs = [
	image_url_input, prompt_input, height_input, width_input,
	negative_prompt_input, duration_seconds_input,
	guidance_scale_input, steps_slider, seed_input, randomize_seed_checkbox,
	upload_to_r2, account_id, access_key, secret_key, bucket
	]
	generate_button.click(
	fn=generate_video,
	inputs=ui_inputs,
	outputs=output_json_component,
	api_name="predict"
	)

	if __name__ == "__main__":
	demo.queue(api_open=True)
	demo.launch(share=True)