|
|
import spaces |
|
|
import torch |
|
|
from diffusers.pipelines.wan.pipeline_wan_i2v import WanImageToVideoPipeline |
|
|
from diffusers.models.transformers.transformer_wan import WanTransformer3DModel |
|
|
from diffusers.utils import export_to_video |
|
|
import gradio as gr |
|
|
import tempfile |
|
|
import numpy as np |
|
|
from PIL import Image |
|
|
import random |
|
|
from datetime import datetime |
|
|
import os |
|
|
import time |
|
|
from PIL import Image |
|
|
import json |
|
|
import boto3 |
|
|
from io import BytesIO |
|
|
from diffusers.utils import load_image |
|
|
import random |
|
|
import gc |
|
|
|
|
|
from torchao.quantization import quantize_ |
|
|
from torchao.quantization import Float8DynamicActivationFloat8WeightConfig |
|
|
from torchao.quantization import Int8WeightOnlyConfig |
|
|
import aoti |
|
|
|
|
|
|
|
|
|
|
|
MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers" |
|
|
|
|
|
MAX_DIM = 832 |
|
|
MIN_DIM = 480 |
|
|
SQUARE_DIM = 640 |
|
|
MULTIPLE_OF = 16 |
|
|
|
|
|
MAX_SEED = np.iinfo(np.int32).max |
|
|
|
|
|
FIXED_FPS = 24 |
|
|
MIN_FRAMES_MODEL = 8 |
|
|
MAX_FRAMES_MODEL = 120 |
|
|
|
|
|
MIN_DURATION = round(MIN_FRAMES_MODEL/FIXED_FPS,1) |
|
|
MAX_DURATION = round(MAX_FRAMES_MODEL/FIXED_FPS,1) |
|
|
|
|
|
|
|
|
pipe = WanImageToVideoPipeline.from_pretrained(MODEL_ID, |
|
|
transformer=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers', |
|
|
subfolder='transformer', |
|
|
torch_dtype=torch.bfloat16, |
|
|
device_map='cuda', |
|
|
), |
|
|
transformer_2=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers', |
|
|
subfolder='transformer_2', |
|
|
torch_dtype=torch.bfloat16, |
|
|
device_map='cuda', |
|
|
), |
|
|
torch_dtype=torch.bfloat16, |
|
|
).to('cuda') |
|
|
|
|
|
pipe.load_lora_weights( |
|
|
"Kijai/WanVideo_comfy", |
|
|
weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors", |
|
|
adapter_name="lightx2v" |
|
|
) |
|
|
kwargs_lora = {} |
|
|
kwargs_lora["load_into_transformer_2"] = True |
|
|
pipe.load_lora_weights( |
|
|
"Kijai/WanVideo_comfy", |
|
|
weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors", |
|
|
adapter_name="lightx2v_2", **kwargs_lora |
|
|
) |
|
|
pipe.set_adapters(["lightx2v", "lightx2v_2"], adapter_weights=[1., 1.]) |
|
|
pipe.fuse_lora(adapter_names=["lightx2v"], lora_scale=3., components=["transformer"]) |
|
|
pipe.fuse_lora(adapter_names=["lightx2v_2"], lora_scale=1., components=["transformer_2"]) |
|
|
pipe.unload_lora_weights() |
|
|
|
|
|
quantize_(pipe.text_encoder, Int8WeightOnlyConfig()) |
|
|
quantize_(pipe.transformer, Float8DynamicActivationFloat8WeightConfig()) |
|
|
quantize_(pipe.transformer_2, Float8DynamicActivationFloat8WeightConfig()) |
|
|
|
|
|
aoti.aoti_blocks_load(pipe.transformer, 'zerogpu-aoti/Wan2', variant='fp8da') |
|
|
aoti.aoti_blocks_load(pipe.transformer_2, 'zerogpu-aoti/Wan2', variant='fp8da') |
|
|
|
|
|
|
|
|
|
|
|
default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation" |
|
|
default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature" |
|
|
|
|
|
|
|
|
class calculateDuration: |
|
|
def __init__(self, activity_name=""): |
|
|
self.activity_name = activity_name |
|
|
|
|
|
def __enter__(self): |
|
|
self.start_time = time.time() |
|
|
self.start_time_formatted = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.start_time)) |
|
|
print(f"Activity: {self.activity_name}, Start time: {self.start_time_formatted}") |
|
|
return self |
|
|
|
|
|
def __exit__(self, exc_type, exc_value, traceback): |
|
|
self.end_time = time.time() |
|
|
self.elapsed_time = self.end_time - self.start_time |
|
|
self.end_time_formatted = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.end_time)) |
|
|
|
|
|
if self.activity_name: |
|
|
print(f"Elapsed time for {self.activity_name}: {self.elapsed_time:.6f} seconds") |
|
|
else: |
|
|
print(f"Elapsed time: {self.elapsed_time:.6f} seconds") |
|
|
|
|
|
|
|
|
def resize_image(image: Image.Image) -> Image.Image: |
|
|
""" |
|
|
Resizes an image to fit within the model's constraints, preserving aspect ratio as much as possible. |
|
|
""" |
|
|
width, height = image.size |
|
|
|
|
|
|
|
|
if width == height: |
|
|
return image.resize((SQUARE_DIM, SQUARE_DIM), Image.LANCZOS) |
|
|
|
|
|
aspect_ratio = width / height |
|
|
|
|
|
MAX_ASPECT_RATIO = MAX_DIM / MIN_DIM |
|
|
MIN_ASPECT_RATIO = MIN_DIM / MAX_DIM |
|
|
|
|
|
image_to_resize = image |
|
|
|
|
|
if aspect_ratio > MAX_ASPECT_RATIO: |
|
|
|
|
|
target_w, target_h = MAX_DIM, MIN_DIM |
|
|
crop_width = int(round(height * MAX_ASPECT_RATIO)) |
|
|
left = (width - crop_width) // 2 |
|
|
image_to_resize = image.crop((left, 0, left + crop_width, height)) |
|
|
elif aspect_ratio < MIN_ASPECT_RATIO: |
|
|
|
|
|
target_w, target_h = MIN_DIM, MAX_DIM |
|
|
crop_height = int(round(width / MIN_ASPECT_RATIO)) |
|
|
top = (height - crop_height) // 2 |
|
|
image_to_resize = image.crop((0, top, width, top + crop_height)) |
|
|
else: |
|
|
if width > height: |
|
|
target_w = MAX_DIM |
|
|
target_h = int(round(target_w / aspect_ratio)) |
|
|
else: |
|
|
target_h = MAX_DIM |
|
|
target_w = int(round(target_h * aspect_ratio)) |
|
|
|
|
|
final_w = round(target_w / MULTIPLE_OF) * MULTIPLE_OF |
|
|
final_h = round(target_h / MULTIPLE_OF) * MULTIPLE_OF |
|
|
|
|
|
final_w = max(MIN_DIM, min(MAX_DIM, final_w)) |
|
|
final_h = max(MIN_DIM, min(MAX_DIM, final_h)) |
|
|
|
|
|
return image_to_resize.resize((final_w, final_h), Image.LANCZOS) |
|
|
|
|
|
|
|
|
def get_num_frames(duration_seconds: float): |
|
|
return 1 + int(np.clip( |
|
|
int(round(duration_seconds * FIXED_FPS)), |
|
|
MIN_FRAMES_MODEL, |
|
|
MAX_FRAMES_MODEL, |
|
|
)) |
|
|
|
|
|
|
|
|
|
|
|
def upload_video_to_r2(video_file, account_id, access_key, secret_key, bucket_name): |
|
|
with calculateDuration("Upload video"): |
|
|
connectionUrl = f"https://{account_id}.r2.cloudflarestorage.com" |
|
|
s3 = boto3.client( |
|
|
's3', |
|
|
endpoint_url=connectionUrl, |
|
|
region_name='auto', |
|
|
aws_access_key_id=access_key, |
|
|
aws_secret_access_key=secret_key |
|
|
) |
|
|
current_time = datetime.now().strftime("%Y/%m/%d/%H%M%S") |
|
|
video_remote_path = f"generated_videos/{current_time}_{random.randint(0, MAX_SEED)}.mp4" |
|
|
with open(video_file, "rb") as f: |
|
|
s3.upload_fileobj(f, bucket_name, video_remote_path) |
|
|
print("upload finish", video_remote_path) |
|
|
|
|
|
return video_remote_path |
|
|
|
|
|
def get_duration( |
|
|
image_url, |
|
|
prompt, |
|
|
height, |
|
|
width, |
|
|
negative_prompt, |
|
|
duration_seconds, |
|
|
guidance_scale, |
|
|
steps, |
|
|
seed, |
|
|
randomize_seed, |
|
|
upload_to_r2, |
|
|
account_id, |
|
|
access_key, |
|
|
secret_key, |
|
|
bucket, |
|
|
progress |
|
|
): |
|
|
BASE_FRAMES_HEIGHT_WIDTH = 81 * 832 * 624 |
|
|
BASE_STEP_DURATION = 15 |
|
|
input_image = load_image(image_url) |
|
|
width, height = resize_image(input_image).size |
|
|
frames = get_num_frames(duration_seconds) |
|
|
factor = frames * width * height / BASE_FRAMES_HEIGHT_WIDTH |
|
|
step_duration = BASE_STEP_DURATION * factor ** 1.5 |
|
|
return 10 + int(steps) * step_duration |
|
|
|
|
|
|
|
|
@spaces.GPU(duration=120) |
|
|
def generate_video(image_url, |
|
|
prompt, |
|
|
height, |
|
|
width, |
|
|
negative_prompt, |
|
|
duration_seconds, |
|
|
guidance_scale, |
|
|
steps, |
|
|
seed, |
|
|
randomize_seed, |
|
|
upload_to_r2, |
|
|
account_id, |
|
|
access_key, |
|
|
secret_key, |
|
|
bucket, |
|
|
progress=gr.Progress(track_tqdm=True)): |
|
|
|
|
|
if image_url is None: |
|
|
raise gr.Error("Please upload an input image.") |
|
|
|
|
|
input_image = load_image(image_url) |
|
|
num_frames = get_num_frames(duration_seconds) |
|
|
|
|
|
current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed) |
|
|
|
|
|
resized_image = resize_image(input_image) |
|
|
print("final size:", resized_image.width, resized_image.height) |
|
|
|
|
|
with torch.inference_mode(): |
|
|
output_frames_list = pipe( |
|
|
image=resized_image, |
|
|
prompt=prompt, |
|
|
negative_prompt=negative_prompt, |
|
|
height=resized_image.height, |
|
|
width=resized_image.width, |
|
|
num_frames=num_frames, |
|
|
guidance_scale=float(guidance_scale), |
|
|
guidance_scale_2=float(guidance_scale), |
|
|
num_inference_steps=int(steps), |
|
|
generator=torch.Generator(device="cuda").manual_seed(current_seed) |
|
|
).frames[0] |
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile: |
|
|
video_path = tmpfile.name |
|
|
export_to_video(output_frames_list, video_path, fps=FIXED_FPS) |
|
|
if upload_to_r2: |
|
|
video_url = upload_video_to_r2(video_path, account_id, access_key, secret_key, bucket) |
|
|
result = {"status": "success", "message": "upload video success", "url": video_url} |
|
|
else: |
|
|
result = {"status": "success", "message": "Image generated but not uploaded", "url": video_path} |
|
|
return json.dumps(result) |
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown("# Fast 4 steps Wan 2.2 I2V (14B)") |
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
image_url_input = gr.Textbox( |
|
|
label="Orginal image url", |
|
|
show_label=True, |
|
|
max_lines=1, |
|
|
placeholder="Enter image url for inpainting", |
|
|
container=False |
|
|
) |
|
|
prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v) |
|
|
duration_seconds_input = gr.Slider(minimum=MIN_DURATION, maximum=MAX_DURATION, step=0.1, value=3.5, label="Duration (seconds)", info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps.") |
|
|
|
|
|
with gr.Accordion("Advanced Settings", open=False): |
|
|
negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3) |
|
|
seed_input = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, interactive=True) |
|
|
randomize_seed_checkbox = gr.Checkbox(label="Randomize seed", value=True, interactive=True) |
|
|
with gr.Row(): |
|
|
height_input = gr.Slider(minimum=480, maximum=1024, step=1, value=640, label=f"Output Height") |
|
|
width_input = gr.Slider(minimum=480, maximum=1024, step=1, value=540, label=f"Output Width") |
|
|
steps_slider = gr.Slider(minimum=1, maximum=30, step=1, value=4, label="Inference Steps") |
|
|
guidance_scale_input = gr.Slider(minimum=0.0, maximum=20.0, step=0.5, value=1.0, label="Guidance Scale", visible=True) |
|
|
|
|
|
with gr.Accordion("R2 Settings", open=False): |
|
|
upload_to_r2 = gr.Checkbox(label="Upload to R2", value=False) |
|
|
with gr.Row(): |
|
|
account_id = gr.Textbox(label="Account Id", placeholder="Enter R2 account id", value="") |
|
|
bucket = gr.Textbox(label="Bucket Name", placeholder="Enter R2 bucket name here", value="") |
|
|
|
|
|
with gr.Row(): |
|
|
access_key = gr.Textbox(label="Access Key", placeholder="Enter R2 access key here", value="") |
|
|
secret_key = gr.Textbox(label="Secret Key", placeholder="Enter R2 secret key here", value="") |
|
|
|
|
|
generate_button = gr.Button("Generate Video", variant="primary") |
|
|
with gr.Column(): |
|
|
output_json_component = gr.Code(label="JSON Result", language="json", value="{}") |
|
|
|
|
|
|
|
|
|
|
|
ui_inputs = [ |
|
|
image_url_input, prompt_input, height_input, width_input, |
|
|
negative_prompt_input, duration_seconds_input, |
|
|
guidance_scale_input, steps_slider, seed_input, randomize_seed_checkbox, |
|
|
upload_to_r2, account_id, access_key, secret_key, bucket |
|
|
] |
|
|
generate_button.click( |
|
|
fn=generate_video, |
|
|
inputs=ui_inputs, |
|
|
outputs=output_json_component, |
|
|
api_name="predict" |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.queue(api_open=True) |
|
|
demo.launch(share=True) |