Spaces:
Running
on
Zero
Running
on
Zero
| import spaces | |
| import torch | |
| from diffusers.pipelines.wan.pipeline_wan_i2v import WanImageToVideoPipeline | |
| from diffusers.models.transformers.transformer_wan import WanTransformer3DModel | |
| from diffusers.utils import export_to_video | |
| import gradio as gr | |
| import tempfile | |
| import numpy as np | |
| from PIL import Image | |
| import random | |
| from datetime import datetime | |
| import os | |
| import time | |
| from PIL import Image | |
| import json | |
| import boto3 | |
| from io import BytesIO | |
| from diffusers.utils import load_image | |
| import random | |
| import gc | |
| from torchao.quantization import quantize_ | |
| from torchao.quantization import Float8DynamicActivationFloat8WeightConfig | |
| from torchao.quantization import Int8WeightOnlyConfig | |
| import aoti | |
| MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers" | |
| MAX_DIM = 832 | |
| MIN_DIM = 480 | |
| SQUARE_DIM = 640 | |
| MULTIPLE_OF = 16 | |
| MAX_SEED = np.iinfo(np.int32).max | |
| FIXED_FPS = 24 | |
| MIN_FRAMES_MODEL = 8 | |
| MAX_FRAMES_MODEL = 120 | |
| MIN_DURATION = round(MIN_FRAMES_MODEL/FIXED_FPS,1) | |
| MAX_DURATION = round(MAX_FRAMES_MODEL/FIXED_FPS,1) | |
| pipe = WanImageToVideoPipeline.from_pretrained(MODEL_ID, | |
| transformer=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers', | |
| subfolder='transformer', | |
| torch_dtype=torch.bfloat16, | |
| device_map='cuda', | |
| ), | |
| transformer_2=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers', | |
| subfolder='transformer_2', | |
| torch_dtype=torch.bfloat16, | |
| device_map='cuda', | |
| ), | |
| torch_dtype=torch.bfloat16, | |
| ).to('cuda') | |
| pipe.load_lora_weights( | |
| "Kijai/WanVideo_comfy", | |
| weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors", | |
| adapter_name="lightx2v" | |
| ) | |
| kwargs_lora = {} | |
| kwargs_lora["load_into_transformer_2"] = True | |
| pipe.load_lora_weights( | |
| "Kijai/WanVideo_comfy", | |
| weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors", | |
| adapter_name="lightx2v_2", **kwargs_lora | |
| ) | |
| pipe.set_adapters(["lightx2v", "lightx2v_2"], adapter_weights=[1., 1.]) | |
| pipe.fuse_lora(adapter_names=["lightx2v"], lora_scale=3., components=["transformer"]) | |
| pipe.fuse_lora(adapter_names=["lightx2v_2"], lora_scale=1., components=["transformer_2"]) | |
| pipe.unload_lora_weights() | |
| quantize_(pipe.text_encoder, Int8WeightOnlyConfig()) | |
| quantize_(pipe.transformer, Float8DynamicActivationFloat8WeightConfig()) | |
| quantize_(pipe.transformer_2, Float8DynamicActivationFloat8WeightConfig()) | |
| aoti.aoti_blocks_load(pipe.transformer, 'zerogpu-aoti/Wan2', variant='fp8da') | |
| aoti.aoti_blocks_load(pipe.transformer_2, 'zerogpu-aoti/Wan2', variant='fp8da') | |
| default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation" | |
| default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature" | |
| class calculateDuration: | |
| def __init__(self, activity_name=""): | |
| self.activity_name = activity_name | |
| def __enter__(self): | |
| self.start_time = time.time() | |
| self.start_time_formatted = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.start_time)) | |
| print(f"Activity: {self.activity_name}, Start time: {self.start_time_formatted}") | |
| return self | |
| def __exit__(self, exc_type, exc_value, traceback): | |
| self.end_time = time.time() | |
| self.elapsed_time = self.end_time - self.start_time | |
| self.end_time_formatted = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.end_time)) | |
| if self.activity_name: | |
| print(f"Elapsed time for {self.activity_name}: {self.elapsed_time:.6f} seconds") | |
| else: | |
| print(f"Elapsed time: {self.elapsed_time:.6f} seconds") | |
| def resize_image(image: Image.Image) -> Image.Image: | |
| """ | |
| Resizes an image to fit within the model's constraints, preserving aspect ratio as much as possible. | |
| """ | |
| width, height = image.size | |
| # Handle square case | |
| if width == height: | |
| return image.resize((SQUARE_DIM, SQUARE_DIM), Image.LANCZOS) | |
| aspect_ratio = width / height | |
| MAX_ASPECT_RATIO = MAX_DIM / MIN_DIM | |
| MIN_ASPECT_RATIO = MIN_DIM / MAX_DIM | |
| image_to_resize = image | |
| if aspect_ratio > MAX_ASPECT_RATIO: | |
| # Very wide image -> crop width to fit 832x480 aspect ratio | |
| target_w, target_h = MAX_DIM, MIN_DIM | |
| crop_width = int(round(height * MAX_ASPECT_RATIO)) | |
| left = (width - crop_width) // 2 | |
| image_to_resize = image.crop((left, 0, left + crop_width, height)) | |
| elif aspect_ratio < MIN_ASPECT_RATIO: | |
| # Very tall image -> crop height to fit 480x832 aspect ratio | |
| target_w, target_h = MIN_DIM, MAX_DIM | |
| crop_height = int(round(width / MIN_ASPECT_RATIO)) | |
| top = (height - crop_height) // 2 | |
| image_to_resize = image.crop((0, top, width, top + crop_height)) | |
| else: | |
| if width > height: # Landscape | |
| target_w = MAX_DIM | |
| target_h = int(round(target_w / aspect_ratio)) | |
| else: # Portrait | |
| target_h = MAX_DIM | |
| target_w = int(round(target_h * aspect_ratio)) | |
| final_w = round(target_w / MULTIPLE_OF) * MULTIPLE_OF | |
| final_h = round(target_h / MULTIPLE_OF) * MULTIPLE_OF | |
| final_w = max(MIN_DIM, min(MAX_DIM, final_w)) | |
| final_h = max(MIN_DIM, min(MAX_DIM, final_h)) | |
| return image_to_resize.resize((final_w, final_h), Image.LANCZOS) | |
| def get_num_frames(duration_seconds: float): | |
| return 1 + int(np.clip( | |
| int(round(duration_seconds * FIXED_FPS)), | |
| MIN_FRAMES_MODEL, | |
| MAX_FRAMES_MODEL, | |
| )) | |
| def upload_video_to_r2(video_file, account_id, access_key, secret_key, bucket_name): | |
| with calculateDuration("Upload video"): | |
| connectionUrl = f"https://{account_id}.r2.cloudflarestorage.com" | |
| s3 = boto3.client( | |
| 's3', | |
| endpoint_url=connectionUrl, | |
| region_name='auto', | |
| aws_access_key_id=access_key, | |
| aws_secret_access_key=secret_key | |
| ) | |
| current_time = datetime.now().strftime("%Y/%m/%d/%H%M%S") | |
| video_remote_path = f"generated_videos/{current_time}_{random.randint(0, MAX_SEED)}.mp4" | |
| with open(video_file, "rb") as f: # 修正关键点 | |
| s3.upload_fileobj(f, bucket_name, video_remote_path) | |
| print("upload finish", video_remote_path) | |
| return video_remote_path | |
| def get_duration( | |
| image_url, | |
| prompt, | |
| height, | |
| width, | |
| negative_prompt, | |
| duration_seconds, | |
| guidance_scale, | |
| steps, | |
| seed, | |
| randomize_seed, | |
| upload_to_r2, | |
| account_id, | |
| access_key, | |
| secret_key, | |
| bucket, | |
| progress | |
| ): | |
| BASE_FRAMES_HEIGHT_WIDTH = 81 * 832 * 624 | |
| BASE_STEP_DURATION = 15 | |
| input_image = load_image(image_url) | |
| width, height = resize_image(input_image).size | |
| frames = get_num_frames(duration_seconds) | |
| factor = frames * width * height / BASE_FRAMES_HEIGHT_WIDTH | |
| step_duration = BASE_STEP_DURATION * factor ** 1.5 | |
| return 10 + int(steps) * step_duration | |
| def generate_video(image_url, | |
| prompt, | |
| height, | |
| width, | |
| negative_prompt, | |
| duration_seconds, | |
| guidance_scale, | |
| steps, | |
| seed, | |
| randomize_seed, | |
| upload_to_r2, | |
| account_id, | |
| access_key, | |
| secret_key, | |
| bucket, | |
| progress=gr.Progress(track_tqdm=True)): | |
| if image_url is None: | |
| raise gr.Error("Please upload an input image.") | |
| input_image = load_image(image_url) | |
| num_frames = get_num_frames(duration_seconds) | |
| current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed) | |
| resized_image = resize_image(input_image) | |
| print("final size:", resized_image.width, resized_image.height) | |
| with torch.inference_mode(): | |
| output_frames_list = pipe( | |
| image=resized_image, | |
| prompt=prompt, | |
| negative_prompt=negative_prompt, | |
| height=resized_image.height, | |
| width=resized_image.width, | |
| num_frames=num_frames, | |
| guidance_scale=float(guidance_scale), | |
| guidance_scale_2=float(guidance_scale), | |
| num_inference_steps=int(steps), | |
| generator=torch.Generator(device="cuda").manual_seed(current_seed) | |
| ).frames[0] | |
| with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile: | |
| video_path = tmpfile.name | |
| export_to_video(output_frames_list, video_path, fps=FIXED_FPS) | |
| if upload_to_r2: | |
| video_url = upload_video_to_r2(video_path, account_id, access_key, secret_key, bucket) | |
| result = {"status": "success", "message": "upload video success", "url": video_url} | |
| else: | |
| result = {"status": "success", "message": "Image generated but not uploaded", "url": video_path} | |
| return json.dumps(result) | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Fast 4 steps Wan 2.1 I2V (14B) with CausVid LoRA") | |
| gr.Markdown("[CausVid](https://github.com/tianweiy/CausVid) is a distilled version of Wan 2.1 to run faster in just 4-8 steps, [extracted as LoRA by Kijai](https://huggingface.co/Kijai/WanVideo_comfy/blob/main/Wan21_CausVid_14B_T2V_lora_rank32.safetensors) and is compatible with 🧨 diffusers") | |
| with gr.Row(): | |
| with gr.Column(): | |
| image_url_input = gr.Textbox( | |
| label="Orginal image url", | |
| show_label=True, | |
| max_lines=1, | |
| placeholder="Enter image url for inpainting", | |
| container=False | |
| ) | |
| prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v) | |
| duration_seconds_input = gr.Slider(minimum=MIN_DURATION, maximum=MAX_DURATION, step=0.1, value=3.5, label="Duration (seconds)", info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps.") | |
| with gr.Accordion("Advanced Settings", open=False): | |
| negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3) | |
| seed_input = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, interactive=True) | |
| randomize_seed_checkbox = gr.Checkbox(label="Randomize seed", value=True, interactive=True) | |
| with gr.Row(): | |
| height_input = gr.Slider(minimum=512, maximum=1024, step=1, value=640, label=f"Output Height") | |
| width_input = gr.Slider(minimum=512, maximum=1024, step=1, value=540, label=f"Output Width") | |
| steps_slider = gr.Slider(minimum=1, maximum=30, step=1, value=4, label="Inference Steps") | |
| guidance_scale_input = gr.Slider(minimum=0.0, maximum=20.0, step=0.5, value=1.0, label="Guidance Scale", visible=True) | |
| with gr.Accordion("R2 Settings", open=False): | |
| upload_to_r2 = gr.Checkbox(label="Upload to R2", value=False) | |
| with gr.Row(): | |
| account_id = gr.Textbox(label="Account Id", placeholder="Enter R2 account id", value="") | |
| bucket = gr.Textbox(label="Bucket Name", placeholder="Enter R2 bucket name here", value="") | |
| with gr.Row(): | |
| access_key = gr.Textbox(label="Access Key", placeholder="Enter R2 access key here", value="") | |
| secret_key = gr.Textbox(label="Secret Key", placeholder="Enter R2 secret key here", value="") | |
| generate_button = gr.Button("Generate Video", variant="primary") | |
| with gr.Column(): | |
| output_json_component = gr.Code(label="JSON Result", language="json", value="{}") | |
| ui_inputs = [ | |
| image_url_input, prompt_input, height_input, width_input, | |
| negative_prompt_input, duration_seconds_input, | |
| guidance_scale_input, steps_slider, seed_input, randomize_seed_checkbox, | |
| upload_to_r2, account_id, access_key, secret_key, bucket | |
| ] | |
| generate_button.click( | |
| fn=generate_video, | |
| inputs=ui_inputs, | |
| outputs=output_json_component, | |
| api_name="predict" | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue(api_open=True) | |
| demo.launch(share=True) |