Tsmith2024
/

wan22-ti2v-endpoint-handler

Model card Files Files and versions

xet

Community

Tsmith2024 commited on Apr 10

Commit

627335d

verified ·

1 Parent(s): 90b1f7c

Fix: use WanImageToVideoPipeline not WanPipeline

Browse files

Files changed (1) hide show

handler.py +18 -83

handler.py CHANGED Viewed

@@ -1,107 +1,51 @@
-"""
-HuggingFace Inference Endpoint handler for Wan2.2-TI2V-5B
-Accepts first + last frame images, returns interpolated video.
-Input JSON:
-  {
-    "inputs": {
-      "start_image": "<base64 png>",
-      "end_image":   "<base64 png>",
-      "prompt":      "...",
-      "num_frames":  41,
-      "guidance_scale": 5.0,
-      "num_inference_steps": 20
-    }
-  }
-Output JSON:
-  { "video": "<base64 mp4>" }
-"""
 import base64
 import io
 import os
 import tempfile
 from typing import Any, Dict
-import numpy as np
 import torch
 from PIL import Image
-from diffusers import WanPipeline, AutoencoderKLWan
 from diffusers.utils import export_to_video
 class EndpointHandler:
     def __init__(self, path: str = ""):
-        """Load Wan2.2-TI2V-5B from /repository (HF mounts model here)."""
-        model_path = path or "/repository"
         print(f"Loading Wan2.2-TI2V-5B from {model_path}…")
-        dtype = torch.bfloat16
         device = "cuda" if torch.cuda.is_available() else "cpu"
-        # VAE in float32 for better decoding quality
         vae = AutoencoderKLWan.from_pretrained(
-            model_path,
-            subfolder="vae",
-            torch_dtype=torch.float32,
         )
-        self.pipe = WanPipeline.from_pretrained(
-            model_path,
-            vae=vae,
-            torch_dtype=dtype,
         )
         self.pipe.to(device)
-        # Memory optimisation — helps on 24GB GPUs
         self.pipe.enable_attention_slicing()
         self.device = device
         print("✓ Model loaded and ready")
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        Called on every request.
-        data = { "inputs": { "start_image": b64, "end_image": b64, "prompt": str, ... } }
-        """
-        inputs = data.get("inputs", data)  # handle both wrapped and unwrapped
-        # Decode images
-        start_img = self._decode_image(inputs["start_image"])
-        end_img   = self._decode_image(inputs["end_image"])
         prompt     = inputs.get("prompt", "Smooth cinematic motion, natural movement")
-        num_frames = int(inputs.get("num_frames", 41))   # must be 4N+1
         guidance   = float(inputs.get("guidance_scale", 5.0))
         steps      = int(inputs.get("num_inference_steps", 20))
         fps        = int(inputs.get("fps", 16))
-        # Ensure num_frames follows 4N+1 pattern
         num_frames = max(9, ((num_frames - 1) // 4) * 4 + 1)
-        # Size from input image (snap to multiples of 32)
-        w, h = start_img.size
-        width  = (w // 32) * 32
-        height = (h // 32) * 32
-        # Build first+last frame conditioning using TI2V mask approach
-        # First frame = start_img, last frame = end_img, middle = grey
-        frames = [start_img.resize((width, height))]
-        grey = Image.new("RGB", (width, height), (128, 128, 128))
-        frames.extend([grey] * (num_frames - 2))
-        frames.append(end_img.resize((width, height)))
-        # Mask: 0 = conditioned (first/last), 1 = free generation (middle)
-        mask_black = Image.new("L", (width, height), 0)
-        mask_white = Image.new("L", (width, height), 255)
-        mask = [mask_black] + [mask_white] * (num_frames - 2) + [mask_black]
         with torch.inference_mode():
             output = self.pipe(
-                image=frames,
-                mask=mask,
                 prompt=prompt,
                 negative_prompt="",
                 height=height,
@@ -110,25 +54,16 @@ class EndpointHandler:
                 guidance_scale=guidance,
                 num_inference_steps=steps,
             ).frames[0]
-        # Export to temp MP4 and encode as base64
         with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp:
             tmp_path = tmp.name
         export_to_video(output, tmp_path, fps=fps)
         with open(tmp_path, "rb") as f:
             video_b64 = base64.b64encode(f.read()).decode("utf-8")
         os.unlink(tmp_path)
         return {"video": video_b64}
     @staticmethod
     def _decode_image(b64_str: str) -> Image.Image:
-        """Decode base64 string to PIL Image."""
-        # Strip data URI prefix if present
         if "," in b64_str:
             b64_str = b64_str.split(",", 1)[1]
-        img_bytes = base64.b64decode(b64_str)
-        return Image.open(io.BytesIO(img_bytes)).convert("RGB")

 import base64
 import io
 import os
 import tempfile
 from typing import Any, Dict
 import torch
 from PIL import Image
+from diffusers import AutoencoderKLWan, WanImageToVideoPipeline
 from diffusers.utils import export_to_video
 class EndpointHandler:
     def __init__(self, path: str = ""):
+        model_path = path or os.environ.get("MODEL_ID", "/repository")
         print(f"Loading Wan2.2-TI2V-5B from {model_path}…")
+        dtype  = torch.bfloat16
         device = "cuda" if torch.cuda.is_available() else "cpu"
         vae = AutoencoderKLWan.from_pretrained(
+            model_path, subfolder="vae", torch_dtype=torch.float32,
         )
+        self.pipe = WanImageToVideoPipeline.from_pretrained(
+            model_path, vae=vae, torch_dtype=dtype,
         )
         self.pipe.to(device)
         self.pipe.enable_attention_slicing()
         self.device = device
         print("✓ Model loaded and ready")
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        inputs     = data.get("inputs", data)
+        start_img  = self._decode_image(inputs["start_image"])
+        end_img    = self._decode_image(inputs["end_image"])
         prompt     = inputs.get("prompt", "Smooth cinematic motion, natural movement")
+        num_frames = int(inputs.get("num_frames", 41))
         guidance   = float(inputs.get("guidance_scale", 5.0))
         steps      = int(inputs.get("num_inference_steps", 20))
         fps        = int(inputs.get("fps", 16))
         num_frames = max(9, ((num_frames - 1) // 4) * 4 + 1)
+        w, h       = start_img.size
+        width      = (w // 32) * 32
+        height     = (h // 32) * 32
+        start_img  = start_img.resize((width, height))
+        end_img    = end_img.resize((width, height))
         with torch.inference_mode():
             output = self.pipe(
+                image=start_img,
+                last_image=end_img,
                 prompt=prompt,
                 negative_prompt="",
                 height=height,
                 guidance_scale=guidance,
                 num_inference_steps=steps,
             ).frames[0]
         with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp:
             tmp_path = tmp.name
         export_to_video(output, tmp_path, fps=fps)
         with open(tmp_path, "rb") as f:
             video_b64 = base64.b64encode(f.read()).decode("utf-8")
         os.unlink(tmp_path)
         return {"video": video_b64}
     @staticmethod
     def _decode_image(b64_str: str) -> Image.Image:
         if "," in b64_str:
             b64_str = b64_str.split(",", 1)[1]
+        return Image.open(io.BytesIO(base64.b64decode(b64_str))).convert("RGB")