LTX-Video

Diffusers

Safetensors

LTXPipeline

Model card Files Files and versions

xet

Community

qhillerich commited on Feb 17

Commit

a2cbd86

verified ·

1 Parent(s): d37ebe0

Update handler.py

Browse files

Files changed (1) hide show

handler.py +56 -27

handler.py CHANGED Viewed

@@ -2,18 +2,10 @@
 handler.py — Hugging Face Inference Endpoint custom handler
 Outputs: GIF, WebM, ZIP(frames)
-This version includes:
-- Defensive patches to avoid hosted runtime failing with:
-    "`ffmpeg` is not a registered plugin name."
-- Robust frame extraction for shapes like (T,H,W,C), (B,T,H,W,C), (T,C,H,W), (B,T,C,H,W)
-- Output encoders:
-  - GIF: Pillow only (no ffmpeg)
-  - ZIP: PNG frames zipped (no ffmpeg)
-  - WebM: imageio + imageio-ffmpeg via IMAGEIO_FFMPEG_EXE env var (NO executable= arg)
-IMPORTANT:
-- HF gateway often requires top-level { "inputs": {...} }.
-- Send requests wrapped in "inputs".
 """
 from __future__ import annotations
@@ -137,6 +129,13 @@ def _b64(data: bytes) -> str:
     return base64.b64encode(data).decode("utf-8")
 def _clamp_uint8_frame(frame: np.ndarray) -> np.ndarray:
     """
     Normalize a frame into uint8 RGB (H,W,3).
@@ -200,10 +199,6 @@ def _encode_gif(frames: List[np.ndarray], fps: int) -> bytes:
 def _encode_webm(frames: List[np.ndarray], fps: int, quality: str = "good") -> bytes:
     """
     Encode WebM (VP9) via imageio.
-    IMPORTANT:
-    - Do NOT pass executable=...; HF's imageio build can reject that parameter.
-    - We rely on IMAGEIO_FFMPEG_EXE env var set at import time.
     """
     if not frames:
         raise ValueError("No frames to encode WebM.")
@@ -280,6 +275,7 @@ class GenParams:
     seed: Optional[int]
     num_inference_steps: int
     guidance_scale: float
 def _unwrap_inputs(payload: Dict[str, Any]) -> Dict[str, Any]:
@@ -292,8 +288,8 @@ def _parse_request(payload: Dict[str, Any]) -> Tuple[GenParams, List[str], bool,
     data = _unwrap_inputs(payload)
     prompt = str(data.get("prompt") or data.get("inputs") or "").strip()
-    if not prompt:
-        raise ValueError("Missing `prompt` (or `inputs`).")
     negative_prompt = str(data.get("negative_prompt") or "").strip()
@@ -304,6 +300,9 @@ def _parse_request(payload: Dict[str, Any]) -> Tuple[GenParams, List[str], bool,
     seed = data.get("seed")
     seed = int(seed) if seed is not None and str(seed).strip() != "" else None
     num_inference_steps = int(data.get("num_inference_steps") or 30)
     guidance_scale = float(data.get("guidance_scale") or 7.5)
@@ -333,6 +332,7 @@ def _parse_request(payload: Dict[str, Any]) -> Tuple[GenParams, List[str], bool,
         seed=seed,
         num_inference_steps=max(1, num_inference_steps),
         guidance_scale=guidance_scale,
     )
     return params, outputs, return_base64, out_cfg
@@ -347,13 +347,13 @@ class EndpointHandler:
         self.pipe = None
         self.init_error: Optional[str] = None
-        print("=== CUSTOM handler.py LOADED (webm uses IMAGEIO_FFMPEG_EXE only) ===", flush=True)
         print(f"=== HF toolkit patch diag: {HF_TOOLKIT_PATCH_DIAG} ===", flush=True)
         print(f"=== imageio-ffmpeg exe: {_FFMPEG_EXE} ===", flush=True)
         try:
             import torch  # type: ignore
-            from diffusers import DiffusionPipeline  # type: ignore
             device = "cuda" if torch.cuda.is_available() else "cpu"
             dtype = torch.float16 if device == "cuda" else torch.float32
@@ -361,7 +361,14 @@ class EndpointHandler:
             subdir = os.getenv("HF_MODEL_SUBDIR", "").strip()
             model_path = self.repo_path if not subdir else os.path.join(self.repo_path, subdir)
-            self.pipe = DiffusionPipeline.from_pretrained(model_path, torch_dtype=dtype)
             try:
                 self.pipe.to(device)
@@ -373,6 +380,10 @@ class EndpointHandler:
                     self.pipe.enable_vae_slicing()
             except Exception:
                 pass
         except Exception as e:
             self.init_error = str(e)
@@ -434,6 +445,8 @@ class EndpointHandler:
             }
         except Exception as e:
             return {
                 "ok": False,
                 "error": str(e),
@@ -469,21 +482,37 @@ class EndpointHandler:
             "width": params.width,
             "num_inference_steps": params.num_inference_steps,
             "guidance_scale": params.guidance_scale,
         }
         # Try common frame arg names across video pipelines
         output = None
         last_err: Optional[Exception] = None
         for frame_arg in ("num_frames", "video_length", "num_video_frames"):
             try:
                 call_kwargs = dict(kwargs)
                 call_kwargs[frame_arg] = params.num_frames
                 if generator is not None:
                     call_kwargs["generator"] = generator
-                output = self.pipe(**{k: v for k, v in call_kwargs.items() if v is not None})
                 break
             except Exception as e:
                 last_err = e
                 continue
         if output is None:
@@ -491,6 +520,8 @@ class EndpointHandler:
         frames: List[np.ndarray] = []
         # 1) output.frames — may be list OR ndarray/tensor-like
         if hasattr(output, "frames") and getattr(output, "frames") is not None:
             frames_raw = getattr(output, "frames")
@@ -537,7 +568,7 @@ class EndpointHandler:
         # 3) output.images (single frame or list)
         elif hasattr(output, "images") and getattr(output, "images") is not None:
-            imgs = getattr(output, "images")
             if isinstance(imgs, list):
                 frames = [np.array(im) for im in imgs]
             else:
@@ -584,8 +615,6 @@ class EndpointHandler:
             "num_frames": len(frames_u8),
             "height": int(frames_u8[0].shape[0]),
             "width": int(frames_u8[0].shape[1]),
-            "num_inference_steps": params.num_inference_steps,
-            "guidance_scale": params.guidance_scale,
-            "seed": params.seed,
         }
-        return frames_u8, diag

 handler.py — Hugging Face Inference Endpoint custom handler
 Outputs: GIF, WebM, ZIP(frames)
+This version maintains UNIVERSAL compatibility:
+- Defensive argument guessing (num_frames vs video_length)
+- Robust output shape parsing (TBL, BCTHW, etc.)
+- Adds Support for Image-to-Video via `image` input (base64)
 """
 from __future__ import annotations
     return base64.b64encode(data).decode("utf-8")
+def _b64_to_pil(b64_str: str) -> Image.Image:
+    if "," in b64_str:
+        b64_str = b64_str.split(",")[1]
+    data = base64.b64decode(b64_str)
+    return Image.open(io.BytesIO(data)).convert("RGB")
 def _clamp_uint8_frame(frame: np.ndarray) -> np.ndarray:
     """
     Normalize a frame into uint8 RGB (H,W,3).
 def _encode_webm(frames: List[np.ndarray], fps: int, quality: str = "good") -> bytes:
     """
     Encode WebM (VP9) via imageio.
     """
     if not frames:
         raise ValueError("No frames to encode WebM.")
     seed: Optional[int]
     num_inference_steps: int
     guidance_scale: float
+    image_b64: Optional[str] = None
 def _unwrap_inputs(payload: Dict[str, Any]) -> Dict[str, Any]:
     data = _unwrap_inputs(payload)
     prompt = str(data.get("prompt") or data.get("inputs") or "").strip()
+    if not prompt and "image" not in data:
+         pass
     negative_prompt = str(data.get("negative_prompt") or "").strip()
     seed = data.get("seed")
     seed = int(seed) if seed is not None and str(seed).strip() != "" else None
+    # Image input for I2V
+    image_b64 = data.get("image") or data.get("image_base64")
     num_inference_steps = int(data.get("num_inference_steps") or 30)
     guidance_scale = float(data.get("guidance_scale") or 7.5)
         seed=seed,
         num_inference_steps=max(1, num_inference_steps),
         guidance_scale=guidance_scale,
+        image_b64=image_b64
     )
     return params, outputs, return_base64, out_cfg
         self.pipe = None
         self.init_error: Optional[str] = None
+        print("=== CUSTOM handler.py LOADED (Universal Mode) ===", flush=True)
         print(f"=== HF toolkit patch diag: {HF_TOOLKIT_PATCH_DIAG} ===", flush=True)
         print(f"=== imageio-ffmpeg exe: {_FFMPEG_EXE} ===", flush=True)
         try:
             import torch  # type: ignore
+            from diffusers import DiffusionPipeline, LTXConditionPipeline
             device = "cuda" if torch.cuda.is_available() else "cpu"
             dtype = torch.float16 if device == "cuda" else torch.float32
             subdir = os.getenv("HF_MODEL_SUBDIR", "").strip()
             model_path = self.repo_path if not subdir else os.path.join(self.repo_path, subdir)
+            # --- Attempt to load LTXConditionPipeline first (for I2V Support) ---
+            # If that fails (e.g. model isn't LTX or diffusers version old), fallback to generic.
+            try:
+                print("Attempting to load LTXConditionPipeline...", flush=True)
+                self.pipe = LTXConditionPipeline.from_pretrained(model_path, torch_dtype=dtype)
+            except Exception as e:
+                print(f"LTXConditionPipeline load failed ({e}), falling back to generic DiffusionPipeline...", flush=True)
+                self.pipe = DiffusionPipeline.from_pretrained(model_path, torch_dtype=dtype)
             try:
                 self.pipe.to(device)
                     self.pipe.enable_vae_slicing()
             except Exception:
                 pass
+            # Optimization for LTX / newer diffusers
+            if hasattr(self.pipe, "vae") and hasattr(self.pipe.vae, "enable_tiling"):
+                self.pipe.vae.enable_tiling()
         except Exception as e:
             self.init_error = str(e)
             }
         except Exception as e:
+            import traceback
+            traceback.print_exc()
             return {
                 "ok": False,
                 "error": str(e),
             "width": params.width,
             "num_inference_steps": params.num_inference_steps,
             "guidance_scale": params.guidance_scale,
+            # "num_frames" is intentionally OMITTED here to be handled by the loop below
         }
+        # Handle Image-to-Video
+        # Use simple argument passing if pipeline supports it (LTXConditionPipeline does)
+        # If image is present, we pass it.
+        if params.image_b64:
+            print("Received image input, performing Image-to-Video.", flush=True)
+            pil_image = _b64_to_pil(params.image_b64)
+            kwargs["image"] = pil_image
         # Try common frame arg names across video pipelines
         output = None
         last_err: Optional[Exception] = None
+        # UNIVERSAL LOOP: Try all known frame arguments
         for frame_arg in ("num_frames", "video_length", "num_video_frames"):
             try:
                 call_kwargs = dict(kwargs)
                 call_kwargs[frame_arg] = params.num_frames
                 if generator is not None:
                     call_kwargs["generator"] = generator
+                # Filter out None values just in case
+                clean_kwargs = {k: v for k, v in call_kwargs.items() if v is not None}
+                output = self.pipe(**clean_kwargs)
                 break
             except Exception as e:
                 last_err = e
+                # Don't print spam, just try next arg
                 continue
         if output is None:
         frames: List[np.ndarray] = []
+        # UNIVERSAL OUTPUT PARSING: Handle all known shapes
         # 1) output.frames — may be list OR ndarray/tensor-like
         if hasattr(output, "frames") and getattr(output, "frames") is not None:
             frames_raw = getattr(output, "frames")
         # 3) output.images (single frame or list)
         elif hasattr(output, "images") and getattr(output, "images") is not None:
+            imgs = getattr(output, "images\")
             if isinstance(imgs, list):
                 frames = [np.array(im) for im in imgs]
             else:
             "num_frames": len(frames_u8),
             "height": int(frames_u8[0].shape[0]),
             "width": int(frames_u8[0].shape[1]),
+            "mode": "i2v" if params.image_b64 else "t2v"
         }
+        return frames_u8, diag