LTX-Video

Diffusers

Safetensors

LTXPipeline

Model card Files Files and versions

xet

Community

qhillerich commited on Feb 11

Commit

0f9b150

verified ·

1 Parent(s): d63b692

Update handler.py

Browse files

Files changed (1) hide show

handler.py +466 -432

handler.py CHANGED Viewed

@@ -1,513 +1,547 @@
 """
-handler.py — Hugging Face Inference Endpoints (custom handler)
-Goal:
-- Fix: `ffmpeg` is not a registered plugin name
-- Do NOT use huggingface_inference_toolkit "plugins" for ffmpeg
-- Instead: resolve an ffmpeg executable path and call it directly
-- Option B: prefer imageio-ffmpeg (repo-only dependency), fallback to system ffmpeg if available
-What you must also do in repo:
-- requirements.txt: add `imageio-ffmpeg>=0.4.9`
-Notes:
-- This file is intentionally "full-fat": robust input parsing, clear errors, temp-file hygiene,
-  optional output formats, and a ready-to-wire inference section.
-- You can paste your actual model inference in the TODO section, or use the provided
-  Transformers pipeline example.
 """
 from __future__ import annotations
 import base64
 import io
-import json
 import os
-import re
-import stat
-import subprocess
-import tempfile
 import time
 from dataclasses import dataclass
-from typing import Any, Dict, Optional, Tuple, Union
-# Optional: imageio-ffmpeg provides an ffmpeg executable path
-try:
-    import imageio_ffmpeg  # type: ignore
-except Exception:
-    imageio_ffmpeg = None
-# Optional: Transformers ASR pipeline example
-# If you don't want this, remove it and wire your own model.
 try:
-    from transformers import pipeline  # type: ignore
 except Exception:
-    pipeline = None
-# -----------------------------
-# Utilities: errors & logging
-# -----------------------------
-class HandlerError(RuntimeError):
-    """Raised for user-facing errors in request handling."""
 def _now_ms() -> int:
     return int(time.time() * 1000)
-def _truncate(s: str, n: int = 2000) -> str:
-    if s is None:
-        return ""
-    s = str(s)
-    return s if len(s) <= n else s[:n] + f"...(truncated {len(s)-n} chars)"
-# -----------------------------
-# Utilities: subprocess runner
-# -----------------------------
-def _run(cmd: list[str], *, timeout_s: Optional[int] = None) -> subprocess.CompletedProcess:
     """
-    Run a command and raise a readable error with STDOUT/STDERR if it fails.
     """
-    try:
-        return subprocess.run(
-            cmd,
-            check=True,
-            capture_output=True,
-            text=True,
-            timeout=timeout_s,
-        )
-    except subprocess.TimeoutExpired as e:
-        raise HandlerError(f"Command timed out after {timeout_s}s: {' '.join(cmd)}") from e
-    except subprocess.CalledProcessError as e:
-        stdout = _truncate(e.stdout or "", 2000)
-        stderr = _truncate(e.stderr or "", 2000)
-        raise HandlerError(
-            "Command failed.\n"
-            f"CMD: {' '.join(cmd)}\n"
-            f"EXIT: {e.returncode}\n"
-            f"STDOUT: {stdout}\n"
-            f"STDERR: {stderr}\n"
-        ) from e
-    except FileNotFoundError as e:
-        raise HandlerError(f"Executable not found for command: {' '.join(cmd)}") from e
-# -----------------------------
-# ffmpeg resolution
-# -----------------------------
-def _is_executable(path: str) -> bool:
-    return os.path.isfile(path) and os.access(path, os.X_OK)
-def _chmod_exec(path: str) -> None:
-    try:
-        st = os.stat(path)
-        os.chmod(path, st.st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
-    except Exception:
-        # best-effort
-        pass
-def _get_ffmpeg_path() -> str:
-    """
-    Resolve an ffmpeg executable path without any HF plugin mechanism.
-    Priority:
-      1) imageio-ffmpeg managed exe (if installed)
-      2) system ffmpeg on PATH
-    """
-    # 1) imageio-ffmpeg
-    if imageio_ffmpeg is not None:
-        try:
-            p = imageio_ffmpeg.get_ffmpeg_exe()
-            if os.path.isfile(p):
-                _chmod_exec(p)
-            _run([p, "-version"], timeout_s=10)
-            return p
-        except Exception:
-            pass
-    # 2) system ffmpeg
-    try:
-        _run(["ffmpeg", "-version"], timeout_s=10)
-        return "ffmpeg"
-    except Exception as e:
-        raise HandlerError(
-            "ffmpeg is not available.\n"
-            "Fix options:\n"
-            "  - Add `imageio-ffmpeg>=0.4.9` to requirements.txt (recommended repo-only fix)\n"
-            "  - Or ensure `ffmpeg` exists in the runtime image (custom container)\n"
-            f"Last error: {e}"
-        )
-def _get_ffprobe_path(ffmpeg_path: str) -> str:
     """
-    Try to infer ffprobe path if available.
-    If using imageio-ffmpeg, ffprobe may not be included; we treat it as optional.
     """
-    # If ffmpeg is a full path, try sibling ffprobe
-    if os.path.sep in ffmpeg_path:
-        cand = os.path.join(os.path.dirname(ffmpeg_path), "ffprobe")
-        if _is_executable(cand):
-            return cand
-    # fallback to system ffprobe if present
-    try:
-        _run(["ffprobe", "-version"], timeout_s=10)
-        return "ffprobe"
-    except Exception:
-        return ""  # optional
-# -----------------------------
-# Input parsing helpers
-# -----------------------------
-@dataclass
-class MediaPayload:
-    raw_bytes: bytes
-    filename: str
-    content_type: str
-_B64_RE = re.compile(r"^[A-Za-z0-9+/=\s]+$")
-def _maybe_base64_decode(s: str) -> Optional[bytes]:
     """
-    Attempt base64 decode if the string looks like base64.
-    Returns bytes if successful, else None.
     """
-    ss = s.strip()
-    # handle data URL: data:audio/wav;base64,....
-    if ss.startswith("data:") and "base64," in ss:
         try:
-            b64 = ss.split("base64,", 1)[1]
-            return base64.b64decode(b64, validate=False)
-        except Exception:
-            return None
-    # plain base64 (heuristic)
-    if len(ss) >= 64 and _B64_RE.match(ss):
         try:
-            return base64.b64decode(ss, validate=False)
         except Exception:
-            return None
-    return None
-def _coerce_to_media_payload(data: Union[Dict[str, Any], bytes, str]) -> MediaPayload:
     """
-    Accept common HF endpoint payload patterns and normalize to bytes + filename/content-type.
-    Supported forms:
-      - bytes / bytearray
-      - base64 string (optionally data URL)
-      - dict {"inputs": <bytes|base64|string|dict>}
-      - dict with keys: "audio"/"data"/"content" containing bytes or base64
-      - dict may include "filename", "content_type"
     """
-    filename = "input_media"
-    content_type = "application/octet-stream"
-    if isinstance(data, (bytes, bytearray)):
-        return MediaPayload(raw_bytes=bytes(data), filename=filename, content_type=content_type)
-    if isinstance(data, str):
-        decoded = _maybe_base64_decode(data)
-        if decoded is None:
-            raise HandlerError(
-                "String input must be base64 (or data:...;base64,...). "
-                "If you're sending JSON, wrap your bytes in base64."
-            )
-        return MediaPayload(raw_bytes=decoded, filename=filename, content_type=content_type)
-    if not isinstance(data, dict):
-        raise HandlerError("Unsupported input type. Send bytes, base64 string, or a JSON object.")
-    # Pull metadata if present
-    filename = str(data.get("filename") or data.get("name") or filename)
-    content_type = str(data.get("content_type") or data.get("mime_type") or content_type)
-    # Common HF: {"inputs": ...}
-    if "inputs" in data:
-        inner = data["inputs"]
-        # If inputs is a dict with richer structure
-        if isinstance(inner, dict):
-            # allow nested metadata
-            filename = str(inner.get("filename") or inner.get("name") or filename)
-            content_type = str(inner.get("content_type") or inner.get("mime_type") or content_type)
-            for k in ("data", "audio", "content", "bytes"):
-                if k in inner:
-                    return _coerce_to_media_payload({**inner, "data": inner[k], "filename": filename, "content_type": content_type})
-        # If inputs is bytes/base64
-        return _coerce_to_media_payload(inner)
-    # Other common keys
-    for k in ("audio", "data", "content", "bytes"):
-        if k in data:
-            v = data[k]
-            if isinstance(v, (bytes, bytearray)):
-                return MediaPayload(raw_bytes=bytes(v), filename=filename, content_type=content_type)
-            if isinstance(v, str):
-                decoded = _maybe_base64_decode(v)
-                if decoded is None:
-                    raise HandlerError(f"Field `{k}` is a string but not base64/data-url.")
-                return MediaPayload(raw_bytes=decoded, filename=filename, content_type=content_type)
-            if isinstance(v, dict):
-                # nested object containing base64
-                return _coerce_to_media_payload(v)
-    raise HandlerError(
-        "Could not find media bytes in request. "
-        "Provide bytes directly, a base64 string, or a JSON object with `inputs` or `audio`/`data`."
-    )
-# -----------------------------
-# Media conversion: any -> wav
-# -----------------------------
-def _write_temp_file(directory: str, name: str, data: bytes) -> str:
-    path = os.path.join(directory, name)
-    with open(path, "wb") as f:
-        f.write(data)
-    return path
-def _convert_to_wav(
-    media_bytes: bytes,
-    *,
-    ffmpeg_path: str,
-    target_sr: int = 16000,
-    target_channels: int = 1,
-    output_pcm: str = "s16le",
-) -> str:
     """
-    Convert arbitrary audio/video bytes to a WAV file (PCM).
-    Returns a temp WAV file path that the caller should delete.
     """
-    # Work inside a temp dir, then copy output to a NamedTemporaryFile outside the context
-    with tempfile.TemporaryDirectory() as d:
-        in_path = _write_temp_file(d, "input_media.bin", media_bytes)
-        out_path = os.path.join(d, "output.wav")
-        cmd = [
-            ffmpeg_path,
-            "-y",
-            "-hide_banner",
-            "-loglevel",
-            "error",
-            "-i",
-            in_path,
-            "-vn",
-            "-ac",
-            str(target_channels),
-            "-ar",
-            str(target_sr),
-            "-acodec",
-            f"pcm_{output_pcm}",
-            out_path,
-        ]
-        _run(cmd, timeout_s=120)
-        tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-        tmp_path = tmp.name
-        tmp.close()
-        with open(out_path, "rb") as src, open(tmp_path, "wb") as dst:
-            dst.write(src.read())
-    return tmp_path
-def _read_file_bytes(path: str) -> bytes:
-    with open(path, "rb") as f:
-        return f.read()
-# -----------------------------
-# Output helpers
-# -----------------------------
-def _as_base64(b: bytes) -> str:
-    return base64.b64encode(b).decode("utf-8")
-def _response(
-    *,
-    ok: bool,
-    text: str = "",
-    extra: Optional[Dict[str, Any]] = None,
-    diagnostics: Optional[Dict[str, Any]] = None,
-) -> Dict[str, Any]:
-    out: Dict[str, Any] = {"ok": ok, "text": text}
-    if extra:
-        out.update(extra)
-    if diagnostics:
-        out["diagnostics"] = diagnostics
-    return out
-# -----------------------------
-# EndpointHandler
-# -----------------------------
-class EndpointHandler:
-    """
-    Hugging Face Inference Endpoint handler.
-    This handler:
-      - Accepts bytes/base64/json payloads
-      - Converts input media to WAV using ffmpeg (no plugin system)
-      - Runs ASR (example: Transformers pipeline)
-      - Returns text and optional timing/diagnostics
-    To use your own model:
-      - Replace `_infer_transcription()` with your model logic
-      - Or set `self.asr` to your pipeline/model in __init__
-    """
-    def __init__(self, path: str = "") -> None:
-        self.model_path = path or ""
-        self.ffmpeg_path = _get_ffmpeg_path()
-        self.ffprobe_path = _get_ffprobe_path(self.ffmpeg_path)
-        # Settings (can be overridden per-request)
-        self.default_sr = int(os.getenv("TARGET_SAMPLE_RATE", "16000"))
-        self.default_channels = int(os.getenv("TARGET_CHANNELS", "1"))
-        # Optional: initialize an ASR pipeline if transformers is available.
-        # If you're not doing ASR, delete this and implement your task.
-        self.asr = None
-        if pipeline is not None:
-            # If your repo contains a model, `path` is typically the model directory.
-            # If not, you can hardcode a model id here.
-            model_id_or_path = self.model_path if self.model_path else os.getenv("ASR_MODEL_ID", "").strip()
-            if model_id_or_path:
-                # You can choose task="automatic-speech-recognition"
-                # and pass device_map / torch_dtype in advanced setups.
-                self.asr = pipeline("automatic-speech-recognition", model=model_id_or_path)
-        # Startup self-test (fast fail)
-        _run([self.ffmpeg_path, "-version"], timeout_s=10)
-    def __call__(self, data: Union[Dict[str, Any], bytes, str]) -> Dict[str, Any]:
         t0 = _now_ms()
-        # Parse request
         try:
-            payload = _coerce_to_media_payload(data)
-        except Exception as e:
-            return _response(ok=False, text=str(e), diagnostics={"stage": "parse"})
-        # Allow per-request overrides
-        req: Dict[str, Any] = data if isinstance(data, dict) else {}
-        target_sr = int(req.get("target_sr") or req.get("sample_rate") or self.default_sr)
-        target_channels = int(req.get("target_channels") or req.get("channels") or self.default_channels)
-        # Convert to wav
-        wav_path = ""
-        try:
-            wav_path = _convert_to_wav(
-                payload.raw_bytes,
-                ffmpeg_path=self.ffmpeg_path,
-                target_sr=target_sr,
-                target_channels=target_channels,
-            )
             t1 = _now_ms()
-            # Inference
-            text, model_meta = self._infer_transcription(wav_path, req=req)
-            t2 = _now_ms()
-            # Optional: include wav bytes or base64 (off by default)
-            include_wav_b64 = bool(req.get("include_wav_base64", False))
-            extra: Dict[str, Any] = {
-                "filename": payload.filename,
-                "content_type": payload.content_type,
-                "model": model_meta,
-            }
-            if include_wav_b64:
-                extra["wav_base64"] = _as_base64(_read_file_bytes(wav_path))
-            return _response(
-                ok=True,
-                text=text,
-                extra=extra,
-                diagnostics={
-                    "ffmpeg": self.ffmpeg_path,
                     "timing_ms": {
-                        "total": t2 - t0,
-                        "convert": t1 - t0,
-                        "inference": t2 - t1,
-                    },
-                    "audio": {
-                        "target_sr": target_sr,
-                        "target_channels": target_channels,
                     },
                 },
-            )
         except Exception as e:
-            return _response(
-                ok=False,
-                text=str(e),
-                diagnostics={
-                    "stage": "convert_or_infer",
-                    "ffmpeg": self.ffmpeg_path,
                 },
-            )
-        finally:
-            if wav_path:
-                try:
-                    os.remove(wav_path)
-                except Exception:
-                    pass
-    # -----------------------------
-    # Inference (replace this)
-    # -----------------------------
-    def _infer_transcription(self, wav_path: str, *, req: Dict[str, Any]) -> Tuple[str, Dict[str, Any]]:
         """
-        Default behavior: if transformers pipeline is configured, run ASR.
-        Otherwise, return a diagnostic placeholder.
-        Replace this method with your real inference logic if needed.
         """
-        if self.asr is None:
-            # Placeholder: you should replace with your actual model logic.
-            # This is still useful to verify that ffmpeg conversion works and the endpoint runs end-to-end.
-            return (
-                f"OK: converted to wav successfully (path={os.path.basename(wav_path)}). "
-                "ASR pipeline not configured. Set ASR_MODEL_ID env var or pass a model path.",
-                {"type": "none", "note": "no ASR pipeline"},
             )
-        # Transformers pipeline accepts a file path for ASR
-        # You can pass options like chunk_length_s, stride_length_s for long audio
-        asr_kwargs: Dict[str, Any] = {}
-        if "chunk_length_s" in req:
-            asr_kwargs["chunk_length_s"] = float(req["chunk_length_s"])
-        if "stride_length_s" in req:
-            asr_kwargs["stride_length_s"] = float(req["stride_length_s"])
-        result = self.asr(wav_path, **asr_kwargs)
-        # Normalize output
-        # Typical result: {"text": "...", ...}
-        if isinstance(result, dict) and "text" in result:
-            text = str(result["text"])
         else:
-            text = json.dumps(result)
-        return text, {
-            "type": "transformers-pipeline",
-            "task": "automatic-speech-recognition",
         }

 """
+handler.py — Hugging Face Inference Endpoint custom handler
+Outputs: GIF, WebM, ZIP(frames)
+Key points:
+- No "huggingface_inference_toolkit plugin" usage at all.
+- WebM encoding uses imageio + imageio-ffmpeg (ffmpeg binary resolved internally).
+- GIF encoding uses Pillow (no ffmpeg needed).
+- ZIP output is a zip of PNG frames.
+Request JSON (examples):
+{
+  "prompt": "a cinematic shot of a hawk flying over snowy mountains",
+  "negative_prompt": "low quality, blurry",
+  "num_frames": 48,
+  "fps": 16,
+  "height": 512,
+  "width": 512,
+  "seed": 123,
+  "outputs": ["gif", "webm", "zip"],      // any subset
+  "return_base64": true,                  // default true
+  "gif": {"fps": 12},                     // optional overrides
+  "webm": {"fps": 24, "quality": "good"}, // quality: "fast"|"good"|"best"
+  "zip": {"format": "png"}                // currently png only
+}
+Response JSON:
+{
+  "ok": true,
+  "diagnostics": {...},
+  "outputs": {
+    "gif_base64": "...",
+    "webm_base64": "...",
+    "zip_base64": "..."
+  }
+}
+Notes on payload sizes:
+- base64 video payloads can be large. For production, consider uploading to R2/S3
+  and returning a URL instead. This handler keeps it in-response for simplicity.
 """
 from __future__ import annotations
 import base64
 import io
 import os
 import time
+import tempfile
+import zipfile
 from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import numpy as np
+from PIL import Image
+# WebM encoding (uses ffmpeg resolved by imageio-ffmpeg)
+import imageio
+# Ensure imageio uses the packaged ffmpeg binary (not HF toolkit plugins)
 try:
+    import imageio_ffmpeg  # type: ignore
+    _FFMPEG_EXE = imageio_ffmpeg.get_ffmpeg_exe()
+    # imageio reads this env var to locate ffmpeg
+    os.environ["IMAGEIO_FFMPEG_EXE"] = _FFMPEG_EXE
 except Exception:
+    _FFMPEG_EXE = ""
 def _now_ms() -> int:
     return int(time.time() * 1000)
+def _b64(data: bytes) -> str:
+    return base64.b64encode(data).decode("utf-8")
+def _clamp_uint8_frame(frame: np.ndarray) -> np.ndarray:
     """
+    Ensure frame is uint8 HxWx3 (RGB).
+    Accepts:
+      - float in [0,1] or [-1,1]
+      - uint8 already
+      - grayscale (HxW) -> RGB
+      - RGBA -> RGB
     """
+    if not isinstance(frame, np.ndarray):
+        frame = np.array(frame)
+    # squeeze batch-like dims if present (best-effort)
+    if frame.ndim == 4 and frame.shape[0] == 1:
+        frame = frame[0]
+    if frame.ndim == 2:
+        frame = np.stack([frame, frame, frame], axis=-1)
+    if frame.ndim != 3:
+        raise ValueError(f"Frame must be HxW, HxWxC, or 1xHxWxC; got shape {frame.shape}")
+    # Channels fixups
+    if frame.shape[-1] == 4:
+        frame = frame[..., :3]
+    elif frame.shape[-1] == 1:
+        frame = np.repeat(frame, 3, axis=-1)
+    elif frame.shape[-1] != 3:
+        # sometimes CxHxW
+        if frame.shape[0] == 3 and frame.ndim == 3:
+            frame = np.transpose(frame, (1, 2, 0))
+        else:
+            raise ValueError(f"Unsupported channel dimension: {frame.shape}")
+    if frame.dtype == np.uint8:
+        return frame
+    # Convert float -> uint8
+    f = frame.astype(np.float32)
+    # If looks like [-1,1], map to [0,1]
+    if f.min() < 0.0:
+        f = (f + 1.0) / 2.0
+    f = np.clip(f, 0.0, 1.0)
+    return (f * 255.0).round().astype(np.uint8)
+def _encode_gif(frames: List[np.ndarray], fps: int) -> bytes:
     """
+    Encode GIF using Pillow (no ffmpeg dependency).
     """
+    if not frames:
+        raise ValueError("No frames to encode.")
+    pil_frames = [Image.fromarray(_clamp_uint8_frame(f)) for f in frames]
+    duration_ms = int(1000 / max(1, fps))
+    buf = io.BytesIO()
+    pil_frames[0].save(
+        buf,
+        format="GIF",
+        save_all=True,
+        append_images=pil_frames[1:],
+        duration=duration_ms,
+        loop=0,
+        optimize=False,
+        disposal=2,
+    )
+    return buf.getvalue()
+def _encode_webm(frames: List[np.ndarray], fps: int, quality: str = "good") -> bytes:
     """
+    Encode WebM using imageio (ffmpeg under the hood via imageio-ffmpeg).
+    quality: "fast" | "good" | "best"
     """
+    if not frames:
+        raise ValueError("No frames to encode.")
+    # Choose VP9 settings. These are pragmatic defaults.
+    # For smaller file sizes: lower bitrate or higher crf.
+    # For quality: lower crf (but larger files).
+    quality = (quality or "good").lower()
+    if quality == "fast":
+        crf = 42
+        preset = "veryfast"
+    elif quality == "best":
+        crf = 28
+        preset = "slow"
+    else:
+        crf = 34
+        preset = "medium"
+    # Write to a temp file then return bytes
+    with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as tmp:
+        out_path = tmp.name
+    try:
+        writer = imageio.get_writer(
+            out_path,
+            fps=max(1, fps),
+            format="FFMPEG",
+            codec="libvpx-vp9",
+            # ffmpeg_params are passed through to ffmpeg invocation
+            ffmpeg_params=[
+                "-pix_fmt", "yuv420p",
+                "-crf", str(crf),
+                "-b:v", "0",
+                "-preset", preset,
+            ],
+        )
         try:
+            for f in frames:
+                writer.append_data(_clamp_uint8_frame(f))
+        finally:
+            writer.close()
+        with open(out_path, "rb") as f:
+            return f.read()
+    finally:
         try:
+            os.remove(out_path)
         except Exception:
+            pass
+def _encode_zip_frames(frames: List[np.ndarray]) -> bytes:
     """
+    Zip frames as PNG images: frame_000000.png, ...
     """
+    if not frames:
+        raise ValueError("No frames to zip.")
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=6) as zf:
+        for i, f in enumerate(frames):
+            arr = _clamp_uint8_frame(f)
+            im = Image.fromarray(arr)
+            frame_buf = io.BytesIO()
+            im.save(frame_buf, format="PNG", optimize=True)
+            zf.writestr(f"frame_{i:06d}.png", frame_buf.getvalue())
+    return buf.getvalue()
+@dataclass
+class GenParams:
+    prompt: str
+    negative_prompt: str
+    num_frames: int
+    fps: int
+    height: int
+    width: int
+    seed: Optional[int]
+class EndpointHandler:
     """
+    Custom handler entrypoint for Hugging Face Inference Endpoints.
     """
+    def __init__(self, path: str = "") -> None:
+        self.repo_path = path or ""
+        # Attempt to initialize a diffusers pipeline if available.
+        # If your repo uses a different entrypoint, edit `_generate_frames()`.
+        self.pipe = None
+        self._init_error = None
+        try:
+            import torch  # type: ignore
+            from diffusers import DiffusionPipeline  # type: ignore
+            # Prefer fp16 if CUDA is available; otherwise float32.
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            dtype = torch.float16 if device == "cuda" else torch.float32
+            # Load from repository path (the model code/checkpoints are in the repo)
+            # This is the most generic path for "diffusers-like" repos.
+            self.pipe = DiffusionPipeline.from_pretrained(
+                self.repo_path if self.repo_path else None,
+                torch_dtype=dtype,
+            )
+            # Move to device if possible
+            try:
+                self.pipe.to(device)
+            except Exception:
+                pass
+            # Some pipelines benefit from enabling memory optimizations
+            try:
+                if hasattr(self.pipe, "enable_vae_slicing"):
+                    self.pipe.enable_vae_slicing()
+            except Exception:
+                pass
+        except Exception as e:
+            self._init_error = str(e)
+            self.pipe = None
+        # Quick diagnostic: ensure imageio-ffmpeg resolved (for WebM)
+        self.ffmpeg_exe = _FFMPEG_EXE
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         t0 = _now_ms()
         try:
+            params, outputs, return_b64, out_cfg = self._parse_request(data)
+            frames, gen_diag = self._generate_frames(params, out_cfg=out_cfg)
             t1 = _now_ms()
+            result_outputs: Dict[str, Any] = {}
+            # GIF
+            if "gif" in outputs:
+                gif_fps = int((out_cfg.get("gif") or {}).get("fps") or params.fps)
+                gif_bytes = _encode_gif(frames, fps=gif_fps)
+                result_outputs["gif_base64" if return_b64 else "gif_bytes"] = _b64(gif_bytes) if return_b64 else gif_bytes
+                t_gif = _now_ms()
+            else:
+                t_gif = t1
+            # WebM
+            if "webm" in outputs:
+                webm_cfg = out_cfg.get("webm") or {}
+                webm_fps = int(webm_cfg.get("fps") or params.fps)
+                webm_quality = str(webm_cfg.get("quality") or "good")
+                webm_bytes = _encode_webm(frames, fps=webm_fps, quality=webm_quality)
+                result_outputs["webm_base64" if return_b64 else "webm_bytes"] = _b64(webm_bytes) if return_b64 else webm_bytes
+                t_webm = _now_ms()
+            else:
+                t_webm = t_gif
+            # ZIP frames
+            if "zip" in outputs:
+                zip_bytes = _encode_zip_frames(frames)
+                result_outputs["zip_base64" if return_b64 else "zip_bytes"] = _b64(zip_bytes) if return_b64 else zip_bytes
+                t_zip = _now_ms()
+            else:
+                t_zip = t_webm
+            return {
+                "ok": True,
+                "outputs": result_outputs,
+                "diagnostics": {
                     "timing_ms": {
+                        "total": t_zip - t0,
+                        "generate": t1 - t0,
+                        "gif": (t_gif - t1) if "gif" in outputs else 0,
+                        "webm": (t_webm - t_gif) if "webm" in outputs else 0,
+                        "zip": (t_zip - t_webm) if "zip" in outputs else 0,
                     },
+                    "generator": gen_diag,
+                    "ffmpeg_exe": self.ffmpeg_exe,
+                    "init_error": self._init_error,
                 },
+            }
         except Exception as e:
+            return {
+                "ok": False,
+                "error": str(e),
+                "diagnostics": {
+                    "ffmpeg_exe": self.ffmpeg_exe,
+                    "init_error": self._init_error,
                 },
+            }
+    # ----------------------------
+    # Request parsing
+    # ----------------------------
+    def _parse_request(self, data: Dict[str, Any]) -> Tuple[GenParams, List[str], bool, Dict[str, Any]]:
+        if not isinstance(data, dict):
+            raise ValueError("Request must be a JSON object.")
+        prompt = str(data.get("prompt") or data.get("inputs") or "").strip()
+        if not prompt:
+            raise ValueError("Missing `prompt` (or `inputs`).")
+        negative_prompt = str(data.get("negative_prompt") or "").strip()
+        num_frames = int(data.get("num_frames") or data.get("frames") or 32)
+        fps = int(data.get("fps") or 12)
+        height = int(data.get("height") or 512)
+        width = int(data.get("width") or 512)
+        seed = data.get("seed")
+        seed = int(seed) if seed is not None and str(seed).strip() != "" else None
+        outputs = data.get("outputs") or ["gif"]
+        if isinstance(outputs, str):
+            outputs = [outputs]
+        outputs = [str(x).lower() for x in outputs]
+        allowed = {"gif", "webm", "zip"}
+        outputs = [o for o in outputs if o in allowed]
+        if not outputs:
+            outputs = ["gif"]
+        return_b64 = bool(data.get("return_base64", True))
+        out_cfg = data.get("output_config") or {}
+        # also allow top-level gif/webm/zip config objects
+        for k in ("gif", "webm", "zip"):
+            if k in data and isinstance(data[k], dict):
+                out_cfg[k] = data[k]
+        params = GenParams(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            num_frames=max(1, num_frames),
+            fps=max(1, fps),
+            height=max(64, height),
+            width=max(64, width),
+            seed=seed,
+        )
+        return params, outputs, return_b64, out_cfg
+    # ----------------------------
+    # Frame generation
+    # ----------------------------
+    def _generate_frames(self, params: GenParams, out_cfg: Dict[str, Any]) -> Tuple[List[np.ndarray], Dict[str, Any]]:
         """
+        Generates frames as a list of uint8 RGB numpy arrays.
+        This is the only place you should need to customize for your specific repo/model.
+        Current implementation:
+        - If a diffusers pipeline is available:
+            pipe(prompt=..., negative_prompt=..., height=..., width=..., num_frames=...)
+          Then tries common output fields: frames / videos / images.
+        - Otherwise: raises with init details.
+        If your model call is different (e.g., special args like guidance_scale, num_inference_steps),
+        add them here.
         """
+        if self.pipe is None:
+            raise RuntimeError(
+                "Model pipeline is not initialized. "
+                "If your repo doesn't use diffusers DiffusionPipeline, edit _generate_frames(). "
+                f"Init error: {self._init_error}"
             )
+        # Optional knobs (safe defaults)
+        num_inference_steps = int(out_cfg.get("num_inference_steps") or 30)
+        guidance_scale = float(out_cfg.get("guidance_scale") or 7.5)
+        # Seed (best effort)
+        generator = None
+        try:
+            import torch  # type: ignore
+            if params.seed is not None:
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+                generator = torch.Generator(device=device).manual_seed(params.seed)
+        except Exception:
+            generator = None
+        # Call the pipeline (generic diffusers-style)
+        kwargs: Dict[str, Any] = {
+            "prompt": params.prompt,
+            "negative_prompt": params.negative_prompt if params.negative_prompt else None,
+            "height": params.height,
+            "width": params.width,
+            "num_inference_steps": num_inference_steps,
+            "guidance_scale": guidance_scale,
+        }
+        # Common video arg names across repos
+        # Some pipelines use num_frames, some use video_length, some use num_frames.
+        # We'll try a few.
+        # (If your repo is strict, adjust this section.)
+        called = False
+        last_err: Optional[Exception] = None
+        output = None
+        for frame_arg in ("num_frames", "video_length", "num_video_frames"):
+            try:
+                call_kwargs = dict(kwargs)
+                call_kwargs[frame_arg] = params.num_frames
+                if generator is not None:
+                    call_kwargs["generator"] = generator
+                output = self.pipe(**{k: v for k, v in call_kwargs.items() if v is not None})
+                called = True
+                break
+            except Exception as e:
+                last_err = e
+                continue
+        if not called:
+            raise RuntimeError(f"Pipeline call failed for frame args. Last error: {last_err}")
+        # Extract frames from common output structures
+        frames: List[np.ndarray] = []
+        # diffusers outputs vary:
+        # - output.frames (list of PIL/np)
+        # - output.videos (tensor/np)
+        # - output.images (list of PIL for single-frame)
+        if hasattr(output, "frames") and output.frames is not None:
+            frames_raw = output.frames
+            frames = [np.array(f) for f in frames_raw]
+        elif hasattr(output, "videos") and output.videos is not None:
+            vids = output.videos
+            arr = None
+            # torch tensor or numpy
+            try:
+                import torch  # type: ignore
+                if isinstance(vids, torch.Tensor):
+                    arr = vids.detach().cpu().numpy()
+                else:
+                    arr = np.array(vids)
+            except Exception:
+                arr = np.array(vids)
+            # Common shapes:
+            # (B, T, C, H, W) or (B, T, H, W, C) or (T, H, W, C)
+            if arr.ndim == 5:
+                # pick first batch
+                arr = arr[0]
+            if arr.ndim == 4 and arr.shape[1] in (1, 3, 4):
+                # likely (T, C, H, W) -> (T, H, W, C)
+                arr = np.transpose(arr, (0, 2, 3, 1))
+            if arr.ndim != 4:
+                raise ValueError(f"Unexpected video tensor shape: {arr.shape}")
+            frames = [arr[t] for t in range(arr.shape[0])]
+        elif hasattr(output, "images") and output.images is not None:
+            imgs = output.images
+            # if it's just one image, treat as 1-frame "video"
+            if isinstance(imgs, list):
+                frames = [np.array(im) for im in imgs]
+            else:
+                frames = [np.array(imgs)]
         else:
+            # final fallback: try dict-like
+            if isinstance(output, dict):
+                for key in ("frames", "videos", "images"):
+                    if key in output and output[key] is not None:
+                        v = output[key]
+                        if key == "videos":
+                            arr = np.array(v)
+                            if arr.ndim == 5:
+                                arr = arr[0]
+                            if arr.ndim == 4 and arr.shape[1] in (1, 3, 4):
+                                arr = np.transpose(arr, (0, 2, 3, 1))
+                            frames = [arr[t] for t in range(arr.shape[0])]
+                        else:
+                            if isinstance(v, list):
+                                frames = [np.array(x) for x in v]
+                            else:
+                                frames = [np.array(v)]
+                        break
+        if not frames:
+            raise RuntimeError("Could not extract frames from pipeline output (no frames/videos/images found).")
+        # Normalize to uint8 RGB
+        frames_u8 = [_clamp_uint8_frame(f) for f in frames]
+        diag = {
+            "prompt_len": len(params.prompt),
+            "negative_prompt_len": len(params.negative_prompt),
+            "num_frames": len(frames_u8),
+            "height": int(frames_u8[0].shape[0]),
+            "width": int(frames_u8[0].shape[1]),
+            "num_inference_steps": num_inference_steps,
+            "guidance_scale": guidance_scale,
+            "seed": params.seed,
         }
+        return frames_u8, diag