dippoo's picture
Fix: save Kling Motion video to settings.paths.output_dir (matches serve endpoint)
5a8b9c0
raw
history blame
42.2 kB
"""Video generation routes — WAN 2.2 img2video on RunPod pod or WaveSpeed cloud."""
from __future__ import annotations
import asyncio
import base64
import logging
import os
import time
import uuid
from pathlib import Path
import runpod
from fastapi import APIRouter, File, Form, HTTPException, UploadFile
from pydantic import BaseModel
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/video", tags=["video"])
# Video jobs tracking
_video_jobs: dict[str, dict] = {}
# Cloud providers (initialized from main.py)
_wavespeed_provider = None
_higgsfield_provider = None
def init_wavespeed(provider):
"""Initialize WaveSpeed provider for cloud video generation."""
global _wavespeed_provider
_wavespeed_provider = provider
def init_higgsfield(provider):
"""Initialize Higgsfield provider for Kling 3.0, Sora 2, etc."""
global _higgsfield_provider
_higgsfield_provider = provider
# Pod state is shared from routes_pod
def _get_pod_state():
from content_engine.api.routes_pod import _pod_state
return _pod_state
def _get_comfyui_url():
from content_engine.api.routes_pod import _get_comfyui_url as _gcurl
return _gcurl()
class VideoGenerateRequest(BaseModel):
prompt: str
negative_prompt: str = ""
num_frames: int = 81 # ~3 seconds at 24fps
fps: int = 24
seed: int = -1
@router.post("/generate")
async def generate_video(
image: UploadFile = File(...),
prompt: str = Form(...),
negative_prompt: str = Form(""),
num_frames: int = Form(81),
fps: int = Form(24),
seed: int = Form(-1),
):
"""Generate a video from an image using WAN 2.2 I2V on the RunPod pod."""
import httpx
import random
import base64
pod_state = _get_pod_state()
if pod_state["status"] != "running":
raise HTTPException(400, "Pod not running - start it first in Status page")
job_id = str(uuid.uuid4())[:8]
seed = seed if seed >= 0 else random.randint(0, 2**32 - 1)
# Read the image
image_bytes = await image.read()
image_b64 = base64.b64encode(image_bytes).decode("utf-8")
# Build ComfyUI workflow for WAN 2.2 I2V
workflow = _build_wan_i2v_workflow(
image_b64=image_b64,
prompt=prompt,
negative_prompt=negative_prompt,
num_frames=num_frames,
fps=fps,
seed=seed,
)
try:
comfyui_url = _get_comfyui_url()
async with httpx.AsyncClient(timeout=30) as client:
# First upload the image to ComfyUI
upload_url = f"{comfyui_url}/upload/image"
files = {"image": (f"input_{job_id}.png", image_bytes, "image/png")}
upload_resp = await client.post(upload_url, files=files)
if upload_resp.status_code != 200:
raise HTTPException(500, "Failed to upload image to pod")
upload_data = upload_resp.json()
uploaded_filename = upload_data.get("name", f"input_{job_id}.png")
# Update workflow with uploaded filename
workflow = _build_wan_i2v_workflow(
uploaded_filename=uploaded_filename,
prompt=prompt,
negative_prompt=negative_prompt,
num_frames=num_frames,
fps=fps,
seed=seed,
)
# Submit workflow
url = f"{comfyui_url}/prompt"
resp = await client.post(url, json={"prompt": workflow})
resp.raise_for_status()
data = resp.json()
prompt_id = data["prompt_id"]
_video_jobs[job_id] = {
"prompt_id": prompt_id,
"status": "running",
"seed": seed,
"started_at": time.time(),
"num_frames": num_frames,
"fps": fps,
}
logger.info("Video generation started: %s -> %s", job_id, prompt_id)
# Start background task to poll for completion
asyncio.create_task(_poll_video_job(job_id, prompt_id))
return {
"job_id": job_id,
"status": "running",
"seed": seed,
"estimated_time": f"~{num_frames * 2} seconds",
}
except httpx.HTTPError as e:
logger.error("Video generation failed: %s", e)
raise HTTPException(500, f"Generation failed: {e}")
@router.post("/generate/cloud")
async def generate_video_cloud(
image: UploadFile = File(...),
prompt: str = Form("smooth motion, high quality video"),
negative_prompt: str = Form(""),
model: str = Form("wan-2.6-i2v"),
num_frames: int = Form(81),
fps: int = Form(24),
seed: int = Form(-1),
backend: str = Form("wavespeed"), # wavespeed or higgsfield
):
"""Generate a video using cloud API (WaveSpeed or Higgsfield)."""
import random
import httpx
logger.info("Video cloud generation request: model=%s, backend=%s, frames=%d", model, backend, num_frames)
# Route to Higgsfield for Kling 3.0 models
if backend == "higgsfield" or model.startswith("kling-3"):
logger.info("Routing to Higgsfield for model: %s", model)
return await generate_video_higgsfield(
image=image,
prompt=prompt,
model=model,
duration=max(3, num_frames // 24), # Convert frames to seconds
seed=seed,
)
if not _wavespeed_provider:
logger.error("WaveSpeed provider not configured!")
raise HTTPException(500, "WaveSpeed API not configured")
job_id = str(uuid.uuid4())[:8]
seed = seed if seed >= 0 else random.randint(0, 2**32 - 1)
# Read the image
image_bytes = await image.read()
image_b64 = base64.b64encode(image_bytes).decode("utf-8")
# Create job entry
_video_jobs[job_id] = {
"status": "running",
"seed": seed,
"started_at": time.time(),
"num_frames": num_frames,
"fps": fps,
"model": model,
"backend": "cloud",
}
logger.info("Cloud video generation started: %s (model=%s)", job_id, model)
# Start background task for cloud video generation
asyncio.create_task(_generate_cloud_video(job_id, image_bytes, prompt, negative_prompt, model, seed))
return {
"job_id": job_id,
"status": "running",
"seed": seed,
"model": model,
"estimated_time": "~30-120 seconds",
}
@router.post("/generate/higgsfield")
async def generate_video_higgsfield(
image: UploadFile = File(...),
prompt: str = Form("smooth cinematic motion"),
model: str = Form("kling-3.0"),
duration: int = Form(5),
resolution: str = Form("720p"),
enable_audio: bool = Form(False),
seed: int = Form(-1),
):
"""Generate a video using Higgsfield (Kling 3.0, Sora 2, Veo 3.1)."""
import random
if not _higgsfield_provider:
raise HTTPException(500, "Higgsfield API not configured - set HIGGSFIELD_API_KEY")
job_id = str(uuid.uuid4())[:8]
seed = seed if seed >= 0 else random.randint(0, 2**32 - 1)
# Read the image
image_bytes = await image.read()
# Create job entry
_video_jobs[job_id] = {
"status": "running",
"seed": seed,
"started_at": time.time(),
"duration": duration,
"model": model,
"backend": "higgsfield",
"message": "Starting Higgsfield video generation...",
}
logger.info("Higgsfield video generation started: %s (model=%s)", job_id, model)
# Start background task
asyncio.create_task(_generate_higgsfield_video(
job_id, image_bytes, prompt, model, duration, resolution, enable_audio, seed
))
return {
"job_id": job_id,
"status": "running",
"seed": seed,
"model": model,
"backend": "higgsfield",
"estimated_time": f"~{duration * 10}-{duration * 20} seconds",
}
@router.post("/generate/kling-motion")
async def generate_kling_motion(
image: UploadFile = File(...),
driving_video: UploadFile = File(...),
prompt: str = Form("smooth motion, high quality video"),
duration: int = Form(5),
seed: int = Form(-1),
character_orientation: str = Form("image"),
):
"""Generate video using Kling Motion Control (character image + driving video)."""
import random
if not _wavespeed_provider:
raise HTTPException(500, "WaveSpeed API not configured")
job_id = str(uuid.uuid4())[:8]
seed = seed if seed >= 0 else random.randint(0, 2**32 - 1)
image_bytes = await image.read()
video_bytes = await driving_video.read()
_video_jobs[job_id] = {
"status": "running",
"seed": seed,
"started_at": time.time(),
"model": "kling-motion",
"backend": "cloud",
"message": "Uploading files...",
}
asyncio.create_task(_generate_kling_motion_video(job_id, image_bytes, video_bytes, prompt, duration, character_orientation))
return {"job_id": job_id, "status": "running", "estimated_time": "~60-120 seconds"}
async def _generate_kling_motion_video(
job_id: str,
image_bytes: bytes,
video_bytes: bytes,
prompt: str,
duration: int,
character_orientation: str = "image",
):
"""Background task: upload image + driving video, call Kling Motion Control API."""
import httpx
import aiohttp
try:
# Resize image if too large (Kling Motion limit)
from PIL import Image
import io
img = Image.open(io.BytesIO(image_bytes))
max_size = 1280
if img.width > max_size or img.height > max_size:
img.thumbnail((max_size, max_size), Image.LANCZOS)
buf = io.BytesIO()
img.save(buf, format="JPEG", quality=90)
image_bytes = buf.getvalue()
logger.info("Resized character image to %dx%d", img.width, img.height)
_video_jobs[job_id]["message"] = "Uploading character image..."
image_url = await _wavespeed_provider._upload_temp_image(image_bytes)
logger.info("Kling motion: character image uploaded: %s", image_url[:80])
_video_jobs[job_id]["message"] = "Uploading driving video..."
video_url = await _upload_temp_video(video_bytes)
logger.info("Kling motion: driving video uploaded: %s", video_url[:80])
api_key = _wavespeed_provider._api_key
endpoint = "https://api.wavespeed.ai/api/v3/kwaivgi/kling-v2.6-pro/motion-control"
payload = {
"image": image_url,
"video": video_url,
"prompt": prompt,
"duration": duration,
"character_orientation": character_orientation,
"enable_sync_mode": False,
}
_video_jobs[job_id]["message"] = "Calling Kling Motion Control API..."
logger.info("Calling Kling Motion Control: %s", endpoint)
# Reuse the provider's existing httpx client (avoids SSL reconnect issues)
http = _wavespeed_provider._http_client
resp = await http.post(
endpoint,
json=payload,
headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"},
)
if resp.status_code != 200:
error_text = resp.text[:500]
logger.error("Kling Motion API error: %s", error_text)
_video_jobs[job_id]["status"] = "failed"
_video_jobs[job_id]["error"] = f"API error: {error_text[:200]}"
return
result = resp.json()
data = result.get("data", result)
logger.info("Kling Motion API response: %s", str(result)[:300])
# Poll if async
outputs = data.get("outputs", [])
urls_data = data.get("urls", {})
if not outputs and urls_data.get("get"):
_video_jobs[job_id]["message"] = "Waiting for Kling Motion to complete..."
video_url_out = await _poll_wavespeed_video(urls_data["get"], api_key, job_id, max_attempts=300, interval=5.0)
elif outputs:
video_url_out = outputs[0] if isinstance(outputs[0], str) else outputs[0].get("url")
else:
_video_jobs[job_id]["status"] = "failed"
_video_jobs[job_id]["error"] = "No output URL in response"
return
if not video_url_out:
return # poll already set status
# Download and save
_video_jobs[job_id]["message"] = "Downloading video..."
from content_engine.config import settings
output_dir = settings.paths.output_dir / "videos"
output_dir.mkdir(parents=True, exist_ok=True)
filename = f"kling_motion_{job_id}.mp4"
output_path = output_dir / filename
r = await http.get(video_url_out)
output_path.write_bytes(r.content)
_video_jobs[job_id]["status"] = "completed"
_video_jobs[job_id]["filename"] = filename
_video_jobs[job_id]["message"] = "Done"
logger.info("Kling Motion video saved: %s", filename)
except Exception as e:
logger.exception("Kling Motion generation failed: %s", e)
_video_jobs[job_id]["status"] = "failed"
_video_jobs[job_id]["error"] = str(e)
async def _upload_temp_video(video_bytes: bytes) -> str:
"""Upload a video file to litterbox.catbox.moe and return the URL."""
import aiohttp
async with aiohttp.ClientSession() as session:
data = aiohttp.FormData()
data.add_field("reqtype", "fileupload")
data.add_field("time", "1h")
data.add_field("fileToUpload", video_bytes, filename="driving.mp4", content_type="video/mp4")
async with session.post("https://litterbox.catbox.moe/resources/internals/api.php", data=data) as resp:
if resp.status == 200:
url = (await resp.text()).strip()
if url.startswith("http"):
return url
raise RuntimeError("Failed to upload driving video to litterbox.catbox.moe")
async def _generate_higgsfield_video(
job_id: str,
image_bytes: bytes,
prompt: str,
model: str,
duration: int,
resolution: str,
enable_audio: bool,
seed: int,
):
"""Background task to generate video via Higgsfield API."""
try:
_video_jobs[job_id]["message"] = "Uploading image to Higgsfield..."
# Upload image to temp URL
image_url = await _higgsfield_provider._upload_temp_image(image_bytes) if hasattr(_higgsfield_provider, '_upload_temp_image') else None
if not image_url:
# Fall back to base64 data URL
import base64
image_b64 = base64.b64encode(image_bytes).decode("utf-8")
image_url = f"data:image/png;base64,{image_b64}"
_video_jobs[job_id]["message"] = f"Generating video with {model}..."
# Generate video
result = await _higgsfield_provider.generate_video(
prompt=prompt,
model=model,
duration=duration,
resolution=resolution,
enable_audio=enable_audio,
image_url=image_url,
)
video_url = result.get("video_url")
if not video_url:
_video_jobs[job_id]["status"] = "failed"
_video_jobs[job_id]["error"] = "No video URL in response"
return
# Download the video
_video_jobs[job_id]["message"] = "Downloading generated video..."
import httpx
async with httpx.AsyncClient(timeout=120) as client:
video_resp = await client.get(video_url)
if video_resp.status_code != 200:
_video_jobs[job_id]["status"] = "failed"
_video_jobs[job_id]["error"] = "Failed to download video"
return
# Save to local output directory
from content_engine.config import settings
output_dir = settings.paths.output_dir / "videos"
output_dir.mkdir(parents=True, exist_ok=True)
ext = ".mp4"
if video_url.endswith(".webm"):
ext = ".webm"
local_path = output_dir / f"video_{job_id}{ext}"
local_path.write_bytes(video_resp.content)
_video_jobs[job_id]["status"] = "completed"
_video_jobs[job_id]["output_path"] = str(local_path)
_video_jobs[job_id]["completed_at"] = time.time()
_video_jobs[job_id]["filename"] = local_path.name
_video_jobs[job_id]["message"] = "Video generated successfully!"
logger.info("Higgsfield video saved: %s", local_path)
except Exception as e:
logger.error("Higgsfield video generation failed: %s", e)
_video_jobs[job_id]["status"] = "failed"
_video_jobs[job_id]["error"] = str(e)
async def _poll_wavespeed_video(poll_url: str, api_key: str, job_id: str, max_attempts: int = 120, interval: float = 3.0) -> str | None:
"""Poll the WaveSpeed async video job URL until outputs are ready.
Returns the first output URL when available, or None on failure.
"""
import httpx
async with httpx.AsyncClient(timeout=60) as client:
for attempt in range(max_attempts):
try:
resp = await client.get(
poll_url,
headers={"Authorization": f"Bearer {api_key}"},
)
resp.raise_for_status()
result = resp.json()
data = result.get("data", result)
status = data.get("status", "")
if status == "failed":
error_msg = data.get("error", "Unknown error")
logger.error("WaveSpeed video job failed: %s", error_msg)
_video_jobs[job_id]["status"] = "failed"
_video_jobs[job_id]["error"] = error_msg
return None
outputs = data.get("outputs", [])
if outputs:
logger.info("WaveSpeed video job completed after %d polls", attempt + 1)
return outputs[0]
# Also check for 'output' field
if "output" in data:
out = data["output"]
if isinstance(out, list) and out:
return out[0]
elif isinstance(out, str):
return out
# Update job status with progress
_video_jobs[job_id]["message"] = f"Generating video... (poll {attempt + 1}/{max_attempts})"
logger.debug("WaveSpeed video job pending (attempt %d/%d)", attempt + 1, max_attempts)
await asyncio.sleep(interval)
except Exception as e:
logger.warning("Video poll request failed: %s", e)
await asyncio.sleep(interval)
logger.error("WaveSpeed video job timed out after %d attempts", max_attempts)
return None
async def _generate_cloud_video(
job_id: str,
image_bytes: bytes,
prompt: str,
negative_prompt: str,
model: str,
seed: int,
):
"""Background task to generate video via WaveSpeed cloud API."""
import httpx
import aiohttp
logger.info("Starting cloud video generation: job=%s, model=%s, image_size=%d bytes", job_id, model, len(image_bytes))
_video_jobs[job_id]["message"] = "Uploading image..."
try:
# Upload image to temporary hosting (WaveSpeed needs URL)
logger.info("Uploading image to temp host...")
image_url = await _wavespeed_provider._upload_temp_image(image_bytes)
logger.info("Image uploaded: %s", image_url[:80] if image_url else "FAILED")
# Resolve model to WaveSpeed model ID
from content_engine.services.cloud_providers.wavespeed_provider import VIDEO_MODEL_MAP
wavespeed_model = VIDEO_MODEL_MAP.get(model, VIDEO_MODEL_MAP.get("default", "alibaba/wan-2.6-i2v-720p"))
# Call WaveSpeed video API
api_key = _wavespeed_provider._api_key
endpoint = f"https://api.wavespeed.ai/api/v3/{wavespeed_model}"
payload = {
"image": image_url,
"prompt": prompt,
"enable_sync_mode": True,
}
if negative_prompt:
payload["negative_prompt"] = negative_prompt
# Grok Imagine Video uses duration (6 or 10s) instead of frame counts
if model == "grok-imagine-i2v":
num_frames = _video_jobs[job_id].get("num_frames", 81)
payload["duration"] = 10 if num_frames > 150 else 6
_video_jobs[job_id]["message"] = f"Calling WaveSpeed API ({wavespeed_model})..."
logger.info("Calling WaveSpeed video API: %s", endpoint)
async with httpx.AsyncClient(timeout=300) as client:
resp = await client.post(
endpoint,
json=payload,
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
},
)
if resp.status_code != 200:
error_text = resp.text[:500]
logger.error("WaveSpeed video API error: %s", error_text)
_video_jobs[job_id]["status"] = "failed"
_video_jobs[job_id]["error"] = f"API error: {error_text[:200]}"
return
result = resp.json()
logger.info("WaveSpeed video API response: %s", str(result)[:500])
data = result.get("data", result)
# Check for failed status
if data.get("status") == "failed":
error_msg = data.get("error", "Unknown error")
_video_jobs[job_id]["status"] = "failed"
_video_jobs[job_id]["error"] = error_msg
return
# Extract video URL - handle async response (outputs empty, urls.get present)
video_url = None
outputs = data.get("outputs", [])
urls_data = data.get("urls", {})
# Check for async response first
if not outputs and urls_data and urls_data.get("get"):
poll_url = urls_data["get"]
logger.info("WaveSpeed video returned async job, polling: %s", poll_url[:80])
_video_jobs[job_id]["message"] = "Polling for video result..."
video_url = await _poll_wavespeed_video(poll_url, api_key, job_id)
elif outputs:
video_url = outputs[0]
elif "output" in data:
out = data["output"]
if isinstance(out, list) and out:
video_url = out[0]
elif isinstance(out, str):
video_url = out
if not video_url:
_video_jobs[job_id]["status"] = "failed"
_video_jobs[job_id]["error"] = f"No video URL in response: {data}"
return
# Download the video
logger.info("Downloading cloud video: %s", video_url[:80])
video_resp = await client.get(video_url)
if video_resp.status_code != 200:
_video_jobs[job_id]["status"] = "failed"
_video_jobs[job_id]["error"] = "Failed to download video"
return
# Save to local output directory
from content_engine.config import settings
output_dir = settings.paths.output_dir / "videos"
output_dir.mkdir(parents=True, exist_ok=True)
# Determine extension from URL or default to mp4
ext = ".mp4"
if video_url.endswith(".webm"):
ext = ".webm"
elif video_url.endswith(".webp"):
ext = ".webp"
local_path = output_dir / f"video_{job_id}{ext}"
local_path.write_bytes(video_resp.content)
_video_jobs[job_id]["status"] = "completed"
_video_jobs[job_id]["output_path"] = str(local_path)
_video_jobs[job_id]["completed_at"] = time.time()
_video_jobs[job_id]["filename"] = local_path.name
logger.info("Cloud video saved: %s", local_path)
except Exception as e:
logger.error("Cloud video generation failed: %s", e)
_video_jobs[job_id]["status"] = "failed"
_video_jobs[job_id]["error"] = str(e)
async def _poll_video_job(job_id: str, prompt_id: str):
"""Poll ComfyUI for video job completion."""
import httpx
start = time.time()
timeout = 1800 # 30 minutes for video (WAN 2.2 needs time to load 14B model first run)
comfyui_url = _get_comfyui_url()
async with httpx.AsyncClient(timeout=60) as client:
while time.time() - start < timeout:
try:
url = f"{comfyui_url}/history/{prompt_id}"
resp = await client.get(url)
if resp.status_code == 200:
data = resp.json()
if prompt_id in data:
outputs = data[prompt_id].get("outputs", {})
# Find video output (SaveAnimatedWEBP or VHS_VideoCombine)
for node_id, node_output in outputs.items():
# Check for gifs/videos
if "gifs" in node_output:
video_info = node_output["gifs"][0]
await _download_video(client, job_id, video_info, pod_state)
return
# Check for images (animated)
if "images" in node_output:
img_info = node_output["images"][0]
if img_info.get("type") == "output":
await _download_video(client, job_id, img_info, pod_state)
return
except Exception as e:
logger.debug("Polling video job: %s", e)
await asyncio.sleep(3)
_video_jobs[job_id]["status"] = "failed"
_video_jobs[job_id]["error"] = "Timeout waiting for video generation"
logger.error("Video generation timed out: %s", job_id)
async def _download_video(client, job_id: str, video_info: dict, pod_state: dict):
"""Download the generated video from ComfyUI."""
filename = video_info.get("filename")
subfolder = video_info.get("subfolder", "")
file_type = video_info.get("type", "output")
# Download video
view_url = f"{_get_comfyui_url()}/view"
params = {"filename": filename, "type": file_type}
if subfolder:
params["subfolder"] = subfolder
video_resp = await client.get(view_url, params=params)
if video_resp.status_code == 200:
# Save to local output directory
from content_engine.config import settings
output_dir = settings.paths.output_dir / "videos"
output_dir.mkdir(parents=True, exist_ok=True)
# Determine extension
ext = Path(filename).suffix or ".webp"
local_path = output_dir / f"video_{job_id}{ext}"
local_path.write_bytes(video_resp.content)
_video_jobs[job_id]["status"] = "completed"
_video_jobs[job_id]["output_path"] = str(local_path)
_video_jobs[job_id]["completed_at"] = time.time()
_video_jobs[job_id]["filename"] = local_path.name
logger.info("Video saved: %s", local_path)
else:
_video_jobs[job_id]["status"] = "failed"
_video_jobs[job_id]["error"] = "Failed to download video"
@router.get("/jobs")
async def list_video_jobs():
"""List all video generation jobs."""
return list(_video_jobs.values())
@router.get("/jobs/{job_id}")
async def get_video_job(job_id: str):
"""Get status of a video generation job."""
job = _video_jobs.get(job_id)
if not job:
raise HTTPException(404, "Job not found")
return job
@router.get("/{filename}")
async def get_video_file(filename: str):
"""Serve a generated video file."""
from fastapi.responses import FileResponse
from content_engine.config import settings
video_path = settings.paths.output_dir / "videos" / filename
if not video_path.exists():
raise HTTPException(404, "Video not found")
if filename.endswith(".webm"):
media_type = "video/webm"
elif filename.endswith(".mp4"):
media_type = "video/mp4"
else:
media_type = "image/webp"
return FileResponse(video_path, media_type=media_type)
@router.post("/animate")
async def generate_video_animate(
image: UploadFile = File(...),
driving_video: UploadFile = File(...),
prompt: str = Form("a person dancing, smooth motion, high quality"),
negative_prompt: str = Form(""),
width: int = Form(832),
height: int = Form(480),
num_frames: int = Form(81),
fps: int = Form(16),
seed: int = Form(-1),
steps: int = Form(20),
cfg: float = Form(6.0),
bg_mode: str = Form("keep"), # keep | driving_video | auto
):
"""Generate a dance animation via WAN 2.2 Animate on RunPod ComfyUI pod.
Requires on the pod:
- models/diffusion_models/Wan2_2-Animate-14B_fp8_e4m3fn_scaled_KJ.safetensors
- models/vae/wan_2.1_vae.safetensors
- models/clip_vision/clip_vision_h.safetensors
- models/text_encoders/umt5-xxl-enc-bf16.safetensors
- Custom nodes: ComfyUI-WanVideoWrapper, ComfyUI-VideoHelperSuite, comfyui_controlnet_aux
"""
import httpx
import random
pod_state = _get_pod_state()
if pod_state["status"] != "running":
raise HTTPException(400, "Pod not running — start it first in Status page")
job_id = str(uuid.uuid4())[:8]
seed = seed if seed >= 0 else random.randint(0, 2**32 - 1)
image_bytes = await image.read()
video_bytes = await driving_video.read()
try:
base_url = _get_comfyui_url()
async with httpx.AsyncClient(timeout=60) as client:
# Upload character reference image
img_resp = await client.post(
f"{base_url}/upload/image",
files={"image": (f"ref_{job_id}.png", image_bytes, "image/png")},
)
if img_resp.status_code != 200:
raise HTTPException(500, f"Failed to upload character image: {img_resp.text[:200]}")
img_filename = img_resp.json().get("name", f"ref_{job_id}.png")
logger.info("Uploaded character image: %s", img_filename)
# Upload driving video
vid_ext = "mp4"
if driving_video.filename and "." in driving_video.filename:
vid_ext = driving_video.filename.rsplit(".", 1)[-1].lower()
vid_resp = await client.post(
f"{base_url}/upload/image",
files={"image": (f"drive_{job_id}.{vid_ext}", video_bytes, "video/mp4")},
)
if vid_resp.status_code != 200:
raise HTTPException(500, f"Failed to upload driving video: {vid_resp.text[:200]}")
vid_filename = vid_resp.json().get("name", f"drive_{job_id}.{vid_ext}")
logger.info("Uploaded driving video: %s", vid_filename)
workflow = _build_wan_animate_workflow(
ref_image_filename=img_filename,
driving_video_filename=vid_filename,
prompt=prompt,
negative_prompt=negative_prompt,
width=width,
height=height,
num_frames=num_frames,
fps=fps,
seed=seed,
steps=steps,
cfg=cfg,
bg_mode=bg_mode,
)
resp = await client.post(f"{base_url}/prompt", json={"prompt": workflow})
if resp.status_code != 200:
logger.error("ComfyUI /prompt rejected workflow: %s", resp.text[:2000])
resp.raise_for_status()
prompt_id = resp.json()["prompt_id"]
_video_jobs[job_id] = {
"prompt_id": prompt_id,
"status": "running",
"seed": seed,
"started_at": time.time(),
"num_frames": num_frames,
"fps": fps,
"mode": "animate",
"message": "WAN 2.2 Animate submitted...",
}
logger.info("WAN Animate job started: %s -> %s", job_id, prompt_id)
asyncio.create_task(_poll_video_job(job_id, prompt_id))
return {
"job_id": job_id,
"status": "running",
"seed": seed,
"estimated_time": f"~{num_frames * 3} seconds",
}
except httpx.HTTPError as e:
logger.error("WAN Animate generation failed: %s", e)
raise HTTPException(500, f"Generation failed: {e}")
def _build_wan_animate_workflow(
ref_image_filename: str,
driving_video_filename: str,
prompt: str = "a person dancing, smooth motion",
negative_prompt: str = "",
width: int = 832,
height: int = 480,
num_frames: int = 81,
fps: int = 16,
seed: int = 42,
steps: int = 20,
cfg: float = 6.0,
bg_mode: str = "auto",
) -> dict:
"""Build ComfyUI API workflow for WAN 2.2 Animate (motion transfer from driving video).
Pipeline:
reference image -> CLIP encode + resize
driving video -> DWPreprocessor (pose skeleton)
both -> WanVideoAnimateEmbeds -> WanVideoSampler -> decode -> MP4
bg_mode options:
"keep" - use reference image as background (character's original background)
"driving_video" - use driving video frames as background
"auto" - no bg hint, model generates its own background
"""
neg = negative_prompt or "blurry, static, low quality, watermark, text"
workflow = {
# VAE
"1": {
"class_type": "WanVideoVAELoader",
"inputs": {
"model_name": "wan_2.1_vae.safetensors",
"precision": "bf16",
},
},
# CLIP Vision
"2": {
"class_type": "CLIPVisionLoader",
"inputs": {"clip_name": "clip_vision_h.safetensors"},
},
# Diffusion model
"3": {
"class_type": "WanVideoModelLoader",
"inputs": {
"model": "wan2.2_animate_14B_bf16.safetensors",
"base_precision": "bf16",
"quantization": "fp8_e4m3fn",
"load_device": "offload_device",
"attention_mode": "sdpa",
},
},
# Load T5 text encoder
"4": {
"class_type": "LoadWanVideoT5TextEncoder",
"inputs": {
"model_name": "umt5-xxl-enc-fp8_e4m3fn.safetensors",
"precision": "bf16",
},
},
# Encode text prompts
"16": {
"class_type": "WanVideoTextEncode",
"inputs": {
"positive_prompt": prompt,
"negative_prompt": neg,
"t5": ["4", 0],
"force_offload": True,
},
},
# Load reference character image
"5": {
"class_type": "LoadImage",
"inputs": {"image": ref_image_filename},
},
# Resize to target resolution
"6": {
"class_type": "ImageResizeKJv2",
"inputs": {
"image": ["5", 0],
"width": width,
"height": height,
"upscale_method": "lanczos",
"keep_proportion": "pad_edge_pixel",
"pad_color": "0, 0, 0",
"crop_position": "top",
"divisible_by": 16,
},
},
# CLIP Vision encode reference
"7": {
"class_type": "WanVideoClipVisionEncode",
"inputs": {
"clip_vision": ["2", 0],
"image_1": ["6", 0],
"strength_1": 1.0,
"strength_2": 1.0,
"crop": "center",
"combine_embeds": "average",
"force_offload": True,
},
},
# Load driving video (dance moves)
"8": {
"class_type": "VHS_LoadVideo",
"inputs": {
"video": driving_video_filename,
"force_rate": fps,
"custom_width": 0,
"custom_height": 0,
"frame_load_cap": num_frames if num_frames > 0 else 0,
"skip_first_frames": 0,
"select_every_nth": 1,
},
},
# Extract pose skeleton from driving video
"9": {
"class_type": "DWPreprocessor",
"inputs": {
"image": ["8", 0],
"detect_hand": "disable",
"detect_body": "enable",
"detect_face": "disable",
"resolution": max(width, height),
"bbox_detector": "yolox_l.torchscript.pt",
"pose_estimator": "dw-ll_ucoco_384_bs5.torchscript.pt",
"scale_stick_for_xinsr_cn": "disable",
},
},
# Animate embeddings: combine ref image + pose + optional background
"10": {
"class_type": "WanVideoAnimateEmbeds",
"inputs": {
"vae": ["1", 0],
"clip_embeds": ["7", 0],
"ref_images": ["6", 0],
"pose_images": ["9", 0],
# bg_mode: "keep" = ref image bg, "driving_video" = video frames bg, "auto" = model decides
**({} if bg_mode == "auto" else {
"bg_images": ["6", 0] if bg_mode == "keep" else ["8", 0],
}),
"width": width,
"height": height,
# When num_frames==0 ("Match video"), link to GetImageSizeAndCount output slot 3
"num_frames": ["15", 3] if num_frames == 0 else num_frames,
"force_offload": True,
"frame_window_size": 77,
"colormatch": "disabled",
"pose_strength": 1.0,
"face_strength": 1.0,
},
},
# Diffusion sampler (no context_options — WanAnim handles looping internally)
"12": {
"class_type": "WanVideoSampler",
"inputs": {
"model": ["3", 0],
"image_embeds": ["10", 0],
"text_embeds": ["16", 0],
"steps": steps,
"cfg": cfg,
"shift": 5.0,
"seed": seed,
"force_offload": True,
"scheduler": "dpm++_sde",
"riflex_freq_index": 0,
"denoise_strength": 1.0,
},
},
# Decode latents to frames
"13": {
"class_type": "WanVideoDecode",
"inputs": {
"vae": ["1", 0],
"samples": ["12", 0],
"enable_vae_tiling": True,
"tile_x": 272,
"tile_y": 272,
"tile_stride_x": 144,
"tile_stride_y": 128,
},
},
# Combine frames into MP4
"14": {
"class_type": "VHS_VideoCombine",
"inputs": {
"images": ["13", 0],
"frame_rate": fps,
"loop_count": 0,
"filename_prefix": "WanAnimate",
"format": "video/h264-mp4",
"pix_fmt": "yuv420p",
"crf": 19,
"save_metadata": True,
"trim_to_audio": False,
"pingpong": False,
"save_output": True,
},
},
}
# "Match video" mode (num_frames=0): detect actual frame count from posed video
# GetImageSizeAndCount outputs: (IMAGE, width, height, count) — slot 3 = frame count
if num_frames == 0:
workflow["15"] = {
"class_type": "GetImageSizeAndCount",
"inputs": {"image": ["9", 0]},
}
return workflow
def _build_wan_i2v_workflow(
uploaded_filename: str = None,
image_b64: str = None,
prompt: str = "",
negative_prompt: str = "",
num_frames: int = 81,
fps: int = 24,
seed: int = -1,
) -> dict:
"""Build ComfyUI workflow for WAN 2.2 Image-to-Video."""
# WAN 2.2 I2V workflow
# This assumes the WAN 2.2 nodes are installed on the pod
workflow = {
# Load the input image
"1": {
"class_type": "LoadImage",
"inputs": {
"image": uploaded_filename or "input.png",
},
},
# WAN 2.2 model loader
"2": {
"class_type": "DownloadAndLoadWanModel",
"inputs": {
"model": "Wan2.2-I2V-14B-480P",
},
},
# Text encoder
"3": {
"class_type": "WanTextEncode",
"inputs": {
"prompt": prompt,
"negative_prompt": negative_prompt,
"wan_model": ["2", 0],
},
},
# Image-to-Video generation
"4": {
"class_type": "WanImageToVideo",
"inputs": {
"image": ["1", 0],
"wan_model": ["2", 0],
"conditioning": ["3", 0],
"num_frames": num_frames,
"seed": seed,
"steps": 30,
"cfg": 5.0,
},
},
# Decode to frames
"5": {
"class_type": "WanDecode",
"inputs": {
"samples": ["4", 0],
"wan_model": ["2", 0],
},
},
# Save as animated WEBP
"6": {
"class_type": "SaveAnimatedWEBP",
"inputs": {
"images": ["5", 0],
"filename_prefix": "wan_video",
"fps": fps,
"lossless": False,
"quality": 85,
},
},
}
return workflow