ImageStudio

Runtime error

ImageStudio Maintainer commited on 10 days ago

Commit

acd22d2

1 Parent(s): 0510568

feat: image_to_video_assets endpoint (image -> motion prompt + image-context check)

One GPU window moderates a user-uploaded image and writes the i2v motion
prompt; refactor vlm_chat to a reusable _vlm_chat_core. Bound via a hidden
trigger as api_name=image_to_video_assets (outputs: prompt, status{moderation},
progress). Used by the generator's image-to-video workflows.

Files changed (1) hide show

app.py +122 -11

app.py CHANGED Viewed

@@ -592,18 +592,13 @@ def _generate_image_inner(
 # =============================================================================
 # Prompt Assistant (Qwen3.5-4B) — single-turn chat, optional image
 # =============================================================================
-@spaces.GPU
-def vlm_chat(message, image, reasoning, max_new_tokens, progress=gr.Progress(track_tqdm=True)):
-    """Answer a single user message, optionally grounded on an uploaded image.
-    Generator: yields ``("progress", produced, budget)`` as tokens stream in and a
-    final ``("text", answer)`` tuple. Token streaming (TextIteratorStreamer + a
-    worker thread) is the canonical ZeroGPU pattern and lets the downstream
-    orchestrator track this node's progress over SSE.
-    ``reasoning`` ("On"/"Off") drives Qwen's ``enable_thinking`` switch: Off skips
-    the <think> trace for a direct answer (best for prompt rewriting); On lets the
-    model reason step-by-step first (slower, needs more max_new_tokens).
     """
     message = (message or "").strip()
     if not message and image is None:
@@ -674,6 +669,36 @@ def vlm_chat(message, image, reasoning, max_new_tokens, progress=gr.Progress(tra
     yield ("text", text)
 def generate_and_upload(
     model_name,
     prompt,
@@ -943,6 +968,72 @@ def prompt_to_video_assets(
     yield frame("done", 1.0, 0.0, label="Done")
 # Recommended defaults per model: (steps, guidance, height, width)
 MODEL_DEFAULTS = {
     MODEL_ZIMAGE: dict(steps=9, guidance=0.0, height=1024, width=1024),
@@ -1218,6 +1309,26 @@ with gr.Blocks(fill_height=True) as demo:
         fn=assistant_chat, inputs=vlm_inputs, outputs=[vlm_output, vlm_progress],
     )
     # UI-less combined endpoint: text -> first-frame image (R2) + video prompt.
     # `gr.api` derives its schema from the function's type hints and registers no
     # components, so it adds an API route without touching the visible UI.

 # =============================================================================
 # Prompt Assistant (Qwen3.5-4B) — single-turn chat, optional image
 # =============================================================================
+def _vlm_chat_core(message, image, reasoning, max_new_tokens):
+    """Token-streaming VLM chat body (no GPU decorator).
+    Yields ``("progress", produced, budget, partial)`` per token and a final
+    ``("text", answer)`` tuple. Kept decorator-free so it can be reused inside a
+    *single* ``@spaces.GPU`` window by other endpoints (e.g. the image→video
+    check), avoiding a second ZeroGPU acquisition. See :func:`vlm_chat`.
     """
     message = (message or "").strip()
     if not message and image is None:
     yield ("text", text)
+@spaces.GPU
+def vlm_chat(message, image, reasoning, max_new_tokens, progress=gr.Progress(track_tqdm=True)):
+    """Answer a single user message, optionally grounded on an uploaded image.
+    Thin ``@spaces.GPU`` wrapper around :func:`_vlm_chat_core` (which holds the
+    token-streaming logic). ``reasoning`` ("On"/"Off") drives Qwen's
+    ``enable_thinking`` switch: Off skips the <think> trace for a direct answer
+    (best for prompt rewriting); On lets the model reason first (slower).
+    """
+    yield from _vlm_chat_core(message, image, reasoning, max_new_tokens)
+@spaces.GPU
+def _image_video_check_gpu(image, message, reasoning, max_new_tokens):
+    """One GPU window: moderate ``image``, then stream a video prompt from it.
+    Yields ``("moderation", dict)`` once the image-context check is done, then
+    relays :func:`_vlm_chat_core`'s ``("progress", …)`` / ``("text", …)`` tuples.
+    Fusing both into a single ZeroGPU allocation (rather than two) matches the
+    pattern used by :func:`generate_image`.
+    """
+    try:
+        moderation = _moderate_image_inner(image)
+    except Exception as exc:  # noqa: BLE001 - never let moderation break the call
+        moderation = {"ok": False, "rating": None, "confidence": None,
+                      "flags": None, "raw": "", "error": f"{type(exc).__name__}: {exc}"}
+    yield ("moderation", moderation)
+    yield from _vlm_chat_core(message, image, reasoning, max_new_tokens)
 def generate_and_upload(
     model_name,
     prompt,
     yield frame("done", 1.0, 0.0, label="Done")
+# =============================================================================
+# Image -> video prompt + image moderation check (no first frame; UI-less)
+# =============================================================================
+# The image-to-video counterpart of `prompt_to_video_assets`: the caller already
+# HAS the first frame (a user-uploaded image), so this endpoint just (1) screens
+# that image with the VLM image-context check and (2) writes the motion prompt
+# the video model will use — both inside one GPU window. The generator calls this
+# before the LTX/Wan video node, records the check against the produced asset,
+# and feeds the prompt forward.
+#
+# Outputs (positional, matching the structured-progress contract):
+#   0: video_prompt  (str)         — the motion prompt for the video model
+#   1: status        (JSON dict)   — { "moderation": <image-context check> }
+#   2: progress      (JSON dict)   — live 0..1 progress for SSE consumers
+_I2V_DEFAULT_INSTRUCTION = (
+    "Look at this image. Write a concise image-to-video motion prompt focusing on "
+    "the characters' actions, expressions and the scene, describing how it should "
+    "animate: camera movement plus subject motion. Respond with ONE single line of "
+    "one or two sentences containing ONLY the motion prompt. No headings, no "
+    "markdown, no bullet points, no preamble."
+)
+def image_to_video_assets(
+    image,
+    video_instruction,
+    reasoning,
+    max_new_tokens,
+    progress=gr.Progress(track_tqdm=True),
+):
+    """User-uploaded image -> motion prompt + image-context (moderation) check.
+    Generator yielding ``(video_prompt, status, progress)`` per frame so the
+    motion prompt streams and a downstream consumer reading the progress index
+    sees live progress over SSE. ``status`` carries the moderation check once
+    available; the final ``complete`` frame holds the clean prompt at index 0.
+    Bound with ``api_name="image_to_video_assets"`` (hidden trigger, no UI).
+    """
+    instruction = (video_instruction or "").strip() or _I2V_DEFAULT_INSTRUCTION
+    budget = int(max_new_tokens) if max_new_tokens else 256
+    if image is None:
+        yield "", {"moderation": None, "error": "image is required"}, \
+            _progress("done", 1.0, label="No image")
+        return
+    status = {"moderation": None}
+    video_prompt = ""
+    yield video_prompt, status, _progress("check", 0.02, label="Screening image")
+    for ev in _image_video_check_gpu(image, instruction, reasoning, budget):
+        if ev[0] == "moderation":
+            status = {"moderation": ev[1]}
+            yield video_prompt, status, _progress("check", 0.2, label="Image checked")
+        elif ev[0] == "progress":
+            _, produced, budget_, partial = ev
+            video_prompt = partial
+            frac = 0.2 + 0.79 * min(1.0, produced / max(budget_, 1))
+            yield video_prompt, status, _progress(
+                "video_prompt", frac, produced, budget_, "Writing video prompt")
+        else:  # ("text", final)
+            video_prompt = ev[1]
+    yield video_prompt.strip(), status, _progress("done", 1.0, label="Done")
 # Recommended defaults per model: (steps, guidance, height, width)
 MODEL_DEFAULTS = {
     MODEL_ZIMAGE: dict(steps=9, guidance=0.0, height=1024, width=1024),
         fn=assistant_chat, inputs=vlm_inputs, outputs=[vlm_output, vlm_progress],
     )
+    # UI-less endpoint: user image -> motion video prompt + image-context check.
+    # Bound via a hidden trigger so the image input deserializes the uploaded
+    # FileData into a PIL image (same proven path as the Prompt Assistant), then
+    # exposed to the generator as api_name="image_to_video_assets".
+    with gr.Row(visible=False):
+        i2v_image = gr.Image(label="i2v image", type="pil")
+        i2v_instruction = gr.Textbox(label="i2v instruction", value="")
+        i2v_reasoning = gr.Radio(choices=["Off", "On"], value="Off", label="i2v reasoning")
+        i2v_max_tokens = gr.Slider(minimum=64, maximum=1024, value=256, step=64, label="i2v max tokens")
+        i2v_btn = gr.Button("i2v", visible=False)
+        i2v_prompt_out = gr.Textbox(label="i2v prompt")
+        i2v_status_out = gr.JSON(label="i2v status")
+        i2v_progress_out = gr.JSON(label="i2v progress")
+    i2v_btn.click(
+        fn=image_to_video_assets,
+        inputs=[i2v_image, i2v_instruction, i2v_reasoning, i2v_max_tokens],
+        outputs=[i2v_prompt_out, i2v_status_out, i2v_progress_out],
+        api_name="image_to_video_assets",
+    )
     # UI-less combined endpoint: text -> first-frame image (R2) + video prompt.
     # `gr.api` derives its schema from the function's type hints and registers no
     # components, so it adds an API route without touching the visible UI.