ImageStudio Maintainer commited on
Commit
acd22d2
Β·
1 Parent(s): 0510568

feat: image_to_video_assets endpoint (image -> motion prompt + image-context check)

Browse files

One GPU window moderates a user-uploaded image and writes the i2v motion
prompt; refactor vlm_chat to a reusable _vlm_chat_core. Bound via a hidden
trigger as api_name=image_to_video_assets (outputs: prompt, status{moderation},
progress). Used by the generator's image-to-video workflows.

Files changed (1) hide show
  1. app.py +122 -11
app.py CHANGED
@@ -592,18 +592,13 @@ def _generate_image_inner(
592
  # =============================================================================
593
  # Prompt Assistant (Qwen3.5-4B) β€” single-turn chat, optional image
594
  # =============================================================================
595
- @spaces.GPU
596
- def vlm_chat(message, image, reasoning, max_new_tokens, progress=gr.Progress(track_tqdm=True)):
597
- """Answer a single user message, optionally grounded on an uploaded image.
598
 
599
- Generator: yields ``("progress", produced, budget)`` as tokens stream in and a
600
- final ``("text", answer)`` tuple. Token streaming (TextIteratorStreamer + a
601
- worker thread) is the canonical ZeroGPU pattern and lets the downstream
602
- orchestrator track this node's progress over SSE.
603
-
604
- ``reasoning`` ("On"/"Off") drives Qwen's ``enable_thinking`` switch: Off skips
605
- the <think> trace for a direct answer (best for prompt rewriting); On lets the
606
- model reason step-by-step first (slower, needs more max_new_tokens).
607
  """
608
  message = (message or "").strip()
609
  if not message and image is None:
@@ -674,6 +669,36 @@ def vlm_chat(message, image, reasoning, max_new_tokens, progress=gr.Progress(tra
674
  yield ("text", text)
675
 
676
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
677
  def generate_and_upload(
678
  model_name,
679
  prompt,
@@ -943,6 +968,72 @@ def prompt_to_video_assets(
943
  yield frame("done", 1.0, 0.0, label="Done")
944
 
945
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
946
  # Recommended defaults per model: (steps, guidance, height, width)
947
  MODEL_DEFAULTS = {
948
  MODEL_ZIMAGE: dict(steps=9, guidance=0.0, height=1024, width=1024),
@@ -1218,6 +1309,26 @@ with gr.Blocks(fill_height=True) as demo:
1218
  fn=assistant_chat, inputs=vlm_inputs, outputs=[vlm_output, vlm_progress],
1219
  )
1220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1221
  # UI-less combined endpoint: text -> first-frame image (R2) + video prompt.
1222
  # `gr.api` derives its schema from the function's type hints and registers no
1223
  # components, so it adds an API route without touching the visible UI.
 
592
  # =============================================================================
593
  # Prompt Assistant (Qwen3.5-4B) β€” single-turn chat, optional image
594
  # =============================================================================
595
+ def _vlm_chat_core(message, image, reasoning, max_new_tokens):
596
+ """Token-streaming VLM chat body (no GPU decorator).
 
597
 
598
+ Yields ``("progress", produced, budget, partial)`` per token and a final
599
+ ``("text", answer)`` tuple. Kept decorator-free so it can be reused inside a
600
+ *single* ``@spaces.GPU`` window by other endpoints (e.g. the image→video
601
+ check), avoiding a second ZeroGPU acquisition. See :func:`vlm_chat`.
 
 
 
 
602
  """
603
  message = (message or "").strip()
604
  if not message and image is None:
 
669
  yield ("text", text)
670
 
671
 
672
+ @spaces.GPU
673
+ def vlm_chat(message, image, reasoning, max_new_tokens, progress=gr.Progress(track_tqdm=True)):
674
+ """Answer a single user message, optionally grounded on an uploaded image.
675
+
676
+ Thin ``@spaces.GPU`` wrapper around :func:`_vlm_chat_core` (which holds the
677
+ token-streaming logic). ``reasoning`` ("On"/"Off") drives Qwen's
678
+ ``enable_thinking`` switch: Off skips the <think> trace for a direct answer
679
+ (best for prompt rewriting); On lets the model reason first (slower).
680
+ """
681
+ yield from _vlm_chat_core(message, image, reasoning, max_new_tokens)
682
+
683
+
684
+ @spaces.GPU
685
+ def _image_video_check_gpu(image, message, reasoning, max_new_tokens):
686
+ """One GPU window: moderate ``image``, then stream a video prompt from it.
687
+
688
+ Yields ``("moderation", dict)`` once the image-context check is done, then
689
+ relays :func:`_vlm_chat_core`'s ``("progress", …)`` / ``("text", …)`` tuples.
690
+ Fusing both into a single ZeroGPU allocation (rather than two) matches the
691
+ pattern used by :func:`generate_image`.
692
+ """
693
+ try:
694
+ moderation = _moderate_image_inner(image)
695
+ except Exception as exc: # noqa: BLE001 - never let moderation break the call
696
+ moderation = {"ok": False, "rating": None, "confidence": None,
697
+ "flags": None, "raw": "", "error": f"{type(exc).__name__}: {exc}"}
698
+ yield ("moderation", moderation)
699
+ yield from _vlm_chat_core(message, image, reasoning, max_new_tokens)
700
+
701
+
702
  def generate_and_upload(
703
  model_name,
704
  prompt,
 
968
  yield frame("done", 1.0, 0.0, label="Done")
969
 
970
 
971
+ # =============================================================================
972
+ # Image -> video prompt + image moderation check (no first frame; UI-less)
973
+ # =============================================================================
974
+ # The image-to-video counterpart of `prompt_to_video_assets`: the caller already
975
+ # HAS the first frame (a user-uploaded image), so this endpoint just (1) screens
976
+ # that image with the VLM image-context check and (2) writes the motion prompt
977
+ # the video model will use β€” both inside one GPU window. The generator calls this
978
+ # before the LTX/Wan video node, records the check against the produced asset,
979
+ # and feeds the prompt forward.
980
+ #
981
+ # Outputs (positional, matching the structured-progress contract):
982
+ # 0: video_prompt (str) β€” the motion prompt for the video model
983
+ # 1: status (JSON dict) β€” { "moderation": <image-context check> }
984
+ # 2: progress (JSON dict) β€” live 0..1 progress for SSE consumers
985
+ _I2V_DEFAULT_INSTRUCTION = (
986
+ "Look at this image. Write a concise image-to-video motion prompt focusing on "
987
+ "the characters' actions, expressions and the scene, describing how it should "
988
+ "animate: camera movement plus subject motion. Respond with ONE single line of "
989
+ "one or two sentences containing ONLY the motion prompt. No headings, no "
990
+ "markdown, no bullet points, no preamble."
991
+ )
992
+
993
+
994
+ def image_to_video_assets(
995
+ image,
996
+ video_instruction,
997
+ reasoning,
998
+ max_new_tokens,
999
+ progress=gr.Progress(track_tqdm=True),
1000
+ ):
1001
+ """User-uploaded image -> motion prompt + image-context (moderation) check.
1002
+
1003
+ Generator yielding ``(video_prompt, status, progress)`` per frame so the
1004
+ motion prompt streams and a downstream consumer reading the progress index
1005
+ sees live progress over SSE. ``status`` carries the moderation check once
1006
+ available; the final ``complete`` frame holds the clean prompt at index 0.
1007
+ Bound with ``api_name="image_to_video_assets"`` (hidden trigger, no UI).
1008
+ """
1009
+ instruction = (video_instruction or "").strip() or _I2V_DEFAULT_INSTRUCTION
1010
+ budget = int(max_new_tokens) if max_new_tokens else 256
1011
+
1012
+ if image is None:
1013
+ yield "", {"moderation": None, "error": "image is required"}, \
1014
+ _progress("done", 1.0, label="No image")
1015
+ return
1016
+
1017
+ status = {"moderation": None}
1018
+ video_prompt = ""
1019
+ yield video_prompt, status, _progress("check", 0.02, label="Screening image")
1020
+
1021
+ for ev in _image_video_check_gpu(image, instruction, reasoning, budget):
1022
+ if ev[0] == "moderation":
1023
+ status = {"moderation": ev[1]}
1024
+ yield video_prompt, status, _progress("check", 0.2, label="Image checked")
1025
+ elif ev[0] == "progress":
1026
+ _, produced, budget_, partial = ev
1027
+ video_prompt = partial
1028
+ frac = 0.2 + 0.79 * min(1.0, produced / max(budget_, 1))
1029
+ yield video_prompt, status, _progress(
1030
+ "video_prompt", frac, produced, budget_, "Writing video prompt")
1031
+ else: # ("text", final)
1032
+ video_prompt = ev[1]
1033
+
1034
+ yield video_prompt.strip(), status, _progress("done", 1.0, label="Done")
1035
+
1036
+
1037
  # Recommended defaults per model: (steps, guidance, height, width)
1038
  MODEL_DEFAULTS = {
1039
  MODEL_ZIMAGE: dict(steps=9, guidance=0.0, height=1024, width=1024),
 
1309
  fn=assistant_chat, inputs=vlm_inputs, outputs=[vlm_output, vlm_progress],
1310
  )
1311
 
1312
+ # UI-less endpoint: user image -> motion video prompt + image-context check.
1313
+ # Bound via a hidden trigger so the image input deserializes the uploaded
1314
+ # FileData into a PIL image (same proven path as the Prompt Assistant), then
1315
+ # exposed to the generator as api_name="image_to_video_assets".
1316
+ with gr.Row(visible=False):
1317
+ i2v_image = gr.Image(label="i2v image", type="pil")
1318
+ i2v_instruction = gr.Textbox(label="i2v instruction", value="")
1319
+ i2v_reasoning = gr.Radio(choices=["Off", "On"], value="Off", label="i2v reasoning")
1320
+ i2v_max_tokens = gr.Slider(minimum=64, maximum=1024, value=256, step=64, label="i2v max tokens")
1321
+ i2v_btn = gr.Button("i2v", visible=False)
1322
+ i2v_prompt_out = gr.Textbox(label="i2v prompt")
1323
+ i2v_status_out = gr.JSON(label="i2v status")
1324
+ i2v_progress_out = gr.JSON(label="i2v progress")
1325
+ i2v_btn.click(
1326
+ fn=image_to_video_assets,
1327
+ inputs=[i2v_image, i2v_instruction, i2v_reasoning, i2v_max_tokens],
1328
+ outputs=[i2v_prompt_out, i2v_status_out, i2v_progress_out],
1329
+ api_name="image_to_video_assets",
1330
+ )
1331
+
1332
  # UI-less combined endpoint: text -> first-frame image (R2) + video prompt.
1333
  # `gr.api` derives its schema from the function's type hints and registers no
1334
  # components, so it adds an API route without touching the visible UI.