Spaces:
Runtime error
Runtime error
ImageStudio Maintainer commited on
Commit Β·
acd22d2
1
Parent(s): 0510568
feat: image_to_video_assets endpoint (image -> motion prompt + image-context check)
Browse filesOne GPU window moderates a user-uploaded image and writes the i2v motion
prompt; refactor vlm_chat to a reusable _vlm_chat_core. Bound via a hidden
trigger as api_name=image_to_video_assets (outputs: prompt, status{moderation},
progress). Used by the generator's image-to-video workflows.
app.py
CHANGED
|
@@ -592,18 +592,13 @@ def _generate_image_inner(
|
|
| 592 |
# =============================================================================
|
| 593 |
# Prompt Assistant (Qwen3.5-4B) β single-turn chat, optional image
|
| 594 |
# =============================================================================
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
"""Answer a single user message, optionally grounded on an uploaded image.
|
| 598 |
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
``reasoning`` ("On"/"Off") drives Qwen's ``enable_thinking`` switch: Off skips
|
| 605 |
-
the <think> trace for a direct answer (best for prompt rewriting); On lets the
|
| 606 |
-
model reason step-by-step first (slower, needs more max_new_tokens).
|
| 607 |
"""
|
| 608 |
message = (message or "").strip()
|
| 609 |
if not message and image is None:
|
|
@@ -674,6 +669,36 @@ def vlm_chat(message, image, reasoning, max_new_tokens, progress=gr.Progress(tra
|
|
| 674 |
yield ("text", text)
|
| 675 |
|
| 676 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 677 |
def generate_and_upload(
|
| 678 |
model_name,
|
| 679 |
prompt,
|
|
@@ -943,6 +968,72 @@ def prompt_to_video_assets(
|
|
| 943 |
yield frame("done", 1.0, 0.0, label="Done")
|
| 944 |
|
| 945 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 946 |
# Recommended defaults per model: (steps, guidance, height, width)
|
| 947 |
MODEL_DEFAULTS = {
|
| 948 |
MODEL_ZIMAGE: dict(steps=9, guidance=0.0, height=1024, width=1024),
|
|
@@ -1218,6 +1309,26 @@ with gr.Blocks(fill_height=True) as demo:
|
|
| 1218 |
fn=assistant_chat, inputs=vlm_inputs, outputs=[vlm_output, vlm_progress],
|
| 1219 |
)
|
| 1220 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1221 |
# UI-less combined endpoint: text -> first-frame image (R2) + video prompt.
|
| 1222 |
# `gr.api` derives its schema from the function's type hints and registers no
|
| 1223 |
# components, so it adds an API route without touching the visible UI.
|
|
|
|
| 592 |
# =============================================================================
|
| 593 |
# Prompt Assistant (Qwen3.5-4B) β single-turn chat, optional image
|
| 594 |
# =============================================================================
|
| 595 |
+
def _vlm_chat_core(message, image, reasoning, max_new_tokens):
|
| 596 |
+
"""Token-streaming VLM chat body (no GPU decorator).
|
|
|
|
| 597 |
|
| 598 |
+
Yields ``("progress", produced, budget, partial)`` per token and a final
|
| 599 |
+
``("text", answer)`` tuple. Kept decorator-free so it can be reused inside a
|
| 600 |
+
*single* ``@spaces.GPU`` window by other endpoints (e.g. the imageβvideo
|
| 601 |
+
check), avoiding a second ZeroGPU acquisition. See :func:`vlm_chat`.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 602 |
"""
|
| 603 |
message = (message or "").strip()
|
| 604 |
if not message and image is None:
|
|
|
|
| 669 |
yield ("text", text)
|
| 670 |
|
| 671 |
|
| 672 |
+
@spaces.GPU
|
| 673 |
+
def vlm_chat(message, image, reasoning, max_new_tokens, progress=gr.Progress(track_tqdm=True)):
|
| 674 |
+
"""Answer a single user message, optionally grounded on an uploaded image.
|
| 675 |
+
|
| 676 |
+
Thin ``@spaces.GPU`` wrapper around :func:`_vlm_chat_core` (which holds the
|
| 677 |
+
token-streaming logic). ``reasoning`` ("On"/"Off") drives Qwen's
|
| 678 |
+
``enable_thinking`` switch: Off skips the <think> trace for a direct answer
|
| 679 |
+
(best for prompt rewriting); On lets the model reason first (slower).
|
| 680 |
+
"""
|
| 681 |
+
yield from _vlm_chat_core(message, image, reasoning, max_new_tokens)
|
| 682 |
+
|
| 683 |
+
|
| 684 |
+
@spaces.GPU
|
| 685 |
+
def _image_video_check_gpu(image, message, reasoning, max_new_tokens):
|
| 686 |
+
"""One GPU window: moderate ``image``, then stream a video prompt from it.
|
| 687 |
+
|
| 688 |
+
Yields ``("moderation", dict)`` once the image-context check is done, then
|
| 689 |
+
relays :func:`_vlm_chat_core`'s ``("progress", β¦)`` / ``("text", β¦)`` tuples.
|
| 690 |
+
Fusing both into a single ZeroGPU allocation (rather than two) matches the
|
| 691 |
+
pattern used by :func:`generate_image`.
|
| 692 |
+
"""
|
| 693 |
+
try:
|
| 694 |
+
moderation = _moderate_image_inner(image)
|
| 695 |
+
except Exception as exc: # noqa: BLE001 - never let moderation break the call
|
| 696 |
+
moderation = {"ok": False, "rating": None, "confidence": None,
|
| 697 |
+
"flags": None, "raw": "", "error": f"{type(exc).__name__}: {exc}"}
|
| 698 |
+
yield ("moderation", moderation)
|
| 699 |
+
yield from _vlm_chat_core(message, image, reasoning, max_new_tokens)
|
| 700 |
+
|
| 701 |
+
|
| 702 |
def generate_and_upload(
|
| 703 |
model_name,
|
| 704 |
prompt,
|
|
|
|
| 968 |
yield frame("done", 1.0, 0.0, label="Done")
|
| 969 |
|
| 970 |
|
| 971 |
+
# =============================================================================
|
| 972 |
+
# Image -> video prompt + image moderation check (no first frame; UI-less)
|
| 973 |
+
# =============================================================================
|
| 974 |
+
# The image-to-video counterpart of `prompt_to_video_assets`: the caller already
|
| 975 |
+
# HAS the first frame (a user-uploaded image), so this endpoint just (1) screens
|
| 976 |
+
# that image with the VLM image-context check and (2) writes the motion prompt
|
| 977 |
+
# the video model will use β both inside one GPU window. The generator calls this
|
| 978 |
+
# before the LTX/Wan video node, records the check against the produced asset,
|
| 979 |
+
# and feeds the prompt forward.
|
| 980 |
+
#
|
| 981 |
+
# Outputs (positional, matching the structured-progress contract):
|
| 982 |
+
# 0: video_prompt (str) β the motion prompt for the video model
|
| 983 |
+
# 1: status (JSON dict) β { "moderation": <image-context check> }
|
| 984 |
+
# 2: progress (JSON dict) β live 0..1 progress for SSE consumers
|
| 985 |
+
_I2V_DEFAULT_INSTRUCTION = (
|
| 986 |
+
"Look at this image. Write a concise image-to-video motion prompt focusing on "
|
| 987 |
+
"the characters' actions, expressions and the scene, describing how it should "
|
| 988 |
+
"animate: camera movement plus subject motion. Respond with ONE single line of "
|
| 989 |
+
"one or two sentences containing ONLY the motion prompt. No headings, no "
|
| 990 |
+
"markdown, no bullet points, no preamble."
|
| 991 |
+
)
|
| 992 |
+
|
| 993 |
+
|
| 994 |
+
def image_to_video_assets(
|
| 995 |
+
image,
|
| 996 |
+
video_instruction,
|
| 997 |
+
reasoning,
|
| 998 |
+
max_new_tokens,
|
| 999 |
+
progress=gr.Progress(track_tqdm=True),
|
| 1000 |
+
):
|
| 1001 |
+
"""User-uploaded image -> motion prompt + image-context (moderation) check.
|
| 1002 |
+
|
| 1003 |
+
Generator yielding ``(video_prompt, status, progress)`` per frame so the
|
| 1004 |
+
motion prompt streams and a downstream consumer reading the progress index
|
| 1005 |
+
sees live progress over SSE. ``status`` carries the moderation check once
|
| 1006 |
+
available; the final ``complete`` frame holds the clean prompt at index 0.
|
| 1007 |
+
Bound with ``api_name="image_to_video_assets"`` (hidden trigger, no UI).
|
| 1008 |
+
"""
|
| 1009 |
+
instruction = (video_instruction or "").strip() or _I2V_DEFAULT_INSTRUCTION
|
| 1010 |
+
budget = int(max_new_tokens) if max_new_tokens else 256
|
| 1011 |
+
|
| 1012 |
+
if image is None:
|
| 1013 |
+
yield "", {"moderation": None, "error": "image is required"}, \
|
| 1014 |
+
_progress("done", 1.0, label="No image")
|
| 1015 |
+
return
|
| 1016 |
+
|
| 1017 |
+
status = {"moderation": None}
|
| 1018 |
+
video_prompt = ""
|
| 1019 |
+
yield video_prompt, status, _progress("check", 0.02, label="Screening image")
|
| 1020 |
+
|
| 1021 |
+
for ev in _image_video_check_gpu(image, instruction, reasoning, budget):
|
| 1022 |
+
if ev[0] == "moderation":
|
| 1023 |
+
status = {"moderation": ev[1]}
|
| 1024 |
+
yield video_prompt, status, _progress("check", 0.2, label="Image checked")
|
| 1025 |
+
elif ev[0] == "progress":
|
| 1026 |
+
_, produced, budget_, partial = ev
|
| 1027 |
+
video_prompt = partial
|
| 1028 |
+
frac = 0.2 + 0.79 * min(1.0, produced / max(budget_, 1))
|
| 1029 |
+
yield video_prompt, status, _progress(
|
| 1030 |
+
"video_prompt", frac, produced, budget_, "Writing video prompt")
|
| 1031 |
+
else: # ("text", final)
|
| 1032 |
+
video_prompt = ev[1]
|
| 1033 |
+
|
| 1034 |
+
yield video_prompt.strip(), status, _progress("done", 1.0, label="Done")
|
| 1035 |
+
|
| 1036 |
+
|
| 1037 |
# Recommended defaults per model: (steps, guidance, height, width)
|
| 1038 |
MODEL_DEFAULTS = {
|
| 1039 |
MODEL_ZIMAGE: dict(steps=9, guidance=0.0, height=1024, width=1024),
|
|
|
|
| 1309 |
fn=assistant_chat, inputs=vlm_inputs, outputs=[vlm_output, vlm_progress],
|
| 1310 |
)
|
| 1311 |
|
| 1312 |
+
# UI-less endpoint: user image -> motion video prompt + image-context check.
|
| 1313 |
+
# Bound via a hidden trigger so the image input deserializes the uploaded
|
| 1314 |
+
# FileData into a PIL image (same proven path as the Prompt Assistant), then
|
| 1315 |
+
# exposed to the generator as api_name="image_to_video_assets".
|
| 1316 |
+
with gr.Row(visible=False):
|
| 1317 |
+
i2v_image = gr.Image(label="i2v image", type="pil")
|
| 1318 |
+
i2v_instruction = gr.Textbox(label="i2v instruction", value="")
|
| 1319 |
+
i2v_reasoning = gr.Radio(choices=["Off", "On"], value="Off", label="i2v reasoning")
|
| 1320 |
+
i2v_max_tokens = gr.Slider(minimum=64, maximum=1024, value=256, step=64, label="i2v max tokens")
|
| 1321 |
+
i2v_btn = gr.Button("i2v", visible=False)
|
| 1322 |
+
i2v_prompt_out = gr.Textbox(label="i2v prompt")
|
| 1323 |
+
i2v_status_out = gr.JSON(label="i2v status")
|
| 1324 |
+
i2v_progress_out = gr.JSON(label="i2v progress")
|
| 1325 |
+
i2v_btn.click(
|
| 1326 |
+
fn=image_to_video_assets,
|
| 1327 |
+
inputs=[i2v_image, i2v_instruction, i2v_reasoning, i2v_max_tokens],
|
| 1328 |
+
outputs=[i2v_prompt_out, i2v_status_out, i2v_progress_out],
|
| 1329 |
+
api_name="image_to_video_assets",
|
| 1330 |
+
)
|
| 1331 |
+
|
| 1332 |
# UI-less combined endpoint: text -> first-frame image (R2) + video prompt.
|
| 1333 |
# `gr.api` derives its schema from the function's type hints and registers no
|
| 1334 |
# components, so it adds an API route without touching the visible UI.
|