multiinputgenerativevideo

No application file

App Files Files

linoyts HF Staff commited on Mar 17

Commit

227286a

1 Parent(s): cc68406

Update app.py (#5)

Browse files

- Update app.py (526dd8d82abe509955445dc2923c692b062515b0)

Files changed (1) hide show

app.py +120 -15

app.py CHANGED Viewed

@@ -11,7 +11,8 @@ subprocess.run([sys.executable, "-m", "pip", "install", "xformers==0.0.32.post2"
 # Install video preprocessing dependencies
 subprocess.run([sys.executable, "-m", "pip", "install",
-                "dwpose", "onnxruntime-gpu", "imageio[ffmpeg]", "scikit-image", "opencv-python-headless"], check=False)
 # Reinstall torchaudio to match the torch CUDA version on this space.
 # controlnet_aux or other deps can pull in a CPU-only torchaudio that conflicts
@@ -1023,7 +1024,107 @@ def generate_video(
 # ─────────────────────────────────────────────────────────────────────────────
-# Gradio UI — LTX 2.3 Move
 # ─────────────────────────────────────────────────────────────────────────────
 css = """
 .main-title { text-align: center; margin-bottom: 0.5em; }
@@ -1038,7 +1139,7 @@ purple_citrus = gr.themes.Citrus(
     neutral_hue=gr.themes.colors.gray,
 )
-with gr.Blocks(title="LTX 2.3 Move", css=css, theme=purple_citrus) as demo:
     gr.Markdown("""
 # LTX 2.3 Sync 🕺
 #### Character Animation with LTX 2.3
@@ -1061,27 +1162,27 @@ using [Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control](https://huggingface.co/Ligh
                     type="filepath",
                 )
             with gr.Row():
                 prompt = gr.Textbox(
-                label="Prompt",
-                info="tip: describe the motion, body posture, facial expressions of the ref video",
-                lines=2,
-                placeholder="the person talks to the camera, making hand gestures",
-            )
-                duration = gr.Slider(
-                    label="Duration (s)", minimum=1.0, maximum=15.0, value=3.0, step=0.5,
                 )
             generate_btn = gr.Button(
                 "Generate", variant="primary", size="lg", elem_classes=["generate-btn"],
             )
             with gr.Accordion("Advanced Settings", open=False):
                 enhance_prompt = gr.Checkbox(label="Enhance Prompt", value=True)
                 conditioning_strength = gr.Slider(
                     label="V2V Conditioning Strength",
@@ -1119,6 +1220,10 @@ using [Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control](https://huggingface.co/Ligh
         fn=on_video_upload,
         inputs=[input_video, input_image, high_res],
         outputs=[width, height, duration],
     )
     high_res.change(
         fn=on_highres_toggle,

 # Install video preprocessing dependencies
 subprocess.run([sys.executable, "-m", "pip", "install",
+                "dwpose", "onnxruntime-gpu", "imageio[ffmpeg]", "scikit-image",
+                "opencv-python-headless", "decord", "num2words"], check=False)
 # Reinstall torchaudio to match the torch CUDA version on this space.
 # controlnet_aux or other deps can pull in a CPU-only torchaudio that conflicts
 # ─────────────────────────────────────────────────────────────────────────────
+# SmolVLM2 — Auto-describe motion from reference video
+# ─────────────────────────────────────────────────────────────────────────────
+SMOLVLM_MODEL_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
+_vlm_model = None
+_vlm_processor = None
+MOTION_PROMPT = """\
+Watch this video carefully. Describe ONLY the following:
+1. The body movements and gestures (walking, dancing, waving, turning, etc.)
+2. Facial expressions and head movements (smiling, nodding, looking around, etc.)
+3. The rhythm, speed, and energy of the motion (slow, fast, smooth, jerky, etc.)
+4. The overall mood and tone conveyed by the movement
+Do NOT describe:
+- What the person/subject looks like (clothing, hair, skin, age, gender)
+- The background, setting, or environment
+- Colors, lighting, or visual style
+- Any objects or props
+Write a concise, single-paragraph description focused purely on motion and expression.\
+"""
+def _load_vlm():
+    global _vlm_model, _vlm_processor
+    if _vlm_model is None:
+        from transformers import AutoProcessor, AutoModelForImageTextToText
+        print(f"[SmolVLM] Loading {SMOLVLM_MODEL_ID}...")
+        _vlm_processor = AutoProcessor.from_pretrained(SMOLVLM_MODEL_ID)
+        try:
+            _vlm_model = AutoModelForImageTextToText.from_pretrained(
+                SMOLVLM_MODEL_ID,
+                torch_dtype=torch.bfloat16,
+                _attn_implementation="flash_attention_2",
+            ).to("cuda")
+        except Exception:
+            _vlm_model = AutoModelForImageTextToText.from_pretrained(
+                SMOLVLM_MODEL_ID,
+                torch_dtype=torch.bfloat16,
+            ).to("cuda")
+        print("[SmolVLM] Model loaded!")
+    return _vlm_model, _vlm_processor
+@spaces.GPU(duration=60)
+@torch.inference_mode()
+def describe_video_motion(video_path, auto_describe=True):
+    """Use SmolVLM2 to generate a motion-only description of a video."""
+    if video_path is None or not auto_describe:
+        return gr.update()
+    try:
+        model, processor = _load_vlm()
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "video", "path": str(video_path)},
+                    {"type": "text", "text": MOTION_PROMPT},
+                ],
+            },
+        ]
+        inputs = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        ).to(model.device, dtype=torch.bfloat16)
+        generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=200)
+        generated_text = processor.batch_decode(
+            generated_ids, skip_special_tokens=True
+        )[0]
+        # Extract only the assistant's response (after the prompt)
+        if "Assistant:" in generated_text:
+            motion_desc = generated_text.split("Assistant:")[-1].strip()
+        else:
+            motion_desc = generated_text.strip()
+        # Clean up any leftover prompt fragments
+        for marker in [MOTION_PROMPT[:40], "Watch this video", "Do NOT describe"]:
+            if marker in motion_desc:
+                motion_desc = motion_desc.split(marker)[0].strip()
+        if motion_desc:
+            print(f"[SmolVLM] Motion description: {motion_desc[:100]}...")
+            return gr.update(value=motion_desc)
+        else:
+            return gr.update()
+    except Exception as e:
+        print(f"[SmolVLM] Error: {e}")
+        return gr.update()
+# ─────────────────────────────────────────────────────────────────────────────
+# Gradio UI — LTX 2.3 Sync
 # ─────────────────────────────────────────────────────────────────────────────
 css = """
 .main-title { text-align: center; margin-bottom: 0.5em; }
     neutral_hue=gr.themes.colors.gray,
 )
+with gr.Blocks(title="LTX 2.3 Sync", css=css, theme=purple_citrus) as demo:
     gr.Markdown("""
 # LTX 2.3 Sync 🕺
 #### Character Animation with LTX 2.3
                     type="filepath",
                 )
             with gr.Row():
                 prompt = gr.Textbox(
+                    label="Prompt",
+                    info="tip: describe the motion, body posture, facial expressions of the ref video",
+                    lines=2,
+                    placeholder="the person talks to the camera, making hand gestures",
                 )
+                with gr.Column(min_width=160):
+                    duration = gr.Slider(
+                        label="Duration (s)", minimum=1.0, maximum=15.0, value=3.0, step=0.5,
+                    )
+                    auto_describe = gr.Checkbox(
+                        label="Auto-describe motion", value=True,
+                        info="Use AI to describe the video's motion as a prompt",
+                    )
             generate_btn = gr.Button(
                 "Generate", variant="primary", size="lg", elem_classes=["generate-btn"],
             )
             with gr.Accordion("Advanced Settings", open=False):
                 enhance_prompt = gr.Checkbox(label="Enhance Prompt", value=True)
                 conditioning_strength = gr.Slider(
                     label="V2V Conditioning Strength",
         fn=on_video_upload,
         inputs=[input_video, input_image, high_res],
         outputs=[width, height, duration],
+    ).then(
+        fn=describe_video_motion,
+        inputs=[input_video, auto_describe],
+        outputs=[prompt],
     )
     high_res.change(
         fn=on_highres_toggle,