linoyts HF Staff commited on
Commit
227286a
Β·
1 Parent(s): cc68406

Update app.py (#5)

Browse files

- Update app.py (526dd8d82abe509955445dc2923c692b062515b0)

Files changed (1) hide show
  1. app.py +120 -15
app.py CHANGED
@@ -11,7 +11,8 @@ subprocess.run([sys.executable, "-m", "pip", "install", "xformers==0.0.32.post2"
11
 
12
  # Install video preprocessing dependencies
13
  subprocess.run([sys.executable, "-m", "pip", "install",
14
- "dwpose", "onnxruntime-gpu", "imageio[ffmpeg]", "scikit-image", "opencv-python-headless"], check=False)
 
15
 
16
  # Reinstall torchaudio to match the torch CUDA version on this space.
17
  # controlnet_aux or other deps can pull in a CPU-only torchaudio that conflicts
@@ -1023,7 +1024,107 @@ def generate_video(
1023
 
1024
 
1025
  # ─────────────────────────────────────────────────────────────────────────────
1026
- # Gradio UI β€” LTX 2.3 Move
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1027
  # ─────────────────────────────────────────────────────────────────────────────
1028
  css = """
1029
  .main-title { text-align: center; margin-bottom: 0.5em; }
@@ -1038,7 +1139,7 @@ purple_citrus = gr.themes.Citrus(
1038
  neutral_hue=gr.themes.colors.gray,
1039
  )
1040
 
1041
- with gr.Blocks(title="LTX 2.3 Move", css=css, theme=purple_citrus) as demo:
1042
  gr.Markdown("""
1043
  # LTX 2.3 Sync πŸ•Ί
1044
  #### Character Animation with LTX 2.3
@@ -1061,27 +1162,27 @@ using [Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control](https://huggingface.co/Ligh
1061
  type="filepath",
1062
  )
1063
 
1064
-
1065
-
1066
  with gr.Row():
1067
  prompt = gr.Textbox(
1068
- label="Prompt",
1069
- info="tip: describe the motion, body posture, facial expressions of the ref video",
1070
- lines=2,
1071
- placeholder="the person talks to the camera, making hand gestures",
1072
-
1073
- )
1074
- duration = gr.Slider(
1075
- label="Duration (s)", minimum=1.0, maximum=15.0, value=3.0, step=0.5,
1076
  )
1077
-
 
 
 
 
 
 
 
1078
 
1079
  generate_btn = gr.Button(
1080
  "Generate", variant="primary", size="lg", elem_classes=["generate-btn"],
1081
  )
1082
 
1083
  with gr.Accordion("Advanced Settings", open=False):
1084
-
1085
  enhance_prompt = gr.Checkbox(label="Enhance Prompt", value=True)
1086
  conditioning_strength = gr.Slider(
1087
  label="V2V Conditioning Strength",
@@ -1119,6 +1220,10 @@ using [Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control](https://huggingface.co/Ligh
1119
  fn=on_video_upload,
1120
  inputs=[input_video, input_image, high_res],
1121
  outputs=[width, height, duration],
 
 
 
 
1122
  )
1123
  high_res.change(
1124
  fn=on_highres_toggle,
 
11
 
12
  # Install video preprocessing dependencies
13
  subprocess.run([sys.executable, "-m", "pip", "install",
14
+ "dwpose", "onnxruntime-gpu", "imageio[ffmpeg]", "scikit-image",
15
+ "opencv-python-headless", "decord", "num2words"], check=False)
16
 
17
  # Reinstall torchaudio to match the torch CUDA version on this space.
18
  # controlnet_aux or other deps can pull in a CPU-only torchaudio that conflicts
 
1024
 
1025
 
1026
  # ─────────────────────────────────────────────────────────────────────────────
1027
+ # SmolVLM2 β€” Auto-describe motion from reference video
1028
+ # ─────────────────────────────────────────────────────────────────────────────
1029
+ SMOLVLM_MODEL_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
1030
+ _vlm_model = None
1031
+ _vlm_processor = None
1032
+
1033
+ MOTION_PROMPT = """\
1034
+ Watch this video carefully. Describe ONLY the following:
1035
+ 1. The body movements and gestures (walking, dancing, waving, turning, etc.)
1036
+ 2. Facial expressions and head movements (smiling, nodding, looking around, etc.)
1037
+ 3. The rhythm, speed, and energy of the motion (slow, fast, smooth, jerky, etc.)
1038
+ 4. The overall mood and tone conveyed by the movement
1039
+
1040
+ Do NOT describe:
1041
+ - What the person/subject looks like (clothing, hair, skin, age, gender)
1042
+ - The background, setting, or environment
1043
+ - Colors, lighting, or visual style
1044
+ - Any objects or props
1045
+
1046
+ Write a concise, single-paragraph description focused purely on motion and expression.\
1047
+ """
1048
+
1049
+
1050
+ def _load_vlm():
1051
+ global _vlm_model, _vlm_processor
1052
+ if _vlm_model is None:
1053
+ from transformers import AutoProcessor, AutoModelForImageTextToText
1054
+ print(f"[SmolVLM] Loading {SMOLVLM_MODEL_ID}...")
1055
+ _vlm_processor = AutoProcessor.from_pretrained(SMOLVLM_MODEL_ID)
1056
+ try:
1057
+ _vlm_model = AutoModelForImageTextToText.from_pretrained(
1058
+ SMOLVLM_MODEL_ID,
1059
+ torch_dtype=torch.bfloat16,
1060
+ _attn_implementation="flash_attention_2",
1061
+ ).to("cuda")
1062
+ except Exception:
1063
+ _vlm_model = AutoModelForImageTextToText.from_pretrained(
1064
+ SMOLVLM_MODEL_ID,
1065
+ torch_dtype=torch.bfloat16,
1066
+ ).to("cuda")
1067
+ print("[SmolVLM] Model loaded!")
1068
+ return _vlm_model, _vlm_processor
1069
+
1070
+
1071
+ @spaces.GPU(duration=60)
1072
+ @torch.inference_mode()
1073
+ def describe_video_motion(video_path, auto_describe=True):
1074
+ """Use SmolVLM2 to generate a motion-only description of a video."""
1075
+ if video_path is None or not auto_describe:
1076
+ return gr.update()
1077
+
1078
+ try:
1079
+ model, processor = _load_vlm()
1080
+
1081
+ messages = [
1082
+ {
1083
+ "role": "user",
1084
+ "content": [
1085
+ {"type": "video", "path": str(video_path)},
1086
+ {"type": "text", "text": MOTION_PROMPT},
1087
+ ],
1088
+ },
1089
+ ]
1090
+
1091
+ inputs = processor.apply_chat_template(
1092
+ messages,
1093
+ add_generation_prompt=True,
1094
+ tokenize=True,
1095
+ return_dict=True,
1096
+ return_tensors="pt",
1097
+ ).to(model.device, dtype=torch.bfloat16)
1098
+
1099
+ generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=200)
1100
+ generated_text = processor.batch_decode(
1101
+ generated_ids, skip_special_tokens=True
1102
+ )[0]
1103
+
1104
+ # Extract only the assistant's response (after the prompt)
1105
+ if "Assistant:" in generated_text:
1106
+ motion_desc = generated_text.split("Assistant:")[-1].strip()
1107
+ else:
1108
+ motion_desc = generated_text.strip()
1109
+
1110
+ # Clean up any leftover prompt fragments
1111
+ for marker in [MOTION_PROMPT[:40], "Watch this video", "Do NOT describe"]:
1112
+ if marker in motion_desc:
1113
+ motion_desc = motion_desc.split(marker)[0].strip()
1114
+
1115
+ if motion_desc:
1116
+ print(f"[SmolVLM] Motion description: {motion_desc[:100]}...")
1117
+ return gr.update(value=motion_desc)
1118
+ else:
1119
+ return gr.update()
1120
+
1121
+ except Exception as e:
1122
+ print(f"[SmolVLM] Error: {e}")
1123
+ return gr.update()
1124
+
1125
+
1126
+ # ─────────────────────────────────────────────────────────────────────────────
1127
+ # Gradio UI β€” LTX 2.3 Sync
1128
  # ─────────────────────────────────────────────────────────────────────────────
1129
  css = """
1130
  .main-title { text-align: center; margin-bottom: 0.5em; }
 
1139
  neutral_hue=gr.themes.colors.gray,
1140
  )
1141
 
1142
+ with gr.Blocks(title="LTX 2.3 Sync", css=css, theme=purple_citrus) as demo:
1143
  gr.Markdown("""
1144
  # LTX 2.3 Sync πŸ•Ί
1145
  #### Character Animation with LTX 2.3
 
1162
  type="filepath",
1163
  )
1164
 
 
 
1165
  with gr.Row():
1166
  prompt = gr.Textbox(
1167
+ label="Prompt",
1168
+ info="tip: describe the motion, body posture, facial expressions of the ref video",
1169
+ lines=2,
1170
+ placeholder="the person talks to the camera, making hand gestures",
 
 
 
 
1171
  )
1172
+ with gr.Column(min_width=160):
1173
+ duration = gr.Slider(
1174
+ label="Duration (s)", minimum=1.0, maximum=15.0, value=3.0, step=0.5,
1175
+ )
1176
+ auto_describe = gr.Checkbox(
1177
+ label="Auto-describe motion", value=True,
1178
+ info="Use AI to describe the video's motion as a prompt",
1179
+ )
1180
 
1181
  generate_btn = gr.Button(
1182
  "Generate", variant="primary", size="lg", elem_classes=["generate-btn"],
1183
  )
1184
 
1185
  with gr.Accordion("Advanced Settings", open=False):
 
1186
  enhance_prompt = gr.Checkbox(label="Enhance Prompt", value=True)
1187
  conditioning_strength = gr.Slider(
1188
  label="V2V Conditioning Strength",
 
1220
  fn=on_video_upload,
1221
  inputs=[input_video, input_image, high_res],
1222
  outputs=[width, height, duration],
1223
+ ).then(
1224
+ fn=describe_video_motion,
1225
+ inputs=[input_video, auto_describe],
1226
+ outputs=[prompt],
1227
  )
1228
  high_res.change(
1229
  fn=on_highres_toggle,