Spaces:
No application file
No application file
Update app.py (#5)
Browse files- Update app.py (526dd8d82abe509955445dc2923c692b062515b0)
app.py
CHANGED
|
@@ -11,7 +11,8 @@ subprocess.run([sys.executable, "-m", "pip", "install", "xformers==0.0.32.post2"
|
|
| 11 |
|
| 12 |
# Install video preprocessing dependencies
|
| 13 |
subprocess.run([sys.executable, "-m", "pip", "install",
|
| 14 |
-
"dwpose", "onnxruntime-gpu", "imageio[ffmpeg]", "scikit-image",
|
|
|
|
| 15 |
|
| 16 |
# Reinstall torchaudio to match the torch CUDA version on this space.
|
| 17 |
# controlnet_aux or other deps can pull in a CPU-only torchaudio that conflicts
|
|
@@ -1023,7 +1024,107 @@ def generate_video(
|
|
| 1023 |
|
| 1024 |
|
| 1025 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1026 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1027 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1028 |
css = """
|
| 1029 |
.main-title { text-align: center; margin-bottom: 0.5em; }
|
|
@@ -1038,7 +1139,7 @@ purple_citrus = gr.themes.Citrus(
|
|
| 1038 |
neutral_hue=gr.themes.colors.gray,
|
| 1039 |
)
|
| 1040 |
|
| 1041 |
-
with gr.Blocks(title="LTX 2.3
|
| 1042 |
gr.Markdown("""
|
| 1043 |
# LTX 2.3 Sync πΊ
|
| 1044 |
#### Character Animation with LTX 2.3
|
|
@@ -1061,27 +1162,27 @@ using [Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control](https://huggingface.co/Ligh
|
|
| 1061 |
type="filepath",
|
| 1062 |
)
|
| 1063 |
|
| 1064 |
-
|
| 1065 |
-
|
| 1066 |
with gr.Row():
|
| 1067 |
prompt = gr.Textbox(
|
| 1068 |
-
|
| 1069 |
-
|
| 1070 |
-
|
| 1071 |
-
|
| 1072 |
-
|
| 1073 |
-
)
|
| 1074 |
-
duration = gr.Slider(
|
| 1075 |
-
label="Duration (s)", minimum=1.0, maximum=15.0, value=3.0, step=0.5,
|
| 1076 |
)
|
| 1077 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1078 |
|
| 1079 |
generate_btn = gr.Button(
|
| 1080 |
"Generate", variant="primary", size="lg", elem_classes=["generate-btn"],
|
| 1081 |
)
|
| 1082 |
|
| 1083 |
with gr.Accordion("Advanced Settings", open=False):
|
| 1084 |
-
|
| 1085 |
enhance_prompt = gr.Checkbox(label="Enhance Prompt", value=True)
|
| 1086 |
conditioning_strength = gr.Slider(
|
| 1087 |
label="V2V Conditioning Strength",
|
|
@@ -1119,6 +1220,10 @@ using [Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control](https://huggingface.co/Ligh
|
|
| 1119 |
fn=on_video_upload,
|
| 1120 |
inputs=[input_video, input_image, high_res],
|
| 1121 |
outputs=[width, height, duration],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1122 |
)
|
| 1123 |
high_res.change(
|
| 1124 |
fn=on_highres_toggle,
|
|
|
|
| 11 |
|
| 12 |
# Install video preprocessing dependencies
|
| 13 |
subprocess.run([sys.executable, "-m", "pip", "install",
|
| 14 |
+
"dwpose", "onnxruntime-gpu", "imageio[ffmpeg]", "scikit-image",
|
| 15 |
+
"opencv-python-headless", "decord", "num2words"], check=False)
|
| 16 |
|
| 17 |
# Reinstall torchaudio to match the torch CUDA version on this space.
|
| 18 |
# controlnet_aux or other deps can pull in a CPU-only torchaudio that conflicts
|
|
|
|
| 1024 |
|
| 1025 |
|
| 1026 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1027 |
+
# SmolVLM2 β Auto-describe motion from reference video
|
| 1028 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1029 |
+
SMOLVLM_MODEL_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
|
| 1030 |
+
_vlm_model = None
|
| 1031 |
+
_vlm_processor = None
|
| 1032 |
+
|
| 1033 |
+
MOTION_PROMPT = """\
|
| 1034 |
+
Watch this video carefully. Describe ONLY the following:
|
| 1035 |
+
1. The body movements and gestures (walking, dancing, waving, turning, etc.)
|
| 1036 |
+
2. Facial expressions and head movements (smiling, nodding, looking around, etc.)
|
| 1037 |
+
3. The rhythm, speed, and energy of the motion (slow, fast, smooth, jerky, etc.)
|
| 1038 |
+
4. The overall mood and tone conveyed by the movement
|
| 1039 |
+
|
| 1040 |
+
Do NOT describe:
|
| 1041 |
+
- What the person/subject looks like (clothing, hair, skin, age, gender)
|
| 1042 |
+
- The background, setting, or environment
|
| 1043 |
+
- Colors, lighting, or visual style
|
| 1044 |
+
- Any objects or props
|
| 1045 |
+
|
| 1046 |
+
Write a concise, single-paragraph description focused purely on motion and expression.\
|
| 1047 |
+
"""
|
| 1048 |
+
|
| 1049 |
+
|
| 1050 |
+
def _load_vlm():
|
| 1051 |
+
global _vlm_model, _vlm_processor
|
| 1052 |
+
if _vlm_model is None:
|
| 1053 |
+
from transformers import AutoProcessor, AutoModelForImageTextToText
|
| 1054 |
+
print(f"[SmolVLM] Loading {SMOLVLM_MODEL_ID}...")
|
| 1055 |
+
_vlm_processor = AutoProcessor.from_pretrained(SMOLVLM_MODEL_ID)
|
| 1056 |
+
try:
|
| 1057 |
+
_vlm_model = AutoModelForImageTextToText.from_pretrained(
|
| 1058 |
+
SMOLVLM_MODEL_ID,
|
| 1059 |
+
torch_dtype=torch.bfloat16,
|
| 1060 |
+
_attn_implementation="flash_attention_2",
|
| 1061 |
+
).to("cuda")
|
| 1062 |
+
except Exception:
|
| 1063 |
+
_vlm_model = AutoModelForImageTextToText.from_pretrained(
|
| 1064 |
+
SMOLVLM_MODEL_ID,
|
| 1065 |
+
torch_dtype=torch.bfloat16,
|
| 1066 |
+
).to("cuda")
|
| 1067 |
+
print("[SmolVLM] Model loaded!")
|
| 1068 |
+
return _vlm_model, _vlm_processor
|
| 1069 |
+
|
| 1070 |
+
|
| 1071 |
+
@spaces.GPU(duration=60)
|
| 1072 |
+
@torch.inference_mode()
|
| 1073 |
+
def describe_video_motion(video_path, auto_describe=True):
|
| 1074 |
+
"""Use SmolVLM2 to generate a motion-only description of a video."""
|
| 1075 |
+
if video_path is None or not auto_describe:
|
| 1076 |
+
return gr.update()
|
| 1077 |
+
|
| 1078 |
+
try:
|
| 1079 |
+
model, processor = _load_vlm()
|
| 1080 |
+
|
| 1081 |
+
messages = [
|
| 1082 |
+
{
|
| 1083 |
+
"role": "user",
|
| 1084 |
+
"content": [
|
| 1085 |
+
{"type": "video", "path": str(video_path)},
|
| 1086 |
+
{"type": "text", "text": MOTION_PROMPT},
|
| 1087 |
+
],
|
| 1088 |
+
},
|
| 1089 |
+
]
|
| 1090 |
+
|
| 1091 |
+
inputs = processor.apply_chat_template(
|
| 1092 |
+
messages,
|
| 1093 |
+
add_generation_prompt=True,
|
| 1094 |
+
tokenize=True,
|
| 1095 |
+
return_dict=True,
|
| 1096 |
+
return_tensors="pt",
|
| 1097 |
+
).to(model.device, dtype=torch.bfloat16)
|
| 1098 |
+
|
| 1099 |
+
generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=200)
|
| 1100 |
+
generated_text = processor.batch_decode(
|
| 1101 |
+
generated_ids, skip_special_tokens=True
|
| 1102 |
+
)[0]
|
| 1103 |
+
|
| 1104 |
+
# Extract only the assistant's response (after the prompt)
|
| 1105 |
+
if "Assistant:" in generated_text:
|
| 1106 |
+
motion_desc = generated_text.split("Assistant:")[-1].strip()
|
| 1107 |
+
else:
|
| 1108 |
+
motion_desc = generated_text.strip()
|
| 1109 |
+
|
| 1110 |
+
# Clean up any leftover prompt fragments
|
| 1111 |
+
for marker in [MOTION_PROMPT[:40], "Watch this video", "Do NOT describe"]:
|
| 1112 |
+
if marker in motion_desc:
|
| 1113 |
+
motion_desc = motion_desc.split(marker)[0].strip()
|
| 1114 |
+
|
| 1115 |
+
if motion_desc:
|
| 1116 |
+
print(f"[SmolVLM] Motion description: {motion_desc[:100]}...")
|
| 1117 |
+
return gr.update(value=motion_desc)
|
| 1118 |
+
else:
|
| 1119 |
+
return gr.update()
|
| 1120 |
+
|
| 1121 |
+
except Exception as e:
|
| 1122 |
+
print(f"[SmolVLM] Error: {e}")
|
| 1123 |
+
return gr.update()
|
| 1124 |
+
|
| 1125 |
+
|
| 1126 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1127 |
+
# Gradio UI β LTX 2.3 Sync
|
| 1128 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1129 |
css = """
|
| 1130 |
.main-title { text-align: center; margin-bottom: 0.5em; }
|
|
|
|
| 1139 |
neutral_hue=gr.themes.colors.gray,
|
| 1140 |
)
|
| 1141 |
|
| 1142 |
+
with gr.Blocks(title="LTX 2.3 Sync", css=css, theme=purple_citrus) as demo:
|
| 1143 |
gr.Markdown("""
|
| 1144 |
# LTX 2.3 Sync πΊ
|
| 1145 |
#### Character Animation with LTX 2.3
|
|
|
|
| 1162 |
type="filepath",
|
| 1163 |
)
|
| 1164 |
|
|
|
|
|
|
|
| 1165 |
with gr.Row():
|
| 1166 |
prompt = gr.Textbox(
|
| 1167 |
+
label="Prompt",
|
| 1168 |
+
info="tip: describe the motion, body posture, facial expressions of the ref video",
|
| 1169 |
+
lines=2,
|
| 1170 |
+
placeholder="the person talks to the camera, making hand gestures",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1171 |
)
|
| 1172 |
+
with gr.Column(min_width=160):
|
| 1173 |
+
duration = gr.Slider(
|
| 1174 |
+
label="Duration (s)", minimum=1.0, maximum=15.0, value=3.0, step=0.5,
|
| 1175 |
+
)
|
| 1176 |
+
auto_describe = gr.Checkbox(
|
| 1177 |
+
label="Auto-describe motion", value=True,
|
| 1178 |
+
info="Use AI to describe the video's motion as a prompt",
|
| 1179 |
+
)
|
| 1180 |
|
| 1181 |
generate_btn = gr.Button(
|
| 1182 |
"Generate", variant="primary", size="lg", elem_classes=["generate-btn"],
|
| 1183 |
)
|
| 1184 |
|
| 1185 |
with gr.Accordion("Advanced Settings", open=False):
|
|
|
|
| 1186 |
enhance_prompt = gr.Checkbox(label="Enhance Prompt", value=True)
|
| 1187 |
conditioning_strength = gr.Slider(
|
| 1188 |
label="V2V Conditioning Strength",
|
|
|
|
| 1220 |
fn=on_video_upload,
|
| 1221 |
inputs=[input_video, input_image, high_res],
|
| 1222 |
outputs=[width, height, duration],
|
| 1223 |
+
).then(
|
| 1224 |
+
fn=describe_video_motion,
|
| 1225 |
+
inputs=[input_video, auto_describe],
|
| 1226 |
+
outputs=[prompt],
|
| 1227 |
)
|
| 1228 |
high_res.change(
|
| 1229 |
fn=on_highres_toggle,
|