Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -9,8 +9,9 @@ os.environ["TORCHDYNAMO_DISABLE"] = "1"
|
|
| 9 |
# Install xformers for memory-efficient attention
|
| 10 |
subprocess.run([sys.executable, "-m", "pip", "install", "xformers==0.0.32.post2", "--no-build-isolation"], check=False)
|
| 11 |
|
| 12 |
-
# Install video preprocessing dependencies
|
| 13 |
-
subprocess.run([sys.executable, "-m", "pip", "install",
|
|
|
|
| 14 |
|
| 15 |
# Reinstall torchaudio to match the torch CUDA version on this space.
|
| 16 |
# controlnet_aux or other deps can pull in a CPU-only torchaudio that conflicts
|
|
@@ -126,18 +127,18 @@ _depth_processor = None
|
|
| 126 |
def _get_pose_processor():
|
| 127 |
global _pose_processor
|
| 128 |
if _pose_processor is None:
|
| 129 |
-
from
|
| 130 |
-
_pose_processor =
|
| 131 |
-
print("[Preprocess]
|
| 132 |
return _pose_processor
|
| 133 |
|
| 134 |
|
| 135 |
def _get_depth_processor():
|
|
|
|
| 136 |
global _depth_processor
|
| 137 |
if _depth_processor is None:
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
print("[Preprocess] MiDaS depth processor loaded")
|
| 141 |
return _depth_processor
|
| 142 |
|
| 143 |
|
|
@@ -170,12 +171,12 @@ def extract_first_frame(video_path: str) -> str:
|
|
| 170 |
|
| 171 |
|
| 172 |
def preprocess_video_pose(frames: list[np.ndarray], width: int, height: int) -> list[np.ndarray]:
|
| 173 |
-
"""Extract
|
| 174 |
processor = _get_pose_processor()
|
| 175 |
result = []
|
| 176 |
for frame in frames:
|
| 177 |
pil = Image.fromarray(frame.astype(np.uint8)).convert("RGB")
|
| 178 |
-
pose_img = processor(pil,
|
| 179 |
if not isinstance(pose_img, Image.Image):
|
| 180 |
pose_img = Image.fromarray(np.array(pose_img).astype(np.uint8))
|
| 181 |
pose_img = pose_img.convert("RGB").resize((width, height), Image.BILINEAR)
|
|
@@ -199,19 +200,18 @@ def preprocess_video_canny(frames: list[np.ndarray], width: int, height: int,
|
|
| 199 |
|
| 200 |
|
| 201 |
def preprocess_video_depth(frames: list[np.ndarray], width: int, height: int) -> list[np.ndarray]:
|
| 202 |
-
"""
|
| 203 |
-
|
| 204 |
-
detect_res = max(frames[0].shape[0], frames[0].shape[1])
|
| 205 |
-
image_res = max(width, height)
|
| 206 |
result = []
|
| 207 |
for frame in frames:
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
|
|
|
| 215 |
return result
|
| 216 |
|
| 217 |
|
|
@@ -241,11 +241,11 @@ def preprocess_conditioning_video(
|
|
| 241 |
Image.fromarray(frames[0]).save(first_png)
|
| 242 |
|
| 243 |
# Process based on mode
|
| 244 |
-
if mode == "Pose (
|
| 245 |
processed = preprocess_video_pose(frames, width, height)
|
| 246 |
elif mode == "Canny Edge":
|
| 247 |
processed = preprocess_video_canny(frames, width, height)
|
| 248 |
-
elif mode == "Depth (
|
| 249 |
processed = preprocess_video_depth(frames, width, height)
|
| 250 |
else:
|
| 251 |
# "Raw" mode — no preprocessing
|
|
@@ -713,7 +713,7 @@ pipeline = LTX23UnifiedPipeline(
|
|
| 713 |
distilled_checkpoint_path=checkpoint_path,
|
| 714 |
spatial_upsampler_path=spatial_upsampler_path,
|
| 715 |
gemma_root=gemma_root,
|
| 716 |
-
|
| 717 |
quantization=QuantizationPolicy.fp8_cast(),
|
| 718 |
)
|
| 719 |
|
|
@@ -1013,12 +1013,12 @@ with gr.Blocks(title="LTX-2.3 Unified: V2V + I2V + A2V") as demo:
|
|
| 1013 |
video_preprocess = gr.Dropdown(
|
| 1014 |
label="Video Preprocessing",
|
| 1015 |
choices=[
|
| 1016 |
-
"Pose (
|
| 1017 |
"Canny Edge",
|
| 1018 |
-
"Depth (
|
| 1019 |
"Raw (no preprocessing)",
|
| 1020 |
],
|
| 1021 |
-
value="Pose (
|
| 1022 |
info="Strips appearance from video → style comes from image/prompt instead",
|
| 1023 |
)
|
| 1024 |
input_audio = gr.Audio(
|
|
|
|
| 9 |
# Install xformers for memory-efficient attention
|
| 10 |
subprocess.run([sys.executable, "-m", "pip", "install", "xformers==0.0.32.post2", "--no-build-isolation"], check=False)
|
| 11 |
|
| 12 |
+
# Install video preprocessing dependencies
|
| 13 |
+
subprocess.run([sys.executable, "-m", "pip", "install",
|
| 14 |
+
"dwpose", "onnxruntime-gpu", "imageio[ffmpeg]", "scikit-image"], check=False)
|
| 15 |
|
| 16 |
# Reinstall torchaudio to match the torch CUDA version on this space.
|
| 17 |
# controlnet_aux or other deps can pull in a CPU-only torchaudio that conflicts
|
|
|
|
| 127 |
def _get_pose_processor():
|
| 128 |
global _pose_processor
|
| 129 |
if _pose_processor is None:
|
| 130 |
+
from dwpose import DwposeDetector
|
| 131 |
+
_pose_processor = DwposeDetector.from_pretrained_default()
|
| 132 |
+
print("[Preprocess] DWPose processor loaded")
|
| 133 |
return _pose_processor
|
| 134 |
|
| 135 |
|
| 136 |
def _get_depth_processor():
|
| 137 |
+
"""Placeholder — uses simple Laplacian edge-based depth approximation via OpenCV."""
|
| 138 |
global _depth_processor
|
| 139 |
if _depth_processor is None:
|
| 140 |
+
_depth_processor = "cv2" # sentinel — we use cv2 directly
|
| 141 |
+
print("[Preprocess] CV2-based depth processor loaded")
|
|
|
|
| 142 |
return _depth_processor
|
| 143 |
|
| 144 |
|
|
|
|
| 171 |
|
| 172 |
|
| 173 |
def preprocess_video_pose(frames: list[np.ndarray], width: int, height: int) -> list[np.ndarray]:
|
| 174 |
+
"""Extract DWPose skeletons from each frame. Returns float [0,1] frames."""
|
| 175 |
processor = _get_pose_processor()
|
| 176 |
result = []
|
| 177 |
for frame in frames:
|
| 178 |
pil = Image.fromarray(frame.astype(np.uint8)).convert("RGB")
|
| 179 |
+
pose_img = processor(pil, include_body=True, include_hand=True, include_face=True)
|
| 180 |
if not isinstance(pose_img, Image.Image):
|
| 181 |
pose_img = Image.fromarray(np.array(pose_img).astype(np.uint8))
|
| 182 |
pose_img = pose_img.convert("RGB").resize((width, height), Image.BILINEAR)
|
|
|
|
| 200 |
|
| 201 |
|
| 202 |
def preprocess_video_depth(frames: list[np.ndarray], width: int, height: int) -> list[np.ndarray]:
|
| 203 |
+
"""Estimate depth-like maps from each frame using Laplacian gradient magnitude.
|
| 204 |
+
This is a fast approximation — for true depth, use MiDaS externally."""
|
|
|
|
|
|
|
| 205 |
result = []
|
| 206 |
for frame in frames:
|
| 207 |
+
resized = cv2.resize(frame, (width, height), interpolation=cv2.INTER_AREA)
|
| 208 |
+
gray = cv2.cvtColor(resized, cv2.COLOR_RGB2GRAY).astype(np.float32)
|
| 209 |
+
# Laplacian gives edge/gradient info that approximates depth discontinuities
|
| 210 |
+
lap = np.abs(cv2.Laplacian(gray, cv2.CV_32F, ksize=5))
|
| 211 |
+
# Normalize to [0, 1]
|
| 212 |
+
lap = lap / (lap.max() + 1e-8)
|
| 213 |
+
depth_3ch = np.stack([lap, lap, lap], axis=-1)
|
| 214 |
+
result.append(depth_3ch)
|
| 215 |
return result
|
| 216 |
|
| 217 |
|
|
|
|
| 241 |
Image.fromarray(frames[0]).save(first_png)
|
| 242 |
|
| 243 |
# Process based on mode
|
| 244 |
+
if mode == "Pose (DWPose)":
|
| 245 |
processed = preprocess_video_pose(frames, width, height)
|
| 246 |
elif mode == "Canny Edge":
|
| 247 |
processed = preprocess_video_canny(frames, width, height)
|
| 248 |
+
elif mode == "Depth (Laplacian)":
|
| 249 |
processed = preprocess_video_depth(frames, width, height)
|
| 250 |
else:
|
| 251 |
# "Raw" mode — no preprocessing
|
|
|
|
| 713 |
distilled_checkpoint_path=checkpoint_path,
|
| 714 |
spatial_upsampler_path=spatial_upsampler_path,
|
| 715 |
gemma_root=gemma_root,
|
| 716 |
+
ic_loras=ic_loras,
|
| 717 |
quantization=QuantizationPolicy.fp8_cast(),
|
| 718 |
)
|
| 719 |
|
|
|
|
| 1013 |
video_preprocess = gr.Dropdown(
|
| 1014 |
label="Video Preprocessing",
|
| 1015 |
choices=[
|
| 1016 |
+
"Pose (DWPose)",
|
| 1017 |
"Canny Edge",
|
| 1018 |
+
"Depth (Laplacian)",
|
| 1019 |
"Raw (no preprocessing)",
|
| 1020 |
],
|
| 1021 |
+
value="Pose (DWPose)",
|
| 1022 |
info="Strips appearance from video → style comes from image/prompt instead",
|
| 1023 |
)
|
| 1024 |
input_audio = gr.Audio(
|