linoyts HF Staff commited on
Commit
b4fa358
·
verified ·
1 Parent(s): bd479f8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -27
app.py CHANGED
@@ -9,8 +9,9 @@ os.environ["TORCHDYNAMO_DISABLE"] = "1"
9
  # Install xformers for memory-efficient attention
10
  subprocess.run([sys.executable, "-m", "pip", "install", "xformers==0.0.32.post2", "--no-build-isolation"], check=False)
11
 
12
- # Install video preprocessing dependencies (pose/canny/depth extraction)
13
- subprocess.run([sys.executable, "-m", "pip", "install", "controlnet_aux", "imageio[ffmpeg]"], check=False)
 
14
 
15
  # Reinstall torchaudio to match the torch CUDA version on this space.
16
  # controlnet_aux or other deps can pull in a CPU-only torchaudio that conflicts
@@ -126,18 +127,18 @@ _depth_processor = None
126
  def _get_pose_processor():
127
  global _pose_processor
128
  if _pose_processor is None:
129
- from controlnet_aux import OpenposeDetector
130
- _pose_processor = OpenposeDetector.from_pretrained("lllyasviel/Annotators")
131
- print("[Preprocess] OpenPose processor loaded")
132
  return _pose_processor
133
 
134
 
135
  def _get_depth_processor():
 
136
  global _depth_processor
137
  if _depth_processor is None:
138
- from controlnet_aux import MidasDetector
139
- _depth_processor = MidasDetector.from_pretrained("lllyasviel/Annotators")
140
- print("[Preprocess] MiDaS depth processor loaded")
141
  return _depth_processor
142
 
143
 
@@ -170,12 +171,12 @@ def extract_first_frame(video_path: str) -> str:
170
 
171
 
172
  def preprocess_video_pose(frames: list[np.ndarray], width: int, height: int) -> list[np.ndarray]:
173
- """Extract OpenPose skeletons from each frame. Returns float [0,1] frames."""
174
  processor = _get_pose_processor()
175
  result = []
176
  for frame in frames:
177
  pil = Image.fromarray(frame.astype(np.uint8)).convert("RGB")
178
- pose_img = processor(pil, hand_and_face=False)
179
  if not isinstance(pose_img, Image.Image):
180
  pose_img = Image.fromarray(np.array(pose_img).astype(np.uint8))
181
  pose_img = pose_img.convert("RGB").resize((width, height), Image.BILINEAR)
@@ -199,19 +200,18 @@ def preprocess_video_canny(frames: list[np.ndarray], width: int, height: int,
199
 
200
 
201
  def preprocess_video_depth(frames: list[np.ndarray], width: int, height: int) -> list[np.ndarray]:
202
- """Extract MiDaS depth maps from each frame. Returns float [0,1] frames."""
203
- processor = _get_depth_processor()
204
- detect_res = max(frames[0].shape[0], frames[0].shape[1])
205
- image_res = max(width, height)
206
  result = []
207
  for frame in frames:
208
- depth = processor(frame, detect_resolution=detect_res,
209
- image_resolution=image_res, output_type="np")
210
- if depth.ndim == 2:
211
- depth = np.stack([depth, depth, depth], axis=-1)
212
- elif depth.shape[-1] == 1:
213
- depth = np.repeat(depth, 3, axis=-1)
214
- result.append(depth)
 
215
  return result
216
 
217
 
@@ -241,11 +241,11 @@ def preprocess_conditioning_video(
241
  Image.fromarray(frames[0]).save(first_png)
242
 
243
  # Process based on mode
244
- if mode == "Pose (OpenPose)":
245
  processed = preprocess_video_pose(frames, width, height)
246
  elif mode == "Canny Edge":
247
  processed = preprocess_video_canny(frames, width, height)
248
- elif mode == "Depth (MiDaS)":
249
  processed = preprocess_video_depth(frames, width, height)
250
  else:
251
  # "Raw" mode — no preprocessing
@@ -713,7 +713,7 @@ pipeline = LTX23UnifiedPipeline(
713
  distilled_checkpoint_path=checkpoint_path,
714
  spatial_upsampler_path=spatial_upsampler_path,
715
  gemma_root=gemma_root,
716
- # ic_loras=ic_loras,
717
  quantization=QuantizationPolicy.fp8_cast(),
718
  )
719
 
@@ -1013,12 +1013,12 @@ with gr.Blocks(title="LTX-2.3 Unified: V2V + I2V + A2V") as demo:
1013
  video_preprocess = gr.Dropdown(
1014
  label="Video Preprocessing",
1015
  choices=[
1016
- "Pose (OpenPose)",
1017
  "Canny Edge",
1018
- "Depth (MiDaS)",
1019
  "Raw (no preprocessing)",
1020
  ],
1021
- value="Pose (OpenPose)",
1022
  info="Strips appearance from video → style comes from image/prompt instead",
1023
  )
1024
  input_audio = gr.Audio(
 
9
  # Install xformers for memory-efficient attention
10
  subprocess.run([sys.executable, "-m", "pip", "install", "xformers==0.0.32.post2", "--no-build-isolation"], check=False)
11
 
12
+ # Install video preprocessing dependencies
13
+ subprocess.run([sys.executable, "-m", "pip", "install",
14
+ "dwpose", "onnxruntime-gpu", "imageio[ffmpeg]", "scikit-image"], check=False)
15
 
16
  # Reinstall torchaudio to match the torch CUDA version on this space.
17
  # controlnet_aux or other deps can pull in a CPU-only torchaudio that conflicts
 
127
  def _get_pose_processor():
128
  global _pose_processor
129
  if _pose_processor is None:
130
+ from dwpose import DwposeDetector
131
+ _pose_processor = DwposeDetector.from_pretrained_default()
132
+ print("[Preprocess] DWPose processor loaded")
133
  return _pose_processor
134
 
135
 
136
  def _get_depth_processor():
137
+ """Placeholder — uses simple Laplacian edge-based depth approximation via OpenCV."""
138
  global _depth_processor
139
  if _depth_processor is None:
140
+ _depth_processor = "cv2" # sentinel — we use cv2 directly
141
+ print("[Preprocess] CV2-based depth processor loaded")
 
142
  return _depth_processor
143
 
144
 
 
171
 
172
 
173
  def preprocess_video_pose(frames: list[np.ndarray], width: int, height: int) -> list[np.ndarray]:
174
+ """Extract DWPose skeletons from each frame. Returns float [0,1] frames."""
175
  processor = _get_pose_processor()
176
  result = []
177
  for frame in frames:
178
  pil = Image.fromarray(frame.astype(np.uint8)).convert("RGB")
179
+ pose_img = processor(pil, include_body=True, include_hand=True, include_face=True)
180
  if not isinstance(pose_img, Image.Image):
181
  pose_img = Image.fromarray(np.array(pose_img).astype(np.uint8))
182
  pose_img = pose_img.convert("RGB").resize((width, height), Image.BILINEAR)
 
200
 
201
 
202
  def preprocess_video_depth(frames: list[np.ndarray], width: int, height: int) -> list[np.ndarray]:
203
+ """Estimate depth-like maps from each frame using Laplacian gradient magnitude.
204
+ This is a fast approximation — for true depth, use MiDaS externally."""
 
 
205
  result = []
206
  for frame in frames:
207
+ resized = cv2.resize(frame, (width, height), interpolation=cv2.INTER_AREA)
208
+ gray = cv2.cvtColor(resized, cv2.COLOR_RGB2GRAY).astype(np.float32)
209
+ # Laplacian gives edge/gradient info that approximates depth discontinuities
210
+ lap = np.abs(cv2.Laplacian(gray, cv2.CV_32F, ksize=5))
211
+ # Normalize to [0, 1]
212
+ lap = lap / (lap.max() + 1e-8)
213
+ depth_3ch = np.stack([lap, lap, lap], axis=-1)
214
+ result.append(depth_3ch)
215
  return result
216
 
217
 
 
241
  Image.fromarray(frames[0]).save(first_png)
242
 
243
  # Process based on mode
244
+ if mode == "Pose (DWPose)":
245
  processed = preprocess_video_pose(frames, width, height)
246
  elif mode == "Canny Edge":
247
  processed = preprocess_video_canny(frames, width, height)
248
+ elif mode == "Depth (Laplacian)":
249
  processed = preprocess_video_depth(frames, width, height)
250
  else:
251
  # "Raw" mode — no preprocessing
 
713
  distilled_checkpoint_path=checkpoint_path,
714
  spatial_upsampler_path=spatial_upsampler_path,
715
  gemma_root=gemma_root,
716
+ ic_loras=ic_loras,
717
  quantization=QuantizationPolicy.fp8_cast(),
718
  )
719
 
 
1013
  video_preprocess = gr.Dropdown(
1014
  label="Video Preprocessing",
1015
  choices=[
1016
+ "Pose (DWPose)",
1017
  "Canny Edge",
1018
+ "Depth (Laplacian)",
1019
  "Raw (no preprocessing)",
1020
  ],
1021
+ value="Pose (DWPose)",
1022
  info="Strips appearance from video → style comes from image/prompt instead",
1023
  )
1024
  input_audio = gr.Audio(