Mateo commited on
Commit
065d662
·
1 Parent(s): 53889b2

optimize time

Browse files
Files changed (2) hide show
  1. app.py +44 -24
  2. vision.py +67 -24
app.py CHANGED
@@ -20,7 +20,7 @@ LOGGER = logging.getLogger(__name__)
20
 
21
 
22
  DEFAULT_SPLIT_CFG = {
23
- "n_samples": 30,
24
  "max_w": 400,
25
  "crop_y": (0.25, 0.90),
26
  "dx_threshold_px": 1.5,
@@ -34,6 +34,7 @@ DEFAULT_SPLIT_CFG = {
34
  "jump_meanabs_threshold": 18.0,
35
  "progress_every": 0,
36
  }
 
37
 
38
 
39
  def _log_timing_summary(label, stats, wall_time=None, max_items=12):
@@ -285,23 +286,27 @@ def timer(name, stats):
285
  stats[name] = stats.get(name, 0.0) + (time.perf_counter() - t0)
286
 
287
 
288
- def _iter_sampled_frames(video_path, n_samples):
289
  timing = {}
290
  wall_t0 = time.perf_counter()
291
- with timer("extract_bgr_with_ffmpeg", timing):
292
- frames = _extract_bgr_with_ffmpeg(video_path, int(n_samples))
 
 
 
 
293
  timing["wall"] = time.perf_counter() - wall_t0
294
  _log_timing_summary("Iter sampled frames", timing, wall_time=timing["wall"])
295
  for out_idx, frame in enumerate(frames):
296
  yield out_idx, frame
297
 
298
 
299
- def iter_frames(video_path, n_samples, max_w, crop_y):
300
  timing = {"resize": 0.0, "crop": 0.0}
301
  wall_t0 = time.perf_counter()
302
  frame_count = 0
303
  try:
304
- for out_idx, frame in _iter_sampled_frames(video_path, n_samples):
305
  frame_count += 1
306
  proc = frame
307
  if max_w > 0 and proc.shape[1] != max_w:
@@ -402,7 +407,7 @@ def estimate_dx_orb_affine(prev_gray, gray, orb, bf, min_matches, keep_ratio, ti
402
 
403
  def split_video_into_stable_segments_fast(
404
  video_path,
405
- n_samples=30,
406
  max_w=400,
407
  crop_y=(0.25, 0.90),
408
  dx_threshold_px=1.5,
@@ -415,6 +420,7 @@ def split_video_into_stable_segments_fast(
415
  keep_ratio=0.4,
416
  jump_meanabs_threshold=18.0,
417
  progress_every=200,
 
418
  ):
419
  wall_t0 = time.perf_counter()
420
  timing_total = {}
@@ -429,7 +435,13 @@ def split_video_into_stable_segments_fast(
429
  frame_count = 0
430
 
431
  with timer("loop_total", timing_total):
432
- for _, frame in iter_frames(video_path, n_samples=n_samples, max_w=max_w, crop_y=crop_y):
 
 
 
 
 
 
433
  frame_count += 1
434
 
435
  with timer("to_gray", timing_total):
@@ -559,7 +571,7 @@ def _bgr_to_pil(frame):
559
  return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
560
 
561
 
562
- def extract_segment_frames(video_path, segments, n_samples):
563
  timing = {}
564
  wall_t0 = time.perf_counter()
565
 
@@ -585,7 +597,7 @@ def extract_segment_frames(video_path, segments, n_samples):
585
  # Detection runs on original sampled frames (no resize / no crop).
586
  to_pil_time = 0.0
587
  with timer("assign_frames_to_segments", timing):
588
- for frame_idx, frame in _iter_sampled_frames(video_path, n_samples=n_samples):
589
  while segment_idx < len(normalized_segments) and frame_idx > normalized_segments[segment_idx][1]:
590
  segment_idx += 1
591
 
@@ -623,7 +635,7 @@ def extract_segment_frames(video_path, segments, n_samples):
623
  return [frames for frames in grouped_frames if frames]
624
 
625
 
626
- def split_video_stable(video_path, split_cfg=None, fallback_n=30):
627
  if not video_path or not os.path.exists(video_path):
628
  return []
629
 
@@ -635,13 +647,17 @@ def split_video_stable(video_path, split_cfg=None, fallback_n=30):
635
 
636
  LOGGER.info("Split config | %s", cfg)
637
 
 
 
 
638
  with timer("split_video_into_stable_segments_fast", timing):
639
- segments, _, _, _ = split_video_into_stable_segments_fast(video_path, **cfg)
640
  with timer("extract_segment_frames", timing):
641
  frame_groups = extract_segment_frames(
642
  video_path,
643
  segments,
644
  n_samples=cfg["n_samples"],
 
645
  )
646
 
647
  if frame_groups:
@@ -655,8 +671,12 @@ def split_video_stable(video_path, split_cfg=None, fallback_n=30):
655
  return frame_groups
656
 
657
  LOGGER.info("Split result | no stable segment, using fallback sampling n=%d", fallback_n)
658
- with timer("fallback_split_video", timing):
659
- fallback_frames = split_video(video_path, n=fallback_n)
 
 
 
 
660
  LOGGER.info("Fallback frame count | %d", len(fallback_frames))
661
  timing["wall"] = time.perf_counter() - wall_t0
662
  _log_timing_summary("split_video_stable", timing, wall_time=timing["wall"])
@@ -743,6 +763,7 @@ def infer(video_file):
743
  with timer("resolve_video_path", timing):
744
  video_path = _resolve_video_path(video_file)
745
  LOGGER.info("Inference start | video=%s", video_path)
 
746
  with timer("split_video_stable", timing):
747
  split_frames = split_video_stable(video_path)
748
  if not split_frames:
@@ -760,19 +781,18 @@ def infer(video_file):
760
  for split_idx, frames in enumerate(split_frames):
761
  split_t0 = time.perf_counter()
762
  LOGGER.info("Inference split %d | frames=%d", split_idx + 1, len(frames))
763
- frame_preds = []
764
- split_model = 0.0
 
 
 
 
 
 
 
765
  split_iou = 0.0
766
  split_draw = 0.0
767
 
768
- for frame in frames:
769
- t_model = time.perf_counter()
770
- bbox = np.asarray(model(frame), dtype=np.float64).reshape(-1, 5)
771
- dt_model = time.perf_counter() - t_model
772
- split_model += dt_model
773
- infer_model += dt_model
774
- frame_preds.append(bbox)
775
-
776
  t_combine = time.perf_counter()
777
  kept_main = _combine_predictions_per_split(frame_preds)
778
  dt_combine = time.perf_counter() - t_combine
 
20
 
21
 
22
  DEFAULT_SPLIT_CFG = {
23
+ "n_samples": 16,
24
  "max_w": 400,
25
  "crop_y": (0.25, 0.90),
26
  "dx_threshold_px": 1.5,
 
34
  "jump_meanabs_threshold": 18.0,
35
  "progress_every": 0,
36
  }
37
+ INFER_BATCH_SIZE = max(1, int(os.getenv("INFER_BATCH_SIZE", "8")))
38
 
39
 
40
  def _log_timing_summary(label, stats, wall_time=None, max_items=12):
 
286
  stats[name] = stats.get(name, 0.0) + (time.perf_counter() - t0)
287
 
288
 
289
+ def _iter_sampled_frames(video_path, n_samples, sampled_frames=None):
290
  timing = {}
291
  wall_t0 = time.perf_counter()
292
+ if sampled_frames is None:
293
+ with timer("extract_bgr_with_ffmpeg", timing):
294
+ frames = _extract_bgr_with_ffmpeg(video_path, int(n_samples))
295
+ else:
296
+ with timer("reuse_sampled_frames", timing):
297
+ frames = sampled_frames
298
  timing["wall"] = time.perf_counter() - wall_t0
299
  _log_timing_summary("Iter sampled frames", timing, wall_time=timing["wall"])
300
  for out_idx, frame in enumerate(frames):
301
  yield out_idx, frame
302
 
303
 
304
+ def iter_frames(video_path, n_samples, max_w, crop_y, sampled_frames=None):
305
  timing = {"resize": 0.0, "crop": 0.0}
306
  wall_t0 = time.perf_counter()
307
  frame_count = 0
308
  try:
309
+ for out_idx, frame in _iter_sampled_frames(video_path, n_samples, sampled_frames=sampled_frames):
310
  frame_count += 1
311
  proc = frame
312
  if max_w > 0 and proc.shape[1] != max_w:
 
407
 
408
  def split_video_into_stable_segments_fast(
409
  video_path,
410
+ n_samples=16,
411
  max_w=400,
412
  crop_y=(0.25, 0.90),
413
  dx_threshold_px=1.5,
 
420
  keep_ratio=0.4,
421
  jump_meanabs_threshold=18.0,
422
  progress_every=200,
423
+ sampled_frames=None,
424
  ):
425
  wall_t0 = time.perf_counter()
426
  timing_total = {}
 
435
  frame_count = 0
436
 
437
  with timer("loop_total", timing_total):
438
+ for _, frame in iter_frames(
439
+ video_path,
440
+ n_samples=n_samples,
441
+ max_w=max_w,
442
+ crop_y=crop_y,
443
+ sampled_frames=sampled_frames,
444
+ ):
445
  frame_count += 1
446
 
447
  with timer("to_gray", timing_total):
 
571
  return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
572
 
573
 
574
+ def extract_segment_frames(video_path, segments, n_samples, sampled_frames=None):
575
  timing = {}
576
  wall_t0 = time.perf_counter()
577
 
 
597
  # Detection runs on original sampled frames (no resize / no crop).
598
  to_pil_time = 0.0
599
  with timer("assign_frames_to_segments", timing):
600
+ for frame_idx, frame in _iter_sampled_frames(video_path, n_samples=n_samples, sampled_frames=sampled_frames):
601
  while segment_idx < len(normalized_segments) and frame_idx > normalized_segments[segment_idx][1]:
602
  segment_idx += 1
603
 
 
635
  return [frames for frames in grouped_frames if frames]
636
 
637
 
638
+ def split_video_stable(video_path, split_cfg=None, fallback_n=16):
639
  if not video_path or not os.path.exists(video_path):
640
  return []
641
 
 
647
 
648
  LOGGER.info("Split config | %s", cfg)
649
 
650
+ with timer("extract_sampled_frames", timing):
651
+ sampled_frames = _extract_bgr_with_ffmpeg(video_path, int(cfg["n_samples"]))
652
+
653
  with timer("split_video_into_stable_segments_fast", timing):
654
+ segments, _, _, _ = split_video_into_stable_segments_fast(video_path, sampled_frames=sampled_frames, **cfg)
655
  with timer("extract_segment_frames", timing):
656
  frame_groups = extract_segment_frames(
657
  video_path,
658
  segments,
659
  n_samples=cfg["n_samples"],
660
+ sampled_frames=sampled_frames,
661
  )
662
 
663
  if frame_groups:
 
671
  return frame_groups
672
 
673
  LOGGER.info("Split result | no stable segment, using fallback sampling n=%d", fallback_n)
674
+ if int(fallback_n) == int(cfg["n_samples"]):
675
+ with timer("fallback_reuse_sampled_frames", timing):
676
+ fallback_frames = [_bgr_to_pil(frame) for frame in sampled_frames]
677
+ else:
678
+ with timer("fallback_split_video", timing):
679
+ fallback_frames = split_video(video_path, n=fallback_n)
680
  LOGGER.info("Fallback frame count | %d", len(fallback_frames))
681
  timing["wall"] = time.perf_counter() - wall_t0
682
  _log_timing_summary("split_video_stable", timing, wall_time=timing["wall"])
 
763
  with timer("resolve_video_path", timing):
764
  video_path = _resolve_video_path(video_file)
765
  LOGGER.info("Inference start | video=%s", video_path)
766
+ LOGGER.info("Inference config | batch_size=%d", INFER_BATCH_SIZE)
767
  with timer("split_video_stable", timing):
768
  split_frames = split_video_stable(video_path)
769
  if not split_frames:
 
781
  for split_idx, frames in enumerate(split_frames):
782
  split_t0 = time.perf_counter()
783
  LOGGER.info("Inference split %d | frames=%d", split_idx + 1, len(frames))
784
+ t_model = time.perf_counter()
785
+ if hasattr(model, "infer_batch"):
786
+ frame_preds = model.infer_batch(frames, batch_size=INFER_BATCH_SIZE)
787
+ else:
788
+ frame_preds = [model(frame) for frame in frames]
789
+ frame_preds = [np.asarray(bbox, dtype=np.float64).reshape(-1, 5) for bbox in frame_preds]
790
+
791
+ split_model = time.perf_counter() - t_model
792
+ infer_model += split_model
793
  split_iou = 0.0
794
  split_draw = 0.0
795
 
 
 
 
 
 
 
 
 
796
  t_combine = time.perf_counter()
797
  kept_main = _combine_predictions_per_split(frame_preds)
798
  dt_combine = time.perf_counter() - t_combine
vision.py CHANGED
@@ -7,7 +7,7 @@ import logging
7
  import os
8
  import platform
9
  import tarfile
10
- from typing import Tuple
11
  from urllib.request import urlretrieve
12
 
13
  import numpy as np
@@ -181,28 +181,7 @@ class Classifier:
181
 
182
  return pred
183
 
184
- def __call__(self, pil_img: Image.Image, occlusion_bboxes: dict = {}) -> np.ndarray:
185
- """Run the classifier on an input image.
186
-
187
- Args:
188
- pil_img: The input PIL image.
189
- occlusion_mask: Optional occlusion mask to exclude certain areas.
190
-
191
- Returns:
192
- Processed predictions.
193
- """
194
- np_img, pad = self.prep_process(pil_img)
195
-
196
- if self.format == "ncnn":
197
- extractor = self.model.create_extractor()
198
- extractor.set_light_mode(True)
199
- extractor.input("in0", np_img)
200
- pred = ncnn.Mat()
201
- extractor.extract("out0", pred)
202
- pred = np.asarray(pred)
203
- else:
204
- pred = self.ort_session.run(["output0"], {"images": np_img})[0][0]
205
-
206
  # Convert pad to a tuple if required
207
  if isinstance(pad, list):
208
  pad = tuple(pad)
@@ -214,7 +193,7 @@ class Classifier:
214
  pred = pred[(pred[:, 2] - pred[:, 0]) < self.max_bbox_size, :]
215
  pred = np.reshape(pred, (-1, 5))
216
 
217
- logging.info(f"Model original pred : {pred}")
218
 
219
  # Remove prediction in bbox occlusion mask
220
  if len(occlusion_bboxes):
@@ -227,3 +206,67 @@ class Classifier:
227
  pred = pred[keep]
228
 
229
  return pred
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  import os
8
  import platform
9
  import tarfile
10
+ from typing import Sequence, Tuple
11
  from urllib.request import urlretrieve
12
 
13
  import numpy as np
 
181
 
182
  return pred
183
 
184
+ def _finalize_prediction(self, pred: np.ndarray, pad: Tuple[int, int], occlusion_bboxes: dict) -> np.ndarray:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  # Convert pad to a tuple if required
186
  if isinstance(pad, list):
187
  pad = tuple(pad)
 
193
  pred = pred[(pred[:, 2] - pred[:, 0]) < self.max_bbox_size, :]
194
  pred = np.reshape(pred, (-1, 5))
195
 
196
+ logging.debug("Model original pred : %s", pred)
197
 
198
  # Remove prediction in bbox occlusion mask
199
  if len(occlusion_bboxes):
 
206
  pred = pred[keep]
207
 
208
  return pred
209
+
210
+ def infer_batch(self, pil_imgs: Sequence[Image.Image], occlusion_bboxes: dict = None, batch_size: int = 8):
211
+ if not pil_imgs:
212
+ return []
213
+
214
+ if occlusion_bboxes is None:
215
+ occlusion_bboxes = {}
216
+
217
+ # NCNN path stays single-image.
218
+ if self.format != "onnx":
219
+ return [self(pil_img, occlusion_bboxes=occlusion_bboxes) for pil_img in pil_imgs]
220
+
221
+ batch_size = max(1, int(batch_size))
222
+ outputs = []
223
+
224
+ for start in range(0, len(pil_imgs), batch_size):
225
+ chunk = pil_imgs[start : start + batch_size]
226
+ batch_imgs = []
227
+ pads = []
228
+ for pil_img in chunk:
229
+ np_img, pad = self.prep_process(pil_img)
230
+ batch_imgs.append(np_img)
231
+ pads.append(pad)
232
+
233
+ np_batch = np.concatenate(batch_imgs, axis=0)
234
+ raw = self.ort_session.run(["output0"], {"images": np_batch})[0]
235
+
236
+ if raw.ndim >= 3 and raw.shape[0] == len(chunk):
237
+ raw_preds = [raw[i] for i in range(len(chunk))]
238
+ elif len(chunk) == 1 and raw.ndim >= 3:
239
+ raw_preds = [raw[0]]
240
+ elif len(chunk) == 1:
241
+ raw_preds = [raw]
242
+ else:
243
+ # Fallback for unexpected output shapes.
244
+ raw_preds = [self.ort_session.run(["output0"], {"images": arr})[0][0] for arr in batch_imgs]
245
+
246
+ for raw_pred, pad in zip(raw_preds, pads):
247
+ outputs.append(self._finalize_prediction(raw_pred, pad, occlusion_bboxes))
248
+
249
+ return outputs
250
+
251
+ def __call__(self, pil_img: Image.Image, occlusion_bboxes: dict = {}) -> np.ndarray:
252
+ """Run the classifier on an input image.
253
+
254
+ Args:
255
+ pil_img: The input PIL image.
256
+ occlusion_mask: Optional occlusion mask to exclude certain areas.
257
+
258
+ Returns:
259
+ Processed predictions.
260
+ """
261
+ np_img, pad = self.prep_process(pil_img)
262
+
263
+ if self.format == "ncnn":
264
+ extractor = self.model.create_extractor()
265
+ extractor.set_light_mode(True)
266
+ extractor.input("in0", np_img)
267
+ pred = ncnn.Mat()
268
+ extractor.extract("out0", pred)
269
+ pred = np.asarray(pred)
270
+ else:
271
+ pred = self.ort_session.run(["output0"], {"images": np_img})[0][0]
272
+ return self._finalize_prediction(pred, pad, occlusion_bboxes)