smashfix-v1 / src /utils.py
uncertainrods's picture
v1-try-deploy
0d0412d
"""
Utility Functions for Pose Processing
======================================
Core utility functions supporting video preprocessing and pose normalization
across both the pose and hybrid pipelines. Handles geometric transformations,
video segment extraction, and crop configuration resolution.
Key Components:
1. Pose Normalization (normalize_pose, normalize_sequence)
- Person-centric coordinate system transformation
- Three-step process: Centering → Alignment → Scaling
- Hip-centered with spine-aligned Y-axis
- Scale-invariant via spine length normalization
2. Video Segment Extraction (get_segment_bounds)
- Extracts relevant portions of shot videos
- Supports tail-based, middle-based, or full extraction
- Configurable via segment_rules in params.yaml
3. Crop Configuration (resolve_crop_config_for_video)
- Per-video crop overrides based on video number
- Supports wildcard patterns for batch configuration
- Handles pre-cropped files with "(N)" naming pattern
4. File Pattern Detection (should_skip_crop, extract_video_number)
- Detects pre-cropped files that should skip cropping
- Extracts video numbers for per-video configuration
Normalization Algorithm:
1. Center pose at hip midpoint (joints 23, 24)
2. Align Y-axis with spine direction (hip → shoulder center)
3. Compute orthogonal X-axis from shoulder vector
4. Z-axis via cross product for right-handed system
5. Scale by spine length for size invariance
Dependencies:
External: numpy, re
Configuration (params.yaml):
segment_rules: Video segment extraction settings
crop_overrides: Per-video crop configuration
Author: IPD Research Team
Version: 1.0.0
"""
import numpy as np
import re
def normalize_pose(keypoints_3d):
"""
Normalizes a single 3D pose (33, 3) to a standard, person-centric coordinate system.
(Centering -> Alignment -> Scaling)
"""
if keypoints_3d.shape != (33, 3): return keypoints_3d
LEFT_SHOULDER, RIGHT_SHOULDER = 11, 12
LEFT_HIP, RIGHT_HIP = 23, 24
# 1. Centering
hip_center = (keypoints_3d[LEFT_HIP] + keypoints_3d[RIGHT_HIP]) / 2.0
centered = keypoints_3d - hip_center
# 2. Alignment
shoulder_center = (centered[LEFT_SHOULDER] + centered[RIGHT_SHOULDER]) / 2.0
spine_len = np.linalg.norm(shoulder_center)
if spine_len < 1e-6: return centered
new_y = shoulder_center / spine_len
right_shoulder_vec = centered[RIGHT_SHOULDER] - centered[LEFT_SHOULDER]
proj = np.dot(right_shoulder_vec, new_y) * new_y
new_x = right_shoulder_vec - proj
if np.linalg.norm(new_x) < 1e-6:
new_x = np.cross(new_y, [0, 1, 0]) if abs(new_y[0]) > 0.5 else np.cross(new_y, [1, 0, 0])
new_x /= np.linalg.norm(new_x)
new_z = np.cross(new_x, new_y)
rotation = np.array([new_x, new_y, new_z])
aligned = np.dot(centered, rotation.T)
# 3. Scaling
return aligned / spine_len
def normalize_sequence(keypoints_sequence):
"""Applies pose normalization to an entire sequence of frames."""
return np.array([normalize_pose(frame) for frame in keypoints_sequence])
def should_skip_crop(video_path_or_name: str) -> bool:
"""Return True if the filename matches the pattern "name (N).ext".
This matches filenames that contain a space followed by a parenthesized
integer index before the extension, e.g. "backhand_drive (1).mp4".
The check accepts either a full path or a bare filename.
"""
import re, os
name = os.path.basename(video_path_or_name)
# Match 'something (123).ext' where the number is one or more digits
return re.search(r"\s\(\d+\)\.[^.]+$", name) is not None
def extract_video_number(video_path_or_name: str) -> int | None:
"""Extract the leading numeric id from a filename/path.
Examples:
- "001.mp4" -> 1
- "006_win_3.npz" -> 6
- "12_some_name.mov" -> 12
Returns None if no leading number exists.
"""
import os
import re
name = os.path.basename(video_path_or_name)
m = re.match(r"^(\d+)", name)
if not m:
return None
try:
return int(m.group(1))
except Exception:
return None
def resolve_crop_config_for_video(
video_path: str,
base_crop_config: dict,
crop_overrides: dict | None = None,
) -> dict:
"""Return crop_config with per-shot per-video overrides applied.
- Only overrides keys present in the matching rule (currently just 'bottom').
- Matches shot name by checking path components against crop_overrides keys.
- Matches by leading video number in filename.
- If multiple ranges match, the last matching range wins.
"""
import copy
import os
effective = copy.deepcopy(base_crop_config) if base_crop_config is not None else {
'top': 0.0,
'bottom': 0.0,
'left': 0.0,
'right': 0.0,
}
if not crop_overrides:
return effective
video_num = extract_video_number(video_path)
if video_num is None:
return effective
comps = set(os.path.normpath(video_path).split(os.path.sep))
matched_shot = None
for shot in crop_overrides.keys():
if shot in comps:
matched_shot = shot
break
if not matched_shot:
return effective
rules = crop_overrides.get(matched_shot) or []
for rule in rules:
try:
start = int(rule.get('start'))
end = int(rule.get('end'))
except Exception:
continue
if start <= video_num <= end:
for k, v in rule.items():
if k in {'start', 'end'}:
continue
effective[k] = v
return effective
def get_tail_seconds_for_video(video_path: str, default: float = 1.75) -> float:
"""Return the tail window length in seconds for a given video path.
Special-case: use 2.0 seconds for forehand_lift and forehand_clear shots.
The check looks at any path component equal to those shot names so it
works with full paths or relative paths.
"""
import os
p = os.path.normpath(video_path)
comps = p.split(os.path.sep)
shots_2s = {'forehand_lift', 'forehand_clear'}
if any(c in shots_2s for c in comps):
return 2.0
return default
def get_segment_bounds(
video_path: str,
fps: float,
total_frames: int,
default_seconds: float = 1.75,
segment_cfg: dict | None = None,
):
"""Return (start_frame, frame_count) for the segment to process for a given video.
Config-driven rules (segment_cfg):
- default_seconds: fallback window length (tail).
- tail_seconds: window length for tail_shots.
- tail_shots: list of shot folder names to use tail_seconds.
- middle_shots: mapping of shot folder name -> seconds for middle window.
If no config is provided, uses hardcoded defaults (tail, 1.75s).
"""
import os
if segment_cfg is None:
segment_cfg = {}
default_seconds = segment_cfg.get('default_seconds', default_seconds)
tail_seconds = segment_cfg.get('tail_seconds', default_seconds)
tail_shots = set(segment_cfg.get('tail_shots', []))
middle_shots = segment_cfg.get('middle_shots', {})
p = os.path.normpath(video_path)
comps = p.split(os.path.sep)
# Middle-shot rule
for shot, secs in middle_shots.items():
if shot in comps:
frames = int(float(secs) * fps)
center = total_frames // 2
start = max(0, center - frames // 2)
return start, frames
# Tail-shot rule
if any(shot in comps for shot in tail_shots):
frames = int(float(tail_seconds) * fps)
start = max(0, total_frames - frames)
return start, frames
# Default tail
frames = int(float(default_seconds) * fps)
start = max(0, total_frames - frames)
return start, frames