Spaces:

uncertainrods
/

smashfix-v1

Sleeping

File size: 8,000 Bytes

0d0412d

"""
Utility Functions for Pose Processing
======================================

Core utility functions supporting video preprocessing and pose normalization
across both the pose and hybrid pipelines. Handles geometric transformations,
video segment extraction, and crop configuration resolution.

Key Components:
    1. Pose Normalization (normalize_pose, normalize_sequence)
       - Person-centric coordinate system transformation
       - Three-step process: Centering → Alignment → Scaling
       - Hip-centered with spine-aligned Y-axis
       - Scale-invariant via spine length normalization
       
    2. Video Segment Extraction (get_segment_bounds)
       - Extracts relevant portions of shot videos
       - Supports tail-based, middle-based, or full extraction
       - Configurable via segment_rules in params.yaml
       
    3. Crop Configuration (resolve_crop_config_for_video)
       - Per-video crop overrides based on video number
       - Supports wildcard patterns for batch configuration
       - Handles pre-cropped files with "(N)" naming pattern
       
    4. File Pattern Detection (should_skip_crop, extract_video_number)
       - Detects pre-cropped files that should skip cropping
       - Extracts video numbers for per-video configuration

Normalization Algorithm:
    1. Center pose at hip midpoint (joints 23, 24)
    2. Align Y-axis with spine direction (hip → shoulder center)
    3. Compute orthogonal X-axis from shoulder vector
    4. Z-axis via cross product for right-handed system
    5. Scale by spine length for size invariance

Dependencies:
    External: numpy, re
    
Configuration (params.yaml):
    segment_rules: Video segment extraction settings
    crop_overrides: Per-video crop configuration

Author: IPD Research Team
Version: 1.0.0
"""

import numpy as np
import re


def normalize_pose(keypoints_3d):
    """
    Normalizes a single 3D pose (33, 3) to a standard, person-centric coordinate system.
    (Centering -> Alignment -> Scaling)
    """
    if keypoints_3d.shape != (33, 3): return keypoints_3d
    
    LEFT_SHOULDER, RIGHT_SHOULDER = 11, 12
    LEFT_HIP, RIGHT_HIP = 23, 24

    # 1. Centering
    hip_center = (keypoints_3d[LEFT_HIP] + keypoints_3d[RIGHT_HIP]) / 2.0
    centered = keypoints_3d - hip_center

    # 2. Alignment
    shoulder_center = (centered[LEFT_SHOULDER] + centered[RIGHT_SHOULDER]) / 2.0
    spine_len = np.linalg.norm(shoulder_center)
    
    if spine_len < 1e-6: return centered

    new_y = shoulder_center / spine_len
    right_shoulder_vec = centered[RIGHT_SHOULDER] - centered[LEFT_SHOULDER]
    
    proj = np.dot(right_shoulder_vec, new_y) * new_y
    new_x = right_shoulder_vec - proj
    
    if np.linalg.norm(new_x) < 1e-6:
         new_x = np.cross(new_y, [0, 1, 0]) if abs(new_y[0]) > 0.5 else np.cross(new_y, [1, 0, 0])

    new_x /= np.linalg.norm(new_x)
    new_z = np.cross(new_x, new_y)

    rotation = np.array([new_x, new_y, new_z])
    aligned = np.dot(centered, rotation.T)

    # 3. Scaling
    return aligned / spine_len

def normalize_sequence(keypoints_sequence):
    """Applies pose normalization to an entire sequence of frames."""
    return np.array([normalize_pose(frame) for frame in keypoints_sequence])


def should_skip_crop(video_path_or_name: str) -> bool:
    """Return True if the filename matches the pattern "name (N).ext".

    This matches filenames that contain a space followed by a parenthesized
    integer index before the extension, e.g. "backhand_drive (1).mp4".
    The check accepts either a full path or a bare filename.
    """
    import re, os
    name = os.path.basename(video_path_or_name)
    # Match 'something (123).ext' where the number is one or more digits
    return re.search(r"\s\(\d+\)\.[^.]+$", name) is not None


def extract_video_number(video_path_or_name: str) -> int | None:
    """Extract the leading numeric id from a filename/path.

    Examples:
      - "001.mp4" -> 1
      - "006_win_3.npz" -> 6
      - "12_some_name.mov" -> 12

    Returns None if no leading number exists.
    """
    import os
    import re

    name = os.path.basename(video_path_or_name)
    m = re.match(r"^(\d+)", name)
    if not m:
        return None
    try:
        return int(m.group(1))
    except Exception:
        return None


def resolve_crop_config_for_video(
    video_path: str,
    base_crop_config: dict,
    crop_overrides: dict | None = None,
) -> dict:
    """Return crop_config with per-shot per-video overrides applied.

    - Only overrides keys present in the matching rule (currently just 'bottom').
    - Matches shot name by checking path components against crop_overrides keys.
    - Matches by leading video number in filename.
    - If multiple ranges match, the last matching range wins.
    """
    import copy
    import os

    effective = copy.deepcopy(base_crop_config) if base_crop_config is not None else {
        'top': 0.0,
        'bottom': 0.0,
        'left': 0.0,
        'right': 0.0,
    }

    if not crop_overrides:
        return effective

    video_num = extract_video_number(video_path)
    if video_num is None:
        return effective

    comps = set(os.path.normpath(video_path).split(os.path.sep))
    matched_shot = None
    for shot in crop_overrides.keys():
        if shot in comps:
            matched_shot = shot
            break

    if not matched_shot:
        return effective

    rules = crop_overrides.get(matched_shot) or []
    for rule in rules:
        try:
            start = int(rule.get('start'))
            end = int(rule.get('end'))
        except Exception:
            continue

        if start <= video_num <= end:
            for k, v in rule.items():
                if k in {'start', 'end'}:
                    continue
                effective[k] = v

    return effective


def get_tail_seconds_for_video(video_path: str, default: float = 1.75) -> float:
    """Return the tail window length in seconds for a given video path.

    Special-case: use 2.0 seconds for forehand_lift and forehand_clear shots.
    The check looks at any path component equal to those shot names so it
    works with full paths or relative paths.
    """
    import os
    p = os.path.normpath(video_path)
    comps = p.split(os.path.sep)
    shots_2s = {'forehand_lift', 'forehand_clear'}
    if any(c in shots_2s for c in comps):
        return 2.0
    return default


def get_segment_bounds(
    video_path: str,
    fps: float,
    total_frames: int,
    default_seconds: float = 1.75,
    segment_cfg: dict | None = None,
):
    """Return (start_frame, frame_count) for the segment to process for a given video.

    Config-driven rules (segment_cfg):
      - default_seconds: fallback window length (tail).
      - tail_seconds: window length for tail_shots.
      - tail_shots: list of shot folder names to use tail_seconds.
      - middle_shots: mapping of shot folder name -> seconds for middle window.

    If no config is provided, uses hardcoded defaults (tail, 1.75s).
    """
    import os

    if segment_cfg is None:
        segment_cfg = {}

    default_seconds = segment_cfg.get('default_seconds', default_seconds)
    tail_seconds = segment_cfg.get('tail_seconds', default_seconds)
    tail_shots = set(segment_cfg.get('tail_shots', []))
    middle_shots = segment_cfg.get('middle_shots', {})

    p = os.path.normpath(video_path)
    comps = p.split(os.path.sep)

    # Middle-shot rule
    for shot, secs in middle_shots.items():
        if shot in comps:
            frames = int(float(secs) * fps)
            center = total_frames // 2
            start = max(0, center - frames // 2)
            return start, frames

    # Tail-shot rule
    if any(shot in comps for shot in tail_shots):
        frames = int(float(tail_seconds) * fps)
        start = max(0, total_frames - frames)
        return start, frames

    # Default tail
    frames = int(float(default_seconds) * fps)
    start = max(0, total_frames - frames)
    return start, frames