Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

basketball_analysis/__init__.py +13 -0
basketball_analysis/matcherBeta.py +197 -0
basketball_analysis/tracking.py +270 -0
basketball_analysis/utils.py +347 -0
basketball_analysis/view_transformer.py +61 -0

basketball_analysis/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from .matcherBeta import Matcher
+from .tracking import Tracker
+from .utils import (
+    get_crops_from_masks,
+    toRGB,
+    xywhn_to_xywh,
+    mask_nms,
+    mask_iou,
+    matcher_probs_custom_argmax,
+    show_annotations,
+    annotate_frame,
+    COURT_KEYPOINT_COORDINATES,
+)

basketball_analysis/matcherBeta.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import torch
+from torch import nn, Tensor
+import torch.nn.functional as F
+from transformers import Dinov2Model, Dinov2Config
+from torchvision.transforms import v2
+from code import interact
+import json
+import os
+from PIL import Image
+import numpy as np
+from typing import Union
+transforms = v2.Compose([
+    v2.ToImage(),
+    v2.ToDtype(torch.float32, scale=True),
+    v2.Resize((224, 224)),
+    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+])
+class CrossAttention(nn.Module):
+    def __init__(self, d_model:int, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.Wq = nn.Linear(d_model, d_model)
+        self.Wk = nn.Linear(d_model, d_model)
+        self.Wv = nn.Linear(d_model, d_model)
+    def forward(self, queries, candidates):
+        Q = self.Wk(candidates) # (B, num_candidates, d_model)
+        K = self.Wq(queries) # (B, num_queries, d_model)
+        V = self.Wv(queries) # (B, num_queries, d_model)
+        attn_out = F.scaled_dot_product_attention(Q, K, V) # (B, num_candidates, d_model)
+        return attn_out
+class JointTransformer(nn.Module):
+    def __init__(
+            self,
+            d_model=384,
+            nhead=4,
+            num_layers=4,
+            *args, **kwargs
+        ):
+        super().__init__(*args, **kwargs)
+        # Transformer encoder
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=d_model,
+            nhead=nhead,
+            dim_feedforward=4 * d_model,
+            batch_first=True,
+            dropout=0.0
+        )
+        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
+    def forward(self, query: Tensor, candidates: Tensor) -> Tensor :
+        Q = query.size(1)
+        assert Q == 1
+        x = torch.cat((query, candidates), dim=1) # (B, Q+C, D)
+        x = self.transformer(x) # (B, Q+C, D)
+        query = x[:,:Q,:] # (B, Q, D)
+        candidates = x[:, Q:, :] # (B, C, D)
+        return query, candidates
+class MLP(nn.Module):
+    def __init__(self, emb_dim, expand_factor, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.lin1 = nn.Linear(emb_dim, emb_dim*expand_factor)
+        self.gelu = nn.GELU("tanh")
+        self.lin2 = nn.Linear(emb_dim*expand_factor, emb_dim)
+    def forward(self, x:Tensor) -> Tensor:
+        x = self.lin1(x)
+        x = self.gelu(x)
+        x = self.lin2(x)
+        return x
+class Matcher(nn.Module):
+    def __init__(self, max_candidates, num_layers, dino_dir, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # -------------- Pre-trained Encoder (frozen) -----------------
+        assert isinstance(dino_dir, str)
+        with open(os.path.join(dino_dir, "config.json"), "r") as f:
+            dino_cfg = json.load(f)
+        self.encoder = Dinov2Model.from_pretrained(dino_dir, config = Dinov2Config(**dino_cfg))
+        self.freeze_encoder()
+        # ----------------- Embeddings to distinguish queries and candidates ---------------------
+        self.query_image_embed = nn.Parameter(torch.randn(1, 1, dino_cfg["hidden_size"]))
+        self.candidates_image_embed = nn.Embedding(max_candidates, dino_cfg["hidden_size"])
+        self.null_candidate = nn.Parameter(torch.randn(1, 1, dino_cfg["hidden_size"])) # null candidate embedding
+        # ---------------- Joint transformer (trained) ----------------------
+        self.max_candidates = max_candidates
+        self.num_layers = num_layers
+        self.joint_transformer = JointTransformer(
+            d_model = dino_cfg["hidden_size"],
+            nhead = dino_cfg["num_attention_heads"],
+            num_layers = num_layers,
+        )
+        self.lnormq = nn.LayerNorm(dino_cfg["hidden_size"], )
+        self.lnormc = nn.LayerNorm(dino_cfg["hidden_size"], )
+        # ------------------------ Final operation ---------------------------
+        self.cross_attn = CrossAttention(dino_cfg["hidden_size"])
+        self.lnormc2 = nn.LayerNorm(dino_cfg["hidden_size"])
+        self.classification_layer = nn.Linear(dino_cfg["hidden_size"], 1)
+    def freeze_encoder(self) -> None:
+        for p in self.encoder.parameters():
+            p.requires_grad_(False)
+    def pre_process_img(self, image:Union[Image.Image, np.ndarray, str]):
+        if isinstance(image, str):
+            image = Image.open(image)
+        return transforms(image)
+    @torch.inference_mode()
+    def predict(self, query_crop: np.ndarray, candidate_crops: list[np.ndarray]):
+        query = transforms(query_crop)[None, None, ...]
+        candidates = torch.stack([transforms(candidate_crop) for candidate_crop in candidate_crops]).unsqueeze(0)
+        probs = self.forward(query, candidates).softmax(dim=-1)
+        return probs.numpy()
+    def forward(self, query: Tensor, candidates: Tensor) -> Tensor :
+        # query (B,1,3,H,W), candidates (B,C,3,H,W)
+        B, C, _, H, W = candidates.shape
+        query = self.encoder(
+            query.view(B, 3, H, W)
+        )['last_hidden_state'] # (B, T, D)
+        # pick the CLS_TOKEN
+        query = query[:,0,:].view(B, 1, -1) # (B, 1, D)
+        candidates = self.encoder(
+            candidates.view(B*C, 3, H, W)
+        )['last_hidden_state'] # (B*C, T, D)
+        # pick the CLS_TOKEN
+        candidates = candidates[:,0,:].view(B, C, -1) # (B, C, D)
+        # Add embeddings
+        query = query + self.query_image_embed.repeat(B, 1, 1) # (B, 1, D)
+        candidate_ids = torch.arange(C, device=query.device).view(1, C)
+        candidates = candidates + self.candidates_image_embed(candidate_ids) # (B, C, D)
+        candidates = torch.cat(
+            (
+                candidates,
+                self.null_candidate.repeat(B, 1, 1)
+            ),
+        dim=1) # (B, C+1, D)
+        # Joint transformer, candidate and query tokens attend to each other
+        q, c = self.joint_transformer(query, candidates)
+        # skip connections
+        query = self.lnormq(query + q)
+        candidates = self.lnormc(candidates + c)
+        # Cross attention, query attends to candidates
+        c = self.cross_attn(query, candidates) # (B, C+1, D)
+        candidates = self.lnormc2(candidates + c)
+        candidates = candidates + c
+        logits = self.classification_layer(candidates) # (B, C+1, 1)
+        return logits.squeeze(-1)
+if __name__ == "__main__":
+    import random
+    B, H, W = 1, 224, 224
+    max_candidates = 10
+    num_layers = 4
+    query = torch.randn((B, 1, 3, H, W))
+    candidates = torch.randn((B, random.randint(2, max_candidates), 3, H, W))
+    matcher = Matcher(max_candidates, num_layers, "DINOv2_base")
+    out = matcher(query, candidates)
+    interact(local=locals())

basketball_analysis/tracking.py ADDED Viewed

	@@ -0,0 +1,270 @@

+import supervision as sv
+import torch
+import numpy as np
+from collections import defaultdict
+from rfdetr import RFDETRSeg2XLarge
+from PIL import Image
+import cv2
+from scipy.optimize import linear_sum_assignment
+from .utils import (
+    mask_nms,
+    toRGB,
+    matcher_probs_custom_argmax,
+    get_distance_cost_matrix,
+    mask_iou,
+    get_crops_from_masks
+)
+from .view_transformer import (
+    get_players_court_xy
+)
+from tqdm import tqdm
+from code import interact
+np.set_printoptions(suppress=True, precision=4)
+torch.set_printoptions(sci_mode=False)
+def indices_to_matches(
+    cost_matrix, indices, thresh: float
+):
+    matched_cost = cost_matrix[tuple(zip(*indices))]
+    matched_mask = matched_cost <= thresh
+    matches = indices[matched_mask]
+    unmatched_a = list(set(range(cost_matrix.shape[0])) - set(matches[:, 0]))
+    unmatched_b = list(set(range(cost_matrix.shape[1])) - set(matches[:, 1]))
+    return matches, unmatched_a, unmatched_b
+def linear_assignment(
+    cost_matrix, thresh
+):
+    row_ind, col_ind = linear_sum_assignment(cost_matrix)
+    indices = np.column_stack((row_ind, col_ind))
+    return indices_to_matches(cost_matrix, indices, thresh)
+class Tracker:
+    def __init__(
+            self,
+            initial_detections:sv.Detections,
+            initial_xy: np.ndarray,
+            initial_frame: np.ndarray,
+            matcher,
+            hungarian_mask_threshold: float,
+            hungarian_pos_threshold: float
+        ):
+        self.frame_id = 0
+        self.track_ids = list(range(len(initial_detections)))
+        self.previous_detections = initial_detections
+        self.previous_xy = initial_xy
+        self.hungarian_mask_threshold = hungarian_mask_threshold
+        self.hungarian_pos_threshold = hungarian_pos_threshold
+        self.matcher = matcher
+        '''Initialize track_ids of all 10 players'''
+        self.all_players_detected = len(initial_detections) == 10
+        initial_detections.tracker_id = np.array(self.track_ids)
+        self.frame_id_to_xy = {
+            self.frame_id : dict(zip(initial_detections.tracker_id, initial_xy))
+        }
+        # Keep one "base selfie" and one "latest selfie" of all players in memory.
+        self.track_id_to_crop = defaultdict(list)
+        for track_id, crop in zip(initial_detections.tracker_id, get_crops_from_masks(initial_frame, initial_detections.mask)):
+            for _ in range(2):
+                self.track_id_to_crop[track_id].append(crop)
+        self.stats = {
+            self.frame_id : {
+                "detected_players" : len(initial_detections),
+                "new_detections" : None,
+                "all_players_detected" : self.all_players_detected,
+                "mask_based_matches" : None,
+                "position_based_matches" : None,
+                "appearance_based_matches" : None,
+                "unmatched" : None
+            }
+        }
+    def update_tracks_with_new_detections(self, detections: sv.Detections, xy: np.ndarray, frame: np.ndarray):
+        detections.tracker_id = -np.ones(shape=(len(detections)), dtype=np.int64)
+        masks = detections.mask
+        '''First Layer | Mask-based tracking:
+        Safely track players based on their masks coordinates. When in doubt, leave the detections untracked'''
+        # Cost_matrix_ij = 1 - IoU(mask_i, mask_j)
+        null_track = self.previous_detections.tracker_id == -1
+        mask_cost_matrix = 1.0 - mask_iou(masks, self.previous_detections[~null_track].mask)
+        matches, unmatched_rows_t, _ = linear_assignment(mask_cost_matrix, self.hungarian_mask_threshold)
+        # Apply results
+        detections.tracker_id[matches[:,0]] = self.previous_detections[~null_track].tracker_id[matches[:,1]]
+        # Remainder
+        unmatched_track_ids_t_1 = list(set(self.track_ids) - set(detections.tracker_id[detections.tracker_id != -1]))
+        mask_based_matches = len(matches)
+        if len(unmatched_rows_t) == 0:
+            self.save_statistics(detections, xy, mask_based_matches)
+            return
+        '''Second Layer | Court-position-based tracking:
+        Safely track remaining un-matched player based on their court (x,y) coordinates.
+        '''
+        pos_based_matches = 0
+        dist_cost_matrix = get_distance_cost_matrix(
+            xy,
+            self.previous_xy[~null_track],
+            ord = 2, # EUCLIDIAN DISTANCE
+        )
+        dist_cost_matrix[matches[:,0], :] = 1e3
+        dist_cost_matrix[:, matches[:,1]] = 1e3
+        matches_, _, _ = linear_assignment(dist_cost_matrix, self.hungarian_pos_threshold)
+        # Apply results
+        for match_ in matches_:
+            if match_[0] in matches[:,0]:
+                continue
+            detections.tracker_id[match_[0]] = self.previous_detections[~null_track].tracker_id[match_[1]]
+            pos_based_matches += 1
+        # Remainder
+        unmatched_rows_t = [i for i in range(len(detections)) if detections.tracker_id[i] == -1]
+        unmatched_track_ids_t_1 = list(set(self.track_ids) - set(detections.tracker_id[detections.tracker_id != -1]))
+        if len(unmatched_rows_t) == 0:
+            self.save_statistics(detections, xy, mask_based_matches, pos_based_matches)
+            return
+        '''Third Layer | Appearance-based tracking:
+        Use a vision model to match remaining player crops to their corresponding crop at t-1
+        '''
+        unmatched = 0
+        appearance_based_matches = 0
+        new_detections = 0
+        while len(unmatched_rows_t) > 0:
+            unmatched_row_t = unmatched_rows_t.pop(0)
+            # If there is only one un-matched mask at t-1 and t, they must correspond to the same player (assuming all players have been detected once, so there's no new player)
+            if self.all_players_detected and len(unmatched_track_ids_t_1) == 1 and len(unmatched_rows_t) == 0:
+                detections.tracker_id[unmatched_row_t] = unmatched_track_ids_t_1[0]
+                unmatched_track_ids_t_1.pop(0)
+                break
+            '''Appearance-based tracking: track remaining un-matched players'''
+            query_crop = get_crops_from_masks(frame, detections[unmatched_row_t].mask)[0] # Crop unmatched player at time t
+            base_candidate_crops = [self.track_id_to_crop[t_id][0] for t_id in unmatched_track_ids_t_1] # Previous crops of unmatched players
+            latest_candidate_crops = [self.track_id_to_crop[t_id][1] for t_id in unmatched_track_ids_t_1] # Previous crops of unmatched players
+            probs = self.matcher.predict(query_crop, base_candidate_crops)
+            probs = (probs + self.matcher.predict(query_crop, latest_candidate_crops)) / 2
+            prediction = matcher_probs_custom_argmax(probs)
+            if prediction != len(base_candidate_crops):
+                pred_track_id = unmatched_track_ids_t_1[prediction]
+                detections.tracker_id[unmatched_row_t] = pred_track_id
+                unmatched_track_ids_t_1.pop(prediction)
+                appearance_based_matches += 1
+            # still unmatched -> (likely) a new player
+            elif not(self.all_players_detected):
+                new_track_id = max(self.track_ids) + 1
+                detections.tracker_id[unmatched_row_t] = new_track_id
+                new_detections += 1
+                self.track_ids.append(new_track_id)
+                self.all_players_detected = len(self.track_ids) == 10
+            else:
+                unmatched += 1
+        self.save_statistics(detections, xy, mask_based_matches, pos_based_matches, appearance_based_matches, new_detections, unmatched)
+    def save_statistics(self, detections, xy, mask_based_matches, pos_based_matches=0, appearance_based_matches=0, new_detections=0, unmatched=0):
+        '''Update tracking statistics'''
+        self.frame_id += 1
+        self.stats[self.frame_id] = {
+            "detected_players" : len(detections),
+            "all_players_detected" : self.all_players_detected,
+            "mask_based_matches" : mask_based_matches,
+            "position_based_matches" : pos_based_matches,
+            "appearance_based_matches" : appearance_based_matches,
+            "new_detections" : new_detections,
+            "unmatched" : unmatched
+        }
+        for i in range(len(detections)):
+            track_id = detections.tracker_id[i]
+            if track_id != -1:
+                self.track_id_to_crop[track_id][1] = get_crops_from_masks(frame, detections[i].mask)[0]
+        self.previous_detections = detections
+        self.previous_xy = xy
+if __name__ == "__main__":
+    from basketball_analysis import Matcher
+    from utils import show_annotations, annotate_frame
+    from inference import get_model
+    VIDEO_PATH = "DEN_SAC_1_2025.mp4"
+    HUNGARIAN_MASK_THRESHOLD = 0.6
+    HUNGARIAN_POS_THRESHOLD = 2.0
+    SEGMENTATION_CONFIDENCE_THRESHOLD = 0.4
+    SEG_MODEL = RFDETRSeg2XLarge(resolution=1008, pretrain_weights="checkpoint_best_ema.pth")
+    SEG_MODEL.optimize_for_inference()
+    ROBOFLOW_API_KEY = "PUNfWgLHrHDufisOOaZp"
+    KEYPOINT_DETECTION_MODEL_ID = "basketball-court-detection-2/14"
+    KEYPOINT_MODEL = get_model(model_id=KEYPOINT_DETECTION_MODEL_ID, api_key=ROBOFLOW_API_KEY)
+    KEYPOINT_COLOR = sv.Color.from_hex('#FF1493')
+    matcher = Matcher(10,8, "DINOv2_small")
+    sd = torch.load("matcher_tuned.pt")
+    matcher.load_state_dict(sd)
+    for p in matcher.parameters():
+        p.requires_grad_(False)
+    matcher.eval();
+    def get_models_predictions(frame):
+        # Segmentation
+        detections = SEG_MODEL.predict(frame, threshold=SEGMENTATION_CONFIDENCE_THRESHOLD)
+        keep = mask_nms(detections.mask, detections.confidence, iou_thresh=0.2)
+        detections = detections[keep]
+        if len(detections) > 10:
+            # keep first 10 detections (10 highest confidence detections)
+            detections = detections[:10]
+        # X,Y coordinates retrieval
+        court_xy = get_players_court_xy(frame, detections, KEYPOINT_MODEL)
+        return detections, court_xy
+    video_iterator = sv.get_video_frames_generator(VIDEO_PATH)
+    frame = toRGB(next(video_iterator))
+    initial_detections, initial_xy = get_models_predictions(frame)
+    history = []
+    tracker = Tracker(initial_detections, initial_xy, frame, matcher, HUNGARIAN_MASK_THRESHOLD, HUNGARIAN_POS_THRESHOLD)
+    history.append(annotate_frame(frame, initial_detections))
+    for frame_id, frame in tqdm(enumerate(video_iterator, start=1)):
+        frame = toRGB(frame)
+        detections, xy = get_models_predictions(frame)
+        tracker.update_tracks_with_new_detections(detections, xy, frame)
+        history.append(annotate_frame(frame, detections))
+        if frame_id == 150:
+            Image.fromarray(history[-1]).save("-1.png")
+            Image.fromarray(history[0]).save("0.png")
+            interact(local=locals())

basketball_analysis/utils.py ADDED Viewed

	@@ -0,0 +1,347 @@

+from __future__ import annotations
+import torch
+import numpy as np
+import supervision as sv
+from pycocotools import mask as mask_utils
+import cv2
+import ffmpeg
+from PIL import Image
+import numpy as np
+from typing import List, Iterable
+from matplotlib import pyplot as plt
+class SAM2Tracker:
+    def __init__(self, predictor) -> None:
+        self.predictor = predictor
+        self._prompted = False
+    def prompt_first_frame(self, frame: np.ndarray, detections: sv.Detections) -> None:
+        if len(detections) == 0:
+            raise ValueError("detections must contain at least one box")
+        if detections.tracker_id is None:
+            detections.tracker_id = list(range(1, len(detections) + 1))
+        with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+            self.predictor.load_first_frame(frame)
+            for xyxy, obj_id in zip(detections.xyxy, detections.tracker_id):
+                bbox = np.asarray([xyxy], dtype=np.float32)
+                self.predictor.add_new_prompt(
+                    frame_idx=0,
+                    obj_id=int(obj_id),
+                    bbox=bbox,
+                )
+        self._prompted = True
+    def propagate(self, frame: np.ndarray) -> sv.Detections:
+        if not self._prompted:
+            raise RuntimeError("Call prompt_first_frame before propagate")
+        with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+            tracker_ids, mask_logits = self.predictor.track(frame)
+        tracker_ids = np.asarray(tracker_ids, dtype=np.int32)
+        masks = (mask_logits > 0.0).cpu().numpy()
+        masks = np.squeeze(masks).astype(bool)
+        if masks.ndim == 2:
+            masks = masks[None, ...]
+        masks = np.array([
+            sv.filter_segments_by_distance(mask, relative_distance=0.03, mode="edge")
+            for mask in masks
+        ])
+        xyxy = sv.mask_to_xyxy(masks=masks)
+        detections = sv.Detections(xyxy=xyxy, mask=masks, tracker_id=tracker_ids)
+        return detections
+    def reset(self) -> None:
+        self._prompted = False
+def get_crops_from_masks(frame: np.ndarray, masks: np.ndarray) -> list[np.ndarray]:
+    """
+    Args:mask_index
+        frame: (H, W, 3) image
+        masks: (N, H, W) binary masks
+    Returns:
+        List of cropped images, one per mask. Each crop is a rectangular
+        bounding box around the mask, with black pixels outside the mask.
+    """
+    crops = []
+    for mask in masks:
+        # Find bounding box of the mask
+        ys, xs = np.where(mask)
+        if len(xs) == 0 or len(ys) == 0:
+            # Empty mask → skip or return empty crop
+            crops.append(np.zeros((0, 0, 3), dtype=frame.dtype))
+            continue
+        y_min, y_max = ys.min(), ys.max() + 1
+        x_min, x_max = xs.min(), xs.max() + 1
+        # Crop the frame and mask
+        frame_crop = frame[y_min:y_max, x_min:x_max]
+        mask_crop = mask[y_min:y_max, x_min:x_max]
+        # Apply mask: keep pixels where mask is True, else black
+        crop = np.zeros_like(frame_crop)
+        crop[mask_crop] = frame_crop[mask_crop]
+        crops.append(crop)
+    return crops
+def f(detections: sv.Detections, track_history: dict, frame_index):
+    for i in range(len(detections)):
+        mask = detections.mask[i]
+        rle = mask_utils.encode(np.asfortranarray(mask))
+        track_history[int(detections.tracker_id[i])].append((frame_index, rle['counts']))
+def toRGB(img: np.ndarray):
+    return cv2.cvtColor(img, code=cv2.COLOR_BGR2RGB)
+def read_frame_from_video(in_filename, frame_num):
+    raw_bytes, err = (
+        ffmpeg
+        .input(in_filename)
+        .filter('select', 'gte(n,{})'.format(frame_num))
+        .output('pipe:', vframes=1, format='rawvideo', pix_fmt='rgb24')
+        .global_args('-loglevel', 'error')
+        .run(capture_stdout=True)
+    )
+    assert len(raw_bytes) == 1080 * 1920 * 3
+    return np.frombuffer(raw_bytes, np.uint8).reshape(1, 1080, 1920, 3).copy()
+def read_consecutive_frames_from_video(in_filename, start_frame, num_frames) -> np.ndarray:
+    out, err = ffmpeg.input(in_filename)\
+        .output(
+        'pipe:1',
+        vf=f'select=between(n\\,{start_frame}\\,{start_frame + num_frames - 1})',
+        vsync=0,
+        vframes=num_frames,
+        format='rawvideo',
+        pix_fmt='rgb24'
+    ).global_args('-loglevel', 'error')\
+        .run(capture_stdout=True, capture_stderr=True)
+    W, H = 1920, 1080
+    frame_size = W * H * 3
+    frames = np.frombuffer(out, np.uint8)
+    if frames.size != num_frames * frame_size:
+        raise RuntimeError(
+            f'Expected {num_frames * frame_size} bytes, got {frames.size}\n'
+            f'ffmpeg stderr:\n{err.decode()}'
+        )
+    # frames.setflags(write=True)
+    return frames.reshape(num_frames, H, W, 3).copy()
+def xywhn_to_xywh(xywhn:list, height:int, width:int):
+    x,y,w,h = xywhn
+    return [int(x * width), int(y * height), int(w * width), int(h * height)]
+def crop_frame_at_mask_from_bbox(frame: np.ndarray, mask: np.ndarray, bbox: list) -> np.array:
+    x,y,w,h = bbox
+    crop = frame[y: y+h, x: x+w]
+    cropped_mask = mask[y: y+h, x: x+w]
+    # from code import interact; interact(local=locals())
+    crop[~cropped_mask] = np.array([0,0,0], dtype=np.uint8)
+    return crop
+def find_consecutive_streaks(nums: list|Iterable):
+    if isinstance(nums, Iterable): nums = list(nums)
+    if not nums:
+        return []
+    streaks = []
+    start = nums[0]
+    for i in range(1, len(nums)):
+        if nums[i] != nums[i-1] + 1:
+            stop = nums[i-1]
+            streaks.append(range(start, stop + 1))
+            start = nums[i]
+    streaks.append(range(start, nums[-1] + 1))
+    return streaks
+def save_loss_history(fpath, loss:float):
+    with open(fpath, "a+") as f:
+        f.write(f"{loss:.6f}\n")
+def save_loss_history_plot(loss_history: list[float], fpath):
+    plt.plot(loss_history)
+    plt.savefig(fpath)
+def save_checkpoint(
+    path,
+    model,
+    optimizer,
+    epoch,
+    step,
+):
+    ckpt = {
+        "model": model.state_dict(),
+        "optimizer": optimizer.state_dict(),
+        "epoch": epoch,
+        "step": step,
+    }
+    torch.save(ckpt, path)
+def load_checkpoint(
+    path,
+    model,
+    optimizer,
+    device="cuda"
+):
+    ckpt = torch.load(path, map_location=device)
+    model.load_state_dict(ckpt["model"])
+    optimizer.load_state_dict(ckpt["optimizer"])
+    epoch = ckpt.get("epoch", 0)
+    step = ckpt.get("step", 0)
+    return epoch, step
+def mask_iou_pair(m1, m2):
+    inter = np.logical_and(m1, m2).sum()
+    if inter == 0:
+        return 0.0
+    union = m1.sum() + m2.sum() - inter
+    return inter / (union + 1e-6)
+def mask_nms(masks, scores, iou_thresh=0.6):
+    order = np.argsort(-scores)
+    keep = []
+    suppressed = np.zeros(len(masks), dtype=bool)
+    for i in order:
+        if suppressed[i]:
+            continue
+        keep.append(i)
+        for j in order:
+            if j <= i or suppressed[j]:
+                continue
+            iou = mask_iou_pair(masks[i], masks[j])
+            if iou > iou_thresh:
+                suppressed[j] = True
+    return keep
+def mask_iou(masks_t: np.ndarray, masks_t1):
+    # Flatten
+    N, H, W = masks_t.shape
+    M = masks_t1.shape[0]
+    masks_t = masks_t.reshape(N, -1).astype(float)      # (N, HW)
+    masks_t1 = masks_t1.reshape(M, -1).astype(float)   # (M, HW)
+    # Intersection: (N, M)
+    intersection = masks_t @ masks_t1.T
+    # Areas
+    area_t = masks_t.sum(1, keepdims=True)      # (N, 1)
+    area_t1 = masks_t1.sum(1, keepdims=True)    # (M, 1)
+    # Union
+    union = area_t + area_t1.T - intersection
+    iou = intersection / (union + 1e-6)
+    return iou  # (N, M)
+COURT_KEYPOINT_COORDINATES = np.array([
+    (0.0, 0.0),
+    (0.0, 2.99),
+    (0.0, 17.0),
+    (0.0, 33.01),
+    (0.0, 47.02),
+    (0.0, 50.0),
+    (5.25, 25.0),
+    (13.92, 2.99),
+    (13.92, 47.02),
+    (19.0, 17.0),
+    (19.0, 25.0),
+    (19.0, 33.01),
+    (27.4, 0.0),
+    (29.01, 25.0),
+    (27.4, 50.0),
+    (46.99, 0.0),
+    (46.99, 25.0),
+    (46.99, 50.0),
+    (66.61, 0.0),
+    (65.0, 25.0),
+    (66.61, 50.0),
+    (75.0, 17.0),
+    (75.0, 25.0),
+    (75.0, 33.01),
+    (80.09, 2.99),
+    (80.09, 47.02),
+    (88.75, 25.0),
+    (94.0, 0.0),
+    (94.0, 2.99),
+    (94.0, 17.0),
+    (94.0, 33.01),
+    (94.0, 47.02),
+    (94.0, 50.0)
+])
+def get_distance_cost_matrix(arr1:np.ndarray, arr2:np.ndarray, ord=1) :
+    cost_matrix = np.empty(shape=(len(arr1), len(arr2)), dtype=np.float64)
+    for i in range(len(arr1)):
+        cost_matrix[i] = np.linalg.norm(arr1[i] - arr2, ord=ord, axis=-1)
+    return torch.tensor(cost_matrix)
+def matcher_probs_custom_argmax(probs:np.ndarray, confidence_threshold=0.7):
+    probs = probs.squeeze(0)
+    pred = probs.argmax()
+    # if matcher predicts the null prediction, but it is not confident
+    if pred == len(probs) - 1 and probs[pred] < confidence_threshold:
+        # predict the second most confident prediction if it has high weight
+        second_best = probs[:-1].argmax()
+        if probs[second_best] > 1.0 - confidence_threshold - 0.05:
+            pred = second_best
+    return pred
+def show_annotations(frame_, detections_):
+    annotated_frame = frame_.copy()
+    annotated_frame = sv.MaskAnnotator(color_lookup=sv.ColorLookup.TRACK).annotate(annotated_frame, detections_)
+    annotated_frame = sv.LabelAnnotator(smart_position=True).annotate(annotated_frame, detections_, labels=list(str(i) for i in detections_.tracker_id))
+    return Image.fromarray(annotated_frame)
+def annotate_frame(frame_, detections_):
+    annotated_frame = frame_.copy()
+    annotated_frame = sv.MaskAnnotator(color_lookup=sv.ColorLookup.TRACK).annotate(annotated_frame, detections_)
+    annotated_frame = sv.LabelAnnotator(smart_position=True).annotate(annotated_frame, detections_, labels=list(str(i) for i in detections_.tracker_id))
+    return annotated_frame
+if __name__ == "__main__":
+    from code import interact
+    frames = read_consecutive_frames_from_video("nba_sample_videos/batch2/SAC_LAL_1.mp4", 199, 1)
+    # crop_frame_at_mask_from_bbox(np.zeros((1080, 1920, 3)), )
+    interact(local=locals())

basketball_analysis/view_transformer.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from sports import MeasurementUnit
+from sports.basketball import CourtConfiguration, League, draw_court, draw_points_on_court
+import numpy as np
+import supervision as sv
+import cv2
+CONFIG = CourtConfiguration(league=League.NBA, measurement_unit=MeasurementUnit.FEET).vertices
+def frame_xy_to_court_xy(frame_xy: np.ndarray, H: np.ndarray):
+    assert frame_xy.shape[1] == 2
+    n_points = frame_xy.shape[0]
+    court_xy = np.hstack((frame_xy, np.ones(shape=(n_points, 1)))) @ H.T
+    court_xy_norm = court_xy[:, :2] / court_xy[:, [-1]]
+    return court_xy_norm
+def get_players_court_xy(frame, detections, model, use_bottom_center=True, normalize=False):
+    KEYPOINT_DETECTION_MODEL_CONFIDENCE = 0.3
+    KEYPOINT_DETECTION_MODEL_ANCHOR_CONFIDENCE = 0.5
+    # Locate court keypoints (or reference points)
+    result = model.infer(frame, confidence=KEYPOINT_DETECTION_MODEL_CONFIDENCE)[0]
+    key_points = sv.KeyPoints.from_inference(result)
+    filter_mask = key_points.confidence[0] > KEYPOINT_DETECTION_MODEL_ANCHOR_CONFIDENCE
+    # Compute homography matrix H
+    court_landmarks = np.array(CONFIG)[filter_mask]
+    frame_landmarks = key_points[:, filter_mask].xy[0]
+    H, _ = cv2.findHomography(frame_landmarks, court_landmarks)
+    # From the player detections, retrieve their position on the court
+    x1 = detections.xyxy[:, 0]
+    x2 = detections.xyxy[:, 2]
+    y1 = detections.xyxy[:, 1]
+    y2 = detections.xyxy[:, 3]
+    if use_bottom_center:
+        # Take the bottom center of the bounding box as the (x,y) coordinate
+        frame_xy = np.vstack(
+        (x1 + (x2 - x1) / 2, y2)
+        ).T
+    else:
+        frame_xy = np.vstack(
+        (x1 + (x2 - x1) / 2, y1 + (y2 - y1) / 2)
+        ).T
+    # apply homographic transformation
+    court_xy = frame_xy_to_court_xy(frame_xy, H)
+    if normalize:
+        court_xy = court_xy / np.array([94.0, 50.0])
+    return court_xy
+def show_positions_on_court(court_xy):
+    court = draw_court(config=CONFIG)
+    court = draw_points_on_court(
+        config=CONFIG,
+        xy=court_xy,
+        court=court
+    )
+    sv.plot_image(court)