tarto2
/

TurboVision

ONNX

Model card Files Files and versions

xet

Community

tarto2 commited on Nov 25, 2025

Commit

779104d

verified ·

1 Parent(s): 8fa61d8

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

keypoint_helper.py +116 -0
miner.py +158 -364

keypoint_helper.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import numpy as np
+from tqdm import tqdm
+from typing import List, Tuple, Sequence, Any
+FOOTBALL_KEYPOINTS: list[tuple[int, int]] = [
+    (0, 0),  # 1
+    (0, 0),  # 2
+    (0, 0),  # 3
+    (0, 0),  # 4
+    (0, 0),  # 5
+    (0, 0),  # 6
+    (0, 0),  # 7
+    (0, 0),  # 8
+    (0, 0),  # 9
+    (0, 0),  # 10
+    (0, 0),  # 11
+    (0, 0),  # 12
+    (0, 0),  # 13
+    (0, 0),  # 14
+    (527, 283),  # 15
+    (527, 403),  # 16
+    (0, 0),  # 17
+    (0, 0),  # 18
+    (0, 0),  # 19
+    (0, 0),  # 20
+    (0, 0),  # 21
+    (0, 0),  # 22
+    (0, 0),  # 23
+    (0, 0),  # 24
+    (0, 0),  # 25
+    (0, 0),  # 26
+    (0, 0),  # 27
+    (0, 0),  # 28
+    (0, 0),  # 29
+    (0, 0),  # 30
+    (405, 340),  # 31
+    (645, 340),  # 32
+]
+def convert_keypoints_to_val_format(keypoints):
+    return [tuple(int(x) for x in pair) for pair in keypoints]
+def predict_failed_indices(results_frames: Sequence[Any]) -> list[int]:
+    max_frames = len(results_frames)
+    if max_frames == 0:
+        return []
+    failed_indices: list[int] = []
+    for frame_index, frame_result in enumerate(results_frames):
+        frame_keypoints = getattr(frame_result, "keypoints", []) or []
+        non_zero_count = sum(1 for (x, y) in frame_keypoints if int(x) != 0 and int(y) != 0)
+        if non_zero_count <= 4:
+            failed_indices.append(frame_index)
+    return failed_indices
+def _generate_sparse_template_keypoints(frame_width: int, frame_height: int) -> list[tuple[int, int]]:
+    template_max_x, template_max_y = (1045, 675)
+    sx = float(frame_width) / float(template_max_x if template_max_x != 0 else 1)
+    sy = float(frame_height) / float(template_max_y if template_max_y != 0 else 1)
+    scaled: list[tuple[int, int]] = []
+    for i in range(32):
+        tx, ty = FOOTBALL_KEYPOINTS[i]
+        x_scaled = int(round(tx * sx))
+        y_scaled = int(round(ty * sy))
+        scaled.append((x_scaled, y_scaled))
+    return scaled
+def fix_keypoints(
+    results_frames: Sequence[Any],
+    failed_indices: Sequence[int],
+    frame_width: int,
+    frame_height: int,
+) -> list[Any]:
+    max_frames = len(results_frames)
+    if max_frames == 0:
+        return list(results_frames)
+    failed_set = set(int(i) for i in failed_indices)
+    all_indices = list(range(max_frames))
+    successful_indices = [i for i in all_indices if i not in failed_set]
+    if len(successful_indices) == 0:
+        sparse_template = _generate_sparse_template_keypoints(frame_width, frame_height)
+        for frame_result in results_frames:
+            setattr(frame_result, "keypoints", list(convert_keypoints_to_val_format(sparse_template)))
+        return list(results_frames)
+    seed_index = successful_indices[0]
+    seed_kps_raw = getattr(results_frames[seed_index], "keypoints", []) or []
+    last_success_kps = convert_keypoints_to_val_format(seed_kps_raw)
+    for frame_index in range(max_frames):
+        frame_result = results_frames[frame_index]
+        if frame_index in failed_set:
+            setattr(frame_result, "keypoints", list(last_success_kps))
+        else:
+            current_kps_raw = getattr(frame_result, "keypoints", []) or []
+            current_kps = convert_keypoints_to_val_format(current_kps_raw)
+            setattr(frame_result, "keypoints", list(current_kps))
+            last_success_kps = current_kps
+    return list(results_frames)
+def run_keypoints_post_processing(results_frames: Sequence[Any], frame_width: int, frame_height: int) -> list[Any]:
+    failed_indices = predict_failed_indices(results_frames)
+    return fix_keypoints(results_frames, failed_indices, frame_width, frame_height)

miner.py CHANGED Viewed

@@ -1,26 +1,23 @@
 from pathlib import Path
-from typing import List, Tuple, Dict, Optional
 import sys
 import os
 from numpy import ndarray
 from pydantic import BaseModel
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 from ultralytics import YOLO
 from team_cluster import TeamClassifier
 from utils import (
     BoundingBox,
     Constants,
-    classify_teams_batch,
 )
 import time
 import torch
 import gc
-import cv2
-import numpy as np
-from collections import defaultdict
 from pitch import process_batch_input, get_cls_net
 import yaml
@@ -49,7 +46,7 @@ class Miner:
     CORNER_CONFIDENCE = Constants.CORNER_CONFIDENCE
     GOALKEEPER_POSITION_MARGIN = Constants.GOALKEEPER_POSITION_MARGIN
     MIN_SAMPLES_FOR_FIT = 16  # Minimum player crops needed before fitting TeamClassifier
-    MAX_SAMPLES_FOR_FIT = 1000  # Maximum samples to avoid overfitting
     def __init__(self, path_hf_repo: Path) -> None:
         try:
@@ -57,7 +54,7 @@ class Miner:
             model_path = path_hf_repo / "football_object_detection.onnx"
             self.bbox_model = YOLO(model_path)
-            print(f"BBox Model Loaded: class name {self.bbox_model.names}")
             team_model_path = path_hf_repo / "osnet_model.pth.tar-100"
             self.team_classifier = TeamClassifier(
@@ -71,8 +68,6 @@ class Miner:
             self.team_classifier_fitted = False
             self.player_crops_for_fit = []
-            # self.keypoints_model = YOLO(path_hf_repo / "keypoint.pt")
             model_kp_path = path_hf_repo / 'keypoint'
             config_kp_path = path_hf_repo / 'hrnetv2_w48.yaml'
             cfg_kp = yaml.safe_load(open(config_kp_path, 'r'))
@@ -84,8 +79,6 @@ class Miner:
             model.eval()
             self.keypoints_model = model
-            print("Keypoints Model (keypoint.pt) Loaded")
             self.kp_threshold = 0.1
             self.pitch_batch_size = 4
             self.health = "healthy"
@@ -138,109 +131,6 @@ class Miner:
         return intersection_area / union_area
-    def _extract_jersey_region(self, crop: ndarray) -> ndarray:
-        """
-        Extract jersey region (upper body) from player crop.
-        For close-ups, focuses on upper 60%, for distant shots uses full crop.
-        """
-        if crop is None or crop.size == 0:
-            return crop
-        h, w = crop.shape[:2]
-        if h < 10 or w < 10:
-            return crop
-        # For close-up shots, extract upper body (jersey region)
-        is_closeup = h > 100 or (h * w) > 12000
-        if is_closeup:
-            # Upper 60% of the crop (jersey area, avoiding shorts)
-            jersey_top = 0
-            jersey_bottom = int(h * 0.60)
-            jersey_left = max(0, int(w * 0.05))
-            jersey_right = min(w, int(w * 0.95))
-            return crop[jersey_top:jersey_bottom, jersey_left:jersey_right]
-        return crop
-    def _extract_color_signature(self, crop: ndarray) -> Optional[np.ndarray]:
-        """
-        Extract color signature from jersey region using HSV and LAB color spaces.
-        Returns a feature vector with dominant colors and color statistics.
-        """
-        if crop is None or crop.size == 0:
-            return None
-        jersey_region = self._extract_jersey_region(crop)
-        if jersey_region.size == 0:
-            return None
-        try:
-            # Convert to HSV and LAB color spaces
-            hsv = cv2.cvtColor(jersey_region, cv2.COLOR_BGR2HSV)
-            lab = cv2.cvtColor(jersey_region, cv2.COLOR_BGR2LAB)
-            # Reshape for processing
-            hsv_flat = hsv.reshape(-1, 3).astype(np.float32)
-            lab_flat = lab.reshape(-1, 3).astype(np.float32)
-            # Compute statistics for HSV
-            hsv_mean = np.mean(hsv_flat, axis=0) / 255.0
-            hsv_std = np.std(hsv_flat, axis=0) / 255.0
-            # Compute statistics for LAB
-            lab_mean = np.mean(lab_flat, axis=0) / 255.0
-            lab_std = np.std(lab_flat, axis=0) / 255.0
-            # Dominant color (most frequent hue)
-            hue_hist, _ = np.histogram(hsv_flat[:, 0], bins=36, range=(0, 180))
-            dominant_hue = np.argmax(hue_hist) * 5  # Convert to hue value
-            # Combine features
-            color_features = np.concatenate([
-                hsv_mean,
-                hsv_std,
-                lab_mean[:2],  # L and A channels (B is less informative)
-                lab_std[:2],
-                [dominant_hue / 180.0]  # Normalized dominant hue
-            ])
-            return color_features
-        except Exception as e:
-            print(f"Error extracting color signature: {e}")
-            return None
-    def _get_spatial_position(self, bbox: Tuple[float, float, float, float],
-                              frame_width: int, frame_height: int) -> Tuple[float, float]:
-        """
-        Get normalized spatial position of player on the pitch.
-        Returns (x_normalized, y_normalized) where 0,0 is top-left.
-        """
-        x1, y1, x2, y2 = bbox
-        center_x = (x1 + x2) / 2.0
-        center_y = (y1 + y2) / 2.0
-        # Normalize to [0, 1]
-        x_norm = center_x / frame_width if frame_width > 0 else 0.5
-        y_norm = center_y / frame_height if frame_height > 0 else 0.5
-        return (x_norm, y_norm)
-    def _find_best_match(self, target_box: Tuple[float, float, float, float],
-                        predicted_frame_data: Dict[int, Tuple[Tuple, str]],
-                        iou_threshold: float) -> Tuple[Optional[str], float]:
-        """
-        Find best matching box in predicted frame data using IoU.
-        """
-        best_iou = 0.0
-        best_team_id = None
-        for idx, (bbox, team_cls_id) in predicted_frame_data.items():
-            iou = self._calculate_iou(target_box, bbox)
-            if iou > best_iou and iou >= iou_threshold:
-                best_iou = iou
-                best_team_id = team_cls_id
-        return (best_team_id, best_iou)
     def _detect_objects_batch(self, decoded_images: List[ndarray]) -> Dict[int, List[BoundingBox]]:
         batch_size = 16
         detection_results = []
@@ -253,203 +143,175 @@ class Miner:
         return detection_results
     def _team_classify(self, detection_results, decoded_images, offset):
-        """
-        Hybrid team classification combining:
-        1. Appearance features (OSNet)
-        2. Color signatures (HSV/LAB)
-        3. Spatial priors (left/right side of pitch)
-        4. Temporal tracking (same player = same team)
-        """
         start = time.time()
-        # Phase 1: Collect samples and fit appearance-based classifier
-        fit_sample_size = min(self.MAX_SAMPLES_FOR_FIT, len(detection_results) * 10)
         player_crops_for_fit = []
         for frame_id in range(len(detection_results)):
             detection_box = detection_results[frame_id].boxes.data
             if len(detection_box) < 4:
                 continue
             if len(player_crops_for_fit) < fit_sample_size:
                 frame_image = decoded_images[frame_id]
                 for box in detection_box:
                     x1, y1, x2, y2, conf, cls_id = box.tolist()
-                    if conf < 0.5 or cls_id != 2:
                         continue
-                    crop = frame_image[int(y1):int(y2), int(x1):int(x2)]
-                    if crop.size > 0:
-                        player_crops_for_fit.append(crop)
             if self.team_classifier and not self.team_classifier_fitted and len(player_crops_for_fit) >= fit_sample_size:
-                print(f"Fitting TeamClassifier (OSNet) with {len(player_crops_for_fit)} player crops")
                 self.team_classifier.fit(player_crops_for_fit)
                 self.team_classifier_fitted = True
                 break
-        if not self.team_classifier_fitted and len(player_crops_for_fit) >= self.MIN_SAMPLES_FOR_FIT:
             print(f"Fallback: Fitting TeamClassifier with {len(player_crops_for_fit)} player crops")
             self.team_classifier.fit(player_crops_for_fit)
             self.team_classifier_fitted = True
-        print(f"Fitting time: {time.time() - start:.2f}s")
-        # Phase 2: Hybrid classification for all frames
         start = time.time()
-        bboxes: dict[int, list[BoundingBox]] = {}
-        # Temporal tracking: {track_id: (team_id, confidence, last_frame)}
-        player_tracks: Dict[Tuple, Tuple[int, float, int]] = {}
-        # Spatial team assignment: track which team is on which side
-        left_side_team = None
-        right_side_team = None
         for frame_id in range(len(detection_results)):
             detection_box = detection_results[frame_id].boxes.data
             frame_image = decoded_images[frame_id]
-            frame_h, frame_w = frame_image.shape[:2]
-            boxes = []
-            # Collect all players in this frame
-            player_data = []  # (idx, crop, bbox, spatial_pos, color_sig)
             for idx, box in enumerate(detection_box):
                 x1, y1, x2, y2, conf, cls_id = box.tolist()
-                if cls_id != 2 or conf < 0.6:
-                    continue
-                crop = frame_image[int(y1):int(y2), int(x1):int(x2)]
-                if crop.size == 0:
                     continue
-                bbox = (x1, y1, x2, y2)
-                spatial_pos = self._get_spatial_position(bbox, frame_w, frame_h)
-                color_sig = self._extract_color_signature(crop)
-                player_data.append((idx, crop, bbox, spatial_pos, color_sig))
-            if len(player_data) == 0:
-                bboxes[offset + frame_id] = []
-                continue
-            # Step 1: Get appearance-based predictions (OSNet)
-            appearance_predictions = {}
-            if self.team_classifier and self.team_classifier_fitted:
-                crops = [data[1] for data in player_data]
-                appearance_team_ids = self.team_classifier.predict(crops)
-                for (idx, _, _, _, _), team_id in zip(player_data, appearance_team_ids):
-                    appearance_predictions[idx] = team_id
-            # Step 2: Extract color signatures and cluster
-            color_signatures = []
-            color_indices = []
-            for idx, _, _, _, color_sig in player_data:
-                if color_sig is not None:
-                    color_signatures.append(color_sig)
-                    color_indices.append(idx)
-            color_predictions = {}
-            if len(color_signatures) >= 4:
-                try:
-                    from sklearn.cluster import KMeans
-                    color_kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
-                    color_clusters = color_kmeans.fit_predict(color_signatures)
-                    for idx, cluster_id in zip(color_indices, color_clusters):
-                        color_predictions[idx] = cluster_id
-                except Exception as e:
-                    print(f"Color clustering failed: {e}")
-            # Step 3: Apply spatial priors
-            # Determine which team is on which side based on majority
-            if left_side_team is None or right_side_team is None:
-                left_side_players = [p for p in player_data if p[3][0] < 0.5]  # x < 0.5
-                right_side_players = [p for p in player_data if p[3][0] >= 0.5]  # x >= 0.5
-                if len(left_side_players) >= 2 and len(right_side_players) >= 2:
-                    # Use appearance predictions to determine sides
-                    left_teams = [appearance_predictions.get(p[0]) for p in left_side_players
-                                 if p[0] in appearance_predictions]
-                    right_teams = [appearance_predictions.get(p[0]) for p in right_side_players
-                                  if p[0] in appearance_predictions]
-                    if left_teams and right_teams:
-                        left_team_mode = max(set(left_teams), key=left_teams.count)
-                        right_team_mode = max(set(right_teams), key=right_teams.count)
-                        if left_team_mode != right_team_mode:
-                            left_side_team = left_team_mode
-                            right_side_team = right_team_mode
-            # Step 4: Combine predictions with voting
-            final_predictions = {}
-            for idx, _, bbox, spatial_pos, _ in player_data:
-                votes = []
-                weights = []
-                # Appearance vote (weight: 0.4)
-                if idx in appearance_predictions:
-                    votes.append(appearance_predictions[idx])
-                    weights.append(0.4)
-                # Color vote (weight: 0.3)
-                if idx in color_predictions:
-                    votes.append(color_predictions[idx])
-                    weights.append(0.3)
-                # Spatial vote (weight: 0.3)
-                if left_side_team is not None and right_side_team is not None:
-                    x_pos, _ = spatial_pos
-                    if x_pos < 0.5:
-                        spatial_team = left_side_team
-                    else:
-                        spatial_team = right_side_team
-                    votes.append(spatial_team)
-                    weights.append(0.3)
-                # Temporal vote (weight: 0.2) - match with previous frames
-                if len(votes) > 0:
-                    # Simple temporal matching: find similar bbox in previous frames
-                    best_track_match = None
-                    best_track_iou = 0.0
-                    for track_key, (track_team, track_conf, track_frame) in player_tracks.items():
-                        if abs(track_frame - frame_id) <= 5:  # Within 5 frames
-                            track_bbox = track_key
-                            iou = self._calculate_iou(bbox, track_bbox)
-                            if iou > best_track_iou and iou > 0.3:
-                                best_track_iou = iou
-                                best_track_match = track_team
-                    if best_track_match is not None:
-                        votes.append(best_track_match)
-                        weights.append(0.2)
-                # Weighted voting
-                if len(votes) > 0:
-                    team_0_score = sum(w for v, w in zip(votes, weights) if v == 0)
-                    team_1_score = sum(w for v, w in zip(votes, weights) if v == 1)
-                    if team_0_score > team_1_score:
-                        final_team = 0
-                    elif team_1_score > team_0_score:
-                        final_team = 1
-                    else:
-                        # Tie: use appearance prediction or first vote
-                        final_team = votes[0] if votes else 0
-                    final_predictions[idx] = final_team
-                    # Update tracking
-                    track_key = bbox
-                    player_tracks[track_key] = (final_team, max(team_0_score, team_1_score), frame_id)
-            # Step 5: Generate output boxes
             for idx, box in enumerate(detection_box):
                 x1, y1, x2, y2, conf, cls_id = box.tolist()
                 if cls_id == 2 and conf < 0.6:
                     continue
-                # Check overlap with staff
                 overlap_staff = False
                 for idy, boxy in enumerate(detection_box):
                     s_x1, s_y1, s_x2, s_y2, s_conf, s_cls_id = boxy.tolist()
@@ -460,13 +322,12 @@ class Miner:
                             break
                 if overlap_staff:
                     continue
                 mapped_cls_id = str(int(cls_id))
-                # Override with team prediction
-                if idx in final_predictions:
-                    mapped_cls_id = str(6 + int(final_predictions[idx]))
                 if mapped_cls_id != '4':
                     if int(mapped_cls_id) == 3 and conf < 0.5:
                         continue
@@ -480,17 +341,14 @@ class Miner:
                             conf=float(conf),
                         )
                     )
             # Handle footballs - keep only the best one
             footballs = [bb for bb in boxes if int(bb.cls_id) == 0]
             if len(footballs) > 1:
                 best_ball = max(footballs, key=lambda b: b.conf)
                 boxes = [bb for bb in boxes if int(bb.cls_id) != 0]
                 boxes.append(best_ball)
-            bboxes[offset + frame_id] = boxes
-        print(f"Hybrid team classification time: {time.time() - start:.2f}s")
         return bboxes
@@ -499,19 +357,11 @@ class Miner:
         detection_results = self._detect_objects_batch(batch_images)
         end = time.time()
         print(f"Detection time: {end - start}")
-        # Use hybrid team classification
         start = time.time()
         bboxes = self._team_classify(detection_results, batch_images, offset)
         end = time.time()
         print(f"Team classify time: {end - start}")
-        # Phase 3: Keypoint Detection
-        # keypoints: Dict[int, List[Tuple[int, int]]] = {}
-        # keypoints = self._detect_keypoints_batch(batch_images, offset, n_keypoints)
         pitch_batch_size = min(self.pitch_batch_size, len(batch_images))
         keypoints: Dict[int, List[Tuple[int, int]]] = {}
@@ -560,81 +410,25 @@ class Miner:
         end = time.time()
         print(f"Keypoint time: {end - start}")
         results: List[TVFrameResult] = []
         for frame_number in range(offset, offset + len(batch_images)):
             frame_boxes = bboxes.get(frame_number, [])
             result = TVFrameResult(
                 frame_id=frame_number,
                 boxes=frame_boxes,
-                keypoints=keypoints.get(
-                        frame_number,
-                        [(0, 0) for _ in range(n_keypoints)],
-                    ),
             )
             results.append(result)
         gc.collect()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
             torch.cuda.synchronize()
-        return results
-    def _detect_keypoints_batch(self, batch_images: List[ndarray],
-                               offset: int, n_keypoints: int) -> Dict[int, List[Tuple[int, int]]]:
-        """
-        Phase 3: Keypoint detection for all frames in batch.
-        Args:
-            batch_images: List of images to process
-            offset: Frame offset for numbering
-            n_keypoints: Number of keypoints expected
-        Returns:
-            Dictionary mapping frame_id to list of keypoint coordinates
-        """
-        keypoints: Dict[int, List[Tuple[int, int]]] = {}
-        keypoints_model_results = self.keypoints_model.predict(batch_images)
-        if keypoints_model_results is None:
-            return keypoints
-        for frame_idx_in_batch, detection in enumerate(keypoints_model_results):
-            if not hasattr(detection, "keypoints") or detection.keypoints is None:
-                continue
-            # Extract keypoints with confidence
-            frame_keypoints_with_conf: List[Tuple[int, int, float]] = []
-            for i, part_points in enumerate(detection.keypoints.data):
-                for k_id, (x, y, _) in enumerate(part_points):
-                    confidence = float(detection.keypoints.conf[i][k_id])
-                    frame_keypoints_with_conf.append((int(x), int(y), confidence))
-            # Pad or truncate to expected number of keypoints
-            if len(frame_keypoints_with_conf) < n_keypoints:
-                frame_keypoints_with_conf.extend(
-                    [(0, 0, 0.0)] * (n_keypoints - len(frame_keypoints_with_conf))
-                )
-            else:
-                frame_keypoints_with_conf = frame_keypoints_with_conf[:n_keypoints]
-            # Filter keypoints based on confidence thresholds
-            filtered_keypoints: List[Tuple[int, int]] = []
-            for idx, (x, y, confidence) in enumerate(frame_keypoints_with_conf):
-                if idx in self.CORNER_INDICES:
-                    # Corner keypoints have lower confidence threshold
-                    if confidence < 0.3:
-                        filtered_keypoints.append((0, 0))
-                    else:
-                        filtered_keypoints.append((int(x), int(y)))
-                else:
-                    # Regular keypoints
-                    if confidence < 0.5:
-                        filtered_keypoints.append((0, 0))
-                    else:
-                        filtered_keypoints.append((int(x), int(y)))
-            frame_id = offset + frame_idx_in_batch
-            keypoints[frame_id] = filtered_keypoints
-        return keypoints

 from pathlib import Path
+from typing import List, Tuple, Dict
 import sys
 import os
 from numpy import ndarray
 from pydantic import BaseModel
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from keypoint_helper import run_keypoints_post_processing
 from ultralytics import YOLO
 from team_cluster import TeamClassifier
 from utils import (
     BoundingBox,
     Constants,
 )
 import time
 import torch
 import gc
 from pitch import process_batch_input, get_cls_net
 import yaml
     CORNER_CONFIDENCE = Constants.CORNER_CONFIDENCE
     GOALKEEPER_POSITION_MARGIN = Constants.GOALKEEPER_POSITION_MARGIN
     MIN_SAMPLES_FOR_FIT = 16  # Minimum player crops needed before fitting TeamClassifier
+    MAX_SAMPLES_FOR_FIT = 600  # Maximum samples to avoid overfitting
     def __init__(self, path_hf_repo: Path) -> None:
         try:
             model_path = path_hf_repo / "football_object_detection.onnx"
             self.bbox_model = YOLO(model_path)
+            print("BBox Model Loaded")
             team_model_path = path_hf_repo / "osnet_model.pth.tar-100"
             self.team_classifier = TeamClassifier(
             self.team_classifier_fitted = False
             self.player_crops_for_fit = []
             model_kp_path = path_hf_repo / 'keypoint'
             config_kp_path = path_hf_repo / 'hrnetv2_w48.yaml'
             cfg_kp = yaml.safe_load(open(config_kp_path, 'r'))
             model.eval()
             self.keypoints_model = model
             self.kp_threshold = 0.1
             self.pitch_batch_size = 4
             self.health = "healthy"
         return intersection_area / union_area
     def _detect_objects_batch(self, decoded_images: List[ndarray]) -> Dict[int, List[BoundingBox]]:
         batch_size = 16
         detection_results = []
         return detection_results
     def _team_classify(self, detection_results, decoded_images, offset):
+        self.team_classifier_fitted = False
         start = time.time()
+        # Collect player crops from first batch for fitting
+        fit_sample_size = 600
         player_crops_for_fit = []
         for frame_id in range(len(detection_results)):
             detection_box = detection_results[frame_id].boxes.data
             if len(detection_box) < 4:
                 continue
+            # Collect player boxes for team classification fitting (first batch only)
             if len(player_crops_for_fit) < fit_sample_size:
                 frame_image = decoded_images[frame_id]
                 for box in detection_box:
                     x1, y1, x2, y2, conf, cls_id = box.tolist()
+                    if conf < 0.5:
                         continue
+                    mapped_cls_id = str(int(cls_id))
+                    # Only collect player crops (cls_id = 2)
+                    if mapped_cls_id == '2':
+                        crop = frame_image[int(y1):int(y2), int(x1):int(x2)]
+                        if crop.size > 0:
+                            player_crops_for_fit.append(crop)
+            # Fit team classifier after collecting samples
             if self.team_classifier and not self.team_classifier_fitted and len(player_crops_for_fit) >= fit_sample_size:
+                print(f"Fitting TeamClassifier with {len(player_crops_for_fit)} player crops")
                 self.team_classifier.fit(player_crops_for_fit)
                 self.team_classifier_fitted = True
                 break
+        if not self.team_classifier_fitted and len(player_crops_for_fit) >= 16:
             print(f"Fallback: Fitting TeamClassifier with {len(player_crops_for_fit)} player crops")
             self.team_classifier.fit(player_crops_for_fit)
             self.team_classifier_fitted = True
+        end = time.time()
+        print(f"Fitting Kmeans time: {end - start}")
+        # Second pass: predict teams with configurable frame skipping optimization
         start = time.time()
+        # Get configuration for frame skipping
+        prediction_interval = 1  # Default: predict every 2 frames
+        iou_threshold = 0.3
+        print(f"Team classification - prediction_interval: {prediction_interval}, iou_threshold: {iou_threshold}")
+        # Storage for predicted frame results: {frame_id: {box_idx: (bbox, team_id)}}
+        predicted_frame_data = {}
+        # Step 1: Predict for frames at prediction_interval only
+        frames_to_predict = []
         for frame_id in range(len(detection_results)):
+            if frame_id % prediction_interval == 0:
+                frames_to_predict.append(frame_id)
+        print(f"Predicting teams for {len(frames_to_predict)}/{len(detection_results)} frames "
+                    f"(saving {100 - (len(frames_to_predict) * 100 // len(detection_results))}% compute)")
+        for frame_id in frames_to_predict:
             detection_box = detection_results[frame_id].boxes.data
             frame_image = decoded_images[frame_id]
+            # Collect player crops for this frame
+            frame_player_crops = []
+            frame_player_indices = []
+            frame_player_boxes = []
             for idx, box in enumerate(detection_box):
                 x1, y1, x2, y2, conf, cls_id = box.tolist()
+                if cls_id == 2 and conf < 0.6:
                     continue
+                mapped_cls_id = str(int(cls_id))
+                # Collect player crops for prediction
+                if self.team_classifier and self.team_classifier_fitted and mapped_cls_id == '2':
+                    crop = frame_image[int(y1):int(y2), int(x1):int(x2)]
+                    if crop.size > 0:
+                        frame_player_crops.append(crop)
+                        frame_player_indices.append(idx)
+                        frame_player_boxes.append((x1, y1, x2, y2))
+            # Predict teams for all players in this frame
+            if len(frame_player_crops) > 0:
+                team_ids = self.team_classifier.predict(frame_player_crops)
+                predicted_frame_data[frame_id] = {}
+                for idx, bbox, team_id in zip(frame_player_indices, frame_player_boxes, team_ids):
+                    # Map team_id (0,1) to cls_id (6,7)
+                    team_cls_id = str(6 + int(team_id))
+                    predicted_frame_data[frame_id][idx] = (bbox, team_cls_id)
+        # Step 2: Process all frames (interpolate skipped frames)
+        fallback_count = 0
+        interpolated_count = 0
+        bboxes: dict[int, list[BoundingBox]] = {}
+        for frame_id in range(len(detection_results)):
+            detection_box = detection_results[frame_id].boxes.data
+            frame_image = decoded_images[frame_id]
+            boxes = []
+            team_predictions = {}
+            if frame_id % prediction_interval == 0:
+                # Predicted frame: use pre-computed predictions
+                if frame_id in predicted_frame_data:
+                    for idx, (bbox, team_cls_id) in predicted_frame_data[frame_id].items():
+                        team_predictions[idx] = team_cls_id
+            else:
+                # Skipped frame: interpolate from neighboring predicted frames
+                # Find nearest predicted frames
+                prev_predicted_frame = (frame_id // prediction_interval) * prediction_interval
+                next_predicted_frame = prev_predicted_frame + prediction_interval
+                # Collect current frame player boxes
+                for idx, box in enumerate(detection_box):
+                    x1, y1, x2, y2, conf, cls_id = box.tolist()
+                    if cls_id == 2 and conf < 0.6:
+                        continue
+                    mapped_cls_id = str(int(cls_id))
+                    if self.team_classifier and self.team_classifier_fitted and mapped_cls_id == '2':
+                        target_box = (x1, y1, x2, y2)
+                        # Try to match with previous predicted frame
+                        best_team_id = None
+                        best_iou = 0.0
+                        if prev_predicted_frame in predicted_frame_data:
+                            team_id, iou = self._find_best_match(
+                                target_box,
+                                predicted_frame_data[prev_predicted_frame],
+                                iou_threshold
+                            )
+                            if team_id is not None:
+                                best_team_id = team_id
+                                best_iou = iou
+                        # Try to match with next predicted frame if available and no good match yet
+                        if best_team_id is None and next_predicted_frame < len(detection_results):
+                            if next_predicted_frame in predicted_frame_data:
+                                team_id, iou = self._find_best_match(
+                                    target_box,
+                                    predicted_frame_data[next_predicted_frame],
+                                    iou_threshold
+                                )
+                                if team_id is not None and iou > best_iou:
+                                    best_team_id = team_id
+                                    best_iou = iou
+                        # Track interpolation success
+                        if best_team_id is not None:
+                            interpolated_count += 1
+                        else:
+                            # Fallback: if no match found, predict individually
+                            crop = frame_image[int(y1):int(y2), int(x1):int(x2)]
+                            if crop.size > 0:
+                                team_id = self.team_classifier.predict([crop])[0]
+                                best_team_id = str(6 + int(team_id))
+                                fallback_count += 1
+                        if best_team_id is not None:
+                            team_predictions[idx] = best_team_id
+            # Parse boxes with team classification
             for idx, box in enumerate(detection_box):
                 x1, y1, x2, y2, conf, cls_id = box.tolist()
                 if cls_id == 2 and conf < 0.6:
                     continue
+                # Check overlap with staff box
                 overlap_staff = False
                 for idy, boxy in enumerate(detection_box):
                     s_x1, s_y1, s_x2, s_y2, s_conf, s_cls_id = boxy.tolist()
                             break
                 if overlap_staff:
                     continue
                 mapped_cls_id = str(int(cls_id))
+                # Override cls_id for players with team prediction
+                if idx in team_predictions:
+                    mapped_cls_id = team_predictions[idx]
                 if mapped_cls_id != '4':
                     if int(mapped_cls_id) == 3 and conf < 0.5:
                         continue
                             conf=float(conf),
                         )
                     )
             # Handle footballs - keep only the best one
             footballs = [bb for bb in boxes if int(bb.cls_id) == 0]
             if len(footballs) > 1:
                 best_ball = max(footballs, key=lambda b: b.conf)
                 boxes = [bb for bb in boxes if int(bb.cls_id) != 0]
                 boxes.append(best_ball)
+            bboxes[offset + frame_id] = boxes
         return bboxes
         detection_results = self._detect_objects_batch(batch_images)
         end = time.time()
         print(f"Detection time: {end - start}")
         start = time.time()
         bboxes = self._team_classify(detection_results, batch_images, offset)
         end = time.time()
         print(f"Team classify time: {end - start}")
         pitch_batch_size = min(self.pitch_batch_size, len(batch_images))
         keypoints: Dict[int, List[Tuple[int, int]]] = {}
         end = time.time()
         print(f"Keypoint time: {end - start}")
         results: List[TVFrameResult] = []
         for frame_number in range(offset, offset + len(batch_images)):
             frame_boxes = bboxes.get(frame_number, [])
+            frame_keypoints = keypoints.get(frame_number, [(0, 0) for _ in range(n_keypoints)])
             result = TVFrameResult(
                 frame_id=frame_number,
                 boxes=frame_boxes,
+                keypoints=frame_keypoints,
             )
             results.append(result)
+        if len(batch_images) > 0:
+            h, w = batch_images[0].shape[:2]
+            results = run_keypoints_post_processing(results, w, h)
         gc.collect()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
             torch.cuda.synchronize()
+        return results