Spaces:

ayushsaun
/

Single_Object_Tracking

Sleeping

App Files Files Community

ayushsaun commited on Jan 14

Commit

8cb54e4

1 Parent(s): 0fc551f

updated inference.py

Browse files

Files changed (1) hide show

inference.py +226 -47

inference.py CHANGED Viewed

@@ -1,30 +1,23 @@
-import os
 import cv2
 import joblib
 import numpy as np
-from pathlib import Path
-class ObjectTrackerInference:
-    def __init__(self, model_dir='models'):
-        self.model_dir = model_dir
-        print("Loading pre-trained models...")
-        self.position_model = joblib.load(os.path.join(model_dir, 'position_model.joblib'))
-        self.size_model = joblib.load(os.path.join(model_dir, 'size_model.joblib'))
-        self.position_scaler = joblib.load(os.path.join(model_dir, 'position_scaler.joblib'))
-        self.size_scaler = joblib.load(os.path.join(model_dir, 'size_scaler.joblib'))
-        print("Models loaded successfully!")
-        self.sift = cv2.SIFT_create(nfeatures=2000)
-        self.orb = cv2.ORB_create(nfeatures=1000)
-        self.matcher = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
         self.prev_frame = None
         self.prev_kp = None
         self.prev_desc = None
-    def estimate_camera_motion(self, frame):
         if frame is None:
             return np.eye(2, 3, dtype=np.float32)
@@ -61,7 +54,117 @@ class ObjectTrackerInference:
         self.prev_desc = desc
         return transform_matrix
     def local_binary_pattern(self, image, n_points=8, radius=1):
         rows, cols = image.shape
         output = np.zeros((rows, cols))
@@ -97,30 +200,63 @@ class ObjectTrackerInference:
         return output
-    def extract_features(self, frame, bbox, transform_matrix=None):
         if frame is None:
-            return None
         gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
-        x, y, w, h = map(int, bbox)
-        x = max(0, min(x, gray.shape[1] - w))
-        y = max(0, min(y, gray.shape[0] - h))
         w = min(w, gray.shape[1] - x)
         h = min(h, gray.shape[0] - y)
         roi = gray[y:y+h, x:x+w]
-        if roi.size == 0:
-            roi = gray
         roi = cv2.resize(roi, (64, 64))
         features = []
         hog = cv2.HOGDescriptor((64,64), (16,16), (8,8), (8,8), 9)
         hog_features = hog.compute(roi)
         features.extend(hog_features.flatten()[:64])
         lbp = self.local_binary_pattern(roi, n_points=8, radius=1)
         features.extend([
             np.mean(lbp),
@@ -128,19 +264,18 @@ class ObjectTrackerInference:
             *np.percentile(lbp, [25, 50, 75])
         ])
-        if transform_matrix is not None:
-            features.extend([
-                transform_matrix[0,0],
-                transform_matrix[1,1],
-                transform_matrix[0,2],
-                transform_matrix[1,2]
-            ])
-        else:
-            features.extend([1, 1, 0, 0])
         features.extend([x, y, w, h])
-        return np.array(features).reshape(1, -1)
     def predict_bbox(self, features):
         features_position = self.position_scaler.transform(features)
@@ -153,13 +288,32 @@ class ObjectTrackerInference:
         return bbox
     def track_video(self, video_path, initial_bbox, output_path='output_tracked.mp4', fps=30):
         print(f"Processing video: {video_path}")
         cap = cv2.VideoCapture(video_path)
         if not cap.isOpened():
             raise ValueError(f"Could not open video: {video_path}")
         frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
         frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
@@ -169,12 +323,15 @@ class ObjectTrackerInference:
         fourcc = cv2.VideoWriter_fourcc(*'mp4v')
         out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))
-        self.prev_frame = None
-        self.prev_kp = None
-        self.prev_desc = None
         current_bbox = initial_bbox
         frame_idx = 0
         print("Tracking object...")
@@ -183,14 +340,36 @@ class ObjectTrackerInference:
             if not ret:
                 break
-            transform_matrix = self.estimate_camera_motion(frame)
-            features = self.extract_features(frame, current_bbox, transform_matrix)
             if features is not None:
                 predicted_bbox = self.predict_bbox(features)
                 current_bbox = predicted_bbox
             x, y, w, h = map(int, current_bbox)
             cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
             cv2.putText(frame, f'Frame: {frame_idx}', (10, 30),
@@ -213,7 +392,7 @@ def main():
     tracker = ObjectTrackerInference(model_dir='models')
     video_path = 'input_video.mp4'
-    initial_bbox = [100, 100, 50, 50]
     output_path = 'tracked_output.mp4'
     result = tracker.track_video(video_path, initial_bbox, output_path)

+"""
+UAV Object Tracker - Inference Script (FIXED)
+Properly uses sliding window search and template matching during inference.
+"""
 import cv2
 import joblib
+import os
 import numpy as np
+class CameraMotionCompensator:
+    def __init__(self):
         self.prev_frame = None
         self.prev_kp = None
         self.prev_desc = None
+        self.orb = cv2.ORB_create(nfeatures=1000)
+        self.matcher = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
+    def estimate_motion(self, frame):
         if frame is None:
             return np.eye(2, 3, dtype=np.float32)
         self.prev_desc = desc
         return transform_matrix
+class ImprovedSlidingWindowTracker:
+    def __init__(self, scale_factor=2.0, overlap=0.3):
+        self.scale_factor = scale_factor
+        self.overlap = overlap
+        self.sift = cv2.SIFT_create(nfeatures=2000)
+        FLANN_INDEX_KDTREE = 1
+        index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
+        search_params = dict(checks=50)
+        self.flann = cv2.FlannBasedMatcher(index_params, search_params)
+        self.scale_levels = 3
+        self.scale_step = 1.2
+    def generate_multiscale_windows(self, img_shape, prev_bbox, transform_matrix=None):
+        x, y, w, h = map(int, prev_bbox)
+        if transform_matrix is not None:
+            center = np.array([[x + w/2, y + h/2, 1]], dtype=np.float32).T
+            transformed_center = np.dot(transform_matrix, center)
+            x = int(transformed_center[0] - w/2)
+            y = int(transformed_center[1] - h/2)
+        windows = []
+        for scale in np.linspace(1/self.scale_step, self.scale_step, self.scale_levels):
+            window_w = int(w * self.scale_factor * scale)
+            window_h = int(h * self.scale_factor * scale)
+            center_x = x + w // 2
+            center_y = y + h // 2
+            step_x = int(window_w * (1 - self.overlap))
+            step_y = int(window_h * (1 - self.overlap))
+            for dy in range(-step_y, step_y + 1, max(1, step_y // 2)):
+                for dx in range(-step_x, step_x + 1, max(1, step_x // 2)):
+                    win_x = max(0, min(center_x - window_w // 2 + dx, img_shape[1] - window_w))
+                    win_y = max(0, min(center_y - window_h // 2 + dy, img_shape[0] - window_h))
+                    # Ensure window is within bounds
+                    if win_x + window_w > img_shape[1]:
+                        window_w = img_shape[1] - win_x
+                    if win_y + window_h > img_shape[0]:
+                        window_h = img_shape[0] - win_y
+                    if window_w > 10 and window_h > 10:
+                        windows.append((win_x, win_y, window_w, window_h))
+        return windows
+    def score_window(self, img, window, template, template_desc):
+        x, y, w, h = map(int, window)
+        if x < 0 or y < 0 or x + w > img.shape[1] or y + h > img.shape[0]:
+            return 0
+        roi = img[y:y+h, x:x+w]
+        min_size = 20
+        if roi.shape[0] < min_size or roi.shape[1] < min_size:
+            return 0
+        roi = cv2.resize(roi, (template.shape[1], template.shape[0]))
+        kp, desc = self.sift.detectAndCompute(roi, None)
+        if desc is None or template_desc is None or len(desc) == 0 or len(template_desc) == 0:
+            return 0
+        try:
+            matches = self.flann.knnMatch(template_desc, desc, k=2)
+            good_matches = []
+            for match_group in matches:
+                if len(match_group) == 2:
+                    m, n = match_group
+                    if m.distance < 0.7 * n.distance:
+                        good_matches.append(m)
+            if len(good_matches) == 0:
+                return 0
+            avg_distance = np.mean([m.distance for m in good_matches])
+            score = len(good_matches) * (1 - avg_distance/512)
+            return score
+        except Exception:
+            return 0
+class ObjectTrackerInference:
+    def __init__(self, model_dir='models'):
+        self.model_dir = model_dir
+        print("Loading pre-trained models...")
+        self.position_model = joblib.load(os.path.join(model_dir, 'position_model.joblib'))
+        self.size_model = joblib.load(os.path.join(model_dir, 'size_model.joblib'))
+        self.position_scaler = joblib.load(os.path.join(model_dir, 'position_scaler.joblib'))
+        self.size_scaler = joblib.load(os.path.join(model_dir, 'size_scaler.joblib'))
+        print("Models loaded successfully!")
+        self.window_tracker = ImprovedSlidingWindowTracker()
+        self.motion_compensator = CameraMotionCompensator()
+        self.template = None
+        self.template_descriptors = None
     def local_binary_pattern(self, image, n_points=8, radius=1):
         rows, cols = image.shape
         output = np.zeros((rows, cols))
         return output
+    def extract_features(self, frame, prev_bbox, transform_matrix):
         if frame is None:
+            return None, prev_bbox
         gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+        # Use sliding window to find best match
+        windows = self.window_tracker.generate_multiscale_windows(
+            frame.shape, prev_bbox, transform_matrix
+        )
+        # Initialize template on first frame
+        if self.template is None:
+            x, y, w, h = map(int, prev_bbox)
+            x = max(0, min(x, gray.shape[1] - w))
+            y = max(0, min(y, gray.shape[0] - h))
+            w = min(w, gray.shape[1] - x)
+            h = min(h, gray.shape[0] - y)
+            self.template = gray[y:y+h, x:x+w].copy()
+            _, self.template_descriptors = self.window_tracker.sift.detectAndCompute(self.template, None)
+        # Find best matching window
+        best_score = -1
+        best_window = prev_bbox
+        for window in windows:
+            score = self.window_tracker.score_window(
+                gray, window, self.template, self.template_descriptors
+            )
+            if score > best_score:
+                best_score = score
+                best_window = window
+        # Use best window for feature extraction
+        x, y, w, h = map(int, best_window)
+        # Ensure bbox is within bounds
+        x = max(0, min(x, gray.shape[1] - 10))
+        y = max(0, min(y, gray.shape[0] - 10))
         w = min(w, gray.shape[1] - x)
         h = min(h, gray.shape[0] - y)
+        w = max(10, w)
+        h = max(10, h)
         roi = gray[y:y+h, x:x+w]
         roi = cv2.resize(roi, (64, 64))
         features = []
+        # HOG features
         hog = cv2.HOGDescriptor((64,64), (16,16), (8,8), (8,8), 9)
         hog_features = hog.compute(roi)
         features.extend(hog_features.flatten()[:64])
+        # LBP features
         lbp = self.local_binary_pattern(roi, n_points=8, radius=1)
         features.extend([
             np.mean(lbp),
             *np.percentile(lbp, [25, 50, 75])
         ])
+        # Motion features
+        features.extend([
+            transform_matrix[0,0],
+            transform_matrix[1,1],
+            transform_matrix[0,2],
+            transform_matrix[1,2]
+        ])
+        # Position and size
         features.extend([x, y, w, h])
+        return np.array(features).reshape(1, -1), (x, y, w, h)
     def predict_bbox(self, features):
         features_position = self.position_scaler.transform(features)
         return bbox
+    def calculate_iou(self, bbox1, bbox2):
+        x1, y1, w1, h1 = bbox1
+        x2, y2, w2, h2 = bbox2
+        x_left = max(x1, x2)
+        y_top = max(y1, y2)
+        x_right = min(x1 + w1, x2 + w2)
+        y_bottom = min(y1 + h1, y2 + h2)
+        if x_right < x_left or y_bottom < y_top:
+            return 0.0
+        intersection_area = (x_right - x_left) * (y_bottom - y_top)
+        bbox1_area = w1 * h1
+        bbox2_area = w2 * h2
+        iou = intersection_area / float(bbox1_area + bbox2_area - intersection_area)
+        return max(0.0, min(1.0, iou))
     def track_video(self, video_path, initial_bbox, output_path='output_tracked.mp4', fps=30):
         print(f"Processing video: {video_path}")
         cap = cv2.VideoCapture(video_path)
         if not cap.isOpened():
             raise ValueError(f"Could not open video: {video_path}")
         frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
         frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
         fourcc = cv2.VideoWriter_fourcc(*'mp4v')
         out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))
+        # Reset state
+        self.motion_compensator.prev_frame = None
+        self.template = None
+        self.template_descriptors = None
         current_bbox = initial_bbox
         frame_idx = 0
+        template_update_counter = 0
+        prev_predicted_bbox = None
         print("Tracking object...")
             if not ret:
                 break
+            transform_matrix = self.motion_compensator.estimate_motion(frame)
+            features, search_bbox = self.extract_features(frame, current_bbox, transform_matrix)
             if features is not None:
                 predicted_bbox = self.predict_bbox(features)
+                # Clamp bbox to frame bounds
+                x, y, w, h = predicted_bbox
+                x = max(0, min(int(x), frame_width - 10))
+                y = max(0, min(int(y), frame_height - 10))
+                w = max(10, min(int(w), frame_width - x))
+                h = max(10, min(int(h), frame_height - y))
+                predicted_bbox = [x, y, w, h]
+                # Adaptive template update
+                template_update_counter += 1
+                if template_update_counter >= 5 and prev_predicted_bbox is not None:
+                    iou = self.calculate_iou(prev_predicted_bbox, predicted_bbox)
+                    if iou > 0.6:
+                        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+                        x, y, w, h = map(int, predicted_bbox)
+                        self.template = gray[y:y+h, x:x+w].copy()
+                        _, self.template_descriptors = self.window_tracker.sift.detectAndCompute(self.template, None)
+                        template_update_counter = 0
                 current_bbox = predicted_bbox
+                prev_predicted_bbox = predicted_bbox
+            # Draw bounding box
             x, y, w, h = map(int, current_bbox)
             cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
             cv2.putText(frame, f'Frame: {frame_idx}', (10, 30),
     tracker = ObjectTrackerInference(model_dir='models')
     video_path = 'input_video.mp4'
+    initial_bbox = [100, 100, 50, 50]  # [x, y, width, height]
     output_path = 'tracked_output.mp4'
     result = tracker.track_video(video_path, initial_bbox, output_path)