Spaces:

ayushsaun
/

Single_Object_Tracking

Build error

App Files Files Community

ayushsaun commited on Jan 15

Commit

363db49

1 Parent(s): 4fcaf7f

added inference changes

Browse files

Files changed (1) hide show

inference.py +138 -32

inference.py CHANGED Viewed

@@ -4,6 +4,109 @@ import joblib
 import numpy as np
 from pathlib import Path
 class ObjectTrackerInference:
     def __init__(self, model_dir='models'):
@@ -23,6 +126,10 @@ class ObjectTrackerInference:
         self.prev_frame = None
         self.prev_kp = None
         self.prev_desc = None
     def estimate_camera_motion(self, frame):
         if frame is None:
@@ -164,8 +271,6 @@ class ObjectTrackerInference:
         frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        print(f"Video: {frame_width}x{frame_height}, {total_frames} frames")
         fourcc = cv2.VideoWriter_fourcc(*'mp4v')
         out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))
@@ -176,49 +281,50 @@ class ObjectTrackerInference:
         current_bbox = initial_bbox
         frame_idx = 0
-        print("Tracking object...")
         while True:
             ret, frame = cap.read()
             if not ret:
                 break
             transform_matrix = self.estimate_camera_motion(frame)
-            features = self.extract_features(frame, current_bbox, transform_matrix)
             if features is not None:
-                predicted_bbox = self.predict_bbox(features)
-                current_bbox = predicted_bbox
             x, y, w, h = map(int, current_bbox)
-            cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
             cv2.putText(frame, f'Frame: {frame_idx}', (10, 30),
-                       cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
             out.write(frame)
             frame_idx += 1
-            if frame_idx % 30 == 0:
-                print(f"Processed {frame_idx}/{total_frames} frames")
         cap.release()
         out.release()
-        print(f"Tracking complete! Video saved to: {output_path}")
         return output_path
-def main():
-    tracker = ObjectTrackerInference(model_dir='models')
-    video_path = 'input_video.mp4'
-    initial_bbox = [100, 100, 50, 50]
-    output_path = 'tracked_output.mp4'
-    result = tracker.track_video(video_path, initial_bbox, output_path)
-    print(f"Done! Output: {result}")
-if __name__ == "__main__":
-    main()

 import numpy as np
 from pathlib import Path
+class CameraMotionVisualizer:
+    @staticmethod
+    def draw_motion_grid(frame, transform_matrix, grid_size=32):
+        if transform_matrix is None:
+            return frame
+        h, w = frame.shape[:2]
+        for y in range(0, h, grid_size):
+            for x in range(0, w, grid_size):
+                start = np.array([x, y, 1])
+                end = np.dot(transform_matrix, start)
+                if abs(end[0] - x) > 1 or abs(end[1] - y) > 1:
+                    cv2.arrowedLine(
+                        frame,
+                        (int(x), int(y)),
+                        (int(end[0]), int(end[1])),
+                        (0, 255, 0),
+                        1,
+                        tipLength=0.2
+                    )
+        return frame
+class SlidingWindowRefiner:
+    def __init__(self):
+        self.sift = cv2.SIFT_create(nfeatures=2000)
+        FLANN_INDEX_KDTREE = 1
+        index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
+        search_params = dict(checks=50)
+        self.flann = cv2.FlannBasedMatcher(index_params, search_params)
+        self.scale_levels = 3
+        self.scale_step = 1.2
+        self.scale_factor = 2.0
+        self.overlap = 0.3
+        self.template = None
+        self.template_kp = None
+        self.template_desc = None
+    def initialize_template(self, gray, bbox):
+        x, y, w, h = map(int, bbox)
+        self.template = gray[y:y+h, x:x+w].copy()
+        self.template_kp, self.template_desc = self.sift.detectAndCompute(
+            self.template, None
+        )
+    def generate_windows(self, img_shape, prev_bbox, transform_matrix=None):
+        x, y, w, h = map(int, prev_bbox)
+        if transform_matrix is not None:
+            center = np.array([[x + w/2, y + h/2, 1]]).T
+            transformed = np.dot(transform_matrix, center)
+            x = int(transformed[0] - w/2)
+            y = int(transformed[1] - h/2)
+        windows = []
+        for scale in np.linspace(1/self.scale_step, self.scale_step, self.scale_levels):
+            ww = int(w * self.scale_factor * scale)
+            hh = int(h * self.scale_factor * scale)
+            step_x = int(ww * (1 - self.overlap))
+            step_y = int(hh * (1 - self.overlap))
+            cx, cy = x + w // 2, y + h // 2
+            for dy in range(-step_y, step_y + 1, max(1, step_y // 2)):
+                for dx in range(-step_x, step_x + 1, max(1, step_x // 2)):
+                    wx = max(0, min(cx - ww // 2 + dx, img_shape[1] - ww))
+                    wy = max(0, min(cy - hh // 2 + dy, img_shape[0] - hh))
+                    windows.append((wx, wy, ww, hh))
+        return windows
+    def score_window(self, gray, window):
+        if self.template_desc is None:
+            return 0
+        x, y, w, h = map(int, window)
+        roi = gray[y:y+h, x:x+w]
+        if roi.shape[0] < 20 or roi.shape[1] < 20:
+            return 0
+        roi = cv2.resize(roi, self.template.shape[::-1])
+        kp, desc = self.sift.detectAndCompute(roi, None)
+        if desc is None:
+            return 0
+        matches = self.flann.knnMatch(self.template_desc, desc, k=2)
+        good = [m for m, n in matches if m.distance < 0.7 * n.distance]
+        if not good:
+            return 0
+        avg_dist = np.mean([m.distance for m in good])
+        return len(good) * (1 - avg_dist / 512)
+# ================================
+# 🔹 ORIGINAL INFERENCE CLASS
+# ================================
 class ObjectTrackerInference:
     def __init__(self, model_dir='models'):
         self.prev_frame = None
         self.prev_kp = None
         self.prev_desc = None
+        # 🔹 ADDITIVE
+        self.window_refiner = SlidingWindowRefiner()
+        self.template_initialized = False
     def estimate_camera_motion(self, frame):
         if frame is None:
         frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
         fourcc = cv2.VideoWriter_fourcc(*'mp4v')
         out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))
         current_bbox = initial_bbox
         frame_idx = 0
         while True:
             ret, frame = cap.read()
             if not ret:
                 break
             transform_matrix = self.estimate_camera_motion(frame)
+            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+            if not self.template_initialized:
+                self.window_refiner.initialize_template(gray, current_bbox)
+                self.template_initialized = True
+            windows = self.window_refiner.generate_windows(
+                frame.shape, current_bbox, transform_matrix
+            )
+            best_score = -1
+            best_window = None
+            for win in windows:
+                score = self.window_refiner.score_window(gray, win)
+                xw, yw, ww, hh = map(int, win)
+                cv2.rectangle(frame, (xw, yw), (xw+ww, yw+hh), (0, 255, 255), 1)
+                if score > best_score:
+                    best_score = score
+                    best_window = win
+            if best_window is not None:
+                current_bbox = best_window
+            features = self.extract_features(frame, current_bbox, transform_matrix)
             if features is not None:
+                current_bbox = self.predict_bbox(features)
+            frame = CameraMotionVisualizer.draw_motion_grid(frame, transform_matrix)
             x, y, w, h = map(int, current_bbox)
+            cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 0, 255), 2)
             cv2.putText(frame, f'Frame: {frame_idx}', (10, 30),
+                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
             out.write(frame)
             frame_idx += 1
         cap.release()
         out.release()
         return output_path