Spaces:

ayushsaun
/

Single_Object_Tracking

Sleeping

App Files Files Community

ayushsaun commited on Jan 15

Commit

0ac3063

1 Parent(s): 8cb54e4

updated inference.py

Browse files

Files changed (1) hide show

inference.py +163 -346

inference.py CHANGED Viewed

@@ -1,14 +1,8 @@
-"""
-UAV Object Tracker - Inference Script (FIXED)
-Properly uses sliding window search and template matching during inference.
-"""
 import cv2
 import joblib
 import os
 import numpy as np
 class CameraMotionCompensator:
     def __init__(self):
         self.prev_frame = None
@@ -16,388 +10,211 @@ class CameraMotionCompensator:
         self.prev_desc = None
         self.orb = cv2.ORB_create(nfeatures=1000)
         self.matcher = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
     def estimate_motion(self, frame):
         if frame is None:
             return np.eye(2, 3, dtype=np.float32)
         gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
         kp, desc = self.orb.detectAndCompute(gray, None)
-        if self.prev_frame is None:
             self.prev_frame = gray
             self.prev_kp = kp
             self.prev_desc = desc
             return np.eye(2, 3, dtype=np.float32)
-        if desc is None or self.prev_desc is None or len(desc) < 4 or len(self.prev_desc) < 4:
-            return np.eye(2, 3, dtype=np.float32)
         matches = self.matcher.match(self.prev_desc, desc)
         if len(matches) < 4:
             return np.eye(2, 3, dtype=np.float32)
-        matches = sorted(matches, key=lambda x: x.distance)
-        good_matches = matches[:min(len(matches), 50)]
-        src_pts = np.float32([self.prev_kp[m.queryIdx].pt for m in good_matches]).reshape(-1, 1, 2)
-        dst_pts = np.float32([kp[m.trainIdx].pt for m in good_matches]).reshape(-1, 1, 2)
-        transform_matrix, _ = cv2.estimateAffinePartial2D(src_pts, dst_pts)
-        if transform_matrix is None:
-            transform_matrix = np.eye(2, 3, dtype=np.float32)
         self.prev_frame = gray
         self.prev_kp = kp
         self.prev_desc = desc
-        return transform_matrix
 class ImprovedSlidingWindowTracker:
     def __init__(self, scale_factor=2.0, overlap=0.3):
         self.scale_factor = scale_factor
         self.overlap = overlap
         self.sift = cv2.SIFT_create(nfeatures=2000)
-        FLANN_INDEX_KDTREE = 1
-        index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
-        search_params = dict(checks=50)
-        self.flann = cv2.FlannBasedMatcher(index_params, search_params)
         self.scale_levels = 3
         self.scale_step = 1.2
-    def generate_multiscale_windows(self, img_shape, prev_bbox, transform_matrix=None):
-        x, y, w, h = map(int, prev_bbox)
-        if transform_matrix is not None:
-            center = np.array([[x + w/2, y + h/2, 1]], dtype=np.float32).T
-            transformed_center = np.dot(transform_matrix, center)
-            x = int(transformed_center[0] - w/2)
-            y = int(transformed_center[1] - h/2)
-        windows = []
-        for scale in np.linspace(1/self.scale_step, self.scale_step, self.scale_levels):
-            window_w = int(w * self.scale_factor * scale)
-            window_h = int(h * self.scale_factor * scale)
-            center_x = x + w // 2
-            center_y = y + h // 2
-            step_x = int(window_w * (1 - self.overlap))
-            step_y = int(window_h * (1 - self.overlap))
-            for dy in range(-step_y, step_y + 1, max(1, step_y // 2)):
-                for dx in range(-step_x, step_x + 1, max(1, step_x // 2)):
-                    win_x = max(0, min(center_x - window_w // 2 + dx, img_shape[1] - window_w))
-                    win_y = max(0, min(center_y - window_h // 2 + dy, img_shape[0] - window_h))
-                    # Ensure window is within bounds
-                    if win_x + window_w > img_shape[1]:
-                        window_w = img_shape[1] - win_x
-                    if win_y + window_h > img_shape[0]:
-                        window_h = img_shape[0] - win_y
-                    if window_w > 10 and window_h > 10:
-                        windows.append((win_x, win_y, window_w, window_h))
         return windows
-    def score_window(self, img, window, template, template_desc):
-        x, y, w, h = map(int, window)
-        if x < 0 or y < 0 or x + w > img.shape[1] or y + h > img.shape[0]:
             return 0
-        roi = img[y:y+h, x:x+w]
-        min_size = 20
-        if roi.shape[0] < min_size or roi.shape[1] < min_size:
             return 0
-        roi = cv2.resize(roi, (template.shape[1], template.shape[0]))
-        kp, desc = self.sift.detectAndCompute(roi, None)
-        if desc is None or template_desc is None or len(desc) == 0 or len(template_desc) == 0:
             return 0
-        try:
-            matches = self.flann.knnMatch(template_desc, desc, k=2)
-            good_matches = []
-            for match_group in matches:
-                if len(match_group) == 2:
-                    m, n = match_group
-                    if m.distance < 0.7 * n.distance:
-                        good_matches.append(m)
-            if len(good_matches) == 0:
-                return 0
-            avg_distance = np.mean([m.distance for m in good_matches])
-            score = len(good_matches) * (1 - avg_distance/512)
-            return score
-        except Exception:
-            return 0
 class ObjectTrackerInference:
-    def __init__(self, model_dir='models'):
-        self.model_dir = model_dir
-        print("Loading pre-trained models...")
-        self.position_model = joblib.load(os.path.join(model_dir, 'position_model.joblib'))
-        self.size_model = joblib.load(os.path.join(model_dir, 'size_model.joblib'))
-        self.position_scaler = joblib.load(os.path.join(model_dir, 'position_scaler.joblib'))
-        self.size_scaler = joblib.load(os.path.join(model_dir, 'size_scaler.joblib'))
-        print("Models loaded successfully!")
         self.window_tracker = ImprovedSlidingWindowTracker()
-        self.motion_compensator = CameraMotionCompensator()
         self.template = None
-        self.template_descriptors = None
-    def local_binary_pattern(self, image, n_points=8, radius=1):
-        rows, cols = image.shape
-        output = np.zeros((rows, cols))
-        for i in range(radius, rows-radius):
-            for j in range(radius, cols-radius):
-                center = image[i, j]
-                pattern = 0
-                for k in range(n_points):
-                    angle = 2 * np.pi * k / n_points
-                    x = j + radius * np.cos(angle)
-                    y = i - radius * np.sin(angle)
-                    x1, x2 = int(np.floor(x)), int(np.ceil(x))
-                    y1, y2 = int(np.floor(y)), int(np.ceil(y))
-                    f11 = image[y1, x1]
-                    f12 = image[y1, x2]
-                    f21 = image[y2, x1]
-                    f22 = image[y2, x2]
-                    x_weight = x - x1
-                    y_weight = y - y1
-                    pixel_value = (f11 * (1-x_weight) * (1-y_weight) +
-                                 f21 * (1-x_weight) * y_weight +
-                                 f12 * x_weight * (1-y_weight) +
-                                 f22 * x_weight * y_weight)
-                    pattern |= (pixel_value > center) << k
-                output[i, j] = pattern
-        return output
-    def extract_features(self, frame, prev_bbox, transform_matrix):
-        if frame is None:
-            return None, prev_bbox
         gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
-        # Use sliding window to find best match
-        windows = self.window_tracker.generate_multiscale_windows(
-            frame.shape, prev_bbox, transform_matrix
-        )
-        # Initialize template on first frame
         if self.template is None:
-            x, y, w, h = map(int, prev_bbox)
-            x = max(0, min(x, gray.shape[1] - w))
-            y = max(0, min(y, gray.shape[0] - h))
-            w = min(w, gray.shape[1] - x)
-            h = min(h, gray.shape[0] - y)
-            self.template = gray[y:y+h, x:x+w].copy()
-            _, self.template_descriptors = self.window_tracker.sift.detectAndCompute(self.template, None)
-        # Find best matching window
         best_score = -1
-        best_window = prev_bbox
-        for window in windows:
-            score = self.window_tracker.score_window(
-                gray, window, self.template, self.template_descriptors
-            )
-            if score > best_score:
-                best_score = score
-                best_window = window
-        # Use best window for feature extraction
-        x, y, w, h = map(int, best_window)
-        # Ensure bbox is within bounds
-        x = max(0, min(x, gray.shape[1] - 10))
-        y = max(0, min(y, gray.shape[0] - 10))
-        w = min(w, gray.shape[1] - x)
-        h = min(h, gray.shape[0] - y)
-        w = max(10, w)
-        h = max(10, h)
-        roi = gray[y:y+h, x:x+w]
-        roi = cv2.resize(roi, (64, 64))
-        features = []
-        # HOG features
-        hog = cv2.HOGDescriptor((64,64), (16,16), (8,8), (8,8), 9)
-        hog_features = hog.compute(roi)
-        features.extend(hog_features.flatten()[:64])
-        # LBP features
-        lbp = self.local_binary_pattern(roi, n_points=8, radius=1)
-        features.extend([
-            np.mean(lbp),
-            np.std(lbp),
-            *np.percentile(lbp, [25, 50, 75])
-        ])
-        # Motion features
-        features.extend([
-            transform_matrix[0,0],
-            transform_matrix[1,1],
-            transform_matrix[0,2],
-            transform_matrix[1,2]
-        ])
-        # Position and size
-        features.extend([x, y, w, h])
-        return np.array(features).reshape(1, -1), (x, y, w, h)
-    def predict_bbox(self, features):
-        features_position = self.position_scaler.transform(features)
-        features_size = self.size_scaler.transform(features)
-        position_pred = self.position_model.predict(features_position)
-        size_pred = self.size_model.predict(features_size)
-        bbox = np.hstack([position_pred, size_pred])[0]
-        return bbox
-    def calculate_iou(self, bbox1, bbox2):
-        x1, y1, w1, h1 = bbox1
-        x2, y2, w2, h2 = bbox2
-        x_left = max(x1, x2)
-        y_top = max(y1, y2)
-        x_right = min(x1 + w1, x2 + w2)
-        y_bottom = min(y1 + h1, y2 + h2)
-        if x_right < x_left or y_bottom < y_top:
-            return 0.0
-        intersection_area = (x_right - x_left) * (y_bottom - y_top)
-        bbox1_area = w1 * h1
-        bbox2_area = w2 * h2
-        iou = intersection_area / float(bbox1_area + bbox2_area - intersection_area)
-        return max(0.0, min(1.0, iou))
-    def track_video(self, video_path, initial_bbox, output_path='output_tracked.mp4', fps=30):
-        print(f"Processing video: {video_path}")
-        cap = cv2.VideoCapture(video_path)
-        if not cap.isOpened():
-            raise ValueError(f"Could not open video: {video_path}")
-        frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-        frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        print(f"Video: {frame_width}x{frame_height}, {total_frames} frames")
-        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-        out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))
-        # Reset state
-        self.motion_compensator.prev_frame = None
-        self.template = None
-        self.template_descriptors = None
-        current_bbox = initial_bbox
-        frame_idx = 0
-        template_update_counter = 0
-        prev_predicted_bbox = None
-        print("Tracking object...")
         while True:
-            ret, frame = cap.read()
             if not ret:
                 break
-            transform_matrix = self.motion_compensator.estimate_motion(frame)
-            features, search_bbox = self.extract_features(frame, current_bbox, transform_matrix)
-            if features is not None:
-                predicted_bbox = self.predict_bbox(features)
-                # Clamp bbox to frame bounds
-                x, y, w, h = predicted_bbox
-                x = max(0, min(int(x), frame_width - 10))
-                y = max(0, min(int(y), frame_height - 10))
-                w = max(10, min(int(w), frame_width - x))
-                h = max(10, min(int(h), frame_height - y))
-                predicted_bbox = [x, y, w, h]
-                # Adaptive template update
-                template_update_counter += 1
-                if template_update_counter >= 5 and prev_predicted_bbox is not None:
-                    iou = self.calculate_iou(prev_predicted_bbox, predicted_bbox)
-                    if iou > 0.6:
-                        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
-                        x, y, w, h = map(int, predicted_bbox)
-                        self.template = gray[y:y+h, x:x+w].copy()
-                        _, self.template_descriptors = self.window_tracker.sift.detectAndCompute(self.template, None)
-                        template_update_counter = 0
-                current_bbox = predicted_bbox
-                prev_predicted_bbox = predicted_bbox
-            # Draw bounding box
-            x, y, w, h = map(int, current_bbox)
-            cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
-            cv2.putText(frame, f'Frame: {frame_idx}', (10, 30),
-                       cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
             out.write(frame)
-            frame_idx += 1
-            if frame_idx % 30 == 0:
-                print(f"Processed {frame_idx}/{total_frames} frames")
         cap.release()
         out.release()
-        print(f"Tracking complete! Video saved to: {output_path}")
-        return output_path
 def main():
-    tracker = ObjectTrackerInference(model_dir='models')
-    video_path = 'input_video.mp4'
-    initial_bbox = [100, 100, 50, 50]  # [x, y, width, height]
-    output_path = 'tracked_output.mp4'
-    result = tracker.track_video(video_path, initial_bbox, output_path)
-    print(f"Done! Output: {result}")
-if __name__ == "__main__":
     main()

 import cv2
 import joblib
 import os
 import numpy as np
 class CameraMotionCompensator:
     def __init__(self):
         self.prev_frame = None
         self.prev_desc = None
         self.orb = cv2.ORB_create(nfeatures=1000)
         self.matcher = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
     def estimate_motion(self, frame):
         if frame is None:
             return np.eye(2, 3, dtype=np.float32)
         gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
         kp, desc = self.orb.detectAndCompute(gray, None)
+        if self.prev_frame is None or desc is None or self.prev_desc is None or len(desc) < 4 or len(self.prev_desc) < 4:
             self.prev_frame = gray
             self.prev_kp = kp
             self.prev_desc = desc
             return np.eye(2, 3, dtype=np.float32)
         matches = self.matcher.match(self.prev_desc, desc)
         if len(matches) < 4:
             return np.eye(2, 3, dtype=np.float32)
+        matches = sorted(matches, key=lambda x: x.distance)[:50]
+        src = np.float32([self.prev_kp[m.queryIdx].pt for m in matches]).reshape(-1,1,2)
+        dst = np.float32([kp[m.trainIdx].pt for m in matches]).reshape(-1,1,2)
+        M,_ = cv2.estimateAffinePartial2D(src, dst)
+        if M is None:
+            M = np.eye(2,3,dtype=np.float32)
         self.prev_frame = gray
         self.prev_kp = kp
         self.prev_desc = desc
+        return M
 class ImprovedSlidingWindowTracker:
     def __init__(self, scale_factor=2.0, overlap=0.3):
         self.scale_factor = scale_factor
         self.overlap = overlap
         self.sift = cv2.SIFT_create(nfeatures=2000)
         self.scale_levels = 3
         self.scale_step = 1.2
+        index_params = dict(algorithm=1, trees=5)
+        search_params = dict(checks=50)
+        self.flann = cv2.FlannBasedMatcher(index_params, search_params)
+    def generate_multiscale_windows(self, img_shape, prev_bbox, transform_matrix):
+        x,y,w,h = map(int, prev_bbox)
+        center = np.array([[x+w/2,y+h/2,1]],dtype=np.float32).T
+        center = np.dot(transform_matrix, center)
+        cx,cy = int(center[0]), int(center[1])
+        windows=[]
+        for s in np.linspace(1/self.scale_step, self.scale_step, self.scale_levels):
+            ww=int(w*self.scale_factor*s)
+            hh=int(h*self.scale_factor*s)
+            step_x=max(1,int(ww*(1-self.overlap)//2))
+            step_y=max(1,int(hh*(1-self.overlap)//2))
+            for dy in range(-step_y,step_y+1,step_y):
+                for dx in range(-step_x,step_x+1,step_x):
+                    wx=max(0,min(cx-ww//2+dx,img_shape[1]-ww))
+                    wy=max(0,min(cy-hh//2+dy,img_shape[0]-hh))
+                    if ww>10 and hh>10:
+                        windows.append((wx,wy,ww,hh))
         return windows
+    def score_window(self, gray, window, template, template_desc):
+        x,y,w,h = map(int,window)
+        roi = gray[y:y+h,x:x+w]
+        if roi.shape[0]<20 or roi.shape[1]<20:
             return 0
+        roi = cv2.resize(roi,(template.shape[1],template.shape[0]))
+        _,desc = self.sift.detectAndCompute(roi,None)
+        if desc is None or template_desc is None:
             return 0
+        matches = self.flann.knnMatch(template_desc,desc,k=2)
+        good = [m for m,n in matches if m.distance < 0.7*n.distance]
+        if not good:
             return 0
+        return len(good)*(1-np.mean([m.distance for m in good])/512)
 class ObjectTrackerInference:
+    def __init__(self, model_dir):
+        self.position_model = joblib.load(os.path.join(model_dir,'position_model.joblib'))
+        self.size_model = joblib.load(os.path.join(model_dir,'size_model.joblib'))
+        self.position_scaler = joblib.load(os.path.join(model_dir,'position_scaler.joblib'))
+        self.size_scaler = joblib.load(os.path.join(model_dir,'size_scaler.joblib'))
         self.window_tracker = ImprovedSlidingWindowTracker()
+        self.motion = CameraMotionCompensator()
         self.template = None
+        self.template_desc = None
+        self.prev_bbox = None
+        self.template_update_counter = 0
+    def local_binary_pattern(self, image):
+        r=1;n=8
+        out=np.zeros(image.shape)
+        for i in range(r,image.shape[0]-r):
+            for j in range(r,image.shape[1]-r):
+                c=image[i,j];v=0
+                for k in range(n):
+                    a=2*np.pi*k/n
+                    x=j+r*np.cos(a);y=i-r*np.sin(a)
+                    x1,x2=int(np.floor(x)),int(np.ceil(x))
+                    y1,y2=int(np.floor(y)),int(np.ceil(y))
+                    val=(image[y1,x1]+image[y1,x2]+image[y2,x1]+image[y2,x2])/4
+                    v|=(val>c)<<k
+                out[i,j]=v
+        return out
+    def extract_features(self, frame, prev_bbox, M):
         gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+        windows = self.window_tracker.generate_multiscale_windows(frame.shape, prev_bbox, M)
         if self.template is None:
+            x,y,w,h = map(int,prev_bbox)
+            self.template = gray[y:y+h,x:x+w]
+            _,self.template_desc = self.window_tracker.sift.detectAndCompute(self.template,None)
         best_score = -1
+        best_window = None
+        for w in windows:
+            s = self.window_tracker.score_window(gray,w,self.template,self.template_desc)
+            if s > best_score:
+                best_score = s
+                best_window = w
+        if best_window is None:
+            x,y,w,h = map(int,prev_bbox)
+        else:
+            x,y,w,h = map(int,best_window)
+        roi = cv2.resize(gray[y:y+h,x:x+w],(64,64))
+        hog = cv2.HOGDescriptor((64,64),(16,16),(8,8),(8,8),9).compute(roi).flatten()[:64]
+        lbp = self.local_binary_pattern(roi)
+        feat = list(hog)+[
+            np.mean(lbp),np.std(lbp),
+            *np.percentile(lbp,[25,50,75]),
+            M[0,0],M[1,1],M[0,2],M[1,2],
+            x,y,w,h
+        ]
+        return np.array(feat).reshape(1,-1),(x,y,w,h),windows
+    def calculate_iou(self,a,b):
+        x1,y1,w1,h1=a
+        x2,y2,w2,h2=b
+        xl=max(x1,x2);yt=max(y1,y2)
+        xr=min(x1+w1,x2+w2);yb=min(y1+h1,y2+h2)
+        if xr<xl or yb<yt:
+            return 0
+        inter=(xr-xl)*(yb-yt)
+        return inter/(w1*h1+w2*h2-inter)
+    def track_video(self, video_path, init_bbox, output):
+        cap=cv2.VideoCapture(video_path)
+        w,h=int(cap.get(3)),int(cap.get(4))
+        out=cv2.VideoWriter(output,cv2.VideoWriter_fourcc(*'mp4v'),30,(w,h))
+        cur=init_bbox
+        frame_idx=0
         while True:
+            ret,frame=cap.read()
             if not ret:
                 break
+            M=self.motion.estimate_motion(frame)
+            feats,search_bbox,windows=self.extract_features(frame,cur,M)
+            pos=self.position_model.predict(self.position_scaler.transform(feats))
+            size=self.size_model.predict(self.size_scaler.transform(feats))
+            pred=[int(pos[0,0]),int(pos[0,1]),int(size[0,0]),int(size[0,1])]
+            self.template_update_counter+=1
+            if self.template_update_counter>=5 and self.prev_bbox is not None:
+                if self.calculate_iou(self.prev_bbox,pred)>0.6:
+                    g=cv2.cvtColor(frame,cv2.COLOR_BGR2GRAY)
+                    x,y,w1,h1=pred
+                    self.template=g[y:y+h1,x:x+w1]
+                    _,self.template_desc=self.window_tracker.sift.detectAndCompute(self.template,None)
+                    self.template_update_counter=0
+            for wx,wy,ww,wh in windows:
+                cv2.rectangle(frame,(wx,wy),(wx+ww,wy+wh),(0,255,255),1)
+            hh,ww=frame.shape[:2]
+            for yy in range(0,hh,32):
+                for xx in range(0,ww,32):
+                    sp=np.array([xx,yy,1])
+                    ep=np.dot(M,sp)
+                    if abs(ep[0]-xx)>1 or abs(ep[1]-yy)>1:
+                        cv2.arrowedLine(frame,(xx,yy),(int(ep[0]),int(ep[1])),(0,255,0),1,tipLength=0.2)
+            x,y,w1,h1=pred
+            cv2.rectangle(frame,(x,y),(x+w1,y+h1),(0,255,0),2)
+            cv2.putText(frame,f'Frame: {frame_idx}',(10,30),cv2.FONT_HERSHEY_SIMPLEX,1,(255,255,255),2)
             out.write(frame)
+            self.prev_bbox=pred
+            cur=pred
+            frame_idx+=1
         cap.release()
         out.release()
 def main():
+    tracker=ObjectTrackerInference('models')
+    tracker.track_video('input_video.mp4',[100,100,50,50],'tracked_output.mp4')
+if __name__=="__main__":
     main()