Spaces:

opencv
/

object_tracking_vittrack

Running

App Files Files Community

Abhishek Gola commited on Jun 24, 2025

Commit

89138dc

1 Parent(s): 71905ee

Added vit tracker to opencv spaces

Browse files

Files changed (4) hide show

README.md +6 -0
app.py +185 -0
requirements.txt +4 -0
vittrack.py +39 -0

README.md CHANGED Viewed

@@ -7,6 +7,12 @@ sdk: gradio
 sdk_version: 5.34.2
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 sdk_version: 5.34.2
 app_file: app.py
 pinned: false
+short_description: Object tracking with ViTtracker using OpenCV
+tags:
+  - opencv
+  - object-tracking
+  - vit
+  - vittracker
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import cv2 as cv
+import numpy as np
+import gradio as gr
+from vittrack import VitTrack
+from huggingface_hub import hf_hub_download
+import os
+import tempfile
+# Download ONNX model at startup
+MODEL_PATH = hf_hub_download(
+    repo_id="opencv/object_tracking_vittrack",
+    filename="object_tracking_vittrack_2023sep.onnx"
+)
+backend_id = cv.dnn.DNN_BACKEND_OPENCV
+target_id  = cv.dnn.DNN_TARGET_CPU
+# Global state
+state = {
+    "points": [],
+    "bbox": None,
+    "video_path": None,
+    "first_frame": None
+}
+def load_first_frame(video_path):
+    """Load video, grab first frame, reset state."""
+    state["video_path"] = video_path
+    cap = cv.VideoCapture(video_path)
+    has_frame, frame = cap.read()
+    cap.release()
+    if not has_frame:
+        return None
+    state["first_frame"] = frame.copy()
+    state["points"].clear()
+    state["bbox"] = None
+    return cv.cvtColor(frame, cv.COLOR_BGR2RGB)
+def select_point(img, evt: gr.SelectData):
+    """Accumulate up to 4 clicks, draw polygon + bounding box."""
+    if state["first_frame"] is None:
+        return None
+    x, y = int(evt.index[0]), int(evt.index[1])
+    if len(state["points"]) < 4:
+        state["points"].append((x, y))
+    vis = state["first_frame"].copy()
+    # draw each point
+    for pt in state["points"]:
+        cv.circle(vis, pt, 5, (0, 255, 0), -1)
+    # draw connecting polygon
+    if len(state["points"]) > 1:
+        pts = np.array(state["points"], dtype=np.int32)
+        cv.polylines(vis, [pts], isClosed=False, color=(255, 255, 0), thickness=2)
+    # once we have exactly 4, compute & draw bounding rect
+    if len(state["points"]) == 4:
+        pts = np.array(state["points"], dtype=np.int32)
+        x0, y0, w, h = cv.boundingRect(pts)
+        state["bbox"] = (x0, y0, w, h)
+        cv.rectangle(vis, (x0, y0), (x0 + w, y0 + h), (0, 0, 255), 2)
+    return cv.cvtColor(vis, cv.COLOR_BGR2RGB)
+def clear_points():
+    """Reset selected points only."""
+    state["points"].clear()
+    state["bbox"] = None
+    if state["first_frame"] is None:
+        return None
+    return cv.cvtColor(state["first_frame"], cv.COLOR_BGR2RGB)
+def clear_all():
+    """Reset everything."""
+    state["points"].clear()
+    state["bbox"] = None
+    state["video_path"] = None
+    state["first_frame"] = None
+    return None, None, None
+def track_video():
+    """Init VitTrack and process entire video, return output path."""
+    if state["video_path"] is None or state["bbox"] is None:
+        return None
+    # instantiate VitTrack
+    model = VitTrack(
+        model_path=MODEL_PATH,
+        backend_id=backend_id,
+        target_id= target_id
+    )
+    cap = cv.VideoCapture(state["video_path"])
+    fps = cap.get(cv.CAP_PROP_FPS)
+    w   = int(cap.get(cv.CAP_PROP_FRAME_WIDTH))
+    h   = int(cap.get(cv.CAP_PROP_FRAME_HEIGHT))
+    # prepare temporary output file
+    tmpdir = tempfile.gettempdir()
+    out_path = os.path.join(tmpdir, "vittrack_output.mp4")
+    writer = cv.VideoWriter(
+        out_path,
+        cv.VideoWriter_fourcc(*"mp4v"),
+        fps,
+        (w, h)
+    )
+    # read & init on first frame
+    _, first_frame = cap.read()
+    model.init(first_frame, state["bbox"])
+    tm = cv.TickMeter()
+    while True:
+        has_frame, frame = cap.read()
+        if not has_frame:
+            break
+        tm.start()
+        isLocated, bbox, score = model.infer(frame)
+        tm.stop()
+        vis = frame.copy()
+        # overlay FPS
+        cv.putText(vis, f"FPS:{tm.getFPS():.2f}", (w//4, 30),
+                   cv.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
+        # draw tracking box or loss message
+        if isLocated and score >= 0.3:
+            x, y, w_, h_ = bbox
+            cv.rectangle(vis, (x, y), (x + w_, y + h_), (0, 255, 0), 2)
+            cv.putText(vis, f"{score:.2f}", (x, y - 10),
+                       cv.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
+        else:
+            cv.putText(vis, "Target lost!",
+                       (w // 2, h//4),
+                       cv.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)
+        writer.write(vis)
+        tm.reset()
+    cap.release()
+    writer.release()
+    return out_path
+with gr.Blocks() as demo:
+    gr.Markdown("## VitTrack: Interactive Video Object Tracking")
+    gr.Markdown(
+        """
+        **How to use this tool:**
+        1. **Upload a video** file (e.g., `.mp4` or `.avi`).
+        2. The **first frame** of the video will appear.
+        3. **Click exactly 4 points** on the object you want to track. These points should outline the object as closely as possible.
+        4. A **bounding box** will be drawn around the selected region automatically.
+        5. Click the **Track** button to start object tracking across the entire video.
+        6. The output video with tracking overlay will appear below.
+        You can also use:
+        - 🧹 **Clear Points** to reset the 4-point selection on the first frame.
+        - 🔄 **Clear All** to reset the uploaded video, frame, and selections.
+        """
+    )
+    with gr.Row():
+        video_in     = gr.File(label="Upload Video", file_types=[".mp4", ".avi"])
+        first_frame  = gr.Image(label="First Frame", interactive=True)
+        output_video = gr.Video(label="Tracking Result")
+    with gr.Row():
+        track_btn     = gr.Button("Track", variant="primary")
+        clear_pts_btn = gr.Button("Clear Points")
+        clear_all_btn = gr.Button("Clear All")
+    video_in.change(fn=load_first_frame, inputs=video_in, outputs=first_frame)
+    first_frame.select(fn=select_point, inputs=first_frame, outputs=first_frame)
+    clear_pts_btn.click(fn=clear_points, outputs=first_frame)
+    clear_all_btn.click(fn=clear_all, outputs=[video_in, first_frame, output_video])
+    track_btn.click(fn=track_video, outputs=output_video)
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+opencv-python
+gradio
+numpy
+huggingface_hub

vittrack.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# This file is part of OpenCV Zoo project.
+# It is subject to the license terms in the LICENSE file found in the same directory.
+import numpy as np
+import cv2 as cv
+class VitTrack:
+    def __init__(self, model_path, backend_id=0, target_id=0):
+        self.model_path = model_path
+        self.backend_id = backend_id
+        self.target_id = target_id
+        self.params = cv.TrackerVit_Params()
+        self.params.net = self.model_path
+        self.params.backend = self.backend_id
+        self.params.target = self.target_id
+        self.model = cv.TrackerVit_create(self.params)
+    @property
+    def name(self):
+        return self.__class__.__name__
+    def setBackendAndTarget(self, backend_id, target_id):
+        self.backend_id = backend_id
+        self.target_id = target_id
+        self.params.backend = self.backend_id
+        self.params.target = self.target_id
+        self.model = cv.TrackerVit_create(self.params)
+    def init(self, image, roi):
+        self.model.init(image, roi)
+    def infer(self, image):
+        is_located, bbox = self.model.update(image)
+        score = self.model.getTrackingScore()
+        return is_located, bbox, score