Spaces:

facebook
/

cowtracker

Running on Zero

App Files Files Community

zlai commited on Feb 5

Commit

715f79d

0 Parent(s):

Initial commit

Browse files

Files changed (44) hide show

.gitattributes +37 -0
.gitignore +42 -0
.gitmodules +6 -0
CODE_OF_CONDUCT.md +80 -0
CONTRIBUTING.md +31 -0
LICENSE +124 -0
README.md +48 -0
app.py +591 -0
cowtracker/__init__.py +23 -0
cowtracker/heads/__init__.py +14 -0
cowtracker/heads/feature_extractor.py +100 -0
cowtracker/heads/tracking_head.py +243 -0
cowtracker/inference/__init__.py +12 -0
cowtracker/inference/windowed.py +144 -0
cowtracker/layers/__init__.py +26 -0
cowtracker/layers/dpt_head.py +173 -0
cowtracker/layers/patch_embed.py +90 -0
cowtracker/layers/resnet_deconv.py +61 -0
cowtracker/layers/temporal_attention.py +307 -0
cowtracker/layers/video_transformer.py +411 -0
cowtracker/models/__init__.py +23 -0
cowtracker/models/cowtracker.py +228 -0
cowtracker/models/cowtracker_windowed.py +218 -0
cowtracker/thirdparty/DepthAnythingV2 +1 -0
cowtracker/thirdparty/__init__.py +19 -0
cowtracker/thirdparty/vggt +1 -0
cowtracker/utils/__init__.py +32 -0
cowtracker/utils/ops.py +151 -0
cowtracker/utils/padding.py +168 -0
cowtracker/utils/visualization.py +229 -0
demo.py +179 -0
docs/logo.jpg +3 -0
docs/teaser.jpg +3 -0
environments.yml +36 -0
output.mp4 +3 -0
packages.txt +2 -0
requirements.txt +26 -0
videos/apple.mp4 +3 -0
videos/bear.mp4 +3 -0
videos/bmx-bumps.mp4 +3 -0
videos/cows.mp4 +3 -0
videos/lab-coat.mp4 +3 -0
videos/longboard.mp4 +3 -0
videos/motocross-jump.mp4 +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,37 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,42 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+*.egg
+*.egg-info/
+dist/
+build/
+eggs/
+.eggs/
+# Temporary files
+tmp/
+*.tmp
+*.temp
+# OS files
+.DS_Store
+.DS_Store?
+._*
+Thumbs.db
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+# Jupyter
+.ipynb_checkpoints/
+# Model checkpoints (use LFS for large files)
+# *.pth
+# *.pt
+# Environment
+.env
+.venv/
+venv/
+ENV/

.gitmodules ADDED Viewed

	@@ -0,0 +1,6 @@

+[submodule "cowtracker/thirdparty/DepthAnythingV2"]
+	path = cowtracker/thirdparty/DepthAnythingV2
+	url = https://github.com/DepthAnything/Depth-Anything-V2.git
+[submodule "cowtracker/thirdparty/vggt"]
+	path = cowtracker/thirdparty/vggt
+	url = https://github.com/facebookresearch/vggt.git

CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,80 @@

+# Code of Conduct
+## Our Pledge
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+## Our Standards
+Examples of behavior that contributes to creating a positive environment
+include:
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+Examples of unacceptable behavior by participants include:
+* The use of sexualized language or imagery and unwelcome sexual attention or
+advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+professional setting
+## Our Responsibilities
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+## Scope
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+This Code of Conduct also applies outside the project spaces when there is a
+reasonable belief that an individual's behavior may have a negative impact on
+the project or its community.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <opensource-conduct@meta.com>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+[homepage]: https://www.contributor-covenant.org
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,31 @@

+# Contributing to cowtracker
+We want to make contributing to this project as easy and transparent as
+possible.
+## Pull Requests
+We actively welcome your pull requests.
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+Complete your CLA here: <https://code.facebook.com/cla>
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+## License
+By contributing to cowtracker, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.

LICENSE ADDED Viewed

	@@ -0,0 +1,124 @@

+FAIR Noncommercial Research License
+v1 Last Updated: December 22, 2025
+“Acceptable Use Policy” means the FAIR Acceptable Use Policy, applicable to Research Materials, that is incorporated into this Agreement.
+“Agreement” means the terms and conditions for use, reproduction, distribution and modification of the Research Materials set forth herein.
+“Documentation” means the specifications, manuals and documentation accompanying
+Research Materials distributed by Meta.
+“Licensee” or “you” means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.
+“Meta” or “we” means Meta Platforms Ireland Limited (if you are located in or, if you are an entity, your principal place of business is in the EEA or Switzerland) and Meta Platforms, Inc. (if you are located outside of the EEA or Switzerland).
+“Noncommercial Research Uses” means noncommercial research use cases related to research, development, education, processing, or analysis and in each case, is not primarily intended for commercial advantage or monetary compensation to you or others.
+“Research Materials” means, collectively, Documentation and the models, software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code, demonstration materials and other elements of the foregoing distributed by Meta and made available under this Agreement.
+By clicking “I Accept” below or by using or distributing any portion or element of the Research Materials, you agree to be bound by this Agreement.
+1. License Rights and Redistribution.
+a. Grant of Rights. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Meta’s intellectual property or other rights owned by Meta embodied in the Research Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Research Materials.
+b. Redistribution and Use.
+ i. You will not use the Research Materials or any outputs or results of the Research Materials in connection with any commercial uses or for any uses other than Noncommercial Research Uses;
+ii. Distribution of Research Materials, and any derivative works thereof, are subject to the terms of this Agreement. If you distribute or make the Research Materials, or any derivative works thereof, available to a third party, you may only do so under the terms of this Agreement. You shall also provide a copy of this Agreement to such third party.
+iii.  If you submit for publication the results of research you perform on, using, or otherwise in connection with Research Materials, you must acknowledge the use of Research Materials in your publication.
+iv. Your use of the Research Materials must comply with applicable laws and regulations (including Trade Control Laws) and adhere to the FAIR Acceptable Use Policy, which is hereby incorporated by reference into this Agreement.
+2. User Support. Your Noncommercial Research Use of the Research Materials is done at your own discretion; Meta does not process any information nor provide any service in relation to such use.  Meta is under no obligation to provide any support services for the Research Materials. Any support provided is “as is”, “with all faults”, and without warranty of any kind.
+3. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE RESEARCH MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OF ANY KIND, AND META DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE RESEARCH MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE RESEARCH MATERIALS AND ANY OUTPUT AND RESULTS.
+4. Limitation of Liability. IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT OR INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+5. Intellectual Property.
+a. Subject to Meta’s ownership of Research Materials and derivatives made by or for Meta, with respect to any derivative works and modifications of the Research Materials that are made by you, as between you and Meta, you are and will be the owner of such derivative works and modifications.
+b. If you institute litigation or other proceedings against Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Research Materials, outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Meta from and against any claim by any third party arising out of or related to your use or distribution of the Research Materials.
+6. Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Research Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Meta may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the Research Materials. Sections 3, 4 and 7 shall survive the termination of this Agreement.
+7. Governing Law and Jurisdiction. This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The courts of California shall have exclusive jurisdiction of any dispute arising out of this Agreement.
+8. Modifications and Amendments. Meta may modify this Agreement from time to time; provided that they are similar in spirit to the current version of the Agreement, but may differ in detail to address new problems or concerns. All such changes will be effective immediately. Your continued use of the Research Materials after any modification to this Agreement constitutes your agreement to such modification. Except as provided in this Agreement, no modification or addition to any provision of this Agreement will be binding unless it is in writing and signed by an authorized representative of both you and Meta.
+FAIR Acceptable Use Policy
+The Fundamental AI Research (FAIR) team at Meta seeks to further understanding of new and existing research domains with the mission of advancing the state-of-the-art in artificial intelligence through open research for the benefit of all.
+As part of this mission, Meta makes certain research materials available for noncommercial research use. Meta is committed to promoting the safe and responsible use of such research materials.
+Prohibited Uses
+You agree you will not use, or allow others to use, Research Materials to:
+ Violate the law or others’ rights, including to:
+Engage in, promote, generate, contribute to, encourage, plan, incite, or further illegal or unlawful activity or content, such as:
+Violence or terrorism
+Exploitation or harm to children, including the solicitation, creation, acquisition, or dissemination of child exploitative content or failure to report Child Sexual Abuse Material
+Human trafficking, exploitation, and sexual violence
+The illegal distribution of information or materials to minors, including obscene materials, or failure to employ legally required age-gating in connection with such information or materials.
+Sexual solicitation
+Any other criminal activity
+Engage in, promote, incite, or facilitate the harassment, abuse, threatening, or bullying of individuals or groups of individuals
+Engage in, promote, incite, or facilitate discrimination or other unlawful or harmful conduct in the provision of employment, employment benefits, credit, housing, other economic benefits, or other essential goods and services
+Engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or related professional practices
+Collect, process, disclose, generate, or infer health, demographic, or other sensitive personal or private information about individuals without rights and consents required by applicable laws
+Engage in or facilitate any action or generate any content that infringes, misappropriates, or otherwise violates any third-party rights, including the outputs or results of any technology using FAIR research materials
+Create, generate, or facilitate the creation of malicious code, malware, computer viruses or do anything else that could disable, overburden, interfere with or impair the proper working, integrity, operation or appearance of a website or computer system
+2. Engage in, promote, incite, facilitate, or assist in the planning or development of activities that present a risk of death or bodily harm to individuals, including use of research artifacts related to the following:
+Military, warfare, nuclear industries or applications, espionage, use for materials or activities that are subject to the International Traffic Arms Regulations (ITAR) maintained by the United States Department of State
+Guns and illegal weapons (including weapon development)
+Illegal drugs and regulated/controlled substances
+Operation of critical infrastructure, transportation technologies, or heavy machinery
+Self-harm or harm to others, including suicide, cutting, and eating disorders
+Any content intended to incite or promote violence, abuse, or any infliction of bodily harm to an individual
+3. Intentionally deceive or mislead others, including use of FAIR Research Materials related to the following:
+ Generating, promoting, or furthering fraud or the creation or promotion of disinformation
+ Generating, promoting, or furthering defamatory content, including the creation of defamatory statements, images, or other content
+Generating, promoting, or further distributing spam
+ Impersonating another individual without consent, authorization, or legal right
+Representing that outputs of FAIR research materials or outputs from technology using FAIR research materials are human-generated
+Generating or facilitating false online engagement, including fake reviews and other means of fake online engagement
+4. Fail to appropriately disclose to end users any known dangers of your Research Materials.
+Please report any violation of this Policy or other problems that could lead to a violation of this Policy by submitting a report here [https://docs.google.com/forms/d/e/1FAIpQLSeb11cryAopJ7LNrC4nxEUXrHY26hfkXQMf_uH-oFgA3WlYZQ/viewform].

README.md ADDED Viewed

	@@ -0,0 +1,48 @@

+---
+title: CoWTracker
+emoji: 🐮
+colorFrom: green
+colorTo: yellow
+sdk: gradio
+sdk_version: 6.2.0
+app_file: app.py
+pinned: false
+license: cc-by-nc-4.0
+suggested_hardware: a10g-small
+short_description: Dense Point Tracking with Cost-Volume Free Warping
+---
+# 🐮 CoWTracker
+**Cost-Volume Free Warping-Based Dense Point Tracking**
+Zihang Lai<sup>1,2</sup>, Eldar Insafutdinov<sup>1</sup>, Edgar Sucar<sup>1</sup>, Andrea Vedaldi<sup>1,2</sup>
+<sup>1</sup>Visual Geometry Group, University of Oxford &nbsp;&nbsp; <sup>2</sup>Meta AI
+---
+Upload a video and CoWTracker will track every pixel through time, visualizing the motion with colorful trajectories.
+## Features
+- **Dense Tracking**: Track all pixels simultaneously
+- **Bidirectional**: Track forwards and backwards from any query frame
+- **Interactive**: Choose query frame and visualization settings
+- **Fast**: Efficient warping-based architecture
+## Links
+- [Project Page](https://cowtracker.github.io/)
+- [GitHub](https://github.com/facebookresearch/cowtracker/)
+- [Paper](#)
+## Usage
+1. Upload a video (or select an example)
+2. Click "Process Video"
+3. Select query frame using the slider
+4. Click "Start Tracking"
+5. Adjust visualization settings as needed
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,591 @@

+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+CoWTracker Gradio Demo.
+Interactive web demo for dense point tracking using CoWTracker.
+Usage:
+    python app.py
+    python app.py --checkpoint /path/to/model.pth --port 8086
+"""
+import glob
+import os
+import tempfile
+import uuid
+from typing import Optional
+import gradio as gr
+import spaces
+import matplotlib
+import mediapy
+import numpy as np
+import PIL.Image
+import torch
+from cowtracker import CoWTracker
+from cowtracker.utils.padding import (
+    apply_padding,
+    compute_padding_params,
+    remove_padding_and_scale_back,
+)
+from cowtracker.utils.visualization import (
+    get_2d_colors,
+    get_colors_from_cmap,
+    paint_point_track,
+)
+# --- Constants ---
+PREVIEW_WIDTH = 1024
+PREVIEW_HEIGHT = 1024
+FRAME_LIMIT = 512
+# Default checkpoint: None means use the model's default HuggingFace URL
+DEFAULT_CHECKPOINT = None
+# --- Model Initialization ---
+def initialize_model(checkpoint_path: Optional[str] = None):
+    """Initialize and load the CoWTracker model once at startup.
+    Args:
+        checkpoint_path: Path to local checkpoint file.
+                         If None, downloads from HuggingFace Hub.
+    """
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = torch.float16 if device == "cuda" else torch.float32
+    ckpt_path = checkpoint_path if checkpoint_path is not None else DEFAULT_CHECKPOINT
+    if ckpt_path:
+        print(f"Initializing CoWTracker model from {ckpt_path}...")
+    else:
+        print("Initializing CoWTracker model from HuggingFace Hub...")
+    model = CoWTracker.from_checkpoint(
+        ckpt_path,
+        device=device,
+        dtype=dtype,
+    )
+    print("Model initialized successfully!")
+    return model
+# Initialize model once at module level
+GLOBAL_MODEL = None
+def get_model():
+    """Get the global model, initializing if needed."""
+    global GLOBAL_MODEL
+    if GLOBAL_MODEL is None:
+        GLOBAL_MODEL = initialize_model()
+    return GLOBAL_MODEL
+# --- Core Logic Functions ---
+def preprocess_video_input(video_path):
+    """Process uploaded video for tracking."""
+    if not video_path:
+        return None
+    video_arr = mediapy.read_video(video_path)
+    video_fps = video_arr.metadata.fps
+    num_frames = video_arr.shape[0]
+    if num_frames > FRAME_LIMIT:
+        gr.Warning(
+            f"Video is too long. Truncating to first {FRAME_LIMIT} frames.", duration=5
+        )
+        video_arr = video_arr[:FRAME_LIMIT]
+        num_frames = FRAME_LIMIT
+    height, width = video_arr.shape[1:3]
+    if height > width:
+        new_height, new_width = PREVIEW_HEIGHT, int(PREVIEW_WIDTH * width / height)
+    else:
+        new_height, new_width = int(PREVIEW_WIDTH * height / width), PREVIEW_WIDTH
+    # Resize logic to keep manageable size
+    if new_height * new_width > 768 * 1024:
+        new_height = new_height * 3 // 4
+        new_width = new_width * 3 // 4
+    # Make divisible by 16 for ffmpeg compatibility
+    new_height, new_width = new_height // 16 * 16, new_width // 16 * 16
+    preview_video = mediapy.resize_video(video_arr, (new_height, new_width))
+    input_video = preview_video  # using preview size for processing
+    preview_video = np.array(preview_video)
+    input_video = np.array(input_video)
+    return (
+        video_arr,
+        preview_video,
+        preview_video.copy(),
+        input_video,
+        video_fps,
+        preview_video[0],
+        gr.update(minimum=0, maximum=num_frames - 1, value=0, interactive=True),
+        gr.update(interactive=True),
+    )
+def choose_frame(frame_num, video_preview_array):
+    """Select frame for preview."""
+    if video_preview_array is None:
+        return None
+    return video_preview_array[int(frame_num)]
+def paint_video(
+    video_preview,
+    query_frame,
+    video_fps,
+    tracks,
+    visibs,
+    rate=1,
+    show_bkg=True,
+    cmap="gist_rainbow",
+):
+    """Paint tracks onto video and save to file."""
+    T, H, W, _ = video_preview.shape
+    # Get colors based on colormap choice
+    if cmap == "bremm":
+        xy0 = tracks[:, query_frame]
+        colors = get_2d_colors(xy0, H, W)
+    else:
+        query_count = tracks.shape[0]
+        colors = get_colors_from_cmap(query_count, cmap)
+    painted_video = paint_point_track(
+        video_preview, tracks, visibs, colors, rate=rate, show_bkg=show_bkg
+    )
+    # Save video to temp directory
+    video_file_name = uuid.uuid4().hex + ".mp4"
+    video_path = os.path.join(tempfile.gettempdir(), "cowtracker_output")
+    video_file_path = os.path.join(video_path, video_file_name)
+    os.makedirs(video_path, exist_ok=True)
+    # Cleanup old jpgs
+    for f in glob.glob(os.path.join(video_path, "*.jpg")):
+        os.remove(f)
+    # Save frames and compile with ffmpeg
+    for ti in range(T):
+        temp_out_f = "%s/%03d.jpg" % (video_path, ti)
+        im = PIL.Image.fromarray(painted_video[ti])
+        im.save(temp_out_f)
+    os.system(
+        f'ffmpeg -y -hide_banner -loglevel error -f image2 -framerate {video_fps} '
+        f'-pattern_type glob -i "{video_path}/*.jpg" -c:v libx264 -crf 20 '
+        f'-pix_fmt yuv420p {video_file_path}'
+    )
+    # Cleanup used jpgs
+    for ti in range(T):
+        temp_out_f = "%s/%03d.jpg" % (video_path, ti)
+        if os.path.exists(temp_out_f):
+            os.remove(temp_out_f)
+    return video_file_path
+@spaces.GPU
+def update_vis(
+    rate, show_bkg, cmap, video_preview, query_frame, video_fps, tracks, visibs
+):
+    """Update visualization with new settings."""
+    if video_preview is None or len(tracks) == 0:
+        return None
+    T, H, W, _ = video_preview.shape
+    tracks_ = tracks.reshape(H, W, T, 2)[::rate, ::rate].reshape(-1, T, 2)
+    visibs_ = visibs.reshape(H, W, T)[::rate, ::rate].reshape(-1, T)
+    return paint_video(
+        video_preview,
+        query_frame,
+        video_fps,
+        tracks_,
+        visibs_,
+        rate=rate,
+        show_bkg=show_bkg,
+        cmap=cmap,
+    )
+@spaces.GPU
+def track(video_preview, video_input, video_fps, query_frame, rate, show_bkg, cmap):
+    """Run tracking on video with bidirectional propagation."""
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = torch.float16 if device == "cuda" else torch.float32
+    video_tensor = torch.tensor(video_input).unsqueeze(0).to(dtype)
+    # Use the globally initialized model
+    model = get_model()
+    print("Using pre-loaded model for tracking...")
+    video_tensor = video_tensor.permute(0, 1, 4, 2, 3)
+    _, T, _, H, W = video_tensor.shape
+    # Store original resolution
+    orig_H, orig_W = H, W
+    # Configure inference size and compute padding parameters
+    inf_H, inf_W = 336, 560
+    skip_upscaling = True
+    print(f"Original video size: {orig_H}x{orig_W}")
+    print(f"Inference size: {inf_H}x{inf_W}")
+    # Compute padding parameters
+    padding_info = compute_padding_params(
+        orig_H, orig_W, inf_H, inf_W, skip_upscaling=skip_upscaling
+    )
+    print(f"Scale factor: {padding_info['scale']:.4f}")
+    if padding_info["upscaling_skipped"]:
+        print(
+            f"Upscaling skipped (scale > 1.0) - using original size: {orig_H}x{orig_W}"
+        )
+    else:
+        print(
+            f"Scaled size (before padding): {padding_info['scaled_H']}x{padding_info['scaled_W']}"
+        )
+    print(
+        f"Padding: top={padding_info['pad_top']}, bottom={padding_info['pad_bottom']}, "
+        f"left={padding_info['pad_left']}, right={padding_info['pad_right']}"
+    )
+    torch.cuda.empty_cache()
+    # Initialize output tensors for INFERENCE resolution
+    traj_maps_e = torch.zeros(
+        (1, T, inf_H, inf_W, 2), dtype=torch.float32, device="cpu"
+    )
+    visconf_maps_e = torch.zeros(
+        (1, T, inf_H, inf_W), dtype=torch.float32, device="cpu"
+    )
+    with torch.no_grad():
+        # Forward pass
+        if query_frame < T - 1:
+            with torch.amp.autocast(device_type="cuda", dtype=torch.float16):
+                # Apply padding to forward video
+                forward_video = video_tensor[0, query_frame:]
+                forward_video_padded = apply_padding(forward_video, padding_info).to(device)
+                predictions = model.forward(
+                    video=forward_video_padded,
+                    queries=None,
+                )
+                # Extract dense predictions (at INFERENCE resolution)
+                tracks_dense = predictions["track"][0]  # (T_forward, inf_H, inf_W, 2)
+                visibility_dense = predictions["vis"][0]  # (T_forward, inf_H, inf_W)
+                confidence_dense = predictions["conf"][0]  # (T_forward, inf_H, inf_W)
+                # Store forward predictions
+                T_forward = tracks_dense.shape[0]
+                traj_maps_e[0, query_frame : query_frame + T_forward] = (
+                    tracks_dense.cpu()
+                )
+                visconf_maps_e[0, query_frame : query_frame + T_forward] = (
+                    visibility_dense * confidence_dense
+                ).cpu()
+        # Backward pass
+        if query_frame > 0:
+            with torch.amp.autocast(device_type="cuda", dtype=torch.float16):
+                # Flip video for backward tracking and apply padding
+                backward_video = video_tensor[0, : query_frame + 1].flip([0])
+                backward_video_padded = apply_padding(
+                    backward_video, padding_info
+                ).to(device)
+                predictions = model.forward(
+                    video=backward_video_padded,
+                    queries=None,
+                )
+                # Extract dense predictions (at INFERENCE resolution)
+                tracks_dense = predictions["track"][0]  # (T_backward, inf_H, inf_W, 2)
+                visibility_dense = predictions["vis"][0]  # (T_backward, inf_H, inf_W)
+                confidence_dense = predictions["conf"][0]  # (T_backward, inf_H, inf_W)
+                # Flip back to original temporal order
+                backward_tracks = tracks_dense.flip([0]).cpu()
+                backward_visconf = (visibility_dense * confidence_dense).flip([0]).cpu()
+                # Store backward predictions (excluding query frame if needed)
+                end_idx = query_frame if query_frame < T - 1 else query_frame + 1
+                traj_maps_e[0, :end_idx] = backward_tracks[:end_idx]
+                visconf_maps_e[0, :end_idx] = backward_visconf[:end_idx]
+    # Remove padding and scale back to original resolution
+    print(f"Removing padding and scaling back to {orig_H}x{orig_W}")
+    tracks_final, _, confidence_final = remove_padding_and_scale_back(
+        traj_maps_e[0],  # (T, inf_H, inf_W, 2)
+        torch.ones_like(visconf_maps_e[0]),  # dummy visibility (not used here)
+        visconf_maps_e[0],  # (T, inf_H, inf_W)
+        padding_info,
+    )
+    print(f"Tracks shape after unpadding: {tracks_final.shape}")
+    print(f"Confidence shape after unpadding: {confidence_final.shape}")
+    # Convert to numpy format
+    tracks = tracks_final.permute(1, 2, 0, 3).reshape(-1, T, 2).numpy()
+    confs = confidence_final.permute(1, 2, 0).reshape(-1, T).numpy()
+    visibs = confs > 0.1
+    return (
+        update_vis(
+            rate, show_bkg, cmap, video_preview, query_frame, video_fps, tracks, visibs
+        ),
+        tracks,
+        visibs,
+        gr.update(interactive=True),
+        gr.update(interactive=True),
+        gr.update(interactive=True),
+    )
+# --- Gradio UI Layout ---
+custom_css = """
+h1 {text-align: center; margin-bottom: 0 !important;}
+.contain {max-width: 95% !important;}
+#examples-accordion {margin-top: 10px;}
+"""
+def create_demo():
+    """Create and return the Gradio demo interface."""
+    with gr.Blocks(title="CoWTracker Demo", theme=gr.themes.Ocean(), css=custom_css) as demo:
+        # State Variables
+        video_state = gr.State()
+        video_queried_preview = gr.State()
+        video_preview = gr.State()
+        video_input = gr.State()
+        video_fps = gr.State(24)
+        tracks = gr.State([])
+        visibs = gr.State([])
+        # Header
+        gr.Markdown(
+            """
+            <div style="text-align: center; max-width: 800px; margin: 0 auto;">
+                <h1 style="font-weight: 900; margin-bottom: 7px;">CoWTracker</h1>
+                <p style="margin-bottom: 10px; font-size: 94%">
+                    Cost-Volume Free Warping-Based Dense Point Tracking.
+                    <a href='https://cowtracker.github.io/' target='_blank'>Project Page</a> |
+                    <a href='https://github.com/facebookresearch/cowtracker/' target='_blank'>GitHub</a> |
+                    <a href='' target='_blank'>Paper</a>
+                </p>
+            </div>
+            """
+        )
+        with gr.Row():
+            # --- Left Column: Input & Query ---
+            with gr.Column(scale=1):
+                with gr.Group():
+                    gr.Markdown("### 1. Upload Video")
+                    video_in = gr.Video(label="Input Video", format="mp4", height=300)
+                    submit_btn = gr.Button("Step 1: Process Video", variant="primary")
+                # Query Frame Preview
+                with gr.Group():
+                    gr.Markdown("### 2. Select Query Frame")
+                    query_frame_slider = gr.Slider(
+                        minimum=0,
+                        maximum=100,
+                        value=0,
+                        step=1,
+                        label="Frame Number",
+                        interactive=False,
+                    )
+                    current_frame = gr.Image(
+                        label="Query Frame Preview",
+                        type="numpy",
+                        interactive=False,
+                        height=300,
+                    )
+            # --- Right Column: Visualization & Output ---
+            with gr.Column(scale=2):
+                gr.Markdown("### 3. Configure & Track")
+                with gr.Group():
+                    with gr.Row():
+                        rate_radio = gr.Radio(
+                            [1, 2, 4, 8],
+                            value=8,
+                            label="Subsampling Rate",
+                            interactive=False,
+                        )
+                        cmap_radio = gr.Radio(
+                            ["gist_rainbow", "rainbow", "jet", "turbo"],
+                            value="gist_rainbow",
+                            label="Colormap",
+                            interactive=False,
+                        )
+                    with gr.Row():
+                        bkg_check = gr.Checkbox(
+                            value=True, label="Overlay on Video", interactive=False
+                        )
+                        track_button = gr.Button(
+                            "Step 2: Start Tracking", variant="primary", interactive=False
+                        )
+                # Output takes entire width of this column
+                output_video = gr.Video(
+                    label="Tracking Result",
+                    interactive=False,
+                    autoplay=True,
+                    loop=True,
+                    height=550,
+                )
+        # --- Full Width Row: Examples ---
+        with gr.Row():
+            with gr.Column():
+                video_folder = "videos"
+                gr.Markdown("### 📚 Example Videos")
+                video_dir = os.path.join(os.path.dirname(__file__), video_folder)
+                video_files = []
+                if os.path.exists(video_dir):
+                    for filename in sorted(os.listdir(video_dir)):
+                        if filename.endswith((".mp4", ".avi", ".mov", ".mkv", ".webm")):
+                            video_files.append(os.path.join(video_dir, filename))
+                if video_files:
+                    gr.Examples(
+                        examples=video_files,
+                        inputs=[video_in],
+                        examples_per_page=16,
+                    )
+        # --- Interaction Logic ---
+        # 1. Submit Video
+        submit_btn.click(
+            fn=preprocess_video_input,
+            inputs=[video_in],
+            outputs=[
+                video_state,
+                video_preview,
+                video_queried_preview,
+                video_input,
+                video_fps,
+                current_frame,
+                query_frame_slider,
+                track_button,
+            ],
+            queue=False,
+        )
+        # 2. Update Preview Frame on Slider Change
+        query_frame_slider.change(
+            fn=choose_frame,
+            inputs=[query_frame_slider, video_queried_preview],
+            outputs=[current_frame],
+            queue=False,
+        )
+        # 3. Run Tracking
+        track_button.click(
+            fn=track,
+            inputs=[
+                video_preview,
+                video_input,
+                video_fps,
+                query_frame_slider,
+                rate_radio,
+                bkg_check,
+                cmap_radio,
+            ],
+            outputs=[
+                output_video,
+                tracks,
+                visibs,
+                rate_radio,
+                bkg_check,
+                cmap_radio,
+            ],
+            queue=True,
+        )
+        # 4. Instant Updates for Visualization Settings (after tracking is done)
+        vis_args = [
+            rate_radio,
+            bkg_check,
+            cmap_radio,
+            video_preview,
+            query_frame_slider,
+            video_fps,
+            tracks,
+            visibs,
+        ]
+        rate_radio.change(
+            fn=update_vis, inputs=vis_args, outputs=[output_video], queue=False
+        )
+        cmap_radio.change(
+            fn=update_vis, inputs=vis_args, outputs=[output_video], queue=False
+        )
+        bkg_check.change(
+            fn=update_vis, inputs=vis_args, outputs=[output_video], queue=False
+        )
+    return demo
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="CoWTracker Gradio Demo")
+    parser.add_argument(
+        "--checkpoint",
+        type=str,
+        default=None,
+        help="Path to model checkpoint",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=7860,
+        help="Server port",
+    )
+    parser.add_argument(
+        "--share",
+        action="store_true",
+        help="Create a public share link",
+    )
+    args = parser.parse_args()
+    # Initialize model with custom checkpoint if provided
+    if args.checkpoint:
+        GLOBAL_MODEL = initialize_model(args.checkpoint)
+    print("=" * 60)
+    print("Starting CoWTracker Gradio Demo")
+    print("=" * 60)
+    demo = create_demo()
+    demo.launch(
+        share=args.share,
+        show_error=True,
+        server_port=args.port,
+    )

cowtracker/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""CoWTracker: Cost-Volume Free Warping-Based Dense Point Tracking."""
+def __getattr__(name):
+    """Lazy import to avoid import errors when dependencies are missing."""
+    if name == "CoWTracker":
+        from cowtracker.models.cowtracker import CoWTracker
+        return CoWTracker
+    if name == "CoWTrackerWindowed":
+        from cowtracker.models.cowtracker_windowed import CoWTrackerWindowed
+        return CoWTrackerWindowed
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+__all__ = ["CoWTracker", "CoWTrackerWindowed"]

cowtracker/heads/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""CowTracker heads."""
+from cowtracker.heads.tracking_head import CowTrackingHead
+from cowtracker.heads.feature_extractor import FeatureExtractor
+import cowtracker.thirdparty  # noqa: F401 - sets up vggt path
+from vggt.heads.dpt_head import DPTHead
+__all__ = ["CowTrackingHead", "FeatureExtractor", "DPTHead"]

cowtracker/heads/feature_extractor.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Feature extraction: DPT + ResNet side features."""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import cowtracker.thirdparty  # noqa: F401 - sets up vggt path
+from vggt.heads.dpt_head import DPTHead
+from cowtracker.layers.resnet_deconv import ResNet18Deconv
+class FeatureExtractor(nn.Module):
+    """
+    Combined DPT and ResNet feature extractor.
+    Takes aggregated tokens from backbone and raw images,
+    outputs combined features for tracking.
+    """
+    DIM_IN = 2048  # 2 * embed_dim (1024)
+    PATCH_SIZE = 14
+    INTERMEDIATE_LAYER_IDX = [4, 11, 17, 23]
+    def __init__(
+        self,
+        features: int = 128,
+        down_ratio: int = 2,
+        side_resnet_channels: int = 128,
+    ):
+        """
+        Args:
+            features: Number of DPT output features.
+            down_ratio: Downsampling ratio relative to input image.
+            side_resnet_channels: Number of ResNet side feature channels.
+        """
+        super().__init__()
+        self.features = features
+        self.down_ratio = down_ratio
+        # DPT head for backbone features
+        self.dpt_head = DPTHead(
+            dim_in=self.DIM_IN,
+            patch_size=self.PATCH_SIZE,
+            features=features,
+            feature_only=True,
+            down_ratio=down_ratio,
+            pos_embed=False,
+            intermediate_layer_idx=self.INTERMEDIATE_LAYER_IDX,
+        )
+        # ResNet for raw image features
+        self.fnet = ResNet18Deconv(3, side_resnet_channels)
+        self.out_dim = features + side_resnet_channels
+    def forward(
+        self,
+        aggregated_tokens_list: list,
+        images: torch.Tensor,
+        patch_start_idx: int,
+    ) -> torch.Tensor:
+        """
+        Extract combined features from backbone tokens and raw images.
+        Args:
+            aggregated_tokens_list: List of tokens from aggregator.
+            images: Input images [B, S, 3, H, W].
+            patch_start_idx: Patch start index for DPT.
+        Returns:
+            combined_features: [B, S, C, H_out, W_out] where C = features + side_resnet_channels.
+        """
+        B, S, _, H_img, W_img = images.shape
+        # DPT features from backbone tokens
+        backbone_features = self.dpt_head(aggregated_tokens_list, images, patch_start_idx)
+        _, _, _, H_out, W_out = backbone_features.shape
+        # Side ResNet features from raw images
+        images_flat = images.view(B * S, 3, H_img, W_img)
+        side_features = self.fnet(images_flat)[0]
+        _, side_channels, H_side, W_side = side_features.shape
+        # Resize side features to match backbone output if needed
+        if H_side != H_out or W_side != W_out:
+            side_features = F.interpolate(
+                side_features, size=(H_out, W_out), mode="bilinear", align_corners=True
+            )
+        side_features = side_features.view(B, S, side_channels, H_out, W_out)
+        return torch.cat([backbone_features, side_features], dim=2)

cowtracker/heads/tracking_head.py ADDED Viewed

	@@ -0,0 +1,243 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""CowTracker tracking head - Warping-based iterative refinement."""
+from typing import Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from cowtracker.layers.video_transformer import MODEL_CONFIGS, VisionTransformerVideo
+from cowtracker.utils.ops import bilinear_sampler, coords_grid
+class CowTrackingHead(nn.Module):
+    """
+    Warping-based iterative refinement module.
+    Responsibility: features -> (tracks, visibility, confidence)
+    Does NOT handle: feature extraction, windowing
+    """
+    TEMPORAL_INTERLEAVE_STRIDE = 2
+    MAX_FRAMES = 256
+    MLP_RATIO = 4.0
+    REFINE_PATCH_SIZE = 4
+    def __init__(
+        self,
+        feature_dim: int,
+        down_ratio: int = 2,
+        warp_iters: int = 5,
+        warp_model: str = "vits",
+        warp_vit_num_blocks: int = None,
+    ):
+        """
+        Args:
+            feature_dim: Input feature dimension (features + side_resnet_channels).
+            down_ratio: Feature downsampling ratio relative to original image.
+            warp_iters: Number of Warping-based iterative refinement iterations.
+            warp_model: Model configuration for video transformer.
+            warp_vit_num_blocks: Number of transformer blocks (None = use default).
+        """
+        super().__init__()
+        self.warp_iters = warp_iters
+        self.down_ratio = down_ratio
+        # Warping-based iterative refinement iteration dimension
+        self.iter_dim = MODEL_CONFIGS[warp_model]["features"]
+        # Video transformer for temporal attention
+        self.refine_net = VisionTransformerVideo(
+            warp_model,
+            self.iter_dim,
+            patch_size=self.REFINE_PATCH_SIZE,
+            temporal_interleave_stride=self.TEMPORAL_INTERLEAVE_STRIDE,
+            max_frames=self.MAX_FRAMES,
+            mlp_ratio=self.MLP_RATIO,
+            attn_dropout=0.0,
+            proj_dropout=0.0,
+            drop_path=0.0,
+            num_blocks=warp_vit_num_blocks,
+        )
+        # Feature processing layers
+        self.fmap_conv = nn.Conv2d(feature_dim, self.iter_dim, 1, 1, 0, bias=True)
+        self.hidden_conv = nn.Conv2d(self.iter_dim * 2, self.iter_dim, 1, 1, 0, bias=True)
+        self.warp_linear = nn.Conv2d(3 * self.iter_dim + 2, self.iter_dim, 1, 1, 0, bias=True)
+        self.refine_transform = nn.Conv2d(self.iter_dim // 2 * 3, self.iter_dim, 1, 1, 0, bias=True)
+        # Upsampling weights
+        self.upsample_weight = nn.Sequential(
+            nn.Conv2d(self.iter_dim, 2 * self.iter_dim, 3, padding=1, bias=True),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(2 * self.iter_dim, (down_ratio**2) * 9, 1, padding=0, bias=True),
+        )
+        # Flow + visibility + confidence head
+        self.flow_head = nn.Sequential(
+            nn.Conv2d(self.iter_dim, 2 * self.iter_dim, 3, padding=1, bias=True),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(2 * self.iter_dim, 4, 1, padding=0, bias=True),
+        )
+        print(f"CowTrackingHead initialized: iter_dim={self.iter_dim}, warp_iters={warp_iters}")
+    def forward(
+        self,
+        features: torch.Tensor,
+        image_size: Tuple[int, int],
+        first_frame_features: torch.Tensor = None,
+    ) -> dict:
+        """
+        Run Warping-based iterative refinement.
+        Args:
+            features: Extracted features [B, S, C, H, W].
+            image_size: Original image size (H_img, W_img) for upsampling.
+            first_frame_features: Optional first frame features [B, 1, C, H, W]
+                for cross-window tracking.
+        Returns:
+            dict with:
+                - track: Dense tracks [B, S, H_img, W_img, 2].
+                - vis: Visibility scores [B, S, H_img, W_img].
+                - conf: Confidence scores [B, S, H_img, W_img].
+        """
+        B, S, _, H, W = features.shape
+        H_img, W_img = image_size
+        # Project features to iteration dimension
+        fmap = self.fmap_conv(features.view(B * S, -1, H, W)).view(B, S, -1, H, W)
+        # Frame 0 reference features
+        if first_frame_features is not None:
+            frame0_fmap = self.fmap_conv(first_frame_features.view(B, -1, H, W)).view(B, 1, -1, H, W)
+        else:
+            frame0_fmap = fmap[:, 0:1]
+        frame0_expanded = frame0_fmap.expand(B, S, -1, H, W)
+        # Initialize hidden state from concatenation of frame0 and current features
+        net = self.hidden_conv(
+            torch.cat([frame0_expanded, fmap], dim=2).view(B * S, -1, H, W)
+        ).view(B, S, -1, H, W)
+        # Initialize flow to zero
+        flow = torch.zeros(B, S, 2, H, W, device=features.device, dtype=features.dtype)
+        # Iterative refinement
+        for _ in range(self.warp_iters):
+            flow = flow.detach()
+            # Compute warped coordinates
+            coords = coords_grid(B * S, H, W, device=features.device).to(fmap.dtype).view(B, S, 2, H, W)
+            coords_warped = coords + flow
+            # Warp features using current flow estimate
+            warped_fmap = bilinear_sampler(
+                fmap.view(B * S, -1, H, W), coords_warped.view(B * S, 2, H, W).permute(0, 2, 3, 1)
+            ).view(B, S, -1, H, W)
+            # Build refinement input
+            refine_inp = self.warp_linear(
+                torch.cat([frame0_expanded, warped_fmap, net, flow], dim=2).view(B * S, -1, H, W)
+            ).view(B, S, -1, H, W)
+            # Apply video transformer with temporal attention
+            refine_out = self.refine_net(refine_inp)["out"]
+            # Update hidden state
+            net = self.refine_transform(
+                torch.cat([refine_out.view(B * S, -1, H, W), net.view(B * S, -1, H, W)], dim=1)
+            ).view(B, S, -1, H, W)
+            # Predict flow and info update
+            update = self.flow_head(net.view(B * S, -1, H, W)).view(B, S, 4, H, W)
+            flow = flow + update[:, :, :2]
+            info = update[:, :, 2:]
+        # Upsample to original resolution
+        weight = 0.25 * self.upsample_weight(net.view(B * S, -1, H, W)).view(B, S, -1, H, W)
+        flow_up, info_up = self._upsample_predictions(flow, info, weight)
+        # Convert flow to absolute track coordinates
+        tracks = self._flow_to_tracks(flow_up, H_img, W_img)
+        return {
+            "track": tracks,
+            "vis": torch.sigmoid(info_up[..., 0]),
+            "conf": torch.sigmoid(info_up[..., 1]),
+        }
+    def _upsample_predictions(
+        self,
+        flow: torch.Tensor,
+        info: torch.Tensor,
+        weight: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Upsample flow and info using learned convex combination."""
+        B, S, _, H, W = flow.shape
+        flow_ups, info_ups = [], []
+        for t in range(S):
+            f_up, i_up = self._upsample_single(flow[:, t], info[:, t], weight[:, t])
+            flow_ups.append(f_up)
+            info_ups.append(i_up)
+        return torch.stack(flow_ups, dim=1), torch.stack(info_ups, dim=1)
+    def _upsample_single(
+        self,
+        flow: torch.Tensor,
+        info: torch.Tensor,
+        mask: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Upsample single frame using soft convex combination."""
+        N, _, H, W = flow.shape
+        C = info.shape[1]
+        factor = self.down_ratio
+        mask = mask.view(N, 1, 9, factor, factor, H, W)
+        mask = torch.softmax(mask, dim=2)
+        up_flow = F.unfold(factor * flow, [3, 3], padding=1).view(N, 2, 9, 1, 1, H, W)
+        up_info = F.unfold(info, [3, 3], padding=1).view(N, C, 9, 1, 1, H, W)
+        up_flow = torch.sum(mask * up_flow, dim=2).permute(0, 1, 4, 2, 5, 3)
+        up_info = torch.sum(mask * up_info, dim=2).permute(0, 1, 4, 2, 5, 3)
+        return (
+            up_flow.reshape(N, 2, factor * H, factor * W).permute(0, 2, 3, 1),
+            up_info.reshape(N, C, factor * H, factor * W).permute(0, 2, 3, 1),
+        )
+    def _flow_to_tracks(
+        self,
+        flow: torch.Tensor,
+        H_img: int,
+        W_img: int,
+    ) -> torch.Tensor:
+        """Convert flow to absolute track coordinates."""
+        B, S = flow.shape[:2]
+        device, dtype = flow.device, flow.dtype
+        # Create coordinate grid
+        y, x = torch.meshgrid(
+            torch.arange(H_img, device=device, dtype=dtype),
+            torch.arange(W_img, device=device, dtype=dtype),
+            indexing="ij",
+        )
+        coords = torch.stack([x, y], dim=-1).unsqueeze(0).unsqueeze(0).expand(B, S, -1, -1, -1)
+        # Normalize flow relative to frame 0 during inference
+        if not self.training:
+            flow = flow - flow[:, 0:1]
+            flow[:, 0] = 0
+        return coords + flow

cowtracker/inference/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Inference utilities for CowTracker."""
+from cowtracker.inference.windowed import WindowedInference
+__all__ = ["WindowedInference"]

cowtracker/inference/windowed.py ADDED Viewed

	@@ -0,0 +1,144 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Windowed inference for long video processing."""
+from typing import Dict, List, Tuple
+class WindowedInference:
+    """
+    Manages windowed inference for long videos.
+    Handles window computation, memory frame selection, and prediction merging.
+    """
+    def __init__(
+        self,
+        window_len: int = 100,
+        stride: int = 100,
+        num_memory_frames: int = 10,
+    ):
+        """
+        Args:
+            window_len: Number of frames per window.
+            stride: Step size between windows.
+            num_memory_frames: Maximum number of memory frames to use.
+        """
+        self.window_len = window_len
+        self.stride = stride
+        self.num_memory_frames = num_memory_frames
+    def compute_windows(self, total_frames: int) -> List[Tuple[int, int]]:
+        """
+        Compute all window (start, end) indices.
+        Args:
+            total_frames: Total number of frames in the video.
+        Returns:
+            List of (start, end) tuples for each window.
+        """
+        S = self.window_len
+        step = self.stride
+        if total_frames <= S:
+            return [(0, total_frames)]
+        windows = []
+        start = 0
+        while start < total_frames:
+            end = min(start + S, total_frames)
+            windows.append((start, end))
+            if end == total_frames:
+                break
+            start += step
+        return windows
+    def select_memory_frames(
+        self,
+        window_idx: int,
+        window_start: int,
+    ) -> List[int]:
+        """
+        Select memory frame indices using hybrid strategy.
+        Strategy combines:
+        - First frame (always included for global reference)
+        - Recent frames (temporal continuity)
+        - Uniformly sampled middle frames (long-range context)
+        Args:
+            window_idx: Current window index.
+            window_start: Start frame index of current window.
+        Returns:
+            Sorted list of memory frame indices.
+        """
+        if window_idx == 0:
+            return []
+        memory_indices = [0]  # Always include first frame
+        # Recent frames for temporal continuity
+        for offset in [2, 1]:
+            idx = window_start - offset
+            if idx > 0 and idx not in memory_indices:
+                memory_indices.append(idx)
+        # Uniform sampling from middle history for long-range context
+        if window_start > 10:
+            mid_start, mid_end = 5, window_start - 3
+            step = (mid_end - mid_start) / 6
+            for i in range(5):
+                idx = int(mid_start + (i + 1) * step)
+                if idx not in memory_indices:
+                    memory_indices.append(idx)
+        # Limit to maximum number of memory frames
+        if len(memory_indices) > self.num_memory_frames:
+            memory_indices = sorted(memory_indices)[-self.num_memory_frames :]
+        return sorted(memory_indices)
+    def merge_predictions(
+        self,
+        window_idx: int,
+        window_start: int,
+        window_end: int,
+        window_pred: Dict,
+        accumulated: Dict,
+    ) -> None:
+        """
+        Merge window predictions into accumulated results.
+        Handles overlapping regions by using only non-overlapping parts
+        from subsequent windows.
+        Args:
+            window_idx: Current window index.
+            window_start: Start frame index.
+            window_end: End frame index.
+            window_pred: Predictions for current window (track, vis, conf).
+            accumulated: Accumulated predictions to update in-place.
+        """
+        S_actual = window_end - window_start
+        if window_idx > 0 and self.stride < self.window_len:
+            # Has overlap with previous window - only take non-overlapping part
+            overlap_len = min(self.window_len - self.stride, S_actual)
+            if overlap_len < S_actual:
+                start_offset = overlap_len
+                for key in ["track", "vis", "conf"]:
+                    accumulated[key][:, window_start + start_offset : window_end] = window_pred[key][
+                        :, start_offset:S_actual
+                    ]
+        else:
+            # No overlap or first window - take everything
+            for key in ["track", "vis", "conf"]:
+                accumulated[key][:, window_start:window_end] = window_pred[key][:, :S_actual]

cowtracker/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Network layers and backbone modules for CowTracker."""
+from cowtracker.layers.temporal_attention import TemporalSelfAttentionBlock
+from cowtracker.layers.video_transformer import (
+    MODEL_CONFIGS,
+    VisionTransformerVideo,
+    FlashAttention3,
+    replace_attention_with_flash3,
+)
+from cowtracker.layers.patch_embed import PatchEmbed
+__all__ = [
+    "TemporalSelfAttentionBlock",
+    "MODEL_CONFIGS",
+    "VisionTransformerVideo",
+    "FlashAttention3",
+    "replace_attention_with_flash3",
+    "PatchEmbed",
+]

cowtracker/layers/dpt_head.py ADDED Viewed

	@@ -0,0 +1,173 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Custom DPTHead with intermediate feature extraction support.
+This module imports the base components from the Depth-Anything-V2 submodule
+and provides a modified DPTHead that can return intermediate features.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# Import base components from submodule
+from cowtracker.thirdparty.DepthAnythingV2.depth_anything_v2.util.blocks import (
+    FeatureFusionBlock,
+    _make_scratch,
+)
+def _make_fusion_block(features, use_bn, size=None):
+    return FeatureFusionBlock(
+        features,
+        nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+        size=size,
+    )
+class DPTHead(nn.Module):
+    """
+    DPT decoder head with support for returning intermediate features.
+    This is a modified version that:
+    - Removes output_conv2 (final depth prediction layers)
+    - Removes resConfUnit1 from refinenet4
+    - Supports returning intermediate feature maps via return_intermediate flag
+    """
+    def __init__(
+        self,
+        in_channels,
+        features=256,
+        use_bn=False,
+        out_channels=[256, 512, 1024, 1024],
+        use_clstoken=False
+    ):
+        super(DPTHead, self).__init__()
+        self.use_clstoken = use_clstoken
+        self.projects = nn.ModuleList([
+            nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channel,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ) for out_channel in out_channels
+        ])
+        self.resize_layers = nn.ModuleList([
+            nn.ConvTranspose2d(
+                in_channels=out_channels[0],
+                out_channels=out_channels[0],
+                kernel_size=4,
+                stride=4,
+                padding=0),
+            nn.ConvTranspose2d(
+                in_channels=out_channels[1],
+                out_channels=out_channels[1],
+                kernel_size=2,
+                stride=2,
+                padding=0),
+            nn.Identity(),
+            nn.Conv2d(
+                in_channels=out_channels[3],
+                out_channels=out_channels[3],
+                kernel_size=3,
+                stride=2,
+                padding=1)
+        ])
+        if use_clstoken:
+            self.readout_projects = nn.ModuleList()
+            for _ in range(len(self.projects)):
+                self.readout_projects.append(
+                    nn.Sequential(
+                        nn.Linear(2 * in_channels, in_channels),
+                        nn.GELU()))
+        self.scratch = _make_scratch(
+            out_channels,
+            features,
+            groups=1,
+            expand=False,
+        )
+        self.scratch.stem_transpose = None
+        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
+        head_features_1 = features
+        self.scratch.output_conv1 = nn.Conv2d(
+            head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1
+        )
+        # Remove resConfUnit1 from refinenet4 (not needed for intermediate feature extraction)
+        del self.scratch.refinenet4.resConfUnit1
+    def forward(self, out_features, patch_h, patch_w, return_intermediate=True):
+        """
+        Forward pass through the DPT head.
+        Args:
+            out_features: List of intermediate features from the encoder
+            patch_h: Height in patches
+            patch_w: Width in patches
+            return_intermediate: If True, return intermediate feature maps
+        Returns:
+            If return_intermediate=True:
+                (out, path_1, path_2, path_3, path_4) - output and intermediate features
+            Else:
+                out - final output only
+        """
+        out = []
+        for i, x in enumerate(out_features):
+            if self.use_clstoken:
+                x, cls_token = x[0], x[1]
+                readout = cls_token.unsqueeze(1).expand_as(x)
+                x = self.readout_projects[i](torch.cat((x, readout), -1))
+            else:
+                x = x[0]
+            x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w))
+            x = self.projects[i](x)
+            x = self.resize_layers[i](x)
+            out.append(x)
+        layer_1, layer_2, layer_3, layer_4 = out
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:])
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:])
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        out = self.scratch.output_conv1(path_1)
+        if return_intermediate:
+            return out, path_1, path_2, path_3, path_4
+        else:
+            out = F.relu(out)
+            return out

cowtracker/layers/patch_embed.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+from typing import Callable, Optional, Tuple, Union
+from torch import Tensor
+import torch.nn as nn
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.flatten_embedding = flatten_embedding
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops

cowtracker/layers/resnet_deconv.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""ResNet18-style encoder-decoder for image features."""
+import torch.nn as nn
+class resconv(nn.Module):
+    """Residual convolution block."""
+    def __init__(self, inp, oup, k=3, s=1):
+        super(resconv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.GELU(),
+            nn.Conv2d(inp, oup, kernel_size=k, stride=s, padding=k // 2, bias=True),
+            nn.GELU(),
+            nn.Conv2d(oup, oup, kernel_size=3, stride=1, padding=1, bias=True),
+        )
+        if inp != oup or s != 1:
+            self.skip_conv = nn.Conv2d(
+                inp, oup, kernel_size=1, stride=s, padding=0, bias=True
+            )
+        else:
+            self.skip_conv = nn.Identity()
+    def forward(self, x):
+        return self.conv(x) + self.skip_conv(x)
+class ResNet18Deconv(nn.Module):
+    """ResNet18-style encoder-decoder for image features."""
+    def __init__(self, inp, oup):
+        super(ResNet18Deconv, self).__init__()
+        self.ds1 = resconv(inp, 64, k=7, s=2)
+        self.conv1 = resconv(64, 64, k=3, s=1)
+        self.conv2 = resconv(64, 128, k=3, s=2)
+        self.conv3 = resconv(128, 256, k=3, s=2)
+        self.conv4 = resconv(256, 512, k=3, s=2)
+        self.up_4 = nn.ConvTranspose2d(512, 256, kernel_size=2, stride=2, padding=0, bias=True)
+        self.proj_3 = resconv(256, 256, k=3, s=1)
+        self.up_3 = nn.ConvTranspose2d(256, 128, kernel_size=2, stride=2, padding=0, bias=True)
+        self.proj_2 = resconv(128, 128, k=3, s=1)
+        self.up_2 = nn.ConvTranspose2d(128, 64, kernel_size=2, stride=2, padding=0, bias=True)
+        self.proj_1 = resconv(64, oup, k=3, s=1)
+    def forward(self, x):
+        out_1 = self.ds1(x)
+        out_1 = self.conv1(out_1)
+        out_2 = self.conv2(out_1)
+        out_3 = self.conv3(out_2)
+        out_4 = self.conv4(out_3)
+        out_3 = self.proj_3(out_3 + self.up_4(out_4))
+        out_2 = self.proj_2(out_2 + self.up_3(out_3))
+        out_1 = self.proj_1(out_1 + self.up_2(out_2))
+        return [out_1, out_2, out_3, out_4]

cowtracker/layers/temporal_attention.py ADDED Viewed

	@@ -0,0 +1,307 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Enhanced Cross Attention Block implementation.
+Self-contained version with all necessary components inline.
+"""
+from typing import Callable, Optional
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+# ============================================================================
+# Inline helper modules (to avoid external dependencies)
+# ============================================================================
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample."""
+    def __init__(self, drop_prob: float = 0.0):
+        super().__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x: Tensor) -> Tensor:
+        if self.drop_prob == 0.0 or not self.training:
+            return x
+        keep_prob = 1 - self.drop_prob
+        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
+        random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+        if keep_prob > 0.0:
+            random_tensor.div_(keep_prob)
+        return x * random_tensor
+class LayerScale(nn.Module):
+    """Layer scale module."""
+    def __init__(self, dim: int, init_values: float = 1e-5):
+        super().__init__()
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x * self.gamma
+class Mlp(nn.Module):
+    """MLP as used in Vision Transformer."""
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int = None,
+        out_features: int = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class MemEffAttention(nn.Module):
+    """Memory efficient self-attention using PyTorch's scaled_dot_product_attention."""
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = True,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        norm_layer: nn.Module = nn.LayerNorm,
+        qk_norm: bool = False,
+        fused_attn: bool = True,
+        rope=None,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        self.fused_attn = fused_attn
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.rope = rope
+    def forward(self, x: Tensor, pos=None) -> Tensor:
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, self.head_dim)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        if self.rope is not None and pos is not None:
+            q = self.rope(q, pos)
+            k = self.rope(k, pos)
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(
+                q, k, v, dropout_p=self.attn_drop.p if self.training else 0.0
+            )
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+# ============================================================================
+# Main attention block classes
+# ============================================================================
+class SelfAttentionBlock(nn.Module):
+    """
+    Self attention block using the same architecture as CrossAttentionBlock but for self attention.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+        qk_norm: bool = False,
+        fused_attn: bool = True,
+        rope=None,
+    ) -> None:
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        # Use standard Attention for self attention
+        self.attn = MemEffAttention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            qk_norm=qk_norm,
+            fused_attn=fused_attn,
+            rope=rope,
+        )
+        self.ls1 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor, pos=None) -> Tensor:
+        def attn_residual_func(x: Tensor, pos=None) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x), pos=pos))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x, pos))
+            x = x + self.drop_path2(ffn_residual_func(x))
+        else:
+            x = x + attn_residual_func(x, pos)
+            x = x + ffn_residual_func(x)
+        return x
+class TemporalSelfAttentionBlock(nn.Module):
+    """
+    Temporal self attention block that applies self-attention across time for each spatial position.
+    Input: [B, S, N, C] -> Output: [B, S, N, C]
+    For each position n, performs self-attention across all time steps.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+        qk_norm: bool = False,
+        fused_attn: bool = True,
+        rope=None,
+    ) -> None:
+        super().__init__()
+        self.self_attn_block = SelfAttentionBlock(
+            dim,
+            num_heads,
+            mlp_ratio,
+            qkv_bias,
+            proj_bias,
+            ffn_bias,
+            drop,
+            attn_drop,
+            init_values,
+            drop_path,
+            act_layer,
+            norm_layer,
+            ffn_layer,
+            qk_norm,
+            fused_attn,
+            rope,
+        )
+    def forward(self, x: Tensor, pos=None):
+        """
+        Apply temporal self-attention across time for each spatial position.
+        Args:
+            x: Input tensor of shape [B, S, N, C]
+            pos: Position encoding
+        Returns:
+            Output tensor of shape [B, S, N, C]
+        """
+        if len(x.shape) != 4:
+            raise ValueError(
+                f"TemporalSelfAttentionBlock expects 4D input [B, S, N, C], got {x.shape}"
+            )
+        B, S, N, C = x.shape
+        if S <= 1:
+            # No temporal dimension to attend over, return input unchanged
+            return x
+        # Reshape to [B*N, S, C] to process each spatial position independently
+        x_reshaped = x.permute(0, 2, 1, 3).reshape(B * N, S, C)  # [B*N, S, C]
+        # Apply temporal self-attention
+        x_reshaped = self.self_attn_block(x_reshaped, pos=pos)
+        # Reshape back to [B, S, N, C]
+        x = x_reshaped.reshape(B, N, S, C).permute(0, 2, 1, 3)
+        return x

cowtracker/layers/video_transformer.py ADDED Viewed

	@@ -0,0 +1,411 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import timm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import xformers.ops as xops
+from timm.models.vision_transformer import Attention as TimmAttention
+from cowtracker.layers.temporal_attention import TemporalSelfAttentionBlock
+from cowtracker.layers.patch_embed import PatchEmbed
+from cowtracker.layers.dpt_head import DPTHead
+print("timm version: ", timm.__version__)
+def get_1d_sincos_pos_embed_from_grid(
+    embed_dim: int, pos: torch.Tensor
+) -> torch.Tensor:
+    """
+    This function generates a 1D positional embedding from a given grid using sine and cosine functions.
+    Args:
+    - embed_dim: The embedding dimension.
+    - pos: The position to generate the embedding from.
+    Returns:
+    - emb: The generated 1D positional embedding.
+    """
+    assert embed_dim % 2 == 0
+    omega = torch.arange(embed_dim // 2, dtype=torch.double)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = torch.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = torch.sin(out)  # (M, D/2)
+    emb_cos = torch.cos(out)  # (M, D/2)
+    emb = torch.cat([emb_sin, emb_cos], dim=1)  # (M, D)
+    return emb[None].float()
+def _get_flash_attention_ops():
+    """Automatically detect GPU and return appropriate flash attention ops.
+    Returns Flash Attention 3 ops for H100 (compute capability >= 9.0),
+    otherwise returns Flash Attention 2 ops.
+    """
+    if not torch.cuda.is_available():
+        return None
+    # Get compute capability of current device
+    major, _ = torch.cuda.get_device_capability()
+    # print("compute capability: ", torch.cuda.get_device_capability())
+    # H100 has compute capability 9.0
+    if major >= 9:
+        # Use Flash Attention 3 for H100 and newer
+        try:
+            return (xops.fmha.flash3.FwOp, xops.fmha.flash3.BwOp)
+        except AttributeError:
+            # Fall back to flash2 if flash3 not available
+            print("Flash Attention 3 not available, falling back to Flash Attention 2")
+            return (xops.fmha.flash.FwOp, xops.fmha.flash.BwOp)
+    else:
+        # Use Flash Attention 2 for older GPUs
+        return (xops.fmha.flash.FwOp, xops.fmha.flash.BwOp)
+class FlashAttention3(nn.Module):
+    """
+    Drop-in replacement for timm.models.vision_transformer.Attention using xformers Flash Attention 3.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ):
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = attn_drop
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        # Get Flash Attention ops
+        self.flash_ops = _get_flash_attention_ops()
+    def forward(
+        self, x: torch.Tensor, attn_mask: torch.Tensor | None = None
+    ) -> torch.Tensor:
+        B, N, C = x.shape
+        # Compute Q, K, V
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim)
+        q, k, v = qkv.unbind(2)  # Each is (B, N, num_heads, head_dim)
+        # xformers expects [B, M, H, K] format - we already have it!
+        # Use xformers memory_efficient_attention with Flash Attention 3
+        x = xops.memory_efficient_attention(
+            q,
+            k,
+            v,
+            attn_bias=attn_mask,  # Pass attention mask if provided
+            p=self.attn_drop if self.training else 0.0,
+            scale=self.scale,
+            op=self.flash_ops,
+        )
+        # Reshape back to [B, N, C]
+        x = x.reshape(B, N, C)
+        # Output projection
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+def replace_attention_with_flash3(model: nn.Module) -> nn.Module:
+    """
+    Recursively replace all timm.Attention modules with FlashAttention3.
+    """
+    for name, module in model.named_children():
+        if isinstance(module, TimmAttention):
+            # Extract parameters from timm Attention
+            flash_attn = FlashAttention3(
+                dim=module.qkv.in_features,
+                num_heads=module.num_heads,
+                qkv_bias=module.qkv.bias is not None,
+                attn_drop=module.attn_drop.p if hasattr(module, "attn_drop") else 0.0,
+                proj_drop=module.proj_drop.p if hasattr(module, "proj_drop") else 0.0,
+            )
+            # Copy weights from original attention
+            flash_attn.qkv.weight.data = module.qkv.weight.data.clone()
+            if module.qkv.bias is not None:
+                flash_attn.qkv.bias.data = module.qkv.bias.data.clone()
+            flash_attn.proj.weight.data = module.proj.weight.data.clone()
+            if module.proj.bias is not None:
+                flash_attn.proj.bias.data = module.proj.bias.data.clone()
+            # Replace the module
+            setattr(model, name, flash_attn)
+            # print(
+            #     f"  Replaced attention module '{name}' (dim={module.qkv.in_features}, heads={module.num_heads})"
+            # )
+        else:
+            # Recursively apply to child modules
+            replace_attention_with_flash3(module)
+    return model
+MODEL_CONFIGS = {
+    "vitl": {
+        "encoder": "vit_large_patch16_224",
+        "features": 256,
+        "out_channels": [256, 512, 1024, 1024],
+    },
+    "vitb": {
+        "encoder": "vit_base_patch16_224",
+        "features": 128,
+        "out_channels": [96, 192, 384, 768],
+    },
+    "vits": {
+        "encoder": "vit_small_patch16_224",
+        "features": 64,
+        "out_channels": [48, 96, 192, 384],
+    },
+    "vitt": {
+        "encoder": "vit_tiny_patch16_224",
+        "features": 32,
+        "out_channels": [24, 48, 96, 192],
+    },
+}
+class VisionTransformerVideo(nn.Module):
+    """
+    Input: (B, T, C, H, W)
+    Pipeline: per-frame ViT + interleaved Temporal Attention (across frames)
+    Time pos: 1D sinusoidal encoding + linear interpolation for variable T
+    """
+    def __init__(
+        self,
+        model_name,
+        input_dim,
+        patch_size=16,
+        temporal_interleave_stride=2,
+        max_frames=256,
+        mlp_ratio=4.0,
+        attn_dropout=0.0,
+        proj_dropout=0.0,
+        drop_path=0.0,
+        shared_temporal_block=False,
+        num_blocks=None,
+        use_flash_attention3=False,
+    ):
+        super().__init__()
+        model = timm.create_model(
+            MODEL_CONFIGS[model_name]["encoder"],
+            pretrained=False,
+            num_classes=0,
+        )
+        self.intermediate_layer_idx = {
+            "vitt": [2, 5, 8, 11],
+            "vits": [2, 5, 8, 11],
+            "vitb": [2, 5, 8, 11],
+            "vitl": [4, 11, 17, 23],
+            "vitg": [9, 19, 29, 39],
+        }
+        self.idx = self.intermediate_layer_idx[model_name]
+        self.blks = model.blocks if num_blocks is None else model.blocks[:num_blocks]
+        # Replace attention with Flash Attention 3 if enabled
+        if use_flash_attention3:
+            self.blks = replace_attention_with_flash3(self.blks)
+            num_fa3_modules = sum(
+                1 for m in self.blks.modules() if isinstance(m, FlashAttention3)
+            )
+            print(
+                f"✓ Flash Attention 3 enabled for spatial attention: replaced {num_fa3_modules} attention modules"
+            )
+        self.embed_dim = model.embed_dim
+        self.input_dim = input_dim
+        self.img_size = (224, 224)
+        self.patch_size = patch_size
+        self.output_dim = MODEL_CONFIGS[model_name]["features"]
+        # Spatial positional embedding (64 corresponds to 224/16 = 14x14)
+        self.pos_embed = nn.Parameter(torch.zeros(1, 64, self.embed_dim))
+        nn.init.trunc_normal_(self.pos_embed, std=0.02)
+        # ====== New: sinusoidal time positional embedding (buffer) ======
+        self.max_frames = max_frames
+        time_grid = torch.arange(max_frames, dtype=torch.float32)  # (T0,)
+        time_emb = get_1d_sincos_pos_embed_from_grid(
+            self.embed_dim, time_grid
+        )  # (1, T0, D)
+        # Follow your pattern: register_buffer + interpolate_time_embed
+        self.register_buffer("time_emb", time_emb, persistent=False)
+        # Patch embed and DPT head
+        self.patch_embed = PatchEmbed(
+            img_size=self.img_size,
+            patch_size=self.patch_size,
+            in_chans=input_dim,
+            embed_dim=self.embed_dim,
+        )
+        self.dpt_head = DPTHead(
+            self.embed_dim,
+            MODEL_CONFIGS[model_name]["features"],
+            out_channels=MODEL_CONFIGS[model_name]["out_channels"],
+        )
+        # Temporal block(s)
+        num_heads = getattr(model.blocks[0].attn, "num_heads", 8)
+        self.shared_temporal_block = shared_temporal_block
+        # Insert a temporal block after every N spatial blocks
+        self.temporal_interleave_stride = max(1, int(temporal_interleave_stride))
+        # Calculate how many temporal blocks we need
+        num_temporal_blocks = sum(
+            1
+            for i in range(len(self.blks))
+            if (i + 1) % self.temporal_interleave_stride == 0
+        )
+        if shared_temporal_block:
+            # Single shared temporal block for all layers
+            self.temporal_block = TemporalSelfAttentionBlock(
+                dim=self.embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                attn_drop=attn_dropout,
+                drop=proj_dropout,
+                drop_path=drop_path,
+            )
+            self.temporal_blocks = None
+        else:
+            # Separate temporal block for each layer
+            self.temporal_block = None
+            self.temporal_blocks = nn.ModuleList(
+                [
+                    TemporalSelfAttentionBlock(
+                        dim=self.embed_dim,
+                        num_heads=num_heads,
+                        mlp_ratio=mlp_ratio,
+                        attn_drop=attn_dropout,
+                        drop=proj_dropout,
+                        drop_path=drop_path,
+                    )
+                    for _ in range(num_temporal_blocks)
+                ]
+            )
+    # ====== New: interpolate temporal positional embedding ======
+    def interpolate_time_embed(self, x_like: torch.Tensor, t: int) -> torch.Tensor:
+        """
+        x_like: used only to fetch dtype (e.g., fp16)
+        Return: time positional embedding of shape (1, t, D)
+        """
+        previous_dtype = x_like.dtype
+        T0 = self.time_emb.shape[1]
+        if t == T0:
+            return self.time_emb.to(previous_dtype)
+        temb = self.time_emb.float()  # (1, T0, D)
+        temb = F.interpolate(
+            temb.permute(0, 2, 1), size=t, mode="linear", align_corners=False
+        ).permute(0, 2, 1)  # (1, t, D)
+        return temb.to(previous_dtype)
+    def interpolate_pos_encoding(self, x, h, w):
+        """
+        Interpolate the 2D spatial positional encoding to match HxW (in patches).
+        """
+        previous_dtype = x.dtype
+        npatch = x.shape[1]
+        N = self.pos_embed.shape[1]
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        sqrt_N = math.sqrt(N)
+        sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N
+        pos_embed = nn.functional.interpolate(
+            pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(0, 3, 1, 2),
+            scale_factor=(sy, sx),
+            mode="bicubic",
+            antialias=False,
+        )
+        assert int(w0) == pos_embed.shape[-1] and int(h0) == pos_embed.shape[-2]
+        pos_embed = pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return pos_embed.to(previous_dtype)
+    def forward(self, x):
+        """
+        x: (B, T, C, H, W)
+        """
+        B, T, C, H, W = x.shape
+        # Merge time into batch for per-frame spatial encoding
+        x = x.view(B * T, C, H, W)
+        x = self.patch_embed(x)  # (B*T, Np, D)
+        x = x.view(B, T, *x.shape[1:])
+        # Get time positional embedding for current T via linear interpolation: (1, T, D)
+        tpos = self.interpolate_time_embed(x, T).unsqueeze(2)  # (1, T, 1, D)
+        x = x + tpos  # (B, T, Np, D)
+        x = x.view(B * T, *x.shape[2:])  # (B*T, Np, D)
+        x = x + self.interpolate_pos_encoding(x, H, W)
+        outputs = []
+        temporal_block_idx = 0
+        for i in range(len(self.blks)):
+            # 1) Spatial self-attention (per frame)
+            x = self.blks[i](x)  # (B*T, Np, D)
+            # 2) Interleave temporal self-attention (across frames, same spatial patch)
+            if (i + 1) % self.temporal_interleave_stride == 0:
+                x = x.view(B, T, *x.shape[1:])
+                if self.shared_temporal_block:
+                    x = self.temporal_block(x)
+                else:
+                    x = self.temporal_blocks[temporal_block_idx](x)
+                    temporal_block_idx += 1
+                x = x.view(B * T, *x.shape[2:])
+            # 3) Collect intermediate features for DPT head
+            if i in self.idx:
+                outputs.append([x])
+        patch_h, patch_w = H // self.patch_size, W // self.patch_size
+        # DPT head consumes (B*T, Np, D); here batch is B*T
+        out, path_1, path_2, path_3, path_4 = self.dpt_head.forward(
+            outputs, patch_h, patch_w, return_intermediate=True
+        )
+        # Upsample per frame
+        out = F.interpolate(
+            out, (H, W), mode="bilinear", align_corners=True
+        )  # (B*T, Cout, H, W)
+        # Restore (B, T, ...)
+        def bt_to_btensor(tensor_or_none):
+            if tensor_or_none is None:
+                return None
+            return tensor_or_none.view(B, T, *tensor_or_none.shape[1:])
+        return {
+            "out": out.view(B, T, *out.shape[1:]),
+            "path_1": bt_to_btensor(path_1),
+            "path_2": bt_to_btensor(path_2),
+            "path_3": bt_to_btensor(path_3),
+            "path_4": bt_to_btensor(path_4),
+        }

cowtracker/models/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""CoWTracker models."""
+def __getattr__(name):
+    """Lazy import to avoid import errors when dependencies are missing."""
+    if name == "CoWTracker":
+        from cowtracker.models.cowtracker import CoWTracker
+        return CoWTracker
+    if name == "CoWTrackerWindowed":
+        from cowtracker.models.cowtracker_windowed import CoWTrackerWindowed
+        return CoWTrackerWindowed
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+__all__ = ["CoWTracker", "CoWTrackerWindowed"]

cowtracker/models/cowtracker.py ADDED Viewed

	@@ -0,0 +1,228 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""CoWTracker: Simple version for short videos."""
+import torch
+import torch.nn as nn
+from huggingface_hub import PyTorchModelHubMixin, hf_hub_download
+import cowtracker.thirdparty  # noqa: F401 - sets up vggt path
+from vggt.models.aggregator import Aggregator
+from cowtracker.heads.feature_extractor import FeatureExtractor
+from cowtracker.heads.tracking_head import CowTrackingHead
+class CoWTracker(nn.Module, PyTorchModelHubMixin):
+    """
+    CoWTracker simple version: processes entire video at once.
+    Suitable for: short videos / sufficient GPU memory.
+    For long videos, use CoWTrackerWindowed instead.
+    """
+    # Backbone configuration
+    IMG_SIZE = 518
+    PATCH_SIZE = 14
+    EMBED_DIM = 1024
+    PATCH_EMBED = "dinov2_vitl14_reg"
+    DEPTH = 24
+    # Default HuggingFace repo for model loading
+    DEFAULT_REPO_ID = "facebook/cowtracker"
+    DEFAULT_FILENAME = "cowtracker_model.pth"
+    def __init__(
+        self,
+        features: int = 128,
+        side_resnet_channels: int = 128,
+        down_ratio: int = 2,
+        warp_iters: int = 5,
+        warp_vit_num_blocks: int = None,
+    ):
+        """
+        Args:
+            features: Number of DPT output features.
+            side_resnet_channels: Number of ResNet side feature channels.
+            down_ratio: Feature downsampling ratio.
+            warp_iters: Number of Warping-based iterative refinement iterations.
+            warp_vit_num_blocks: Number of transformer blocks (None = default).
+        """
+        super().__init__()
+        print("Initializing CoWTracker...")
+        # Backbone: VGGT backbone
+        self.aggregator = Aggregator(
+            img_size=self.IMG_SIZE,
+            patch_size=self.PATCH_SIZE,
+            embed_dim=self.EMBED_DIM,
+            patch_embed=self.PATCH_EMBED,
+            depth=self.DEPTH,
+        )
+        # High Resolution Feature extraction
+        self.feature_extractor = FeatureExtractor(
+            features=features,
+            down_ratio=down_ratio,
+            side_resnet_channels=side_resnet_channels,
+        )
+        # Tracking head: warping-based iterative refinement
+        self.tracking_head = CowTrackingHead(
+            feature_dim=self.feature_extractor.out_dim,
+            down_ratio=down_ratio,
+            warp_iters=warp_iters,
+            warp_vit_num_blocks=warp_vit_num_blocks,
+        )
+        print(f"  - Features: {features}, Side channels: {side_resnet_channels}")
+        print(f"  - Warping-based iterative refinement iterations: {warp_iters}")
+    def forward(self, video: torch.Tensor, queries: torch.Tensor = None) -> dict:
+        """
+        Forward pass for dense tracking.
+        Args:
+            video: Input video [B, S, 3, H, W] or [S, 3, H, W] in range [0, 255].
+            queries: Optional query points (unused, for API compatibility).
+        Returns:
+            dict with:
+                - track: Dense tracks [B, S, H, W, 2].
+                - vis: Visibility scores [B, S, H, W].
+                - conf: Confidence scores [B, S, H, W].
+        """
+        # Normalize input
+        images = video / 255.0
+        if images.ndim == 4:
+            images = images.unsqueeze(0)
+        B, S, C, H, W = images.shape
+        # Extract backbone tokens
+        tokens, patch_idx = self.aggregator(images)
+        # Extract high resolution features
+        features = self.feature_extractor(tokens, images, patch_idx)
+        # Run tracking
+        predictions = self.tracking_head(features, image_size=(H, W))
+        if not self.training:
+            predictions["images"] = images
+        return predictions
+    @staticmethod
+    def _remap_legacy_state_dict(state_dict: dict) -> dict:
+        """
+        Remap legacy checkpoint keys to new model structure.
+        Old structure:
+            tracking_head.aggregator.* -> aggregator.*
+            tracking_head.feature_extractor.* -> feature_extractor.dpt_head.*
+            tracking_head.fnet.* -> feature_extractor.fnet.*
+            tracking_head.* (rest) -> tracking_head.*
+        Args:
+            state_dict: Original state dict.
+        Returns:
+            Remapped state dict.
+        """
+        new_state_dict = {}
+        for key, value in state_dict.items():
+            new_key = key
+            # Remap tracking_head.aggregator -> aggregator
+            if key.startswith("tracking_head.aggregator."):
+                new_key = key.replace("tracking_head.aggregator.", "aggregator.")
+            # Remap tracking_head.feature_extractor -> feature_extractor.dpt_head
+            elif key.startswith("tracking_head.feature_extractor."):
+                new_key = key.replace(
+                    "tracking_head.feature_extractor.", "feature_extractor.dpt_head."
+                )
+            # Remap tracking_head.fnet -> feature_extractor.fnet
+            elif key.startswith("tracking_head.fnet."):
+                new_key = key.replace("tracking_head.fnet.", "feature_extractor.fnet.")
+            new_state_dict[new_key] = value
+        return new_state_dict
+    @classmethod
+    def _load_checkpoint(cls, checkpoint_path: str = None) -> dict:
+        """
+        Load checkpoint from local path or HuggingFace Hub.
+        Args:
+            checkpoint_path: Local file path to checkpoint.
+                             If None, downloads from default HuggingFace repo.
+        Returns:
+            Loaded checkpoint dict.
+        """
+        import os
+        if checkpoint_path is None:
+            # Download from HuggingFace Hub (uses HF_TOKEN env var automatically)
+            print(f"Downloading checkpoint from HuggingFace: {cls.DEFAULT_REPO_ID}/{cls.DEFAULT_FILENAME}")
+            checkpoint_path = hf_hub_download(
+                repo_id=cls.DEFAULT_REPO_ID,
+                filename=cls.DEFAULT_FILENAME,
+            )
+            print(f"Downloaded to: {checkpoint_path}")
+        else:
+            checkpoint_path = os.path.expanduser(checkpoint_path)
+            print(f"Loading checkpoint from local path: {checkpoint_path}")
+        with open(checkpoint_path, "rb") as fp:
+            ckpt = torch.load(fp, map_location="cpu")
+        return ckpt
+    @classmethod
+    def from_checkpoint(
+        cls,
+        checkpoint_path: str = None,
+        device: str = "cuda",
+        dtype=torch.bfloat16,
+    ):
+        """
+        Load model from checkpoint (local path or HuggingFace Hub).
+        Args:
+            checkpoint_path: Path to local checkpoint file.
+                             If None, downloads from default HuggingFace repo.
+            device: Target device.
+            dtype: Target dtype.
+        Returns:
+            Loaded model in eval mode.
+        """
+        model = cls()
+        ckpt = cls._load_checkpoint(checkpoint_path)
+        state_dict = ckpt.get("model", ckpt)
+        # Remap legacy checkpoint keys if needed
+        legacy_prefixes = ["tracking_head.feature_extractor.", "tracking_head.aggregator.", "tracking_head.fnet."]
+        if any(k.startswith(p) for k in state_dict.keys() for p in legacy_prefixes):
+            print("Detected legacy checkpoint format, remapping keys...")
+            state_dict = cls._remap_legacy_state_dict(state_dict)
+        msg = model.load_state_dict(state_dict, strict=False)
+        print(f"Load message: {msg}")
+        model = model.to(device).to(dtype)
+        model.eval()
+        for p in model.parameters():
+            p.requires_grad = False
+        print("Model loaded successfully!")
+        return model

cowtracker/models/cowtracker_windowed.py ADDED Viewed

	@@ -0,0 +1,218 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""CoWTracker Windowed: Full version for long videos."""
+import torch
+import torch.nn as nn
+from huggingface_hub import PyTorchModelHubMixin
+from cowtracker.models.cowtracker import CoWTracker
+from cowtracker.inference.windowed import WindowedInference
+class CoWTrackerWindowed(nn.Module, PyTorchModelHubMixin):
+    """
+    CoWTracker windowed version: processes video in sliding windows.
+    Suitable for: long videos / limited GPU memory.
+    Composes CoWTracker with WindowedInference.
+    """
+    def __init__(
+        self,
+        # Window parameters
+        window_len: int = 100,
+        stride: int = 100,
+        num_memory_frames: int = 10,
+        # CoWTracker parameters
+        **cow_tracker_kwargs,
+    ):
+        """
+        Args:
+            window_len: Number of frames per window.
+            stride: Step size between windows.
+            num_memory_frames: Maximum number of memory frames.
+            **cow_tracker_kwargs: Arguments passed to CoWTracker.
+        """
+        super().__init__()
+        print(f"Initializing CoWTrackerWindowed: window_len={window_len}, stride={stride}")
+        self.model = CoWTracker(**cow_tracker_kwargs)
+        self.windowed = WindowedInference(
+            window_len=window_len,
+            stride=stride,
+            num_memory_frames=num_memory_frames,
+        )
+    def forward(self, video: torch.Tensor, queries: torch.Tensor = None) -> dict:
+        """
+        Forward pass with windowed inference.
+        Args:
+            video: Input video [B, S, 3, H, W] or [S, 3, H, W] in range [0, 255].
+            queries: Optional query points (unused, for API compatibility).
+        Returns:
+            dict with:
+                - track: Dense tracks [B, T, H, W, 2].
+                - vis: Visibility scores [B, T, H, W].
+                - conf: Confidence scores [B, T, H, W].
+        """
+        # Normalize input
+        images = video / 255.0
+        if images.ndim == 4:
+            images = images.unsqueeze(0)
+        B, T, C, H, W = images.shape
+        device, dtype = images.device, images.dtype
+        # Initialize accumulated outputs
+        accumulated = {
+            "track": torch.zeros((B, T, H, W, 2), device=device, dtype=dtype),
+            "vis": torch.zeros((B, T, H, W), device=device, dtype=dtype),
+            "conf": torch.zeros((B, T, H, W), device=device, dtype=dtype),
+        }
+        windows = self.windowed.compute_windows(T)
+        first_frame = images[:, 0:1]
+        first_frame_features = None
+        for window_idx, (start, end) in enumerate(windows):
+            if not self.training:
+                print(f"Processing window {window_idx + 1}/{len(windows)}: frames [{start}, {end})")
+            # Get memory frame indices
+            memory_indices = self.windowed.select_memory_frames(window_idx, start)
+            if not self.training and memory_indices:
+                print(f"  Memory frames: {memory_indices}")
+            # Gather frames: first_frame + memory + window
+            frames = self._gather_frames(images, first_frame, start, end, memory_indices)
+            # Extract backbone tokens
+            tokens, patch_idx = self.model.aggregator(frames)
+            # Extract combined features
+            features = self.model.feature_extractor(tokens, frames, patch_idx)
+            # Split features: first_frame | memory | window
+            first_frame_features = features[:, 0:1]
+            num_memory = len(memory_indices)
+            offset = 1 + num_memory
+            # Run tracking on extended features (memory + window), using first_frame as reference
+            extended_features = features[:, 1:]  # Exclude first_frame from input
+            pred = self.model.tracking_head(
+                extended_features,
+                image_size=(H, W),
+                first_frame_features=first_frame_features,
+            )
+            # Extract window predictions (remove memory frames from output)
+            window_pred = {
+                "track": pred["track"][:, num_memory:],
+                "vis": pred["vis"][:, num_memory:],
+                "conf": pred["conf"][:, num_memory:],
+            }
+            # Merge into accumulated results
+            self.windowed.merge_predictions(window_idx, start, end, window_pred, accumulated)
+            # Cleanup for memory efficiency
+            if not self.training:
+                del features, tokens, pred
+                torch.cuda.empty_cache()
+        if not self.training:
+            accumulated["images"] = images
+        return accumulated
+    def _gather_frames(
+        self,
+        images: torch.Tensor,
+        first_frame: torch.Tensor,
+        start: int,
+        end: int,
+        memory_indices: list,
+    ) -> torch.Tensor:
+        """Gather first_frame + memory + window frames."""
+        parts = [first_frame]
+        if memory_indices:
+            parts.append(images[:, memory_indices])
+        parts.append(images[:, start:end])
+        return torch.cat(parts, dim=1)
+    # Proxy properties for convenient access to internal components
+    @property
+    def aggregator(self):
+        return self.model.aggregator
+    @property
+    def feature_extractor(self):
+        return self.model.feature_extractor
+    @property
+    def tracking_head(self):
+        return self.model.tracking_head
+    @classmethod
+    def from_checkpoint(
+        cls,
+        checkpoint_path: str = None,
+        window_len: int = 100,
+        stride: int = 100,
+        device: str = "cuda",
+        dtype=torch.bfloat16,
+    ):
+        """
+        Load model from checkpoint (local path or HuggingFace Hub).
+        Args:
+            checkpoint_path: Path to local checkpoint file.
+                             If None, downloads from default HuggingFace repo.
+            window_len: Number of frames per window.
+            stride: Step size between windows.
+            device: Target device.
+            dtype: Target dtype.
+        Returns:
+            Loaded model in eval mode.
+        """
+        model = cls(window_len=window_len, stride=stride)
+        # Use CoWTracker's checkpoint loading method (handles local path and HuggingFace download)
+        ckpt = CoWTracker._load_checkpoint(checkpoint_path)
+        state_dict = ckpt.get("model", ckpt)
+        # Remap legacy checkpoint keys if needed (delegate to CoWTracker)
+        legacy_prefixes = ["tracking_head.feature_extractor.", "tracking_head.aggregator.", "tracking_head.fnet."]
+        if any(k.startswith(p) for k in state_dict.keys() for p in legacy_prefixes):
+            print("Detected legacy checkpoint format, remapping keys...")
+            state_dict = CoWTracker._remap_legacy_state_dict(state_dict)
+        # Add "model." prefix if checkpoint is from CoWTracker (no prefix)
+        # CoWTrackerWindowed wraps CoWTracker as self.model, so keys need "model." prefix
+        if not any(k.startswith("model.") for k in state_dict.keys()):
+            print("Adding 'model.' prefix to state dict keys...")
+            state_dict = {f"model.{k}": v for k, v in state_dict.items()}
+        msg = model.load_state_dict(state_dict, strict=False)
+        print(f"Load message: {msg}")
+        model = model.to(device).to(dtype)
+        model.eval()
+        for p in model.parameters():
+            p.requires_grad = False
+        print("Model loaded successfully!")
+        return model

cowtracker/thirdparty/DepthAnythingV2 ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit e5a2732d3ea2cddc081d7bfd708fc0bf09f812f1

cowtracker/thirdparty/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Third-party modules.
+This module sets up sys.path for third-party packages.
+"""
+import os
+import sys
+# Add vggt to sys.path so that 'from vggt.xxx' imports work
+_vggt_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "vggt")
+if _vggt_path not in sys.path:
+    sys.path.insert(0, _vggt_path)

cowtracker/thirdparty/vggt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 44b3afbd1869d8bde4894dd8ea1e293112dd5eba

cowtracker/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from cowtracker.utils.padding import (
+    compute_padding_params,
+    apply_padding,
+    remove_padding_and_scale_back,
+)
+from cowtracker.utils.visualization import paint_point_track
+from cowtracker.utils.ops import (
+    bilinear_sampler,
+    coords_grid,
+    Padder,
+    load_ckpt,
+    upflow8,
+)
+__all__ = [
+    "compute_padding_params",
+    "apply_padding",
+    "remove_padding_and_scale_back",
+    "paint_point_track",
+    "bilinear_sampler",
+    "coords_grid",
+    "Padder",
+    "load_ckpt",
+    "upflow8",
+]

cowtracker/utils/ops.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Common operations for tracking: bilinear sampling, coordinate grids, etc."""
+import cv2
+import torch
+import torch.nn.functional as F
+import numpy as np
+from scipy import interpolate
+def load_ckpt(model, path):
+    """Load checkpoint."""
+    state_dict = torch.load(path, map_location=torch.device("cpu"))
+    model.load_state_dict(state_dict, strict=False)
+def resize_data(img1, img2, flow, factor=1.0):
+    _, _, h, w = img1.shape
+    h = int(h * factor)
+    w = int(w * factor)
+    img1 = F.interpolate(img1, (h, w), mode="area")
+    img2 = F.interpolate(img2, (h, w), mode="area")
+    flow = F.interpolate(flow, (h, w), mode="area") * factor
+    return img1, img2, flow
+class Padder:
+    """Pads images such that dimensions are divisible by factor."""
+    def __init__(self, dims, mode="sintel", factor=32):
+        self.ht, self.wd = dims[-2:]
+        pad_ht = (((self.ht + 8) // factor) + 1) * factor - self.ht
+        pad_wd = (((self.wd + 8) // factor) + 1) * factor - self.wd
+        if mode == "sintel":
+            self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, pad_ht // 2, pad_ht - pad_ht // 2]
+        else:
+            self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, 0, pad_ht]
+    def pad(self, x):
+        return F.pad(x, self._pad, mode="constant", value=0)
+    def unpad(self, x):
+        ht, wd = x.shape[-2:]
+        c = [self._pad[2], ht - self._pad[3], self._pad[0], wd - self._pad[1]]
+        return x[..., c[0] : c[1], c[2] : c[3]]
+def forward_interpolate(flow):
+    flow = flow.detach().cpu().numpy()
+    dx, dy = flow[0], flow[1]
+    ht, wd = dx.shape
+    x0, y0 = np.meshgrid(np.arange(wd), np.arange(ht))
+    x1 = x0 + dx
+    y1 = y0 + dy
+    x1 = x1.reshape(-1)
+    y1 = y1.reshape(-1)
+    dx = dx.reshape(-1)
+    dy = dy.reshape(-1)
+    valid = (x1 > 0) & (x1 < wd) & (y1 > 0) & (y1 < ht)
+    x1 = x1[valid]
+    y1 = y1[valid]
+    dx = dx[valid]
+    dy = dy[valid]
+    flow_x = interpolate.griddata((x1, y1), dx, (x0, y0), method="nearest", fill_value=0)
+    flow_y = interpolate.griddata((x1, y1), dy, (x0, y0), method="nearest", fill_value=0)
+    flow = np.stack([flow_x, flow_y], axis=0)
+    return torch.from_numpy(flow).float()
+def bilinear_sampler(img, coords, mode="bilinear", mask=False):
+    """Wrapper for grid_sample, uses pixel coordinates."""
+    H, W = img.shape[-2:]
+    xgrid, ygrid = coords.split([1, 1], dim=-1)
+    xgrid = 2 * xgrid / (W - 1) - 1
+    ygrid = 2 * ygrid / (H - 1) - 1
+    grid = torch.cat([xgrid, ygrid], dim=-1)
+    img = F.grid_sample(img, grid, align_corners=True)
+    if mask:
+        mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
+        return img, mask.float()
+    return img
+def coords_grid(batch, ht, wd, device):
+    coords = torch.meshgrid(torch.arange(ht, device=device), torch.arange(wd, device=device))
+    coords = torch.stack(coords[::-1], dim=0).float()
+    return coords[None].repeat(batch, 1, 1, 1)
+def upflow8(flow, mode="bilinear"):
+    new_size = (8 * flow.shape[2], 8 * flow.shape[3])
+    return 8 * F.interpolate(flow, size=new_size, mode=mode, align_corners=True)
+def transform(T, p):
+    assert T.shape == (4, 4)
+    return np.einsum("H W j, i j -> H W i", p, T[:3, :3]) + T[:3, 3]
+def from_homog(x):
+    return x[..., :-1] / x[..., [-1]]
+def reproject(depth1, pose1, pose2, K1, K2):
+    H, W = depth1.shape
+    x, y = np.meshgrid(np.arange(W), np.arange(H), indexing="xy")
+    img_1_coords = np.stack((x, y, np.ones_like(x)), axis=-1).astype(np.float64)
+    cam1_coords = np.einsum("H W, H W j, i j -> H W i", depth1, img_1_coords, np.linalg.inv(K1))
+    rel_pose = pose2 @ np.linalg.inv(pose1)
+    cam2_coords = transform(rel_pose, cam1_coords)
+    return from_homog(np.einsum("H W j, i j -> H W i", cam2_coords, K2))
+def induced_flow(depth0, depth1, data):
+    H, W = depth0.shape
+    coords1 = reproject(depth0, data["T0"], data["T1"], data["K0"], data["K1"])
+    x, y = np.meshgrid(np.arange(W), np.arange(H), indexing="xy")
+    coords0 = np.stack([x, y], axis=-1)
+    flow_01 = coords1 - coords0
+    H, W = depth1.shape
+    coords1 = reproject(depth1, data["T1"], data["T0"], data["K1"], data["K0"])
+    x, y = np.meshgrid(np.arange(W), np.arange(H), indexing="xy")
+    coords0 = np.stack([x, y], axis=-1)
+    flow_10 = coords1 - coords0
+    return flow_01, flow_10
+def check_cycle_consistency(flow_01, flow_10):
+    H, W = flow_01.shape[:2]
+    new_coords = flow_01 + np.stack(np.meshgrid(np.arange(W), np.arange(H), indexing="xy"), axis=-1)
+    flow_reprojected = cv2.remap(flow_10, new_coords.astype(np.float32), None, interpolation=cv2.INTER_LINEAR)
+    cycle = flow_reprojected + flow_01
+    cycle = np.linalg.norm(cycle, axis=-1)
+    mask = (cycle < 0.1 * min(H, W)).astype(np.float32)
+    return mask

cowtracker/utils/padding.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Padding utilities for video preprocessing and postprocessing."""
+import torch
+import torch.nn.functional as F
+def compute_padding_params(orig_H, orig_W, inf_H, inf_W, skip_upscaling=False):
+    """Compute padding parameters to preserve aspect ratio.
+    Args:
+        orig_H: Original height
+        orig_W: Original width
+        inf_H: Inference height
+        inf_W: Inference width
+        skip_upscaling: If True and scale > 1, skip upscaling and just pad
+    Returns:
+        Dictionary containing:
+            - scale: Scale factor that would be applied (1.0 if skipped)
+            - scaled_H, scaled_W: Dimensions after scaling (before padding)
+            - pad_top, pad_bottom, pad_left, pad_right: Padding amounts
+            - orig_H, orig_W: Original dimensions (for reference)
+            - upscaling_skipped: Whether upscaling was skipped
+    """
+    scale = min(inf_H / orig_H, inf_W / orig_W)
+    upscaling_skipped = False
+    if skip_upscaling and scale > 1.0:
+        scaled_H = orig_H
+        scaled_W = orig_W
+        upscaling_skipped = True
+    else:
+        scaled_H = int(orig_H * scale)
+        scaled_W = int(orig_W * scale)
+    pad_H = inf_H - scaled_H
+    pad_W = inf_W - scaled_W
+    pad_top = pad_H // 2
+    pad_bottom = pad_H - pad_top
+    pad_left = pad_W // 2
+    pad_right = pad_W - pad_left
+    return {
+        "scale": scale,
+        "scaled_H": scaled_H,
+        "scaled_W": scaled_W,
+        "pad_top": pad_top,
+        "pad_bottom": pad_bottom,
+        "pad_left": pad_left,
+        "pad_right": pad_right,
+        "orig_H": orig_H,
+        "orig_W": orig_W,
+        "upscaling_skipped": upscaling_skipped,
+    }
+def apply_padding(rgbs, padding_info):
+    """Apply padding to input images to reach inference size.
+    Args:
+        rgbs: Input tensor (T, C, H, W)
+        padding_info: Dictionary from compute_padding_params
+    Returns:
+        Padded tensor (T, C, inf_H, inf_W)
+    """
+    T, C, H, W = rgbs.shape
+    scaled_H = padding_info["scaled_H"]
+    scaled_W = padding_info["scaled_W"]
+    if (scaled_H, scaled_W) != (H, W):
+        rgbs_scaled = F.interpolate(
+            rgbs,
+            size=(scaled_H, scaled_W),
+            mode="bilinear",
+            align_corners=False,
+        )
+    else:
+        rgbs_scaled = rgbs
+    pad_left = padding_info["pad_left"]
+    pad_right = padding_info["pad_right"]
+    pad_top = padding_info["pad_top"]
+    pad_bottom = padding_info["pad_bottom"]
+    rgbs_padded = F.pad(
+        rgbs_scaled,
+        (pad_left, pad_right, pad_top, pad_bottom),
+        mode="constant",
+        value=0,
+    )
+    return rgbs_padded
+def remove_padding_and_scale_back(tracks, visibility, confidence, padding_info):
+    """Remove padding from model outputs and scale back to original resolution.
+    Args:
+        tracks: Track predictions (T, inf_H, inf_W, 2)
+        visibility: Visibility predictions (T, inf_H, inf_W)
+        confidence: Confidence predictions (T, inf_H, inf_W)
+        padding_info: Dictionary from compute_padding_params
+    Returns:
+        Tuple of (tracks, visibility, confidence) scaled to original resolution
+    """
+    scaled_H = padding_info["scaled_H"]
+    scaled_W = padding_info["scaled_W"]
+    pad_top = padding_info["pad_top"]
+    pad_left = padding_info["pad_left"]
+    orig_H = padding_info["orig_H"]
+    orig_W = padding_info["orig_W"]
+    tracks_unpadded = tracks[
+        :, pad_top : pad_top + scaled_H, pad_left : pad_left + scaled_W, :
+    ]
+    visibility_unpadded = visibility[
+        :, pad_top : pad_top + scaled_H, pad_left : pad_left + scaled_W
+    ]
+    confidence_unpadded = confidence[
+        :, pad_top : pad_top + scaled_H, pad_left : pad_left + scaled_W
+    ]
+    tracks_unpadded = tracks_unpadded.clone()
+    tracks_unpadded[:, :, :, 0] -= pad_left
+    tracks_unpadded[:, :, :, 1] -= pad_top
+    if (scaled_H, scaled_W) != (orig_H, orig_W):
+        tracks_permuted = tracks_unpadded.permute(0, 3, 1, 2)
+        tracks_scaled = F.interpolate(
+            tracks_permuted,
+            size=(orig_H, orig_W),
+            mode="bilinear",
+            align_corners=False,
+        )
+        tracks_final = tracks_scaled.permute(0, 2, 3, 1)
+        tracks_final[:, :, :, 0] *= orig_W / scaled_W
+        tracks_final[:, :, :, 1] *= orig_H / scaled_H
+        visibility_final = F.interpolate(
+            visibility_unpadded.unsqueeze(1),
+            size=(orig_H, orig_W),
+            mode="bilinear",
+            align_corners=False,
+        ).squeeze(1)
+        confidence_final = F.interpolate(
+            confidence_unpadded.unsqueeze(1),
+            size=(orig_H, orig_W),
+            mode="bilinear",
+            align_corners=False,
+        ).squeeze(1)
+    else:
+        tracks_final = tracks_unpadded
+        visibility_final = visibility_unpadded
+        confidence_final = confidence_unpadded
+    return tracks_final, visibility_final, confidence_final

cowtracker/utils/visualization.py ADDED Viewed

	@@ -0,0 +1,229 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Visualization utilities for point tracking."""
+import colorsys
+import os
+import random
+from typing import List, Optional, Tuple, Union
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+# Bremm 2D colormap for position-based coloring
+# This creates a smooth 2D color gradient based on x,y position
+BREMM_COLORMAP = None  # Lazy loaded
+def _create_bremm_colormap():
+    """Create a 2D colormap programmatically (Bremm-style).
+    This creates a smooth 2D color gradient where:
+    - X position maps to hue variation
+    - Y position maps to saturation/value variation
+    """
+    size = 256
+    colormap = np.zeros((size, size, 3), dtype=np.uint8)
+    for y in range(size):
+        for x in range(size):
+            # Normalize to [0, 1]
+            nx = x / (size - 1)
+            ny = y / (size - 1)
+            # Create a 2D color mapping using HSV
+            # Hue varies with x, saturation/value with y
+            hue = (nx * 0.8 + ny * 0.2) % 1.0  # Mix of x and y for hue
+            saturation = 0.6 + 0.4 * (1 - ny)  # Higher saturation at top
+            value = 0.7 + 0.3 * nx  # Higher value on right
+            # Convert HSV to RGB
+            rgb = colorsys.hsv_to_rgb(hue, saturation, value)
+            colormap[y, x] = [int(c * 255) for c in rgb]
+    return colormap
+def _get_bremm_colormap():
+    """Get or create the bremm colormap."""
+    global BREMM_COLORMAP
+    if BREMM_COLORMAP is None:
+        # Try to load from file first
+        colormap_file = os.path.join(os.path.dirname(__file__), "bremm.png")
+        if os.path.exists(colormap_file):
+            BREMM_COLORMAP = (plt.imread(colormap_file) * 255).astype(np.uint8)
+            if BREMM_COLORMAP.shape[2] == 4:  # RGBA
+                BREMM_COLORMAP = BREMM_COLORMAP[:, :, :3]
+        else:
+            BREMM_COLORMAP = _create_bremm_colormap()
+    return BREMM_COLORMAP
+def get_2d_colors(xys: np.ndarray, H: int, W: int) -> np.ndarray:
+    """Get colors based on 2D position using Bremm colormap.
+    This creates position-dependent colors where nearby points have
+    similar colors, useful for visualizing spatial coherence.
+    Args:
+        xys: Point coordinates [N, 2] in pixel space (x, y)
+        H: Image height
+        W: Image width
+    Returns:
+        Array of RGB colors [N, 3] as uint8
+    """
+    colormap = _get_bremm_colormap()
+    height, width = colormap.shape[:2]
+    N = xys.shape[0]
+    output = np.zeros((N, 3), dtype=np.uint8)
+    # Normalize coordinates to [0, 1]
+    xys_norm = xys.copy().astype(np.float32)
+    xys_norm[:, 0] = xys_norm[:, 0] / max(W - 1, 1)
+    xys_norm[:, 1] = xys_norm[:, 1] / max(H - 1, 1)
+    # Clip to valid range
+    xys_norm = np.clip(xys_norm, 0, 1)
+    # Map to colormap coordinates
+    for i in range(N):
+        x, y = xys_norm[i]
+        xp = int((width - 1) * x)
+        yp = int((height - 1) * y)
+        output[i] = colormap[yp, xp]
+    return output
+def get_colors_from_cmap(num_colors: int, cmap: str = "gist_rainbow") -> np.ndarray:
+    """Gets colormap for points using matplotlib colormap.
+    Args:
+        num_colors: Number of colors to generate
+        cmap: Matplotlib colormap name (e.g., "gist_rainbow", "jet", "turbo")
+    Returns:
+        Array of RGB colors [num_colors, 3] as uint8
+    """
+    cmap_ = matplotlib.colormaps.get_cmap(cmap)
+    colors = []
+    for i in range(num_colors):
+        c = cmap_(i / float(num_colors))
+        colors.append((int(c[0] * 255), int(c[1] * 255), int(c[2] * 255)))
+    return np.array(colors)
+def paint_point_track(
+    frames: np.ndarray,
+    point_tracks: np.ndarray,
+    visibles: np.ndarray,
+    colormap: Optional[Union[List[Tuple[int, int, int]], np.ndarray]] = None,
+    rate: int = 1,
+    show_bkg: bool = True,
+) -> np.ndarray:
+    """Paint point tracks on video frames using GPU-accelerated scatter.
+    Args:
+        frames: Video frames [T, H, W, C] in uint8
+        point_tracks: Track coordinates [P, T, 2] (x, y)
+        visibles: Visibility mask [P, T]
+        colormap: Optional list/array of RGB colors for each point
+        rate: Subsampling rate for visualization (affects point size)
+        show_bkg: Whether to show background (True) or black out (False)
+    Returns:
+        Painted frames [T, H, W, C] in uint8
+    """
+    print("Starting visualization...")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    frames_t = (
+        torch.from_numpy(frames).float().permute(0, 3, 1, 2).to(device)
+    )  # [T,C,H,W]
+    if show_bkg:
+        frames_t = frames_t * 0.5  # darken to see tracks better
+    else:
+        frames_t = frames_t * 0.0  # black out background
+    point_tracks_t = torch.from_numpy(point_tracks).to(device)  # [P,T,2]
+    visibles_t = torch.from_numpy(visibles).to(device)  # [P,T]
+    T, C, H, W = frames_t.shape
+    P = point_tracks.shape[0]
+    # Use gist_rainbow colormap (matching app3.py behavior)
+    if colormap is None:
+        colormap = get_colors_from_cmap(P, "gist_rainbow")
+    colors = torch.tensor(colormap, dtype=torch.float32, device=device)  # [P,3]
+    # Adjust radius based on rate
+    if rate == 1:
+        radius = 1
+    elif rate == 2:
+        radius = 1
+    elif rate == 4:
+        radius = 2
+    elif rate == 8:
+        radius = 4
+    else:
+        radius = 6
+    sharpness = 0.15 + 0.05 * np.log2(rate)
+    D = radius * 2 + 1
+    y = torch.arange(D, device=device).float()[:, None] - radius
+    x = torch.arange(D, device=device).float()[None, :] - radius
+    dist2 = x**2 + y**2
+    icon = torch.clamp(1 - (dist2 - (radius**2) / 2.0) / (radius * 2 * sharpness), 0, 1)
+    icon = icon.view(1, D, D)
+    dx = torch.arange(-radius, radius + 1, device=device)
+    dy = torch.arange(-radius, radius + 1, device=device)
+    disp_y, disp_x = torch.meshgrid(dy, dx, indexing="ij")
+    for t in range(T):
+        mask = visibles_t[:, t]
+        if mask.sum() == 0:
+            continue
+        xy = point_tracks_t[mask, t] + 0.5
+        xy[:, 0] = xy[:, 0].clamp(0, W - 1)
+        xy[:, 1] = xy[:, 1].clamp(0, H - 1)
+        colors_now = colors[mask]
+        N = xy.shape[0]
+        cx = xy[:, 0].long()
+        cy = xy[:, 1].long()
+        x_grid = cx[:, None, None] + disp_x
+        y_grid = cy[:, None, None] + disp_y
+        valid = (x_grid >= 0) & (x_grid < W) & (y_grid >= 0) & (y_grid < H)
+        x_valid = x_grid[valid]
+        y_valid = y_grid[valid]
+        icon_weights = icon.expand(N, D, D)[valid]
+        colors_valid = (
+            colors_now[:, :, None, None]
+            .expand(N, 3, D, D)
+            .permute(1, 0, 2, 3)[:, valid]
+        )
+        idx_flat = (y_valid * W + x_valid).long()
+        accum = torch.zeros_like(frames_t[t])
+        weight = torch.zeros(1, H * W, device=device)
+        img_flat = accum.view(C, -1)
+        weighted_colors = colors_valid * icon_weights
+        img_flat.scatter_add_(1, idx_flat.unsqueeze(0).expand(C, -1), weighted_colors)
+        weight.scatter_add_(1, idx_flat.unsqueeze(0), icon_weights.unsqueeze(0))
+        weight = weight.view(1, H, W)
+        alpha = weight.clamp(0, 1)
+        accum = accum / (weight + 1e-6)
+        frames_t[t] = frames_t[t] * (1 - alpha) + accum * alpha
+    print("Visualization done.")
+    return frames_t.clamp(0, 255).byte().permute(0, 2, 3, 1).cpu().numpy()

demo.py ADDED Viewed

	@@ -0,0 +1,179 @@

+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Minimal CoWTracker inference demo.
+Usage:
+    python demo.py --video input.mp4 --output output.mp4
+    python demo.py --video input.mp4 --output output.mp4 --checkpoint ~/run168/cow_tracker_model.pth
+"""
+import argparse
+import os
+import mediapy
+import numpy as np
+import torch
+from cowtracker import CoWTracker
+from cowtracker.utils.visualization import paint_point_track
+inf_dtype = torch.float16
+def preprocess_video(video_path, max_frames=200, target_size=(336, 560)):
+    """Load and preprocess video.
+    Args:
+        video_path: Path to input video
+        max_frames: Maximum number of frames to process
+        target_size: Target size (H, W) for inference
+    Returns:
+        Tuple of (video_array, fps)
+    """
+    video_arr = mediapy.read_video(video_path)
+    video_fps = video_arr.metadata.fps
+    num_frames = video_arr.shape[0]
+    # Truncate if too long
+    if num_frames > max_frames:
+        print(f"Video is too long. Truncating to first {max_frames} frames.")
+        video_arr = video_arr[:max_frames]
+    # Resize to target size
+    video_arr = mediapy.resize_video(video_arr, target_size)
+    return np.array(video_arr), video_fps
+def run_inference(model, video):
+    """Run tracking inference on video.
+    Args:
+        model: CoWTracker model
+        video: Video array [T, H, W, C] in uint8
+    Returns:
+        Tuple of (tracks, visibilities, confidences)
+            - tracks: [T, H, W, 2]
+            - visibilities: [T, H, W]
+            - confidences: [T, H, W]
+    """
+    device = next(model.parameters()).device
+    # Convert to tensor [T, C, H, W]
+    video_tensor = torch.from_numpy(video).permute(0, 3, 1, 2).float().to(device)
+    T, C, H, W = video_tensor.shape
+    print(f"Video size: {H}x{W}")
+    torch.cuda.empty_cache()
+    with torch.no_grad():
+        with torch.amp.autocast(device_type="cuda", dtype=inf_dtype):
+            predictions = model.forward(video=video_tensor, queries=None)
+            tracks = predictions["track"][0].cpu()
+            visibility = predictions["vis"][0].cpu()
+            confidence = predictions["conf"][0].cpu()
+    visconf = visibility * confidence
+    return tracks, visconf > 0.1, visconf
+def create_visualization(video, tracks, visibilities, rate=8, fps=30, show_bkg=True):
+    """Create visualization video.
+    Args:
+        video: Video array [T, H, W, C]
+        tracks: Tracks [T, H, W, 2]
+        visibilities: Visibility mask [T, H, W]
+        rate: Subsampling rate for points
+        fps: Output video fps
+        show_bkg: Whether to show background
+    Returns:
+        Painted video frames [T, H, W, C]
+    """
+    T, H, W, _ = video.shape
+    # Subsample tracks for visualization
+    tracks_np = tracks.permute(1, 2, 0, 3).reshape(-1, T, 2).numpy()  # [HW, T, 2]
+    vis_np = visibilities.permute(1, 2, 0).reshape(-1, T).numpy()  # [HW, T]
+    # Subsample
+    tracks_sub = tracks_np.reshape(H, W, T, 2)[::rate, ::rate].reshape(-1, T, 2)
+    vis_sub = vis_np.reshape(H, W, T)[::rate, ::rate].reshape(-1, T)
+    # Paint tracks
+    painted_video = paint_point_track(
+        video, tracks_sub, vis_sub, rate=rate, show_bkg=show_bkg
+    )
+    return painted_video
+def main():
+    parser = argparse.ArgumentParser(description="CoWTracker Inference Demo")
+    parser.add_argument("--video", type=str, required=True, help="Path to input video")
+    parser.add_argument("--output", type=str, default=None, help="Path to output video")
+    parser.add_argument(
+        "--checkpoint",
+        type=str,
+        default=None,
+        help="Path to model checkpoint",
+    )
+    parser.add_argument(
+        "--rate", type=int, default=8, help="Subsampling rate for visualization"
+    )
+    parser.add_argument(
+        "--max_frames", type=int, default=200, help="Maximum number of frames"
+    )
+    parser.add_argument("--no_bkg", action="store_true", help="Hide background in visualization")
+    args = parser.parse_args()
+    # Set output path
+    if args.output is None:
+        base_name = os.path.splitext(os.path.basename(args.video))[0]
+        args.output = f"{base_name}_tracked.mp4"
+    print("=" * 60)
+    print("CoWTracker Inference Demo")
+    print("=" * 60)
+    # Load model
+    print("\n[1/4] Loading model...")
+    model = CoWTracker.from_checkpoint(
+        args.checkpoint,
+        device="cuda" if torch.cuda.is_available() else "cpu",
+        dtype=inf_dtype if torch.cuda.is_available() else torch.float32,
+    )
+    # Load video
+    print("\n[2/4] Loading video...")
+    video, fps = preprocess_video(args.video, max_frames=args.max_frames)
+    print(f"Video shape: {video.shape}, FPS: {fps}")
+    # Run inference
+    print("\n[3/4] Running inference...")
+    tracks, visibilities, confidences = run_inference(model, video)
+    print(f"Tracks shape: {tracks.shape}")
+    # Create visualization
+    print("\n[4/4] Creating visualization...")
+    painted_video = create_visualization(
+        video, tracks, visibilities, rate=args.rate, fps=fps, show_bkg=not args.no_bkg
+    )
+    # Save output
+    mediapy.write_video(args.output, painted_video, fps=fps)
+    print(f"\nSaved output to: {args.output}")
+    print("=" * 60)
+if __name__ == "__main__":
+    main()

docs/logo.jpg ADDED Viewed

Git LFS Details

SHA256: ea7971892491cdc682c4592ce208d755e899269e28b57133cdd343020c1c1726
Pointer size: 130 Bytes
Size of remote file: 69.4 kB

docs/teaser.jpg ADDED Viewed

Git LFS Details

SHA256: 23100f29cbaf75d0e40a39ef480b5dbc3a6e61513d2ada83d810a3d4b4541f17
Pointer size: 131 Bytes
Size of remote file: 283 kB

environments.yml ADDED Viewed

	@@ -0,0 +1,36 @@

+name: cowtracker
+channels:
+  - pytorch
+  - nvidia
+  - conda-forge
+  - defaults
+dependencies:
+  - python=3.12
+  - pip
+  - pip:
+      # Core deep learning
+      - torch>=2.0.0
+      - torchvision>=0.15.0
+      - xformers
+      - timm
+      # Numerical / scientific
+      - numpy
+      - scipy
+      - einops
+      # Image / video processing
+      - opencv-python
+      - Pillow
+      - mediapy
+      - matplotlib
+      # Model hub
+      - huggingface_hub
+      # Gradio demo
+      - gradio
+      # Optional: development tools
+      - ipython
+      - ipdb

output.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:989c1e01c2f3eb2e55387ba74c78497d3aba3805c63dc0eeb7f581754465bf7c
+size 2550238

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ ffmpeg
2	+

requirements.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+# Core deep learning
+torch>=2.0.0
+torchvision>=0.15.0
+xformers==0.0.33.post1
+timm
+# Numerical / scientific
+numpy
+scipy
+einops
+# Image / video processing
+opencv-python-headless
+Pillow
+mediapy
+matplotlib
+# Model hub
+huggingface_hub
+# Gradio demo
+gradio>=4.0.0
+# HuggingFace Spaces (ZeroGPU support)
+spaces

videos/apple.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7f48c5cfb1479e1dbc1df2373d5cad4f55c198bbdb379da0ece10087971542a
+size 1219872

videos/bear.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eeffab1780be601b19b2097be81a8c2d4fa2b624ac1028be0a32191d25acca0f
+size 893943

videos/bmx-bumps.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4d4aa73e0342d8dc08c4a7e3c9ea10e46507d363f0363f3e38bfb3ececa1588
+size 3094667

videos/cows.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4ca9ba3b3f720142917dc20935b03c9bbdc629e55502955526daccec567170d
+size 5282840

videos/lab-coat.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf43f05f1a011e6cf376d4dc17b18b249613fa5e082df9e3af941fa012c34c9e
+size 1850114

videos/longboard.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13c977306e09a3c0766952497b7e51d5accfb5b15bdcb5ac32a1c1afc7893f67
+size 2879038

videos/motocross-jump.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f757c903177154b7eceb4b9fe5386bb38cba20ef4da8645cbfc450e0ac39ffef
+size 1343986