Buckets:

hf-doc-build/doc-dev / computer-vision-course /pr_397 /en /unit8 /monocular_depth_estimation.html
rtrm's picture
download
raw
145 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Metric and Relative Monocular Depth Estimation: An Overview. Fine-Tuning Depth Anything V2 👐 📚&quot;,&quot;local&quot;:&quot;metric-and-relative-monocular-depth-estimation-an-overview-fine-tuning-depth-anything-v2--&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Evolution of Models&quot;,&quot;local&quot;:&quot;evolution-of-models&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;The Basics&quot;,&quot;local&quot;:&quot;the-basics&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Relative and Absolute (aka Metric) Depth Estimation: What’s the Point?&quot;,&quot;local&quot;:&quot;relative-and-absolute-aka-metric-depth-estimation-whats-the-point&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Scale and Shift Invariant Loss 😎&quot;,&quot;local&quot;:&quot;scale-and-shift-invariant-loss-&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Metrics&quot;,&quot;local&quot;:&quot;metrics&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Important Considerations&quot;,&quot;local&quot;:&quot;important-considerations&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Four Methods of Calculating Metrics&quot;,&quot;local&quot;:&quot;four-methods-of-calculating-metrics&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Depth Anything V2 Absolute Depth Estimation Fine-Tuning&quot;,&quot;local&quot;:&quot;depth-anything-v2-absolute-depth-estimation-fine-tuning&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Key Ideas Behind Depth Anything V2&quot;,&quot;local&quot;:&quot;key-ideas-behind-depth-anything-v2&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Getting Started with Fine-Tuning&quot;,&quot;local&quot;:&quot;getting-started-with-fine-tuning&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Step 1: Clone the Repository and Download Pre-trained Weights&quot;,&quot;local&quot;:&quot;step-1-clone-the-repository-and-download-pre-trained-weights&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Step 2: Import Required Modules&quot;,&quot;local&quot;:&quot;step-2-import-required-modules&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Step 3: Get All File Paths for Training and Validation&quot;,&quot;local&quot;:&quot;step-3-get-all-file-paths-for-training-and-validation&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Step 4: Define the PyTorch Dataset&quot;,&quot;local&quot;:&quot;step-4-define-the-pytorch-dataset&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Step 5: Data Visualization&quot;,&quot;local&quot;:&quot;step-5-data-visualization&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Step 6: Prepare Dataloaders&quot;,&quot;local&quot;:&quot;step-6-prepare-dataloaders&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Step 7: Metric Evaluation&quot;,&quot;local&quot;:&quot;step-7-metric-evaluation&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Step 8: Define Hyperparameters&quot;,&quot;local&quot;:&quot;step-8-define-hyperparameters&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Step 9: Training Function&quot;,&quot;local&quot;:&quot;step-9-training-function&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Step 10: Launch the Training&quot;,&quot;local&quot;:&quot;step-10-launch-the-training&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;References&quot;,&quot;local&quot;:&quot;references&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/computer-vision-course/pr_397/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/entry/start.7f209408.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/scheduler.7bc62968.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/singletons.b15acae1.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/paths.11cdc4b4.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/entry/app.32e8338e.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/index.2f8492b0.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/nodes/0.e37092e8.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/nodes/85.69b59d0a.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/CodeBlock.bb61a5a9.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/index.514d62da.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Metric and Relative Monocular Depth Estimation: An Overview. Fine-Tuning Depth Anything V2 👐 📚&quot;,&quot;local&quot;:&quot;metric-and-relative-monocular-depth-estimation-an-overview-fine-tuning-depth-anything-v2--&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Evolution of Models&quot;,&quot;local&quot;:&quot;evolution-of-models&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;The Basics&quot;,&quot;local&quot;:&quot;the-basics&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Relative and Absolute (aka Metric) Depth Estimation: What’s the Point?&quot;,&quot;local&quot;:&quot;relative-and-absolute-aka-metric-depth-estimation-whats-the-point&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Scale and Shift Invariant Loss 😎&quot;,&quot;local&quot;:&quot;scale-and-shift-invariant-loss-&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Metrics&quot;,&quot;local&quot;:&quot;metrics&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Important Considerations&quot;,&quot;local&quot;:&quot;important-considerations&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Four Methods of Calculating Metrics&quot;,&quot;local&quot;:&quot;four-methods-of-calculating-metrics&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Depth Anything V2 Absolute Depth Estimation Fine-Tuning&quot;,&quot;local&quot;:&quot;depth-anything-v2-absolute-depth-estimation-fine-tuning&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Key Ideas Behind Depth Anything V2&quot;,&quot;local&quot;:&quot;key-ideas-behind-depth-anything-v2&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Getting Started with Fine-Tuning&quot;,&quot;local&quot;:&quot;getting-started-with-fine-tuning&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Step 1: Clone the Repository and Download Pre-trained Weights&quot;,&quot;local&quot;:&quot;step-1-clone-the-repository-and-download-pre-trained-weights&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Step 2: Import Required Modules&quot;,&quot;local&quot;:&quot;step-2-import-required-modules&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Step 3: Get All File Paths for Training and Validation&quot;,&quot;local&quot;:&quot;step-3-get-all-file-paths-for-training-and-validation&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Step 4: Define the PyTorch Dataset&quot;,&quot;local&quot;:&quot;step-4-define-the-pytorch-dataset&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Step 5: Data Visualization&quot;,&quot;local&quot;:&quot;step-5-data-visualization&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Step 6: Prepare Dataloaders&quot;,&quot;local&quot;:&quot;step-6-prepare-dataloaders&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Step 7: Metric Evaluation&quot;,&quot;local&quot;:&quot;step-7-metric-evaluation&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Step 8: Define Hyperparameters&quot;,&quot;local&quot;:&quot;step-8-define-hyperparameters&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Step 9: Training Function&quot;,&quot;local&quot;:&quot;step-9-training-function&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Step 10: Launch the Training&quot;,&quot;local&quot;:&quot;step-10-launch-the-training&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;References&quot;,&quot;local&quot;:&quot;references&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="metric-and-relative-monocular-depth-estimation-an-overview-fine-tuning-depth-anything-v2--" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#metric-and-relative-monocular-depth-estimation-an-overview-fine-tuning-depth-anything-v2--"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Metric and Relative Monocular Depth Estimation: An Overview. Fine-Tuning Depth Anything V2 👐 📚</span></h1> <h2 class="relative group"><a id="evolution-of-models" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#evolution-of-models"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Evolution of Models</span></h2> <p data-svelte-h="svelte-142xz5k">Over the past decade, monocular depth estimation models have undergone remarkable advancements. Let’s take a visual journey through this evolution.</p> <p data-svelte-h="svelte-uwse9e">We started with basic models like this:</p> <p data-svelte-h="svelte-q9mjbd"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/Metric%20and%20Relative%20Monocular%20Depth%20Estimation%20An%20Overview.%20Fine-Tuning%20Depth%20Anything%20V2/depth_estimation_evolution1.png" alt="image/png">
Progressed to more sophisticated models:</p> <p data-svelte-h="svelte-1cbidz2"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/Metric%20and%20Relative%20Monocular%20Depth%20Estimation%20An%20Overview.%20Fine-Tuning%20Depth%20Anything%20V2/depth_estimation_evolution2.png" alt="image/png"></p> <p data-svelte-h="svelte-g72asz">And now, we have the state-of-the-art model, Depth Anything V2:</p> <p data-svelte-h="svelte-1o5jpkb"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/Metric%20and%20Relative%20Monocular%20Depth%20Estimation%20An%20Overview.%20Fine-Tuning%20Depth%20Anything%20V2/depth_estimation_evolution3.png" alt="image/png"></p> <p data-svelte-h="svelte-jjef7a">Impressive, isn’t it?</p> <p data-svelte-h="svelte-s8bzno">Today, we will demystify how these models work and simplify complex concepts. Moreover, we will fine-tune our own model using a custom dataset. ”<em>But wait,</em>” you might ask, ”<em>why would we need to fine-tune a model on our own dataset when the latest model performs so well in any environment?</em></p> <p data-svelte-h="svelte-1m44jav">This is where the nuances and specifics come into play, which is precisely the focus of this article. If you’re eager to explore the intricacies of monocular depth estimation, keep reading.</p> <h2 class="relative group"><a id="the-basics" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#the-basics"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>The Basics</span></h2> <p data-svelte-h="svelte-952prg"><em>Okay, what exactly is depth?</em>” Typically, it’s a single-channel image where each pixel represents the distance from the camera or sensor to a point in space corresponding to that pixel. However, it turns out that these distances can be absolute or relative — what a twist!</p> <ul data-svelte-h="svelte-vnboup"><li><strong>Absolute Depth</strong>: Each pixel value directly corresponds to a physical distance (e.g., in meters or centimeters).</li> <li><strong>Relative Depth</strong>: The pixel values indicate which points are closer or further away without referencing real-world units of measurement. Often the relative depth is inverted, i.e. the smaller the number, the farther the point is.</li></ul> <p data-svelte-h="svelte-1ar7mrx">We’ll explore these concepts in more detail a bit later.</p> <p data-svelte-h="svelte-1pbkc55"><em>Well, but what does monocular mean?</em>” It simply means that we need to estimate depth using just a single photo. What’s so challenging about that? Take a look at this:</p> <p data-svelte-h="svelte-8j5iul"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/Metric%20and%20Relative%20Monocular%20Depth%20Estimation%20An%20Overview.%20Fine-Tuning%20Depth%20Anything%20V2/depth_ambiguity1.gif" alt="image/gif"></p> <p data-svelte-h="svelte-43yuhw"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/Metric%20and%20Relative%20Monocular%20Depth%20Estimation%20An%20Overview.%20Fine-Tuning%20Depth%20Anything%20V2/depth_ambiguity2.gif" alt="image/gif"></p> <p data-svelte-h="svelte-munlfj">As you can see, projecting a 3D space onto a 2D plane can create ambiguity due to perspective. To address this, there are precise mathematical methods for depth estimation using multiple images, such as Stereo Vision, Structure from Motion, and the broader field of Photogrammetry. Additionally, technologies like laser scanners (e.g., LiDAR) can be used for depth measurement.</p> <p data-svelte-h="svelte-awra7p"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/Metric%20and%20Relative%20Monocular%20Depth%20Estimation%20An%20Overview.%20Fine-Tuning%20Depth%20Anything%20V2/stereo_vision_sfm.png" alt="image/png"></p> <h2 class="relative group"><a id="relative-and-absolute-aka-metric-depth-estimation-whats-the-point" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#relative-and-absolute-aka-metric-depth-estimation-whats-the-point"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Relative and Absolute (aka Metric) Depth Estimation: What’s the Point?</span></h2> <p data-svelte-h="svelte-14adj62">Let’s explore some challenges that highlight the necessity of relative depth estimation. And to be more scientific, let’s refer to some papers.</p> <blockquote data-svelte-h="svelte-6em976"><p>The advantage of predicting metric depth is the practical utility for many downstream applications in computer vision and robotics, such as mapping, planning, navigation, object recognition, 3D reconstruction, and image editing. However, training a single metric depth estimation model across multiple datasets often deteriorates the performance, especially when the collection includes images with large differences in depth scale, e.g. indoor and outdoor images. As a result, current MDE models usually overfit to specific datasets and do not generalize well to other datasets.</p></blockquote> <p data-svelte-h="svelte-14l3avz">Typically, the architecture for this image-to-image task is an encoder-decoder model, like a U-Net, with various modifications. Formally, this is a pixel-wise regression problem. Imagine how challenging it is for a neural network to accurately predict distances for each pixel, ranging from a few meters to several hundred meters. <br> This brings us to the idea of moving away from a universal model that predicts exact distances in all scenarios. Instead, let’s develop a model that approximately (relatively) predicts depth, capturing the shape and structure of the scene by indicating which objects are farther and which are closer relative to each other and to us. If precise distances are needed, we can fine-tune this relative model on a specific dataset, leveraging its existing understanding of the task.</p> <p data-svelte-h="svelte-1o5wdpe"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/Metric%20and%20Relative%20Monocular%20Depth%20Estimation%20An%20Overview.%20Fine-Tuning%20Depth%20Anything%20V2/different_scales.png" alt="image/png"></p> <p data-svelte-h="svelte-ppcwfx">There are even more details we have to pay attention to.</p> <blockquote data-svelte-h="svelte-1b3u3ow"><p>The model not only has to handle images taken with different cameras and camera settings but also has to learn to adjust for the large variations in the overall scale of the scenes.</p></blockquote> <p data-svelte-h="svelte-wf4ua1">Apart from different scales, as we mentioned earlier, a significant problem lies in the cameras themselves, which can have vastly different perspectives of the world.</p> <p data-svelte-h="svelte-ikewdx"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/Metric%20and%20Relative%20Monocular%20Depth%20Estimation%20An%20Overview.%20Fine-Tuning%20Depth%20Anything%20V2/focal_lenght.png" alt="image/png"></p> <p data-svelte-h="svelte-1cl1emg">Notice how changes in focal length dramatically alter the perception of background distances!</p> <p data-svelte-h="svelte-18djyrd">Lastly, many datasets lack absolute depth maps altogether and only have relative ones (for instance, due to the lack of camera calibration). Additionally, each method of obtaining depth has its own advantages, disadvantages, biases, and problems.</p> <p data-svelte-h="svelte-108g7gr"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/Metric%20and%20Relative%20Monocular%20Depth%20Estimation%20An%20Overview.%20Fine-Tuning%20Depth%20Anything%20V2/metrics1.png" alt="image/png"></p> <blockquote data-svelte-h="svelte-1kebj4j"><p>We identify three major challenges. 1) Inherently different representations of depth: direct vs. inverse depth representations. 2) Scale ambiguity: for some data sources, depth is only given up to an unknown scale. 3) Shift ambiguity: some datasets provide disparity only up to an unknown scale and global disparity shift that is a function of the unknown baseline and a horizontal shift of the principal points due to post-processing</p></blockquote> <p data-svelte-h="svelte-1ksy5lu"><em>Disparity refers to the difference in the apparent position of an object when viewed from two different viewpoints, commonly used in stereo vision to estimate depth.</em></p> <p data-svelte-h="svelte-1cf2sqp">In short, I hope I’ve convinced you that you can’t just take scattered depth maps from the internet and train a model with them using some pixel-wise MSE.</p> <p data-svelte-h="svelte-q9h8lq">But how do we equalize all these variations? How can we abstract as much as possible from the differences and extract commonalities from all these datasets — namely, the shape and structure of the scene, the proportional relationships between objects, indicating what is closer and what is farther away?</p> <h2 class="relative group"><a id="scale-and-shift-invariant-loss-" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#scale-and-shift-invariant-loss-"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Scale and Shift Invariant Loss 😎</span></h2> <p data-svelte-h="svelte-1j8tvr">Simply put, we need to perform some sort of normalization on all the depth maps we want to train on and evaluate metrics with. Here is an idea: we want to create a loss function that doesn’t consider the scale of the environment or the various shifts. The remaining task is to translate this idea into mathematical terms.</p> <blockquote><p>Concretely, the depth value is first transformed into the disparity space by<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>d</mi><mo>=</mo><mfrac><mn>1</mn><mi>t</mi></mfrac></mrow><annotation encoding="application/x-tex"> d = \frac{1}{t} </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.6944em;"></span><span class="mord mathnormal">d</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1.1901em;vertical-align:-0.345em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.8451em;"><span style="top:-2.655em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">t</span></span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.394em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">1</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.345em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span></span></span></span><!-- HTML_TAG_END --> and then normalized to<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>0</mn><mo></mo><mn>1</mn></mrow><annotation encoding="application/x-tex"> 0 \sim 1 </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.6444em;"></span><span class="mord">0</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel"></span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:0.6444em;"></span><span class="mord">1</span></span></span></span><!-- HTML_TAG_END --> on each depth map. To enable multi-dataset joint training, we adopt the affine-invariant loss to ignore the unknown scale and shift of each sample:<!-- HTML_TAG_START --><span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><msub><mi mathvariant="script">L</mi><mn>1</mn></msub><mo>=</mo><mfrac><mn>1</mn><mrow><mi>H</mi><mi>W</mi></mrow></mfrac><munderover><mo></mo><mrow><mi>i</mi><mo>=</mo><mn>1</mn></mrow><mrow><mi>H</mi><mi>W</mi></mrow></munderover><mi>ρ</mi><mo stretchy="false">(</mo><msubsup><mi>d</mi><mi>i</mi><mo></mo></msubsup><mo separator="true">,</mo><msub><mi>d</mi><mi>i</mi></msub><mo stretchy="false">)</mo><mo separator="true">,</mo></mrow><annotation encoding="application/x-tex">\mathcal{L}_1 = \frac{1}{HW} \sum_{i=1}^{HW} \rho(d_i^*, d_i),</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8333em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathcal">L</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3011em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight">1</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:3.106em;vertical-align:-1.2777em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.08125em;">H</span><span class="mord mathnormal" style="margin-right:0.13889em;">W</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">1</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mop op-limits"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.8283em;"><span style="top:-1.8723em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mrel mtight">=</span><span class="mord mtight">1</span></span></span></span><span style="top:-3.05em;"><span class="pstrut" style="height:3.05em;"></span><span><span class="mop op-symbol large-op"></span></span></span><span style="top:-4.3em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.08125em;">H</span><span class="mord mathnormal mtight" style="margin-right:0.13889em;">W</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:1.2777em;"><span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord mathnormal">ρ</span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal">d</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.7387em;"><span style="top:-2.453em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span><span style="top:-3.113em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight"></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.247em;"><span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal">d</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mclose">)</span><span class="mpunct">,</span></span></span></span></span><!-- HTML_TAG_END -->
where<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msubsup><mi>d</mi><mi>i</mi><mo></mo></msubsup></mrow><annotation encoding="application/x-tex"> d_i^* </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.9531em;vertical-align:-0.2587em;"></span><span class="mord"><span class="mord mathnormal">d</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.6887em;"><span style="top:-2.4413em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight"></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.2587em;"><span></span></span></span></span></span></span></span></span></span><!-- HTML_TAG_END --> and<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>d</mi><mi>i</mi></msub></mrow><annotation encoding="application/x-tex"> d_i </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8444em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathnormal">d</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span><!-- HTML_TAG_END --> are the prediction and ground truth, respectively. And<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>ρ</mi></mrow><annotation encoding="application/x-tex"> \rho </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.625em;vertical-align:-0.1944em;"></span><span class="mord mathnormal">ρ</span></span></span></span><!-- HTML_TAG_END --> is the affine-invariant mean absolute error loss:<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>ρ</mi><mo stretchy="false">(</mo><msubsup><mi>d</mi><mi>i</mi><mo></mo></msubsup><mo separator="true">,</mo><msub><mi>d</mi><mi>i</mi></msub><mo stretchy="false">)</mo><mo>=</mo><mrow><mo fence="true"></mo><msubsup><mover accent="true"><mi>d</mi><mo>^</mo></mover><mi>i</mi><mo></mo></msubsup><mo></mo><msub><mover accent="true"><mi>d</mi><mo>^</mo></mover><mi>i</mi></msub><mo fence="true"></mo></mrow></mrow><annotation encoding="application/x-tex"> \rho(d_i^*, d_i) = \left| \hat{d}_i^* - \hat{d}_i \right| </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1.0087em;vertical-align:-0.2587em;"></span><span class="mord mathnormal">ρ</span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal">d</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.6887em;"><span style="top:-2.4413em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight"></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.2587em;"><span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal">d</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1.8em;vertical-align:-0.65em;"></span><span class="minner"><span class="mopen"><span class="delimsizing mult"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.15em;"><span style="top:-3.15em;"><span class="pstrut" style="height:3.8em;"></span><span style="width:0.333em;height:1.800em;"><svg xmlns="http://www.w3.org/2000/svg" width='0.333em' height='1.800em' viewBox='0 0 333 1800'><path d='M145 15 v585 v600 v585 c2.667,10,9.667,15,21,15
c10,0,16.667,-5,20,-15 v-585 v-600 v-585 c-2.667,-10,-9.667,-15,-21,-15
c-10,0,-16.667,5,-20,15z M188 15 H145 v585 v600 v585 h43z'/></svg></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.65em;"><span></span></span></span></span></span></span><span class="mord"><span class="mord accent"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.9579em;"><span style="top:-3em;"><span class="pstrut" style="height:3em;"></span><span class="mord mathnormal">d</span></span><span style="top:-3.2634em;"><span class="pstrut" style="height:3em;"></span><span class="accent-body" style="left:-0.0833em;"><span class="mord">^</span></span></span></span></span></span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.6887em;"><span style="top:-2.4413em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight"></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.2587em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin"></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord"><span class="mord accent"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.9579em;"><span style="top:-3em;"><span class="pstrut" style="height:3em;"></span><span class="mord mathnormal">d</span></span><span style="top:-3.2634em;"><span class="pstrut" style="height:3em;"></span><span class="accent-body" style="left:-0.0833em;"><span class="mord">^</span></span></span></span></span></span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mclose"><span class="delimsizing mult"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.15em;"><span style="top:-3.15em;"><span class="pstrut" style="height:3.8em;"></span><span style="width:0.333em;height:1.800em;"><svg xmlns="http://www.w3.org/2000/svg" width='0.333em' height='1.800em' viewBox='0 0 333 1800'><path d='M145 15 v585 v600 v585 c2.667,10,9.667,15,21,15
c10,0,16.667,-5,20,-15 v-585 v-600 v-585 c-2.667,-10,-9.667,-15,-21,-15
c-10,0,-16.667,5,-20,15z M188 15 H145 v585 v600 v585 h43z'/></svg></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.65em;"><span></span></span></span></span></span></span></span></span></span></span><!-- HTML_TAG_END -->, where<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msubsup><mover accent="true"><mi>d</mi><mo>^</mo></mover><mi>i</mi><mo></mo></msubsup></mrow><annotation encoding="application/x-tex"> \hat{d}_i^* </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1.2165em;vertical-align:-0.2587em;"></span><span class="mord"><span class="mord accent"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.9579em;"><span style="top:-3em;"><span class="pstrut" style="height:3em;"></span><span class="mord mathnormal">d</span></span><span style="top:-3.2634em;"><span class="pstrut" style="height:3em;"></span><span class="accent-body" style="left:-0.0833em;"><span class="mord">^</span></span></span></span></span></span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.6887em;"><span style="top:-2.4413em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight"></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.2587em;"><span></span></span></span></span></span></span></span></span></span><!-- HTML_TAG_END --> and<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mover accent="true"><mi>d</mi><mo>^</mo></mover><mi>i</mi></msub></mrow><annotation encoding="application/x-tex"> \hat{d}_i </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1.1079em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord accent"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.9579em;"><span style="top:-3em;"><span class="pstrut" style="height:3em;"></span><span class="mord mathnormal">d</span></span><span style="top:-3.2634em;"><span class="pstrut" style="height:3em;"></span><span class="accent-body" style="left:-0.0833em;"><span class="mord">^</span></span></span></span></span></span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span><!-- HTML_TAG_END --> are the scaled and shifted versions of the prediction<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msubsup><mi>d</mi><mi>i</mi><mo></mo></msubsup></mrow><annotation encoding="application/x-tex"> d_i^* </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.9531em;vertical-align:-0.2587em;"></span><span class="mord"><span class="mord mathnormal">d</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.6887em;"><span style="top:-2.4413em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight"></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.2587em;"><span></span></span></span></span></span></span></span></span></span><!-- HTML_TAG_END --> and ground truth<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>d</mi><mi>i</mi></msub></mrow><annotation encoding="application/x-tex"> d_i </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8444em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathnormal">d</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span><!-- HTML_TAG_END -->:<!-- HTML_TAG_START --><span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><msub><mover accent="true"><mi>d</mi><mo>^</mo></mover><mi>i</mi></msub><mo>=</mo><mfrac><mrow><msub><mi>d</mi><mi>i</mi></msub><mo></mo><mi>t</mi><mo stretchy="false">(</mo><mi>d</mi><mo stretchy="false">)</mo></mrow><mrow><mi>s</mi><mo stretchy="false">(</mo><mi>d</mi><mo stretchy="false">)</mo></mrow></mfrac><mo separator="true">,</mo></mrow><annotation encoding="application/x-tex">\hat{d}_i = \frac{d_i - t(d)}{s(d)},</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1.1079em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord accent"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.9579em;"><span style="top:-3em;"><span class="pstrut" style="height:3em;"></span><span class="mord mathnormal">d</span></span><span style="top:-3.2634em;"><span class="pstrut" style="height:3em;"></span><span class="accent-body" style="left:-0.0833em;"><span class="mord">^</span></span></span></span></span></span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:2.363em;vertical-align:-0.936em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.427em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord mathnormal">s</span><span class="mopen">(</span><span class="mord mathnormal">d</span><span class="mclose">)</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord"><span class="mord mathnormal">d</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin"></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord mathnormal">t</span><span class="mopen">(</span><span class="mord mathnormal">d</span><span class="mclose">)</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.936em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mpunct">,</span></span></span></span></span><!-- HTML_TAG_END -->
where<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>t</mi><mo stretchy="false">(</mo><mi>d</mi><mo stretchy="false">)</mo></mrow><annotation encoding="application/x-tex"> t(d) </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal">t</span><span class="mopen">(</span><span class="mord mathnormal">d</span><span class="mclose">)</span></span></span></span><!-- HTML_TAG_END --> and<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>s</mi><mo stretchy="false">(</mo><mi>d</mi><mo stretchy="false">)</mo></mrow><annotation encoding="application/x-tex"> s(d) </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal">s</span><span class="mopen">(</span><span class="mord mathnormal">d</span><span class="mclose">)</span></span></span></span><!-- HTML_TAG_END --> are used to align the prediction and ground truth to have zero translation and unit scale:<!-- HTML_TAG_START --><span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><mi>t</mi><mo stretchy="false">(</mo><mi>d</mi><mo stretchy="false">)</mo><mo>=</mo><mrow><mi mathvariant="normal">m</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">d</mi><mi mathvariant="normal">i</mi><mi mathvariant="normal">a</mi><mi mathvariant="normal">n</mi></mrow><mo stretchy="false">(</mo><mi>d</mi><mo stretchy="false">)</mo><mo separator="true">,</mo><mspace width="1em"/><mi>s</mi><mo stretchy="false">(</mo><mi>d</mi><mo stretchy="false">)</mo><mo>=</mo><mfrac><mn>1</mn><mrow><mi>H</mi><mi>W</mi></mrow></mfrac><munderover><mo></mo><mrow><mi>i</mi><mo>=</mo><mn>1</mn></mrow><mrow><mi>H</mi><mi>W</mi></mrow></munderover><mrow><mo fence="true"></mo><msub><mi>d</mi><mi>i</mi></msub><mo></mo><mi>t</mi><mo stretchy="false">(</mo><mi>d</mi><mo stretchy="false">)</mo><mo fence="true"></mo></mrow><mi mathvariant="normal">.</mi></mrow><annotation encoding="application/x-tex">t(d) = \mathrm{median}(d), \quad s(d) = \frac{1}{HW} \sum_{i=1}^{HW} \left| d_i - t(d) \right|.</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal">t</span><span class="mopen">(</span><span class="mord mathnormal">d</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord"><span class="mord mathrm">median</span></span><span class="mopen">(</span><span class="mord mathnormal">d</span><span class="mclose">)</span><span class="mpunct">,</span><span class="mspace" style="margin-right:1em;"></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord mathnormal">s</span><span class="mopen">(</span><span class="mord mathnormal">d</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:3.106em;vertical-align:-1.2777em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.08125em;">H</span><span class="mord mathnormal" style="margin-right:0.13889em;">W</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">1</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mop op-limits"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.8283em;"><span style="top:-1.8723em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mrel mtight">=</span><span class="mord mtight">1</span></span></span></span><span style="top:-3.05em;"><span class="pstrut" style="height:3.05em;"></span><span><span class="mop op-symbol large-op"></span></span></span><span style="top:-4.3em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.08125em;">H</span><span class="mord mathnormal mtight" style="margin-right:0.13889em;">W</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:1.2777em;"><span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="minner"><span class="mopen delimcenter" style="top:0em;"></span><span class="mord"><span class="mord mathnormal">d</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin"></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mord mathnormal">t</span><span class="mopen">(</span><span class="mord mathnormal">d</span><span class="mclose">)</span><span class="mclose delimcenter" style="top:0em;"></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">.</span></span></span></span></span><!-- HTML_TAG_END --></p></blockquote> <p data-svelte-h="svelte-mwszoe">In fact, there are many other methods and functions that help eliminate scale and shifts. There are also different additions to loss functions, such as gradient loss, which focuses not on the pixel values themselves but on how quickly they change (hence the name — gradient). You can read more about this in the <a href="https://arxiv.org/abs/1907.01341" rel="nofollow">MiDaS</a> paper, i’ll include a list of useful literature at the end. Let’s briefly discuss metrics before moving on to the most exciting part — fine-tuning on absolute depth using a custom dataset.</p> <h2 class="relative group"><a id="metrics" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#metrics"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Metrics</span></h2> <p data-svelte-h="svelte-67ww49">In depth estimation, several standard metrics are used to evaluate performance, including MAE (Mean Absolute Error), RMSE (Root Mean Square Error), and their logarithmic variations to smooth out large gaps in distance. Additionally, consider the following:</p> <ul><li><strong data-svelte-h="svelte-1jwrow9">Absolute Relative Error (AbsRel)</strong>: This metric is similar to MAE but expressed in percentage terms, measuring how much the predicted distances differ from the true ones on average in percentage terms. <br><!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext>AbsRel</mtext><mo>=</mo><mfrac><mn>1</mn><mi>N</mi></mfrac><msubsup><mo></mo><mrow><mi>i</mi><mo>=</mo><mn>1</mn></mrow><mi>N</mi></msubsup><mfrac><mrow><mi mathvariant="normal"></mi><msub><mi>d</mi><mi>i</mi></msub><mo></mo><msub><mover accent="true"><mi>d</mi><mo>^</mo></mover><mi>i</mi></msub><mi mathvariant="normal"></mi></mrow><msub><mi>d</mi><mi>i</mi></msub></mfrac></mrow><annotation encoding="application/x-tex">\text{AbsRel} = \frac{1}{N} \sum_{i=1}^{N} \frac{|d_i - \hat{d}_i|}{d_i}</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.6944em;"></span><span class="mord text"><span class="mord">AbsRel</span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1.6006em;vertical-align:-0.4451em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.8451em;"><span style="top:-2.655em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.10903em;">N</span></span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.394em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">1</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.345em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mop"><span class="mop op-symbol small-op" style="position:relative;top:0em;"></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.9812em;"><span style="top:-2.4003em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mrel mtight">=</span><span class="mord mtight">1</span></span></span></span><span style="top:-3.2029em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.10903em;">N</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.2997em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.1555em;"><span style="top:-2.655em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight"><span class="mord mathnormal mtight">d</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3281em;"><span style="top:-2.357em;margin-left:0em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.143em;"><span></span></span></span></span></span></span></span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.485em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight"></span><span class="mord mtight"><span class="mord mathnormal mtight">d</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3281em;"><span style="top:-2.357em;margin-left:0em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.143em;"><span></span></span></span></span></span></span><span class="mbin mtight"></span><span class="mord mtight"><span class="mord accent mtight"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.9579em;"><span style="top:-2.7em;"><span class="pstrut" style="height:2.7em;"></span><span class="mord mathnormal mtight">d</span></span><span style="top:-2.9634em;"><span class="pstrut" style="height:2.7em;"></span><span class="accent-body" style="left:-0.0833em;"><span class="mord mtight">^</span></span></span></span></span></span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3281em;"><span style="top:-2.357em;margin-left:0em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.143em;"><span></span></span></span></span></span></span><span class="mord mtight"></span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.4451em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span></span></span></span><!-- HTML_TAG_END --></li> <li><strong>Threshold Accuracy (<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>δ</mi><mn>1</mn></msub></mrow><annotation encoding="application/x-tex">\delta_1</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8444em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03785em;">δ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3011em;"><span style="top:-2.55em;margin-left:-0.0379em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight">1</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span><!-- HTML_TAG_END -->)</strong>: This measures the percentage of predicted pixels that differ from the true pixels by no more than 25%. <br><!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>δ</mi><mn>1</mn></msub><mo>=</mo><mtext> proportion of predicted depths where </mtext><mi>max</mi><mo></mo><mrow><mo fence="true">(</mo><mfrac><msub><mi>d</mi><mi>i</mi></msub><msub><mover accent="true"><mi>d</mi><mo>^</mo></mover><mi>i</mi></msub></mfrac><mo separator="true">,</mo><mfrac><msub><mover accent="true"><mi>d</mi><mo>^</mo></mover><mi>i</mi></msub><msub><mi>d</mi><mi>i</mi></msub></mfrac><mo fence="true">)</mo></mrow><mo>&lt;</mo><mn>1.25</mn></mrow><annotation encoding="application/x-tex">\delta_1 = \text{ proportion of predicted depths where } \max\left(\frac{d_i}{\hat{d}_i}, \frac{\hat{d}_i}{d_i}\right) &lt; 1.25</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8444em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.03785em;">δ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3011em;"><span style="top:-2.55em;margin-left:-0.0379em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight">1</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1.8em;vertical-align:-0.65em;"></span><span class="mord text"><span class="mord"> proportion of predicted depths where </span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mop">max</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="minner"><span class="mopen delimcenter" style="top:0em;"><span class="delimsizing size2">(</span></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.8962em;"><span style="top:-2.5195em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight"><span class="mord accent mtight"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.9579em;"><span style="top:-2.7em;"><span class="pstrut" style="height:2.7em;"></span><span class="mord mathnormal mtight">d</span></span><span style="top:-2.9634em;"><span class="pstrut" style="height:2.7em;"></span><span class="accent-body" style="left:-0.0833em;"><span class="mord mtight">^</span></span></span></span></span></span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3281em;"><span style="top:-2.357em;margin-left:0em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.143em;"><span></span></span></span></span></span></span></span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.4101em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight"><span class="mord mathnormal mtight">d</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3281em;"><span style="top:-2.357em;margin-left:0em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.143em;"><span></span></span></span></span></span></span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.5806em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.0806em;"><span style="top:-2.655em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight"><span class="mord mathnormal mtight">d</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3281em;"><span style="top:-2.357em;margin-left:0em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.143em;"><span></span></span></span></span></span></span></span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.4101em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight"><span class="mord accent mtight"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.9579em;"><span style="top:-2.7em;"><span class="pstrut" style="height:2.7em;"></span><span class="mord mathnormal mtight">d</span></span><span style="top:-2.9634em;"><span class="pstrut" style="height:2.7em;"></span><span class="accent-body" style="left:-0.0833em;"><span class="mord mtight">^</span></span></span></span></span></span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3281em;"><span style="top:-2.357em;margin-left:0em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.143em;"><span></span></span></span></span></span></span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.4451em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mclose delimcenter" style="top:0em;"><span class="delimsizing size2">)</span></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">&lt;</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:0.6444em;"></span><span class="mord">1.25</span></span></span></span><!-- HTML_TAG_END --></li></ul> <h3 class="relative group"><a id="important-considerations" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#important-considerations"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Important Considerations</span></h3> <blockquote data-svelte-h="svelte-hlxn52"><p>For all our models and baselines, we align predictions and ground truth in scale and shift for each image before measuring errors.</p></blockquote> <p data-svelte-h="svelte-pt44s1">Indeed, if we are training to predict relative depth but want to measure quality on a dataset with absolute values, and we are not interested in fine-tuning on this dataset or in the absolute values, we can, similar to the loss function, exclude scale and shift from the calculations and standardize everything to a unified measure.</p> <h3 class="relative group"><a id="four-methods-of-calculating-metrics" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#four-methods-of-calculating-metrics"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Four Methods of Calculating Metrics</span></h3> <p data-svelte-h="svelte-ygtdws">Understanding these methods helps avoid confusion when analyzing metrics in papers:</p> <ol data-svelte-h="svelte-susvm9"><li><strong>Zero-shot Relative Depth Estimation</strong><ul><li>Train to predict relative depth on one set of datasets and measure quality on others. Since the depth is relative, significantly different scales are not a concern, and metrics on other datasets usually remain high, similar to the test sets of the training datasets.</li></ul></li></ol> <p data-svelte-h="svelte-1bd5b2y"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/Metric%20and%20Relative%20Monocular%20Depth%20Estimation%20An%20Overview.%20Fine-Tuning%20Depth%20Anything%20V2/metrics2.png" alt="image/png"></p> <ol start="2" data-svelte-h="svelte-1c3uys9"><li><strong>Zero-shot Absolute Depth Estimation</strong><ul><li>Train a universal relative model, then fine-tune it on a good dataset for predicting absolute depth, and measure the quality of absolute depth predictions on a different dataset. Metrics in this case tend to be worse compared to the previous method, highlighting the challenge of predicting absolute depth well across different environments.</li></ul></li></ol> <p data-svelte-h="svelte-4cr0hd"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/Metric%20and%20Relative%20Monocular%20Depth%20Estimation%20An%20Overview.%20Fine-Tuning%20Depth%20Anything%20V2/metrics3.png" alt="image/png"></p> <ol start="3" data-svelte-h="svelte-1x8hiyz"><li><strong>Fine-tuned (In-domain) Absolute Depth Estimation</strong><ul><li>Similar to the previous method, but now measure quality on the test set of the dataset used for fine-tuning absolute depth prediction. This is one of the most practical approaches.</li></ul></li></ol> <p data-svelte-h="svelte-1v44rns"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/Metric%20and%20Relative%20Monocular%20Depth%20Estimation%20An%20Overview.%20Fine-Tuning%20Depth%20Anything%20V2/metrics4.png" alt="image/png"></p> <ol start="4" data-svelte-h="svelte-ud0pes"><li><strong>Fine-tuned (In-domain) Relative Depth Estimation</strong><ul><li>Train to predict relative depth and measure quality on the test set of the training datasets. This might not be the most precise name, but the idea is straightforward.</li></ul></li></ol> <h2 class="relative group"><a id="depth-anything-v2-absolute-depth-estimation-fine-tuning" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#depth-anything-v2-absolute-depth-estimation-fine-tuning"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Depth Anything V2 Absolute Depth Estimation Fine-Tuning</span></h2> <p data-svelte-h="svelte-1qoukxz">In this section, we will reproduce the results from the Depth Anything V2 paper by fine-tuning the model to predict absolute depth on the NYU-D dataset, aiming to achieve metrics similar to those shown in the last table from the previous section.</p> <h3 class="relative group"><a id="key-ideas-behind-depth-anything-v2" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#key-ideas-behind-depth-anything-v2"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Key Ideas Behind Depth Anything V2</span></h3> <p data-svelte-h="svelte-13canjn">Depth Anything V2 is a powerful model for depth estimation, achieving remarkable results due to several innovative concepts:</p> <ul data-svelte-h="svelte-1vc4uw9"><li><strong>Universal Training Method on Heterogeneous Data</strong>: This method, introduced in the MiDaS 2020 paper, enables robust training across various types of datasets.</li> <li><strong>DPT Architecture</strong>: The “Vision Transformers for Dense Prediction” paper presents this architecture, which is essentially a U-Net with a Vision Transformer (ViT) encoder and several modifications.</li></ul> <p data-svelte-h="svelte-c926y3"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/Metric%20and%20Relative%20Monocular%20Depth%20Estimation%20An%20Overview.%20Fine-Tuning%20Depth%20Anything%20V2/dpt.png" alt="image/png"></p> <ul data-svelte-h="svelte-rq0o1t"><li><strong>DINOv2 Encoder</strong>: This standard ViT, pre-trained using a self-supervised method on a massive dataset, serves as a powerful and versatile feature extractor. In recent years, CV researchers have aimed to create foundation models similar to GPT and BERT in NLP, and DINOv2 is a significant step in that direction.</li> <li><strong>Use of Synthetic Data</strong>: The training pipeline is very well described in the image below. This approach allowed the authors to achieve such clarity and accuracy in the depth maps. After all, if you think about it, the labels obtained from synthetic data are truly “ground truth.”</li></ul> <p data-svelte-h="svelte-1t0xzi5"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/Metric%20and%20Relative%20Monocular%20Depth%20Estimation%20An%20Overview.%20Fine-Tuning%20Depth%20Anything%20V2/DA2_pipeline.png" alt="image/png"></p> <h3 class="relative group"><a id="getting-started-with-fine-tuning" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#getting-started-with-fine-tuning"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Getting Started with Fine-Tuning</span></h3> <p data-svelte-h="svelte-2u1d73">Now, let’s dive into the code. If you don’t have access to a powerful GPU, I highly recommend using Kaggle over Colab. Kaggle offers several advantages:</p> <ul data-svelte-h="svelte-q6rxwd"><li>Up to 30 hours of GPU usage per week</li> <li>No connection interruptions</li> <li>Very fast and convenient access to datasets</li> <li>The ability to use two GPUs simultaneously in one of the configurations, which will help you practice distributed training</li></ul> <p data-svelte-h="svelte-qgwn6r">You can jump straight into the code using this <a href="https://www.kaggle.com/code/amanattheedge/depth-anything-v2-metric-fine-tunning-on-nyu/notebook" rel="nofollow">notebook on Kaggle</a>.</p> <p data-svelte-h="svelte-134kezh">We’ll go through everything in detail here. To start, let’s download all the necessary modules from the authors’ repository and the checkpoint of the smallest model with the ViT-S encoder.</p> <h4 class="relative group"><a id="step-1-clone-the-repository-and-download-pre-trained-weights" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#step-1-clone-the-repository-and-download-pre-trained-weights"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Step 1: Clone the Repository and Download Pre-trained Weights</span></h4> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!git <span class="hljs-built_in">clone</span> https://github.com/DepthAnything/Depth-Anything-V2
!wget -O depth_anything_v2_vits.pth https://huggingface.co/depth-anything/Depth-Anything-V2-Small/resolve/main/depth_anything_v2_vits.pth?download=<span class="hljs-literal">true</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-yvhtch">You can also download the dataset <a href="http://datasets.lids.mit.edu/fastdepth/data/" rel="nofollow">here</a></p> <h4 class="relative group"><a id="step-2-import-required-modules" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#step-2-import-required-modules"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Step 2: Import Required Modules</span></h4> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> numpy <span class="hljs-keyword">as</span> np
<span class="hljs-keyword">import</span> matplotlib.pyplot <span class="hljs-keyword">as</span> plt
<span class="hljs-keyword">import</span> os
<span class="hljs-keyword">from</span> tqdm <span class="hljs-keyword">import</span> tqdm
<span class="hljs-keyword">import</span> cv2
<span class="hljs-keyword">import</span> random
<span class="hljs-keyword">import</span> h5py
<span class="hljs-keyword">import</span> sys
sys.path.append(<span class="hljs-string">&quot;/kaggle/working/Depth-Anything-V2/metric_depth&quot;</span>)
<span class="hljs-keyword">from</span> accelerate <span class="hljs-keyword">import</span> Accelerator
<span class="hljs-keyword">from</span> accelerate.utils <span class="hljs-keyword">import</span> set_seed
<span class="hljs-keyword">from</span> accelerate <span class="hljs-keyword">import</span> notebook_launcher
<span class="hljs-keyword">from</span> accelerate <span class="hljs-keyword">import</span> DistributedDataParallelKwargs
<span class="hljs-keyword">import</span> transformers
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">import</span> torchvision
<span class="hljs-keyword">from</span> torchvision.transforms <span class="hljs-keyword">import</span> v2
<span class="hljs-keyword">from</span> torchvision.transforms <span class="hljs-keyword">import</span> Compose
<span class="hljs-keyword">import</span> torch.nn.functional <span class="hljs-keyword">as</span> F
<span class="hljs-keyword">import</span> albumentations <span class="hljs-keyword">as</span> A
<span class="hljs-keyword">from</span> depth_anything_v2.dpt <span class="hljs-keyword">import</span> DepthAnythingV2
<span class="hljs-keyword">from</span> util.loss <span class="hljs-keyword">import</span> SiLogLoss
<span class="hljs-keyword">from</span> dataset.transform <span class="hljs-keyword">import</span> Resize, NormalizeImage, PrepareForNet, Crop<!-- HTML_TAG_END --></pre></div> <h4 class="relative group"><a id="step-3-get-all-file-paths-for-training-and-validation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#step-3-get-all-file-paths-for-training-and-validation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Step 3: Get All File Paths for Training and Validation</span></h4> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">get_all_files</span>(<span class="hljs-params">directory</span>):
all_files = []
<span class="hljs-keyword">for</span> root, dirs, files <span class="hljs-keyword">in</span> os.walk(directory):
<span class="hljs-keyword">for</span> file <span class="hljs-keyword">in</span> files:
all_files.append(os.path.join(root, file))
<span class="hljs-keyword">return</span> all_files
train_paths = get_all_files(<span class="hljs-string">&quot;/kaggle/input/nyu-depth-dataset-v2/nyudepthv2/train&quot;</span>)
val_paths = get_all_files(<span class="hljs-string">&quot;/kaggle/input/nyu-depth-dataset-v2/nyudepthv2/val&quot;</span>)<!-- HTML_TAG_END --></pre></div> <h4 class="relative group"><a id="step-4-define-the-pytorch-dataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#step-4-define-the-pytorch-dataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Step 4: Define the PyTorch Dataset</span></h4> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># NYU Depth V2 40k. Original NYU is 400k</span>
<span class="hljs-keyword">class</span> <span class="hljs-title class_">NYU</span>(torch.utils.data.Dataset):
<span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self, paths, mode, size=(<span class="hljs-params"><span class="hljs-number">518</span>, <span class="hljs-number">518</span></span>)</span>):
self.mode = mode <span class="hljs-comment"># train or val</span>
self.size = size
self.paths = paths
net_w, net_h = size
<span class="hljs-comment"># author&#x27;s transforms</span>
self.transform = Compose(
[
Resize(
width=net_w,
height=net_h,
resize_target=<span class="hljs-literal">True</span> <span class="hljs-keyword">if</span> mode == <span class="hljs-string">&quot;train&quot;</span> <span class="hljs-keyword">else</span> <span class="hljs-literal">False</span>,
keep_aspect_ratio=<span class="hljs-literal">True</span>,
ensure_multiple_of=<span class="hljs-number">14</span>,
resize_method=<span class="hljs-string">&quot;lower_bound&quot;</span>,
image_interpolation_method=cv2.INTER_CUBIC,
),
NormalizeImage(mean=[<span class="hljs-number">0.485</span>, <span class="hljs-number">0.456</span>, <span class="hljs-number">0.406</span>], std=[<span class="hljs-number">0.229</span>, <span class="hljs-number">0.224</span>, <span class="hljs-number">0.225</span>]),
PrepareForNet(),
]
+ ([Crop(size[<span class="hljs-number">0</span>])] <span class="hljs-keyword">if</span> self.mode == <span class="hljs-string">&quot;train&quot;</span> <span class="hljs-keyword">else</span> [])
)
<span class="hljs-comment"># only horizontal flip in the paper</span>
self.augs = A.Compose(
[
A.HorizontalFlip(),
A.ColorJitter(hue=<span class="hljs-number">0.1</span>, contrast=<span class="hljs-number">0.1</span>, brightness=<span class="hljs-number">0.1</span>, saturation=<span class="hljs-number">0.1</span>),
A.GaussNoise(var_limit=<span class="hljs-number">25</span>),
]
)
<span class="hljs-keyword">def</span> <span class="hljs-title function_">__getitem__</span>(<span class="hljs-params">self, item</span>):
path = self.paths[item]
image, depth = self.h5_loader(path)
<span class="hljs-keyword">if</span> self.mode == <span class="hljs-string">&quot;train&quot;</span>:
augmented = self.augs(image=image, mask=depth)
image = augmented[<span class="hljs-string">&quot;image&quot;</span>] / <span class="hljs-number">255.0</span>
depth = augmented[<span class="hljs-string">&quot;mask&quot;</span>]
<span class="hljs-keyword">else</span>:
image = image / <span class="hljs-number">255.0</span>
sample = self.transform({<span class="hljs-string">&quot;image&quot;</span>: image, <span class="hljs-string">&quot;depth&quot;</span>: depth})
sample[<span class="hljs-string">&quot;image&quot;</span>] = torch.from_numpy(sample[<span class="hljs-string">&quot;image&quot;</span>])
sample[<span class="hljs-string">&quot;depth&quot;</span>] = torch.from_numpy(sample[<span class="hljs-string">&quot;depth&quot;</span>])
<span class="hljs-comment"># sometimes there are masks for valid depths in datasets because of noise e.t.c</span>
<span class="hljs-comment"># sample[&#x27;valid_mask&#x27;] = ...</span>
<span class="hljs-keyword">return</span> sample
<span class="hljs-keyword">def</span> <span class="hljs-title function_">__len__</span>(<span class="hljs-params">self</span>):
<span class="hljs-keyword">return</span> <span class="hljs-built_in">len</span>(self.paths)
<span class="hljs-keyword">def</span> <span class="hljs-title function_">h5_loader</span>(<span class="hljs-params">self, path</span>):
h5f = h5py.File(path, <span class="hljs-string">&quot;r&quot;</span>)
rgb = np.array(h5f[<span class="hljs-string">&quot;rgb&quot;</span>])
rgb = np.transpose(rgb, (<span class="hljs-number">1</span>, <span class="hljs-number">2</span>, <span class="hljs-number">0</span>))
depth = np.array(h5f[<span class="hljs-string">&quot;depth&quot;</span>])
<span class="hljs-keyword">return</span> rgb, depth<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1lxosh">Here are a few points to note:</p> <ul data-svelte-h="svelte-1csofyi"><li>The original NYU-D dataset contains 407k samples, but we are using a subset of 40k. This will slightly impact the final model quality.</li> <li>The authors of the paper used only horizontal flips for data augmentation.</li> <li>Occasionally, some points in the depth maps may not be processed correctly, resulting in “bad pixels”. Some datasets include a mask that differentiates between valid and invalid pixels in addition to the image and depth map. This mask is necessary to exclude bad pixels from loss and metric calculations.</li></ul> <p data-svelte-h="svelte-qh24xo"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/Metric%20and%20Relative%20Monocular%20Depth%20Estimation%20An%20Overview.%20Fine-Tuning%20Depth%20Anything%20V2/depth_holes.png" alt="image/png"></p> <ul data-svelte-h="svelte-109cal8"><li>During training, we resize images so that the smaller side is 518 pixels and then crop them. For validation, we do not crop or resize the depth maps. Instead, we upsample the predicted depth maps and compute metrics at the original resolution.</li></ul> <h4 class="relative group"><a id="step-5-data-visualization" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#step-5-data-visualization"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Step 5: Data Visualization</span></h4> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->num_images = <span class="hljs-number">5</span>
fig, axes = plt.subplots(num_images, <span class="hljs-number">2</span>, figsize=(<span class="hljs-number">10</span>, <span class="hljs-number">5</span> * num_images))
train_set = NYU(train_paths, mode=<span class="hljs-string">&quot;train&quot;</span>)
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(num_images):
sample = train_set[i * <span class="hljs-number">1000</span>]
img, depth = sample[<span class="hljs-string">&quot;image&quot;</span>].numpy(), sample[<span class="hljs-string">&quot;depth&quot;</span>].numpy()
mean = np.array([<span class="hljs-number">0.485</span>, <span class="hljs-number">0.456</span>, <span class="hljs-number">0.406</span>]).reshape((<span class="hljs-number">3</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>))
std = np.array([<span class="hljs-number">0.229</span>, <span class="hljs-number">0.224</span>, <span class="hljs-number">0.225</span>]).reshape((<span class="hljs-number">3</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>))
img = img * std + mean
axes[i, <span class="hljs-number">0</span>].imshow(np.transpose(img, (<span class="hljs-number">1</span>, <span class="hljs-number">2</span>, <span class="hljs-number">0</span>)))
axes[i, <span class="hljs-number">0</span>].set_title(<span class="hljs-string">&quot;Image&quot;</span>)
axes[i, <span class="hljs-number">0</span>].axis(<span class="hljs-string">&quot;off&quot;</span>)
im1 = axes[i, <span class="hljs-number">1</span>].imshow(depth, cmap=<span class="hljs-string">&quot;viridis&quot;</span>, vmin=<span class="hljs-number">0</span>)
axes[i, <span class="hljs-number">1</span>].set_title(<span class="hljs-string">&quot;True Depth&quot;</span>)
axes[i, <span class="hljs-number">1</span>].axis(<span class="hljs-string">&quot;off&quot;</span>)
fig.colorbar(im1, ax=axes[i, <span class="hljs-number">1</span>])
plt.tight_layout()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1qwdzlr"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/Metric%20and%20Relative%20Monocular%20Depth%20Estimation%20An%20Overview.%20Fine-Tuning%20Depth%20Anything%20V2/dataset.png" alt="image/png"></p> <p data-svelte-h="svelte-1vezx4k">As you can see, the images are very blurry and noisy. Because of this, we won’t be able to get fine-grained depth maps seen in the previews of Depth Anything V2. In the black hole artifacts, the depth is 0, and we will use this fact later for masking these holes. Also, the dataset contains many nearly identical photos of the same location.</p> <h4 class="relative group"><a id="step-6-prepare-dataloaders" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#step-6-prepare-dataloaders"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Step 6: Prepare Dataloaders</span></h4> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">get_dataloaders</span>(<span class="hljs-params">batch_size</span>):
train_dataset = NYU(train_paths, mode=<span class="hljs-string">&quot;train&quot;</span>)
val_dataset = NYU(val_paths, mode=<span class="hljs-string">&quot;val&quot;</span>)
train_dataloader = torch.utils.data.DataLoader(
train_dataset, batch_size=batch_size, shuffle=<span class="hljs-literal">True</span>, num_workers=<span class="hljs-number">4</span>, drop_last=<span class="hljs-literal">True</span>
)
val_dataloader = torch.utils.data.DataLoader(
val_dataset,
batch_size=<span class="hljs-number">1</span>, <span class="hljs-comment"># for dynamic resolution evaluations without padding</span>
shuffle=<span class="hljs-literal">False</span>,
num_workers=<span class="hljs-number">4</span>,
drop_last=<span class="hljs-literal">True</span>,
)
<span class="hljs-keyword">return</span> train_dataloader, val_dataloader<!-- HTML_TAG_END --></pre></div> <h4 class="relative group"><a id="step-7-metric-evaluation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#step-7-metric-evaluation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Step 7: Metric Evaluation</span></h4> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">eval_depth</span>(<span class="hljs-params">pred, target</span>):
<span class="hljs-keyword">assert</span> pred.shape == target.shape
thresh = torch.<span class="hljs-built_in">max</span>((target / pred), (pred / target))
d1 = torch.<span class="hljs-built_in">sum</span>(thresh &lt; <span class="hljs-number">1.25</span>).<span class="hljs-built_in">float</span>() / <span class="hljs-built_in">len</span>(thresh)
diff = pred - target
diff_log = torch.log(pred) - torch.log(target)
abs_rel = torch.mean(torch.<span class="hljs-built_in">abs</span>(diff) / target)
rmse = torch.sqrt(torch.mean(torch.<span class="hljs-built_in">pow</span>(diff, <span class="hljs-number">2</span>)))
mae = torch.mean(torch.<span class="hljs-built_in">abs</span>(diff))
silog = torch.sqrt(
torch.<span class="hljs-built_in">pow</span>(diff_log, <span class="hljs-number">2</span>).mean() - <span class="hljs-number">0.5</span> * torch.<span class="hljs-built_in">pow</span>(diff_log.mean(), <span class="hljs-number">2</span>)
)
<span class="hljs-keyword">return</span> {
<span class="hljs-string">&quot;d1&quot;</span>: d1.detach(),
<span class="hljs-string">&quot;abs_rel&quot;</span>: abs_rel.detach(),
<span class="hljs-string">&quot;rmse&quot;</span>: rmse.detach(),
<span class="hljs-string">&quot;mae&quot;</span>: mae.detach(),
<span class="hljs-string">&quot;silog&quot;</span>: silog.detach(),
}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-qzvf1e">Our loss function is SiLog. It might seem that when training on absolute depth, we should forget about invariance to scale and other techniques used for relative depth training. However, it turns out this is not entirely true, and we often still want to use a kind of “scale regularization”, but to a lesser extent. The parameter λ=0.5 helps balance between global consistency and local accuracy.</p> <h4 class="relative group"><a id="step-8-define-hyperparameters" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#step-8-define-hyperparameters"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Step 8: Define Hyperparameters</span></h4> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->model_weights_path = <span class="hljs-string">&quot;/kaggle/working/depth_anything_v2_vits.pth&quot;</span>
model_configs = {
<span class="hljs-string">&quot;vits&quot;</span>: {<span class="hljs-string">&quot;encoder&quot;</span>: <span class="hljs-string">&quot;vits&quot;</span>, <span class="hljs-string">&quot;features&quot;</span>: <span class="hljs-number">64</span>, <span class="hljs-string">&quot;out_channels&quot;</span>: [<span class="hljs-number">48</span>, <span class="hljs-number">96</span>, <span class="hljs-number">192</span>, <span class="hljs-number">384</span>]},
<span class="hljs-string">&quot;vitb&quot;</span>: {<span class="hljs-string">&quot;encoder&quot;</span>: <span class="hljs-string">&quot;vitb&quot;</span>, <span class="hljs-string">&quot;features&quot;</span>: <span class="hljs-number">128</span>, <span class="hljs-string">&quot;out_channels&quot;</span>: [<span class="hljs-number">96</span>, <span class="hljs-number">192</span>, <span class="hljs-number">384</span>, <span class="hljs-number">768</span>]},
<span class="hljs-string">&quot;vitl&quot;</span>: {<span class="hljs-string">&quot;encoder&quot;</span>: <span class="hljs-string">&quot;vitl&quot;</span>, <span class="hljs-string">&quot;features&quot;</span>: <span class="hljs-number">256</span>, <span class="hljs-string">&quot;out_channels&quot;</span>: [<span class="hljs-number">256</span>, <span class="hljs-number">512</span>, <span class="hljs-number">1024</span>, <span class="hljs-number">1024</span>]},
<span class="hljs-string">&quot;vitg&quot;</span>: {
<span class="hljs-string">&quot;encoder&quot;</span>: <span class="hljs-string">&quot;vitg&quot;</span>,
<span class="hljs-string">&quot;features&quot;</span>: <span class="hljs-number">384</span>,
<span class="hljs-string">&quot;out_channels&quot;</span>: [<span class="hljs-number">1536</span>, <span class="hljs-number">1536</span>, <span class="hljs-number">1536</span>, <span class="hljs-number">1536</span>],
},
}
model_encoder = <span class="hljs-string">&quot;vits&quot;</span>
max_depth = <span class="hljs-number">10</span>
batch_size = <span class="hljs-number">11</span>
lr = <span class="hljs-number">5e-6</span>
weight_decay = <span class="hljs-number">0.01</span>
num_epochs = <span class="hljs-number">10</span>
warmup_epochs = <span class="hljs-number">0.5</span>
scheduler_rate = <span class="hljs-number">1</span>
load_state = <span class="hljs-literal">False</span>
state_path = <span class="hljs-string">&quot;/kaggle/working/cp&quot;</span>
save_model_path = <span class="hljs-string">&quot;/kaggle/working/model&quot;</span>
seed = <span class="hljs-number">42</span>
mixed_precision = <span class="hljs-string">&quot;fp16&quot;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ze4ya1">Pay attention to the parameter ”<strong>max_depth</strong>”. The last layer in our model is a sigmoid for each pixel, producing an output from 0 to 1. We simply multiply each pixel by ”<strong>max_depth</strong>” to represent distances from 0 to ”<strong>max_depth</strong>“.</p> <h4 class="relative group"><a id="step-9-training-function" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#step-9-training-function"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Step 9: Training Function</span></h4> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">train_fn</span>():
set_seed(seed)
ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=<span class="hljs-literal">True</span>)
accelerator = Accelerator(
mixed_precision=mixed_precision,
kwargs_handlers=[ddp_kwargs],
)
<span class="hljs-comment"># in the paper they initialize decoder randomly and use only encoder pretrained weights. Then full fine-tune</span>
<span class="hljs-comment"># ViT-S encoder here</span>
model = DepthAnythingV2(**{**model_configs[model_encoder], <span class="hljs-string">&quot;max_depth&quot;</span>: max_depth})
model.load_state_dict(
{k: v <span class="hljs-keyword">for</span> k, v <span class="hljs-keyword">in</span> torch.load(model_weights_path).items() <span class="hljs-keyword">if</span> <span class="hljs-string">&quot;pretrained&quot;</span> <span class="hljs-keyword">in</span> k},
strict=<span class="hljs-literal">False</span>,
)
optim = torch.optim.AdamW(
[
{
<span class="hljs-string">&quot;params&quot;</span>: [
param
<span class="hljs-keyword">for</span> name, param <span class="hljs-keyword">in</span> model.named_parameters()
<span class="hljs-keyword">if</span> <span class="hljs-string">&quot;pretrained&quot;</span> <span class="hljs-keyword">in</span> name
],
<span class="hljs-string">&quot;lr&quot;</span>: lr,
},
{
<span class="hljs-string">&quot;params&quot;</span>: [
param
<span class="hljs-keyword">for</span> name, param <span class="hljs-keyword">in</span> model.named_parameters()
<span class="hljs-keyword">if</span> <span class="hljs-string">&quot;pretrained&quot;</span> <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> name
],
<span class="hljs-string">&quot;lr&quot;</span>: lr * <span class="hljs-number">10</span>,
},
],
lr=lr,
weight_decay=weight_decay,
)
criterion = SiLogLoss() <span class="hljs-comment"># author&#x27;s loss</span>
train_dataloader, val_dataloader = get_dataloaders(batch_size)
scheduler = transformers.get_cosine_schedule_with_warmup(
optim,
<span class="hljs-built_in">len</span>(train_dataloader) * warmup_epochs,
num_epochs * scheduler_rate * <span class="hljs-built_in">len</span>(train_dataloader),
)
model, optim, train_dataloader, val_dataloader, scheduler = accelerator.prepare(
model, optim, train_dataloader, val_dataloader, scheduler
)
<span class="hljs-keyword">if</span> load_state:
accelerator.wait_for_everyone()
accelerator.load_state(state_path)
best_val_absrel = <span class="hljs-number">1000</span>
<span class="hljs-keyword">for</span> epoch <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">1</span>, num_epochs):
model.train()
train_loss = <span class="hljs-number">0</span>
<span class="hljs-keyword">for</span> sample <span class="hljs-keyword">in</span> tqdm(
train_dataloader, disable=<span class="hljs-keyword">not</span> accelerator.is_local_main_process
):
optim.zero_grad()
img, depth = sample[<span class="hljs-string">&quot;image&quot;</span>], sample[<span class="hljs-string">&quot;depth&quot;</span>]
pred = model(img)
<span class="hljs-comment"># mask</span>
loss = criterion(pred, depth, (depth &lt;= max_depth) &amp; (depth &gt;= <span class="hljs-number">0.001</span>))
accelerator.backward(loss)
optim.step()
scheduler.step()
train_loss += loss.detach()
train_loss /= <span class="hljs-built_in">len</span>(train_dataloader)
train_loss = accelerator.reduce(train_loss, reduction=<span class="hljs-string">&quot;mean&quot;</span>).item()
model.<span class="hljs-built_in">eval</span>()
results = {<span class="hljs-string">&quot;d1&quot;</span>: <span class="hljs-number">0</span>, <span class="hljs-string">&quot;abs_rel&quot;</span>: <span class="hljs-number">0</span>, <span class="hljs-string">&quot;rmse&quot;</span>: <span class="hljs-number">0</span>, <span class="hljs-string">&quot;mae&quot;</span>: <span class="hljs-number">0</span>, <span class="hljs-string">&quot;silog&quot;</span>: <span class="hljs-number">0</span>}
<span class="hljs-keyword">for</span> sample <span class="hljs-keyword">in</span> tqdm(val_dataloader, disable=<span class="hljs-keyword">not</span> accelerator.is_local_main_process):
img, depth = sample[<span class="hljs-string">&quot;image&quot;</span>].<span class="hljs-built_in">float</span>(), sample[<span class="hljs-string">&quot;depth&quot;</span>][<span class="hljs-number">0</span>]
<span class="hljs-keyword">with</span> torch.no_grad():
pred = model(img)
<span class="hljs-comment"># evaluate on the original resolution</span>
pred = F.interpolate(
pred[:, <span class="hljs-literal">None</span>], depth.shape[-<span class="hljs-number">2</span>:], mode=<span class="hljs-string">&quot;bilinear&quot;</span>, align_corners=<span class="hljs-literal">True</span>
)[<span class="hljs-number">0</span>, <span class="hljs-number">0</span>]
valid_mask = (depth &lt;= max_depth) &amp; (depth &gt;= <span class="hljs-number">0.001</span>)
cur_results = eval_depth(pred[valid_mask], depth[valid_mask])
<span class="hljs-keyword">for</span> k <span class="hljs-keyword">in</span> results.keys():
results[k] += cur_results[k]
<span class="hljs-keyword">for</span> k <span class="hljs-keyword">in</span> results.keys():
results[k] = results[k] / <span class="hljs-built_in">len</span>(val_dataloader)
results[k] = <span class="hljs-built_in">round</span>(accelerator.reduce(results[k], reduction=<span class="hljs-string">&quot;mean&quot;</span>).item(), <span class="hljs-number">3</span>)
accelerator.wait_for_everyone()
accelerator.save_state(state_path, safe_serialization=<span class="hljs-literal">False</span>)
<span class="hljs-keyword">if</span> results[<span class="hljs-string">&quot;abs_rel&quot;</span>] &lt; best_val_absrel:
best_val_absrel = results[<span class="hljs-string">&quot;abs_rel&quot;</span>]
unwrapped_model = accelerator.unwrap_model(model)
<span class="hljs-keyword">if</span> accelerator.is_local_main_process:
torch.save(unwrapped_model.state_dict(), save_model_path)
accelerator.<span class="hljs-built_in">print</span>(
<span class="hljs-string">f&quot;epoch_<span class="hljs-subst">{epoch}</span>, train_loss = <span class="hljs-subst">{train_loss:<span class="hljs-number">.5</span>f}</span>, val_metrics = <span class="hljs-subst">{results}</span>&quot;</span>
)
<span class="hljs-comment"># P.S. While testing one configuration, I encountered an error in which the loss turned into nan.</span>
<span class="hljs-comment"># This is fixed by adding a small epsilon to the predictions to prevent division by 0</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1k6513x">In the paper, the authors randomly initialize the decoder and use only the encoder weights. They then fine-tune the entire model. Other notable points include:</p> <ul data-svelte-h="svelte-1ms1z4v"><li>Using different learning rates for the decoder and encoder. The encoder’s learning rate is lower since we don’t want to significantly alter the already excellent weights, unlike the randomly initialized decoder.</li> <li>The authors used a polynomial scheduler in the paper, while I used a cosine scheduler with warmup because I like it.</li> <li>In the mask, as mentioned earlier, we avoid the black holes in the depth maps by using the condition ”<strong>depth &gt;= 0.001</strong></li> <li>During training cycles, we calculate the loss on resized depth maps. During validation, we upsample the predictions and compute metrics at the original resolution.</li> <li>And look how easily we can wrap custom PyTorch code for distributed computing using HF accelerate.</li></ul> <h4 class="relative group"><a id="step-10-launch-the-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#step-10-launch-the-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Step 10: Launch the Training</span></h4> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># You can run this code with 1 gpu. Just set num_processes=1</span>
notebook_launcher(train_fn, num_processes=<span class="hljs-number">2</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1x9rrcm">I believe we achieved our desired goal. The slight difference in performance can be attributed to the significant difference in dataset sizes (40k vs 400k). Keep in mind, we used the ViT-S encoder.</p> <p data-svelte-h="svelte-qy23g7"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/Metric%20and%20Relative%20Monocular%20Depth%20Estimation%20An%20Overview.%20Fine-Tuning%20Depth%20Anything%20V2/metrics5.png" alt="image/png"></p> <p data-svelte-h="svelte-17xid3k">Let’s show some results</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->model = DepthAnythingV2(**{**model_configs[model_encoder], <span class="hljs-string">&quot;max_depth&quot;</span>: max_depth}).to(
<span class="hljs-string">&quot;cuda&quot;</span>
)
model.load_state_dict(torch.load(save_model_path))
num_images = <span class="hljs-number">10</span>
fig, axes = plt.subplots(num_images, <span class="hljs-number">3</span>, figsize=(<span class="hljs-number">15</span>, <span class="hljs-number">5</span> * num_images))
val_dataset = NYU(val_paths, mode=<span class="hljs-string">&quot;val&quot;</span>)
model.<span class="hljs-built_in">eval</span>()
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(num_images):
sample = val_dataset[i]
img, depth = sample[<span class="hljs-string">&quot;image&quot;</span>], sample[<span class="hljs-string">&quot;depth&quot;</span>]
mean = torch.tensor([<span class="hljs-number">0.485</span>, <span class="hljs-number">0.456</span>, <span class="hljs-number">0.406</span>]).view(<span class="hljs-number">3</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>)
std = torch.tensor([<span class="hljs-number">0.229</span>, <span class="hljs-number">0.224</span>, <span class="hljs-number">0.225</span>]).view(<span class="hljs-number">3</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>)
<span class="hljs-keyword">with</span> torch.inference_mode():
pred = model(img.unsqueeze(<span class="hljs-number">0</span>).to(<span class="hljs-string">&quot;cuda&quot;</span>))
pred = F.interpolate(
pred[:, <span class="hljs-literal">None</span>], depth.shape[-<span class="hljs-number">2</span>:], mode=<span class="hljs-string">&quot;bilinear&quot;</span>, align_corners=<span class="hljs-literal">True</span>
)[<span class="hljs-number">0</span>, <span class="hljs-number">0</span>]
img = img * std + mean
axes[i, <span class="hljs-number">0</span>].imshow(img.permute(<span class="hljs-number">1</span>, <span class="hljs-number">2</span>, <span class="hljs-number">0</span>))
axes[i, <span class="hljs-number">0</span>].set_title(<span class="hljs-string">&quot;Image&quot;</span>)
axes[i, <span class="hljs-number">0</span>].axis(<span class="hljs-string">&quot;off&quot;</span>)
max_depth = <span class="hljs-built_in">max</span>(depth.<span class="hljs-built_in">max</span>(), pred.cpu().<span class="hljs-built_in">max</span>())
im1 = axes[i, <span class="hljs-number">1</span>].imshow(depth, cmap=<span class="hljs-string">&quot;viridis&quot;</span>, vmin=<span class="hljs-number">0</span>, vmax=max_depth)
axes[i, <span class="hljs-number">1</span>].set_title(<span class="hljs-string">&quot;True Depth&quot;</span>)
axes[i, <span class="hljs-number">1</span>].axis(<span class="hljs-string">&quot;off&quot;</span>)
fig.colorbar(im1, ax=axes[i, <span class="hljs-number">1</span>])
im2 = axes[i, <span class="hljs-number">2</span>].imshow(pred.cpu(), cmap=<span class="hljs-string">&quot;viridis&quot;</span>, vmin=<span class="hljs-number">0</span>, vmax=max_depth)
axes[i, <span class="hljs-number">2</span>].set_title(<span class="hljs-string">&quot;Predicted Depth&quot;</span>)
axes[i, <span class="hljs-number">2</span>].axis(<span class="hljs-string">&quot;off&quot;</span>)
fig.colorbar(im2, ax=axes[i, <span class="hljs-number">2</span>])
plt.tight_layout()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-dn6epa"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/Metric%20and%20Relative%20Monocular%20Depth%20Estimation%20An%20Overview.%20Fine-Tuning%20Depth%20Anything%20V2/inference.png" alt="image/png"></p> <p data-svelte-h="svelte-nl5w42">The images in the validation set are much cleaner and more accurate than those in the training set, which is why our predictions appear a bit blurry in comparison. Take another look at the training samples above.</p> <p data-svelte-h="svelte-lt9zy0">In general, the key takeaway is that the model’s quality heavily depends on the quality of the provided depth maps. Kudos to the authors of Depth Anything V2 for overcoming this limitation and producing very sharp depth maps. The only drawback is that they are relative.</p> <h2 class="relative group"><a id="references" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#references"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>References</span></h2> <ul data-svelte-h="svelte-1h1ipi7"><li><a href="https://arxiv.org/abs/1907.01341" rel="nofollow">Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer</a></li> <li><a href="https://arxiv.org/abs/2302.12288" rel="nofollow">ZoeDepth: Zero-shot Transfer by Combining Relative and Metric Depth</a></li> <li><a href="https://arxiv.org/abs/2103.13413" rel="nofollow">Vision Transformers for Dense Prediction</a></li> <li><a href="https://arxiv.org/abs/2401.10891" rel="nofollow">Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data</a></li> <li><a href="https://arxiv.org/abs/2406.09414" rel="nofollow">Depth Anything V2</a></li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/computer-vision-course/blob/main/chapters/en/unit8/monocular_depth_estimation.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1p6gie1 = {
assets: "/docs/computer-vision-course/pr_397/en",
base: "/docs/computer-vision-course/pr_397/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/computer-vision-course/pr_397/en/_app/immutable/entry/start.7f209408.js"),
import("/docs/computer-vision-course/pr_397/en/_app/immutable/entry/app.32e8338e.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 85],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
145 kB
·
Xet hash:
025d7bb45f1babbf4307c3f64c3ddf8105d4e5910069f67467a2483e55f352b6

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.