import io import cv2 import numpy as np import torch from fastapi import FastAPI, UploadFile from PIL import Image from depth_anything_v2.dpt import DepthAnythingV2 app = FastAPI() DEVICE = "cuda" if torch.cuda.is_available() else "cpu" ENCODER = "vitl" DATASET = "hypersim" MAX_DEPTH = 20 MODEL_CONFIGS = { "vits": {"encoder": "vits", "features": 64, "out_channels": [48, 96, 192, 384]}, "vitb": {"encoder": "vitb", "features": 128, "out_channels": [96, 192, 384, 768]}, "vitl": {"encoder": "vitl", "features": 256, "out_channels": [256, 512, 1024, 1024]}, } model = DepthAnythingV2(**{**MODEL_CONFIGS[ENCODER], "max_depth": MAX_DEPTH}) model.load_state_dict( torch.load( f"/app/checkpoints/depth_anything_v2_metric_{DATASET}_{ENCODER}.pth", map_location="cpu" ) ) model = model.to(DEVICE).eval() @app.get("/") def root(): return { "message": "Depth Anything V2 Metric API running", "device": DEVICE, "encoder": ENCODER, "dataset": DATASET, "max_depth_meters": MAX_DEPTH, } @app.post("/depth") async def depth(file: UploadFile): contents = await file.read() pil_img = Image.open(io.BytesIO(contents)).convert("RGB") img_np = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR) with torch.inference_mode(): depth_map = model.infer_image(img_np) h, w = depth_map.shape return { "unit": "meters", "closest_distance_m": round(float(np.min(depth_map)), 3), "farthest_distance_m": round(float(np.max(depth_map)), 3), "mean_distance_m": round(float(np.mean(depth_map)), 3), "center_distance_m": round(float(depth_map[h // 2, w // 2]), 3), "image_size": {"width": w, "height": h}, }