Spaces:

SebRincon
/

anycalib-gpu

Paused

File size: 11,007 Bytes

"""
AnyCalib — Full-Resolution Camera Calibration & Lens Correction

Gradio Space running the full AnyCalib pipeline:
  1. DINOv2 ViT-L/14 backbone → LightDPT decoder → ConvexTangentDecoder head
  2. RANSAC + Gauss-Newton calibrator → camera intrinsics [f, cx, cy, k1, ...]
  3. Full-resolution undistortion via grid_sample

No resolution limits. No quantization. Full FP32 inference.
Runs on GPU if available (ZeroGPU / dedicated), falls back to CPU.
"""
from __future__ import annotations

import json
import time

import gradio as gr
import numpy as np
import torch

# ── GPU decorator (works on ZeroGPU Spaces, no-op elsewhere) ──
try:
    import spaces
    gpu_decorator = spaces.GPU(duration=120)
except (ImportError, Exception):
    # Not on a ZeroGPU Space — use identity decorator
    def gpu_decorator(fn):
        return fn

# ── Load model at startup ──
from anycalib.model.anycalib_pretrained import AnyCalib
from anycalib.cameras.factory import CameraFactory

print("[anycalib] Loading model...")
t0 = time.time()
MODEL = AnyCalib(model_id="anycalib_gen")
MODEL.eval()
TOTAL_PARAMS = sum(p.numel() for p in MODEL.parameters())
print(f"[anycalib] Model loaded in {time.time() - t0:.1f}s ({TOTAL_PARAMS:,} params)")


def _build_undistort_grid(camera, params, h, w, scale=1.0, target_proj="perspective"):
    """Build undistortion sampling grid (mirrors AnyCalibRunner._undistort_grid)."""
    params_b = params[None, ...] if params.ndim == 1 else params
    num_f = int(camera.NUM_F)
    f = params_b[..., None, :num_f]
    c = params_b[..., None, num_f:num_f + 2]

    im_coords = camera.pixel_grid_coords(h, w, params_b, 0.0).reshape(-1, 2)
    im_n = (im_coords - c) / f
    r = torch.linalg.norm(im_n, dim=-1) / scale
    theta = camera.ideal_unprojection(r, target_proj)
    phi = torch.atan2(im_n[..., 1], im_n[..., 0])
    R = torch.sin(theta)
    rays = torch.stack((R * torch.cos(phi), R * torch.sin(phi), torch.cos(theta)), dim=-1)

    params_proj = params_b
    if num_f == 2:
        params_proj = params_b.clone()
        params_proj[..., :2] = f.amax(dim=-1, keepdim=True)

    map_xy, valid = camera.project(params_proj, rays)
    if valid is not None:
        valid = valid.reshape(1, h, w)[0]

    grid = 2.0 * map_xy.reshape(1, h, w, 2) / map_xy.new_tensor((w, h)) - 1.0
    return grid, valid


@gpu_decorator
@torch.no_grad()
def run_calibration(
    input_image: np.ndarray,
    cam_id: str,
    scale: float,
    target_proj: str,
    padding_mode: str,
    interp_mode: str,
    k1_threshold: float,
):
    """Full pipeline: predict -> fit -> undistort at original resolution."""

    if input_image is None:
        raise gr.Error("Please upload an image.")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    MODEL.to(device)

    h, w = input_image.shape[:2]
    t_total = time.time()

    # Preprocess
    x = input_image.astype("float32") / 255.0
    x = np.transpose(x, (2, 0, 1))
    x_t = torch.from_numpy(x).to(device)

    # Neural network inference
    t0 = time.time()
    out = MODEL.predict(x_t, cam_id=cam_id)
    intrinsics = out["intrinsics"]
    pred_size = out.get("pred_size")
    t_infer = time.time() - t0

    # Parse intrinsics
    camera = CameraFactory.create_from_id(cam_id)
    num_f = int(camera.NUM_F)
    intr_list = intrinsics.detach().cpu().numpy().astype(np.float64).tolist()

    focal = intr_list[:num_f]
    cx_val, cy_val = intr_list[num_f], intr_list[num_f + 1]
    k1_val = intr_list[num_f + 2] if len(intr_list) > num_f + 2 else 0.0

    f_px = focal[0]
    fov_h = float(2 * np.degrees(np.arctan(w / (2 * f_px)))) if f_px > 0 else 0
    fov_v = float(2 * np.degrees(np.arctan(h / (2 * f_px)))) if f_px > 0 else 0

    if k1_val < -0.001:
        dist_type = "Barrel (k1 < 0)"
    elif k1_val > 0.001:
        dist_type = "Pincushion (k1 > 0)"
    else:
        dist_type = "Negligible"

    skip_undistort = k1_threshold > 0 and abs(k1_val) < k1_threshold

    if skip_undistort:
        corrected = input_image.copy()
        valid_frac = 1.0
        t_undistort = 0.0
    else:
        t0 = time.time()
        grid, valid = _build_undistort_grid(
            camera, intrinsics, h, w,
            scale=scale, target_proj=target_proj,
        )
        y_t = torch.nn.functional.grid_sample(
            x_t[None, ...], grid,
            mode=interp_mode,
            padding_mode=padding_mode,
            align_corners=False,
        )
        t_undistort = time.time() - t0

        valid_frac = float(valid.float().mean().item()) if valid is not None else 1.0

        y = y_t[0].clamp(0, 1).detach().cpu().numpy()
        y = np.transpose(y, (1, 2, 0))
        corrected = (y * 255.0 + 0.5).astype("uint8")

    t_total_elapsed = time.time() - t_total

    hw_label = "GPU" if device.type == "cuda" else "CPU"

    params_md = f"""
### Camera Intrinsics

| Parameter | Value |
|-----------|-------|
| **Focal length** | `{f_px:.2f}` px |
| **Principal point** | `({cx_val:.2f}, {cy_val:.2f})` px |
| **Distortion k1** | `{k1_val:.6f}` |
| **Distortion type** | {dist_type} |
| **FOV (horizontal)** | `{fov_h:.1f}` deg |
| **FOV (vertical)** | `{fov_v:.1f}` deg |
| **Valid pixel fraction** | `{valid_frac:.3f}` |
| **k1 gated (skipped)** | `{skip_undistort}` |

### Image Info

| Property | Value |
|----------|-------|
| **Input resolution** | `{w} x {h}` ({w*h:,} px) |
| **Model working size** | `{pred_size}` |
| **Camera model** | `{cam_id}` |
| **Scale** | `{scale}` |
| **Target projection** | `{target_proj}` |

### Timing ({hw_label})

| Stage | Time |
|-------|------|
| Neural net inference | `{t_infer*1000:.0f}` ms |
| Undistortion (grid_sample) | `{t_undistort*1000:.0f}` ms |
| **Total** | **`{t_total_elapsed*1000:.0f}` ms** |
| Hardware | `{device}` ({hw_label}) |
"""

    raw_json = json.dumps({
        "intrinsics": {
            "focal_length_px": focal,
            "principal_point": [cx_val, cy_val],
            "k1": k1_val,
        },
        "fov": {"horizontal_deg": fov_h, "vertical_deg": fov_v},
        "distortion": {"type": dist_type, "k1_gated": skip_undistort},
        "image": {
            "input_resolution": [w, h],
            "total_pixels": w * h,
            "model_working_size": pred_size,
        },
        "camera": {
            "model": cam_id,
            "scale": scale,
            "target_projection": target_proj,
            "padding_mode": padding_mode,
            "interpolation": interp_mode,
        },
        "quality": {
            "valid_pixel_fraction": valid_frac,
        },
        "timing_ms": {
            "neural_net": round(t_infer * 1000, 1),
            "undistortion": round(t_undistort * 1000, 1),
            "total": round(t_total_elapsed * 1000, 1),
        },
        "device": str(device),
        "all_intrinsics_raw": intr_list,
    }, indent=2)

    return corrected, params_md, raw_json


# ── Gradio UI ──

with gr.Blocks() as demo:

    gr.Markdown("""
# AnyCalib — Full-Resolution Camera Calibration

Single-image lens calibration & distortion correction powered by
[AnyCalib](https://github.com/javrtg/AnyCalib) (DINOv2 ViT-L/14 + LightDPT + ConvexTangentDecoder, ~320M params).

Full FP32 inference, no quantization, no resolution limits. Automatically uses GPU when available.

Upload any image and get the **corrected (undistorted) image** at original resolution,
plus camera intrinsics, FOV, distortion parameters, and timing.
    """)

    with gr.Row():
        with gr.Column(scale=1):
            input_image = gr.Image(
                label="Input Image",
                type="numpy",
                sources=["upload", "clipboard"],
            )

            with gr.Accordion("Advanced Settings", open=False):
                cam_id = gr.Dropdown(
                    label="Camera Model",
                    choices=[
                        "simple_division:1",
                        "division:1",
                        "simple_radial:1",
                        "simple_kb:1",
                        "simple_pinhole",
                        "pinhole",
                    ],
                    value="simple_division:1",
                )
                scale = gr.Slider(
                    label="Focal Length Scale (< 1 = wider FOV, less crop)",
                    minimum=0.5, maximum=1.5, step=0.05, value=1.0,
                )
                target_proj = gr.Dropdown(
                    label="Target Projection",
                    choices=["perspective", "stereographic", "equidistant", "equisolid", "orthographic"],
                    value="perspective",
                )
                padding_mode = gr.Dropdown(
                    label="Padding Mode",
                    choices=["border", "zeros", "reflection"],
                    value="border",
                )
                interp_mode = gr.Dropdown(
                    label="Interpolation",
                    choices=["bilinear", "bicubic", "nearest"],
                    value="bilinear",
                )
                k1_threshold = gr.Slider(
                    label="k1 Threshold (skip undistortion if |k1| below this)",
                    minimum=0.0, maximum=0.1, step=0.005, value=0.0,
                )

            run_btn = gr.Button("Run Calibration", variant="primary", size="lg")

        with gr.Column(scale=1):
            output_image = gr.Image(label="Corrected (Undistorted) Image", type="numpy")

    with gr.Row():
        with gr.Column():
            params_output = gr.Markdown(label="Camera Parameters")
        with gr.Column():
            json_output = gr.Code(label="Raw JSON Output", language="json")

    gr.Markdown("""
---
### How it works

1. **Upload** any image (phone photo, action cam, drone, dashcam, etc.)
2. The model predicts per-pixel **ray directions** using a DINOv2 ViT-L/14 backbone
3. **RANSAC + Gauss-Newton** calibrator fits camera intrinsics `[f, cx, cy, k1]` from the rays
4. Image is **undistorted at full resolution** via differentiable grid_sample
5. All parameters and raw JSON output are displayed

### Links

- Raw weights: [SebRincon/anycalib](https://huggingface.co/SebRincon/anycalib) (safetensors)
- ONNX models: [SebRincon/anycalib-onnx](https://huggingface.co/SebRincon/anycalib-onnx) (FP32/FP16/INT8)
- WASM demo: [SebRincon/anycalib-wasm](https://huggingface.co/spaces/SebRincon/anycalib-wasm) (browser-only)
- Source: [github.com/javrtg/AnyCalib](https://github.com/javrtg/AnyCalib)
    """)

    run_btn.click(
        fn=run_calibration,
        inputs=[input_image, cam_id, scale, target_proj, padding_mode, interp_mode, k1_threshold],
        outputs=[output_image, params_output, json_output],
    )

    input_image.change(
        fn=run_calibration,
        inputs=[input_image, cam_id, scale, target_proj, padding_mode, interp_mode, k1_threshold],
        outputs=[output_image, params_output, json_output],
    )


if __name__ == "__main__":
    demo.launch()