SPILL / app.py
Louadria's picture
fix: depth scaling, device detection, and sanity check for 3D tab
1e53f0f
"""
HuggingFace Space app for SPILL: Glass Keypoint Detection + 3D Reconstruction
Gradio demo with two tabs:
1. 2D Reconstruction β€” detect keypoints on uploaded images
2. 3D Reconstruction β€” monocular depth estimation + cylinder reconstruction
To deploy:
1. Create a new Space on HuggingFace with `gradio` SDK
2. Upload this directory contents
3. Or run: `huggingface-cli login` then push from terminal
"""
import os
import cv2
import numpy as np
import torch
import gradio as gr
from pathlib import Path
# Import SPILL library (installed via requirements.txt)
from spill import GlassDetector, Monocular3DReconstructor
from spill import depth_overlay, create_3d_figure, build_3d_info
# Model paths - checkpoints are stored in the repo
BASE_DIR = Path(__file__).parent
CHECKPOINT_PATH = BASE_DIR / "checkpoints" / "wild_glasses.ckpt"
YOLO_PATH = BASE_DIR / "checkpoints" / "yolov8m.pt"
# Detect device β€” default to CPU for safety (Blackwell GPU + old PyTorch issue).
# HF Spaces set SPILL_FORCE_CUDA=1 in the Dockerfile to enable GPU.
if os.environ.get("SPILL_FORCE_CUDA"):
DEVICE = "cuda"
else:
DEVICE = "cpu"
print(f"Loading SPILL models on {DEVICE}...")
detector = GlassDetector(
keypoint_checkpoint=str(CHECKPOINT_PATH),
yolo_model_path=str(YOLO_PATH),
device=DEVICE,
)
print("2D models loaded!")
# Lazy-load 3D reconstructor on first use (saves GPU VRAM at startup)
_reconstructor = None
def get_reconstructor():
global _reconstructor
if _reconstructor is None:
print("[3D] Lazy-loading 3D reconstructor...")
_reconstructor = Monocular3DReconstructor(
depth_model_size="large",
device=DEVICE,
)
return _reconstructor
KP_COLORS = {
"bottom_front": (255, 0, 0), # Red
"top_front": (0, 255, 0), # Green
"top_left": (0, 0, 255), # Blue
"top_right": (255, 0, 255), # Magenta
"fluid_level": (0, 255, 255), # Cyan
"fluid_level_2": (255, 255, 0), # Yellow
}
def draw_detections(image, keypoints_list):
"""Draw keypoints and bounding boxes on the image."""
output = image.copy()
for idx, kp in enumerate(keypoints_list):
# Draw original YOLO bounding box (no padding)
x1, y1, x2, y2 = kp.bounding_box
cv2.rectangle(output, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
cv2.putText(output, f"Glass #{idx+1}", (int(x1), int(y1) - 8),
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
# Draw keypoints (no text labels on image)
all_kps = [
("bottom_front", kp.bottom_front),
("top_front", kp.top_front),
("top_left", kp.top_left),
("top_right", kp.top_right),
("fluid_level", kp.fluid_level),
("fluid_level_2", kp.fluid_level_2),
]
for name, pt in all_kps:
if pt is not None:
color = KP_COLORS[name]
cv2.circle(output, (int(pt[0]), int(pt[1])), 6, color, -1)
cv2.circle(output, (int(pt[0]), int(pt[1])), 8, color, 1)
return output
def detect_glasses_2d(image):
"""2D tab callback: detect glasses and return annotated image + info."""
if image is None:
return None, "Please upload an image."
# Gradio gives us RGB, convert to BGR for OpenCV
if isinstance(image, dict):
image = image["image"]
image_bgr = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
# Detect
keypoints_list = detector.detect(image_bgr)
if not keypoints_list:
output = image.copy()
return output, "No glasses detected. Try an image with clear glasses/cups/wine glasses."
# Draw
output_rgb = draw_detections(image_bgr, keypoints_list)
output_rgb = cv2.cvtColor(output_rgb, cv2.COLOR_BGR2RGB)
# Build info text β€” per-keypoint pixel coordinates (CriticBarista style)
info_lines = [f"Found {len(keypoints_list)} glass(es):\n"]
for i, kp in enumerate(keypoints_list):
info_lines.append(f"\n--- Glass #{i+1} ---")
info_lines.append(f" bottom_front: ({kp.bottom_front[0]:.0f}, {kp.bottom_front[1]:.0f})")
info_lines.append(f" top_front: ({kp.top_front[0]:.0f}, {kp.top_front[1]:.0f})")
info_lines.append(f" top_left: ({kp.top_left[0]:.0f}, {kp.top_left[1]:.0f})")
info_lines.append(f" top_right: ({kp.top_right[0]:.0f}, {kp.top_right[1]:.0f})")
if kp.fluid_level is not None:
info_lines.append(f" fluid_level: ({kp.fluid_level[0]:.0f}, {kp.fluid_level[1]:.0f})")
if kp.fluid_level_2 is not None:
info_lines.append(f" fluid_level_2: ({kp.fluid_level_2[0]:.0f}, {kp.fluid_level_2[1]:.0f})")
info = "\n".join(info_lines)
return output_rgb, info
def detect_glasses_3d(image):
"""3D tab callback: full monocular 3D reconstruction."""
if image is None:
return None, None, None, "Please upload an image."
if isinstance(image, dict):
image = image["image"]
image_bgr = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
# Step 1: 2D keypoint detection
keypoints_list = detector.detect(image_bgr)
if not keypoints_list:
output = image.copy()
return (output, None, None,
"No glasses detected. Try an image with clear glasses/cups/wine glasses.")
# Step 2: Load 3D reconstructor (lazy)
reconstructor = get_reconstructor()
# Step 3: Full 3D reconstruction
glasses, depth_map, info = reconstructor.reconstruct(image_bgr, keypoints_list)
# Step 4: Annotated image (2D keypoints overlaid)
annotated_rgb = draw_detections(image_bgr, keypoints_list)
annotated_rgb = cv2.cvtColor(annotated_rgb, cv2.COLOR_BGR2RGB)
# Step 5: Depth overlay
depth_vis = depth_overlay(image, depth_map)
# Step 6: 3D plot
plot = create_3d_figure(glasses, info, image.shape)
# Step 7: Info text
info_text = build_3d_info(glasses, keypoints_list, info)
return annotated_rgb, depth_vis, plot, info_text
# ── Build Gradio interface ──────────────────────────────────────
DESCRIPTION = """
# SPILL: Glass Detection & 3D Reconstruction
Detect **transparent glassware** in images using semantic keypoint detection β€” and reconstruct full 3D cylinders from a single RGB image.
**How it works:**
1. YOLOv8 detects glass bounding boxes (cups, vases, wine glasses)
2. A keypoint detector predicts structural points + fluid level on each glass
3. (3D tab) DepthAnythingV2 estimates monocular depth β†’ RANSAC finds the table plane β†’ cylinder estimation gives radius, height, tilt, and fluid level
**Key points:**
- πŸ”΄ Bottom Front β€” base of the glass facing the camera
- 🟒 Top Front β€” rim edge facing the camera
- πŸ”΅ Top Left β€” left edge of the rim
- 🟣 Top Right β€” right edge of the rim
- 🟑 Fluid Level β€” liquid surface detected by the model
- 🟠 Fluid Level (alt) β€” secondary fluid level candidate (shown when the model detects multiple peaks on the fluid level heatmap; useful when the first peak is uncertain, so downstream use cases can conservatively pick the highest or lowest value)
**3D Reconstruction** uses DepthAnythingV2-Large for monocular depth estimation, Open3D RANSAC for plane detection, and the SPILL cylinder solver for full 3D parameters.
"""
with gr.Blocks(title="SPILL Glass Detection & 3D Reconstruction") as demo:
gr.Markdown(DESCRIPTION)
with gr.Tabs():
# ── 2D Detection Tab ──
with gr.Tab("2D Reconstruction"):
gr.Markdown(
"### 2D Keypoint Detection\n\n"
"Upload an image and see detected glass keypoints overlaid. "
"No depth camera needed."
)
with gr.Row():
input_2d = gr.Image(type="numpy", label="Upload Image", sources=["upload", "clipboard"])
output_2d = gr.Image(type="numpy", label="Detection Result")
detect_2d_btn = gr.Button("Detect Glasses", variant="primary")
info_2d = gr.Textbox(label="Detection Info")
detect_2d_btn.click(
fn=detect_glasses_2d,
inputs=input_2d,
outputs=[output_2d, info_2d],
)
# ── 3D Reconstruction Tab ──
with gr.Tab("3D Reconstruction"):
gr.Markdown(
"### Monocular 3D Reconstruction\n\n"
"From a single RGB image: estimate depth (DepthAnythingV2), find the table plane (RANSAC), "
"and reconstruct 3D glass cylinders β€” radius, height, tilt angle, and fluid level. "
"No depth camera required!\n\n"
"**Best results:** place the glass on a flat surface (table), keep the camera roughly level."
)
with gr.Row():
input_3d = gr.Image(type="numpy", label="Upload Image", sources=["upload", "clipboard"])
output_3d_annotated = gr.Image(type="numpy", label="Keypoints Overlay")
with gr.Row():
output_3d_depth = gr.Image(type="numpy", label="Depth Estimate")
output_3d_plot = gr.Plot(label="3D Reconstruction")
detect_3d_btn = gr.Button("Reconstruct 3D", variant="primary")
info_3d = gr.Textbox(label="3D Reconstruction Info")
detect_3d_btn.click(
fn=detect_glasses_3d,
inputs=input_3d,
outputs=[output_3d_annotated, output_3d_depth, output_3d_plot, info_3d],
)
gr.Markdown("""
---
**Paper:** [SPILL: Size, Pose, and Internal Liquid Level Estimation](https://github.com/Louadria/SPILL)
| **Dataset:** [Glasses-in-the-Wild](https://doi.org/10.5281/zenodo.17288314)
| **Code:** [Louadria/SPILL](https://github.com/Louadria/SPILL)
""")
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)