Spaces:

Louadria
/

SPILL

Running

App Files Files Community

SPILL / app.py

Louadria

fix: depth scaling, device detection, and sanity check for 3D tab

1e53f0f 1 day ago

raw

history blame contribute delete

10 kB

	"""
	HuggingFace Space app for SPILL: Glass Keypoint Detection + 3D Reconstruction

	Gradio demo with two tabs:
	1. 2D Reconstruction — detect keypoints on uploaded images
	2. 3D Reconstruction — monocular depth estimation + cylinder reconstruction

	To deploy:
	1. Create a new Space on HuggingFace with `gradio` SDK
	2. Upload this directory contents
	3. Or run: `huggingface-cli login` then push from terminal
	"""
	import os
	import cv2
	import numpy as np
	import torch
	import gradio as gr
	from pathlib import Path

	# Import SPILL library (installed via requirements.txt)
	from spill import GlassDetector, Monocular3DReconstructor
	from spill import depth_overlay, create_3d_figure, build_3d_info

	# Model paths - checkpoints are stored in the repo
	BASE_DIR = Path(__file__).parent
	CHECKPOINT_PATH = BASE_DIR / "checkpoints" / "wild_glasses.ckpt"
	YOLO_PATH = BASE_DIR / "checkpoints" / "yolov8m.pt"

	# Detect device — default to CPU for safety (Blackwell GPU + old PyTorch issue).
	# HF Spaces set SPILL_FORCE_CUDA=1 in the Dockerfile to enable GPU.
	if os.environ.get("SPILL_FORCE_CUDA"):
	DEVICE = "cuda"
	else:
	DEVICE = "cpu"

	print(f"Loading SPILL models on {DEVICE}...")
	detector = GlassDetector(
	keypoint_checkpoint=str(CHECKPOINT_PATH),
	yolo_model_path=str(YOLO_PATH),
	device=DEVICE,
	)
	print("2D models loaded!")

	# Lazy-load 3D reconstructor on first use (saves GPU VRAM at startup)
	_reconstructor = None


	def get_reconstructor():
	global _reconstructor
	if _reconstructor is None:
	print("[3D] Lazy-loading 3D reconstructor...")
	_reconstructor = Monocular3DReconstructor(
	depth_model_size="large",
	device=DEVICE,
	)
	return _reconstructor


	KP_COLORS = {
	"bottom_front": (255, 0, 0), # Red
	"top_front": (0, 255, 0), # Green
	"top_left": (0, 0, 255), # Blue
	"top_right": (255, 0, 255), # Magenta
	"fluid_level": (0, 255, 255), # Cyan
	"fluid_level_2": (255, 255, 0), # Yellow
	}


	def draw_detections(image, keypoints_list):
	"""Draw keypoints and bounding boxes on the image."""
	output = image.copy()

	for idx, kp in enumerate(keypoints_list):
	# Draw original YOLO bounding box (no padding)
	x1, y1, x2, y2 = kp.bounding_box
	cv2.rectangle(output, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
	cv2.putText(output, f"Glass #{idx+1}", (int(x1), int(y1) - 8),
	cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

	# Draw keypoints (no text labels on image)
	all_kps = [
	("bottom_front", kp.bottom_front),
	("top_front", kp.top_front),
	("top_left", kp.top_left),
	("top_right", kp.top_right),
	("fluid_level", kp.fluid_level),
	("fluid_level_2", kp.fluid_level_2),
	]

	for name, pt in all_kps:
	if pt is not None:
	color = KP_COLORS[name]
	cv2.circle(output, (int(pt[0]), int(pt[1])), 6, color, -1)
	cv2.circle(output, (int(pt[0]), int(pt[1])), 8, color, 1)

	return output


	def detect_glasses_2d(image):
	"""2D tab callback: detect glasses and return annotated image + info."""
	if image is None:
	return None, "Please upload an image."

	# Gradio gives us RGB, convert to BGR for OpenCV
	if isinstance(image, dict):
	image = image["image"]
	image_bgr = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

	# Detect
	keypoints_list = detector.detect(image_bgr)

	if not keypoints_list:
	output = image.copy()
	return output, "No glasses detected. Try an image with clear glasses/cups/wine glasses."

	# Draw
	output_rgb = draw_detections(image_bgr, keypoints_list)
	output_rgb = cv2.cvtColor(output_rgb, cv2.COLOR_BGR2RGB)

	# Build info text — per-keypoint pixel coordinates (CriticBarista style)
	info_lines = [f"Found {len(keypoints_list)} glass(es):\n"]
	for i, kp in enumerate(keypoints_list):
	info_lines.append(f"\n--- Glass #{i+1} ---")
	info_lines.append(f" bottom_front: ({kp.bottom_front[0]:.0f}, {kp.bottom_front[1]:.0f})")
	info_lines.append(f" top_front: ({kp.top_front[0]:.0f}, {kp.top_front[1]:.0f})")
	info_lines.append(f" top_left: ({kp.top_left[0]:.0f}, {kp.top_left[1]:.0f})")
	info_lines.append(f" top_right: ({kp.top_right[0]:.0f}, {kp.top_right[1]:.0f})")
	if kp.fluid_level is not None:
	info_lines.append(f" fluid_level: ({kp.fluid_level[0]:.0f}, {kp.fluid_level[1]:.0f})")
	if kp.fluid_level_2 is not None:
	info_lines.append(f" fluid_level_2: ({kp.fluid_level_2[0]:.0f}, {kp.fluid_level_2[1]:.0f})")

	info = "\n".join(info_lines)
	return output_rgb, info


	def detect_glasses_3d(image):
	"""3D tab callback: full monocular 3D reconstruction."""
	if image is None:
	return None, None, None, "Please upload an image."

	if isinstance(image, dict):
	image = image["image"]
	image_bgr = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

	# Step 1: 2D keypoint detection
	keypoints_list = detector.detect(image_bgr)

	if not keypoints_list:
	output = image.copy()
	return (output, None, None,
	"No glasses detected. Try an image with clear glasses/cups/wine glasses.")

	# Step 2: Load 3D reconstructor (lazy)
	reconstructor = get_reconstructor()

	# Step 3: Full 3D reconstruction
	glasses, depth_map, info = reconstructor.reconstruct(image_bgr, keypoints_list)

	# Step 4: Annotated image (2D keypoints overlaid)
	annotated_rgb = draw_detections(image_bgr, keypoints_list)
	annotated_rgb = cv2.cvtColor(annotated_rgb, cv2.COLOR_BGR2RGB)

	# Step 5: Depth overlay
	depth_vis = depth_overlay(image, depth_map)

	# Step 6: 3D plot
	plot = create_3d_figure(glasses, info, image.shape)

	# Step 7: Info text
	info_text = build_3d_info(glasses, keypoints_list, info)

	return annotated_rgb, depth_vis, plot, info_text


	# ── Build Gradio interface ──────────────────────────────────────

	DESCRIPTION = """
	# SPILL: Glass Detection & 3D Reconstruction

	Detect transparent glassware in images using semantic keypoint detection — and reconstruct full 3D cylinders from a single RGB image.

	How it works:
	1. YOLOv8 detects glass bounding boxes (cups, vases, wine glasses)
	2. A keypoint detector predicts structural points + fluid level on each glass
	3. (3D tab) DepthAnythingV2 estimates monocular depth → RANSAC finds the table plane → cylinder estimation gives radius, height, tilt, and fluid level

	Key points:
	- 🔴 Bottom Front — base of the glass facing the camera
	- 🟢 Top Front — rim edge facing the camera
	- 🔵 Top Left — left edge of the rim
	- 🟣 Top Right — right edge of the rim
	- 🟡 Fluid Level — liquid surface detected by the model
	- 🟠 Fluid Level (alt) — secondary fluid level candidate (shown when the model detects multiple peaks on the fluid level heatmap; useful when the first peak is uncertain, so downstream use cases can conservatively pick the highest or lowest value)

	3D Reconstruction uses DepthAnythingV2-Large for monocular depth estimation, Open3D RANSAC for plane detection, and the SPILL cylinder solver for full 3D parameters.
	"""

	with gr.Blocks(title="SPILL Glass Detection & 3D Reconstruction") as demo:
	gr.Markdown(DESCRIPTION)

	with gr.Tabs():
	# ── 2D Detection Tab ──
	with gr.Tab("2D Reconstruction"):
	gr.Markdown(
	"### 2D Keypoint Detection\n\n"
	"Upload an image and see detected glass keypoints overlaid. "
	"No depth camera needed."
	)
	with gr.Row():
	input_2d = gr.Image(type="numpy", label="Upload Image", sources=["upload", "clipboard"])
	output_2d = gr.Image(type="numpy", label="Detection Result")
	detect_2d_btn = gr.Button("Detect Glasses", variant="primary")
	info_2d = gr.Textbox(label="Detection Info")
	detect_2d_btn.click(
	fn=detect_glasses_2d,
	inputs=input_2d,
	outputs=[output_2d, info_2d],
	)

	# ── 3D Reconstruction Tab ──
	with gr.Tab("3D Reconstruction"):
	gr.Markdown(
	"### Monocular 3D Reconstruction\n\n"
	"From a single RGB image: estimate depth (DepthAnythingV2), find the table plane (RANSAC), "
	"and reconstruct 3D glass cylinders — radius, height, tilt angle, and fluid level. "
	"No depth camera required!\n\n"
	"Best results: place the glass on a flat surface (table), keep the camera roughly level."
	)
	with gr.Row():
	input_3d = gr.Image(type="numpy", label="Upload Image", sources=["upload", "clipboard"])
	output_3d_annotated = gr.Image(type="numpy", label="Keypoints Overlay")
	with gr.Row():
	output_3d_depth = gr.Image(type="numpy", label="Depth Estimate")
	output_3d_plot = gr.Plot(label="3D Reconstruction")
	detect_3d_btn = gr.Button("Reconstruct 3D", variant="primary")
	info_3d = gr.Textbox(label="3D Reconstruction Info")
	detect_3d_btn.click(
	fn=detect_glasses_3d,
	inputs=input_3d,
	outputs=[output_3d_annotated, output_3d_depth, output_3d_plot, info_3d],
	)

	gr.Markdown("""
	---
	Paper: [SPILL: Size, Pose, and Internal Liquid Level Estimation](https://github.com/Louadria/SPILL)
	\| Dataset: [Glasses-in-the-Wild](https://doi.org/10.5281/zenodo.17288314)
	\| Code: [Louadria/SPILL](https://github.com/Louadria/SPILL)
	""")

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)