| """ |
| HuggingFace Space app for SPILL: Glass Keypoint Detection + 3D Reconstruction |
| |
| Gradio demo with two tabs: |
| 1. 2D Reconstruction β detect keypoints on uploaded images |
| 2. 3D Reconstruction β monocular depth estimation + cylinder reconstruction |
| |
| To deploy: |
| 1. Create a new Space on HuggingFace with `gradio` SDK |
| 2. Upload this directory contents |
| 3. Or run: `huggingface-cli login` then push from terminal |
| """ |
| import os |
| import cv2 |
| import numpy as np |
| import torch |
| import gradio as gr |
| from pathlib import Path |
|
|
| |
| from spill import GlassDetector, Monocular3DReconstructor |
| from spill import depth_overlay, create_3d_figure, build_3d_info |
|
|
| |
| BASE_DIR = Path(__file__).parent |
| CHECKPOINT_PATH = BASE_DIR / "checkpoints" / "wild_glasses.ckpt" |
| YOLO_PATH = BASE_DIR / "checkpoints" / "yolov8m.pt" |
|
|
| |
| |
| if os.environ.get("SPILL_FORCE_CUDA"): |
| DEVICE = "cuda" |
| else: |
| DEVICE = "cpu" |
|
|
| print(f"Loading SPILL models on {DEVICE}...") |
| detector = GlassDetector( |
| keypoint_checkpoint=str(CHECKPOINT_PATH), |
| yolo_model_path=str(YOLO_PATH), |
| device=DEVICE, |
| ) |
| print("2D models loaded!") |
|
|
| |
| _reconstructor = None |
|
|
|
|
| def get_reconstructor(): |
| global _reconstructor |
| if _reconstructor is None: |
| print("[3D] Lazy-loading 3D reconstructor...") |
| _reconstructor = Monocular3DReconstructor( |
| depth_model_size="large", |
| device=DEVICE, |
| ) |
| return _reconstructor |
|
|
|
|
| KP_COLORS = { |
| "bottom_front": (255, 0, 0), |
| "top_front": (0, 255, 0), |
| "top_left": (0, 0, 255), |
| "top_right": (255, 0, 255), |
| "fluid_level": (0, 255, 255), |
| "fluid_level_2": (255, 255, 0), |
| } |
|
|
|
|
| def draw_detections(image, keypoints_list): |
| """Draw keypoints and bounding boxes on the image.""" |
| output = image.copy() |
|
|
| for idx, kp in enumerate(keypoints_list): |
| |
| x1, y1, x2, y2 = kp.bounding_box |
| cv2.rectangle(output, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2) |
| cv2.putText(output, f"Glass #{idx+1}", (int(x1), int(y1) - 8), |
| cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2) |
|
|
| |
| all_kps = [ |
| ("bottom_front", kp.bottom_front), |
| ("top_front", kp.top_front), |
| ("top_left", kp.top_left), |
| ("top_right", kp.top_right), |
| ("fluid_level", kp.fluid_level), |
| ("fluid_level_2", kp.fluid_level_2), |
| ] |
|
|
| for name, pt in all_kps: |
| if pt is not None: |
| color = KP_COLORS[name] |
| cv2.circle(output, (int(pt[0]), int(pt[1])), 6, color, -1) |
| cv2.circle(output, (int(pt[0]), int(pt[1])), 8, color, 1) |
|
|
| return output |
|
|
|
|
| def detect_glasses_2d(image): |
| """2D tab callback: detect glasses and return annotated image + info.""" |
| if image is None: |
| return None, "Please upload an image." |
|
|
| |
| if isinstance(image, dict): |
| image = image["image"] |
| image_bgr = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) |
|
|
| |
| keypoints_list = detector.detect(image_bgr) |
|
|
| if not keypoints_list: |
| output = image.copy() |
| return output, "No glasses detected. Try an image with clear glasses/cups/wine glasses." |
|
|
| |
| output_rgb = draw_detections(image_bgr, keypoints_list) |
| output_rgb = cv2.cvtColor(output_rgb, cv2.COLOR_BGR2RGB) |
|
|
| |
| info_lines = [f"Found {len(keypoints_list)} glass(es):\n"] |
| for i, kp in enumerate(keypoints_list): |
| info_lines.append(f"\n--- Glass #{i+1} ---") |
| info_lines.append(f" bottom_front: ({kp.bottom_front[0]:.0f}, {kp.bottom_front[1]:.0f})") |
| info_lines.append(f" top_front: ({kp.top_front[0]:.0f}, {kp.top_front[1]:.0f})") |
| info_lines.append(f" top_left: ({kp.top_left[0]:.0f}, {kp.top_left[1]:.0f})") |
| info_lines.append(f" top_right: ({kp.top_right[0]:.0f}, {kp.top_right[1]:.0f})") |
| if kp.fluid_level is not None: |
| info_lines.append(f" fluid_level: ({kp.fluid_level[0]:.0f}, {kp.fluid_level[1]:.0f})") |
| if kp.fluid_level_2 is not None: |
| info_lines.append(f" fluid_level_2: ({kp.fluid_level_2[0]:.0f}, {kp.fluid_level_2[1]:.0f})") |
|
|
| info = "\n".join(info_lines) |
| return output_rgb, info |
|
|
|
|
| def detect_glasses_3d(image): |
| """3D tab callback: full monocular 3D reconstruction.""" |
| if image is None: |
| return None, None, None, "Please upload an image." |
|
|
| if isinstance(image, dict): |
| image = image["image"] |
| image_bgr = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) |
|
|
| |
| keypoints_list = detector.detect(image_bgr) |
|
|
| if not keypoints_list: |
| output = image.copy() |
| return (output, None, None, |
| "No glasses detected. Try an image with clear glasses/cups/wine glasses.") |
|
|
| |
| reconstructor = get_reconstructor() |
|
|
| |
| glasses, depth_map, info = reconstructor.reconstruct(image_bgr, keypoints_list) |
|
|
| |
| annotated_rgb = draw_detections(image_bgr, keypoints_list) |
| annotated_rgb = cv2.cvtColor(annotated_rgb, cv2.COLOR_BGR2RGB) |
|
|
| |
| depth_vis = depth_overlay(image, depth_map) |
|
|
| |
| plot = create_3d_figure(glasses, info, image.shape) |
|
|
| |
| info_text = build_3d_info(glasses, keypoints_list, info) |
|
|
| return annotated_rgb, depth_vis, plot, info_text |
|
|
|
|
| |
|
|
| DESCRIPTION = """ |
| # SPILL: Glass Detection & 3D Reconstruction |
| |
| Detect **transparent glassware** in images using semantic keypoint detection β and reconstruct full 3D cylinders from a single RGB image. |
| |
| **How it works:** |
| 1. YOLOv8 detects glass bounding boxes (cups, vases, wine glasses) |
| 2. A keypoint detector predicts structural points + fluid level on each glass |
| 3. (3D tab) DepthAnythingV2 estimates monocular depth β RANSAC finds the table plane β cylinder estimation gives radius, height, tilt, and fluid level |
| |
| **Key points:** |
| - π΄ Bottom Front β base of the glass facing the camera |
| - π’ Top Front β rim edge facing the camera |
| - π΅ Top Left β left edge of the rim |
| - π£ Top Right β right edge of the rim |
| - π‘ Fluid Level β liquid surface detected by the model |
| - π Fluid Level (alt) β secondary fluid level candidate (shown when the model detects multiple peaks on the fluid level heatmap; useful when the first peak is uncertain, so downstream use cases can conservatively pick the highest or lowest value) |
| |
| **3D Reconstruction** uses DepthAnythingV2-Large for monocular depth estimation, Open3D RANSAC for plane detection, and the SPILL cylinder solver for full 3D parameters. |
| """ |
|
|
| with gr.Blocks(title="SPILL Glass Detection & 3D Reconstruction") as demo: |
| gr.Markdown(DESCRIPTION) |
|
|
| with gr.Tabs(): |
| |
| with gr.Tab("2D Reconstruction"): |
| gr.Markdown( |
| "### 2D Keypoint Detection\n\n" |
| "Upload an image and see detected glass keypoints overlaid. " |
| "No depth camera needed." |
| ) |
| with gr.Row(): |
| input_2d = gr.Image(type="numpy", label="Upload Image", sources=["upload", "clipboard"]) |
| output_2d = gr.Image(type="numpy", label="Detection Result") |
| detect_2d_btn = gr.Button("Detect Glasses", variant="primary") |
| info_2d = gr.Textbox(label="Detection Info") |
| detect_2d_btn.click( |
| fn=detect_glasses_2d, |
| inputs=input_2d, |
| outputs=[output_2d, info_2d], |
| ) |
|
|
| |
| with gr.Tab("3D Reconstruction"): |
| gr.Markdown( |
| "### Monocular 3D Reconstruction\n\n" |
| "From a single RGB image: estimate depth (DepthAnythingV2), find the table plane (RANSAC), " |
| "and reconstruct 3D glass cylinders β radius, height, tilt angle, and fluid level. " |
| "No depth camera required!\n\n" |
| "**Best results:** place the glass on a flat surface (table), keep the camera roughly level." |
| ) |
| with gr.Row(): |
| input_3d = gr.Image(type="numpy", label="Upload Image", sources=["upload", "clipboard"]) |
| output_3d_annotated = gr.Image(type="numpy", label="Keypoints Overlay") |
| with gr.Row(): |
| output_3d_depth = gr.Image(type="numpy", label="Depth Estimate") |
| output_3d_plot = gr.Plot(label="3D Reconstruction") |
| detect_3d_btn = gr.Button("Reconstruct 3D", variant="primary") |
| info_3d = gr.Textbox(label="3D Reconstruction Info") |
| detect_3d_btn.click( |
| fn=detect_glasses_3d, |
| inputs=input_3d, |
| outputs=[output_3d_annotated, output_3d_depth, output_3d_plot, info_3d], |
| ) |
|
|
| gr.Markdown(""" |
| --- |
| **Paper:** [SPILL: Size, Pose, and Internal Liquid Level Estimation](https://github.com/Louadria/SPILL) |
| | **Dataset:** [Glasses-in-the-Wild](https://doi.org/10.5281/zenodo.17288314) |
| | **Code:** [Louadria/SPILL](https://github.com/Louadria/SPILL) |
| """) |
|
|
| if __name__ == "__main__": |
| demo.launch(server_name="0.0.0.0", server_port=7860) |
|
|