Arthur Niu
fix post_process_grounded_object_detection
cdcb644
import gradio as gr
import numpy as np
import cv2
import torch
from PIL import Image
from transformers import (
AutoProcessor,
AutoModelForZeroShotObjectDetection,
SamModel,
SamProcessor,
)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# -------------------------
# Models
# -------------------------
DINO_ID = "IDEA-Research/grounding-dino-tiny"
dino_processor = AutoProcessor.from_pretrained(DINO_ID)
dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(DINO_ID).to(DEVICE)
SAM_ID = "facebook/sam-vit-base"
sam_processor = SamProcessor.from_pretrained(SAM_ID)
sam_model = SamModel.from_pretrained(SAM_ID).to(DEVICE)
# -------------------------
# Mask + geometry helpers
# -------------------------
def _ensure_2d_mask(mask) -> np.ndarray:
if torch.is_tensor(mask):
mask = mask.detach().cpu().numpy()
mask = np.array(mask)
mask = np.squeeze(mask)
if mask.ndim == 3:
if mask.shape[0] <= 16 and mask.shape[1] > 32 and mask.shape[2] > 32:
mask = mask[0]
else:
mask = mask[:, :, 0]
mask = np.squeeze(mask)
if mask.ndim != 2:
raise ValueError(f"Mask is not 2D after normalization. Got shape: {mask.shape}")
return (mask > 0).astype(np.uint8)
def _clean_mask(mask01: np.ndarray) -> np.ndarray:
mask01 = _ensure_2d_mask(mask01)
m = np.ascontiguousarray((mask01 * 255).astype(np.uint8))
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (7, 7))
m = cv2.morphologyEx(m, cv2.MORPH_CLOSE, kernel, iterations=1)
m = cv2.morphologyEx(m, cv2.MORPH_OPEN, kernel, iterations=1)
return (m > 0).astype(np.uint8)
def _order_points(pts4: np.ndarray) -> np.ndarray:
pts4 = np.asarray(pts4, dtype=np.float32)
s = pts4.sum(axis=1)
d = pts4[:, 0] - pts4[:, 1]
tl = pts4[np.argmin(s)]
br = pts4[np.argmax(s)]
tr = pts4[np.argmax(d)]
bl = pts4[np.argmin(d)]
return np.array([tl, tr, br, bl], dtype=np.float32)
def _warp_with_bounds(img: np.ndarray, H: np.ndarray, border_value=(255, 255, 255), interp=cv2.INTER_LINEAR):
h, w = img.shape[:2]
corners = np.array([[0, 0], [w - 1, 0], [w - 1, h - 1], [0, h - 1]], dtype=np.float32)
corners_h = cv2.perspectiveTransform(corners.reshape(-1, 1, 2), H).reshape(-1, 2)
min_xy = corners_h.min(axis=0)
max_xy = corners_h.max(axis=0)
min_x, min_y = float(min_xy[0]), float(min_xy[1])
max_x, max_y = float(max_xy[0]), float(max_xy[1])
tx = -min_x if min_x < 0 else 0.0
ty = -min_y if min_y < 0 else 0.0
out_w = int(np.ceil(max_x + tx)) + 1
out_h = int(np.ceil(max_y + ty)) + 1
T = np.array([[1.0, 0.0, tx], [0.0, 1.0, ty], [0.0, 0.0, 1.0]], dtype=np.float32)
H_out = (T @ H).astype(np.float32)
warped = cv2.warpPerspective(
img,
H_out,
(out_w, out_h),
flags=interp,
borderMode=cv2.BORDER_CONSTANT,
borderValue=border_value,
)
return warped, H_out
# -------------------------
# GroundingDINO post-process compatibility
# -------------------------
def _dino_post_process(outputs, inputs, pil_img: Image.Image):
"""
Handle multiple transformers versions:
- Some accept (threshold, text_threshold)
- Some accept different kw names
- Some accept no thresholds at all
We always return a dict with 'boxes' and 'scores'.
"""
target_sizes = [pil_img.size[::-1]] # (h,w)
# Try most common signature (newer)
try:
return dino_processor.post_process_grounded_object_detection(
outputs,
inputs.input_ids,
threshold=0.0, # let us filter ourselves later
text_threshold=0.0,
target_sizes=target_sizes,
)[0]
except TypeError:
pass
# Try without thresholds (older)
try:
return dino_processor.post_process_grounded_object_detection(
outputs,
inputs.input_ids,
target_sizes=target_sizes,
)[0]
except TypeError:
pass
# Try with positional args only
try:
return dino_processor.post_process_grounded_object_detection(
outputs,
inputs.input_ids,
target_sizes,
)[0]
except Exception as e:
raise RuntimeError(f"GroundingDINO post_process API mismatch: {e}")
# -------------------------
# Detection + segmentation
# -------------------------
def _detect_building_box(pil_img: Image.Image, box_threshold=0.35, text_threshold=0.25) -> np.ndarray:
"""
Grounding DINO detect bbox. Returns xyxy float32.
We do our own filtering by box_threshold to avoid version-specific kwargs.
"""
# Use a single prompt string (most compatible)
prompt = "building. building facade. house. house facade. facade."
# Processor call compatibility
try:
inputs = dino_processor(images=pil_img, text=prompt, return_tensors="pt")
except TypeError:
inputs = dino_processor(images=pil_img, text=[prompt], return_tensors="pt")
inputs = inputs.to(DEVICE)
with torch.no_grad():
outputs = dino_model(**inputs)
results = _dino_post_process(outputs, inputs, pil_img)
if "boxes" not in results or len(results["boxes"]) == 0:
raise ValueError("No building detected. Try a closer crop or adjust thresholds.")
boxes = results["boxes"].detach().cpu().numpy().astype(np.float32)
scores = results["scores"].detach().cpu().numpy().astype(np.float32)
# Manual thresholding (since processor signature differs)
keep = scores >= float(box_threshold)
if not np.any(keep):
# If nothing passes, keep the best one anyway
best = int(np.argmax(scores))
return boxes[best]
boxes_k = boxes[keep]
scores_k = scores[keep]
best = int(np.argmax(scores_k))
return boxes_k[best]
def _segment_box_mask(pil_img: Image.Image, box_xyxy: np.ndarray) -> np.ndarray:
input_boxes = [[[float(box_xyxy[0]), float(box_xyxy[1]), float(box_xyxy[2]), float(box_xyxy[3])]]]
inputs = sam_processor(images=pil_img, input_boxes=input_boxes, return_tensors="pt").to(DEVICE)
with torch.no_grad():
outputs = sam_model(**inputs, multimask_output=False)
masks = sam_processor.image_processor.post_process_masks(
outputs.pred_masks.cpu(),
inputs["original_sizes"].cpu(),
inputs["reshaped_input_sizes"].cpu(),
)
m = masks[0]
if torch.is_tensor(m):
m = m.detach().cpu().numpy()
m = np.array(m)
if m.ndim >= 3:
m = m[0]
return _ensure_2d_mask(m)
# -------------------------
# Outline helpers
# -------------------------
def _get_mask_contours(mask01: np.ndarray):
mask01 = _ensure_2d_mask(mask01)
mask255 = np.ascontiguousarray((mask01 * 255).astype(np.uint8))
cnts, _ = cv2.findContours(mask255, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
return cnts
def _draw_outline_on_image(rgb_img: np.ndarray, mask01: np.ndarray, thickness: int = 3) -> np.ndarray:
out = rgb_img.copy()
cnts = _get_mask_contours(mask01)
if cnts:
cv2.drawContours(out, cnts, contourIdx=-1, color=(255, 255, 255), thickness=int(thickness))
return out
# -------------------------
# Architectural chart (Option A)
# -------------------------
def architectural_chart(
rgb_img: np.ndarray,
mode: str = "blueprint",
edge1: int = 60,
edge2: int = 160,
hough_threshold: int = 80,
min_line_length: int = 40,
max_line_gap: int = 8,
thickness: int = 2,
add_grid: bool = False,
) -> np.ndarray:
gray = cv2.cvtColor(rgb_img, cv2.COLOR_RGB2GRAY)
gray = cv2.bilateralFilter(gray, d=7, sigmaColor=50, sigmaSpace=50)
edges = cv2.Canny(gray, int(edge1), int(edge2))
lines = cv2.HoughLinesP(
edges,
rho=1,
theta=np.pi / 180,
threshold=int(hough_threshold),
minLineLength=int(min_line_length),
maxLineGap=int(max_line_gap),
)
h, w = edges.shape[:2]
if mode == "blueprint":
canvas = np.zeros((h, w, 3), dtype=np.uint8)
canvas[:, :] = (20, 40, 90)
line_color = (255, 255, 255)
edge_color = (220, 220, 220)
grid_color = (255, 255, 255)
else:
canvas = np.ones((h, w, 3), dtype=np.uint8) * 255
line_color = (0, 0, 0)
edge_color = (30, 30, 30)
grid_color = (0, 0, 0)
edge_layer = np.zeros_like(canvas)
ys, xs = np.where(edges > 0)
edge_layer[ys, xs] = edge_color
canvas = cv2.addWeighted(canvas, 1.0, edge_layer, 0.35, 0)
if lines is not None:
for x1, y1, x2, y2 in lines[:, 0]:
cv2.line(canvas, (x1, y1), (x2, y2), line_color, int(thickness), cv2.LINE_AA)
if add_grid:
step = max(40, min(h, w) // 25)
grid = canvas.copy()
for x in range(0, w, step):
cv2.line(grid, (x, 0), (x, h), grid_color, 1)
for y in range(0, h, step):
cv2.line(grid, (0, y), (w, y), grid_color, 1)
canvas = cv2.addWeighted(canvas, 1.0, grid, 0.08, 0)
return canvas
# -------------------------
# Vanishing-point-based facade rectification
# -------------------------
def _create_lsd():
try:
refine = cv2.LSD_REFINE_STD if hasattr(cv2, "LSD_REFINE_STD") else 1
return cv2.createLineSegmentDetector(refine)
except Exception:
return cv2.createLineSegmentDetector()
def _extract_lines_lsd(rgb_img: np.ndarray, mask01: np.ndarray, min_len: float = 40.0):
mask01 = _ensure_2d_mask(mask01)
gray = cv2.cvtColor(rgb_img, cv2.COLOR_RGB2GRAY)
gray = cv2.GaussianBlur(gray, (3, 3), 0)
lsd = _create_lsd()
detected = lsd.detect(gray)[0]
if detected is None:
return []
lines_h = []
h, w = mask01.shape
for seg in detected.reshape(-1, 4):
x1, y1, x2, y2 = map(float, seg)
dx = x2 - x1
dy = y2 - y1
length = (dx * dx + dy * dy) ** 0.5
if length < min_len:
continue
mx = int(round((x1 + x2) * 0.5))
my = int(round((y1 + y2) * 0.5))
if mx < 0 or my < 0 or mx >= w or my >= h:
continue
if mask01[my, mx] == 0:
continue
p1 = np.array([x1, y1, 1.0], dtype=np.float32)
p2 = np.array([x2, y2, 1.0], dtype=np.float32)
l = np.cross(p1, p2)
norm = float(np.hypot(l[0], l[1]))
if norm < 1e-6:
continue
lines_h.append((l / norm).astype(np.float32))
return lines_h
def _intersection_of_lines(l1, l2):
p = np.cross(l1, l2)
if abs(float(p[2])) < 1e-6:
return None
return (p / p[2]).astype(np.float32)
def _fit_vanishing_point_ransac(lines, iters=900, dist_thresh=3.0, min_inliers=10):
if len(lines) < 2:
return None, None
lines = [np.asarray(l, dtype=np.float32) for l in lines]
best_vp, best_inliers, best_count = None, None, 0
rng = np.random.default_rng(0)
for _ in range(iters):
i, j = rng.integers(0, len(lines), size=2)
if i == j:
continue
vp = _intersection_of_lines(lines[i], lines[j])
if vp is None:
continue
errs = [abs(float(l @ vp)) for l in lines]
inliers = [k for k, e in enumerate(errs) if e < dist_thresh]
if len(inliers) > best_count:
best_count = len(inliers)
best_vp = vp
best_inliers = inliers
if best_vp is None or best_inliers is None or best_count < min_inliers:
return None, None
A = np.stack([lines[k] for k in best_inliers], axis=0).astype(np.float32)
_, _, Vt = np.linalg.svd(A)
vp = Vt[-1, :]
if abs(float(vp[2])) < 1e-6:
return None, None
vp = (vp / vp[2]).astype(np.float32)
return vp, best_inliers
def _split_lines_by_orientation(lines):
horiz, vert = [], []
for l in lines:
a, b, _ = map(float, l)
dx, dy = b, -a
ang = (np.degrees(np.arctan2(dy, dx)) + 180.0) % 180.0
if ang < 25 or ang > 155:
horiz.append(l)
elif 65 < ang < 115:
vert.append(l)
return horiz, vert
def _affine_H_from_vanishing_line(l):
l = np.asarray(l, dtype=np.float32)
if abs(float(l[2])) < 1e-6:
return None
l1, l2, l3 = map(float, l)
return np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [l1 / l3, l2 / l3, 1.0]], dtype=np.float32)
def _dominant_directions_from_lines(lines):
if len(lines) < 6:
return None, None
horiz, vert = _split_lines_by_orientation(lines)
def mean_dir(line_list, mode):
vecs = []
for l in line_list:
a, b, _ = map(float, l)
dx, dy = b, -a
n = float(np.hypot(dx, dy))
if n < 1e-6:
continue
dx, dy = dx / n, dy / n
if mode == "h":
if dx < 0:
dx, dy = -dx, -dy
else:
if dy < 0:
dx, dy = -dx, -dy
vecs.append([dx, dy])
if len(vecs) < 2:
return None
v = np.mean(np.array(vecs, dtype=np.float32), axis=0)
n = float(np.hypot(v[0], v[1]))
if n < 1e-6:
return None
return (v / n).astype(np.float32)
u = mean_dir(horiz, "h")
v = mean_dir(vert, "v")
return u, v
def _front_facade_rectify(rgb_img: np.ndarray, mask01: np.ndarray):
mask01 = _clean_mask(mask01)
debug = rgb_img.copy()
debug = _draw_outline_on_image(debug, mask01, thickness=2)
lines = _extract_lines_lsd(rgb_img, mask01, min_len=40.0)
if len(lines) < 10:
return None, None, debug
horiz, vert = _split_lines_by_orientation(lines)
if len(horiz) < 4 or len(vert) < 4:
return None, None, debug
vp_h, _ = _fit_vanishing_point_ransac(horiz, iters=900, dist_thresh=3.0, min_inliers=10)
vp_v, _ = _fit_vanishing_point_ransac(vert, iters=900, dist_thresh=3.0, min_inliers=10)
if vp_h is None or vp_v is None:
return None, None, debug
van_line = np.cross(vp_h, vp_v).astype(np.float32)
H_aff = _affine_H_from_vanishing_line(van_line)
if H_aff is None:
return None, None, debug
bgr = cv2.cvtColor(rgb_img, cv2.COLOR_RGB2BGR)
aff_bgr, _ = _warp_with_bounds(bgr, H_aff, border_value=(255, 255, 255), interp=cv2.INTER_LINEAR)
aff_rgb = cv2.cvtColor(aff_bgr, cv2.COLOR_BGR2RGB)
mask255 = (mask01 * 255).astype(np.uint8)
aff_mask255, _ = _warp_with_bounds(mask255, H_aff, border_value=0, interp=cv2.INTER_NEAREST)
aff_mask01 = (aff_mask255 > 0).astype(np.uint8)
aff_lines = _extract_lines_lsd(aff_rgb, aff_mask01, min_len=40.0)
u, v = _dominant_directions_from_lines(aff_lines)
if u is None or v is None:
return None, None, debug
M2 = np.array([[u[0], v[0]], [u[1], v[1]]], dtype=np.float32)
if abs(float(np.linalg.det(M2))) < 1e-6:
return None, None, debug
A2 = np.linalg.inv(M2).astype(np.float32)
H_lin = np.array(
[[A2[0, 0], A2[0, 1], 0.0], [A2[1, 0], A2[1, 1], 0.0], [0.0, 0.0, 1.0]],
dtype=np.float32,
)
aff_bgr2 = cv2.cvtColor(aff_rgb, cv2.COLOR_RGB2BGR)
rect_bgr, _ = _warp_with_bounds(aff_bgr2, H_lin, border_value=(255, 255, 255), interp=cv2.INTER_LINEAR)
rect_rgb = cv2.cvtColor(rect_bgr, cv2.COLOR_BGR2RGB)
rect_mask255, _ = _warp_with_bounds(aff_mask255, H_lin, border_value=0, interp=cv2.INTER_NEAREST)
rect_mask01 = (rect_mask255 > 0).astype(np.uint8)
return rect_rgb, rect_mask01, debug
# -------------------------
# Fallback: full-building quad from mask contour
# -------------------------
def _fitline_to_abc(points_xy: np.ndarray):
pts = points_xy.astype(np.float32).reshape(-1, 1, 2)
vx, vy, x0, y0 = cv2.fitLine(pts, cv2.DIST_L2, 0, 0.01, 0.01).reshape(-1)
a = -vy
b = vx
c = a * x0 + b * y0
return float(a), float(b), float(c)
def _intersect_lines_abc(l1, l2):
a1, b1, c1 = l1
a2, b2, c2 = l2
det = a1 * b2 - a2 * b1
if abs(det) < 1e-9:
return None
x = (c1 * b2 - c2 * b1) / det
y = (a1 * c2 - a2 * c1) / det
return np.array([x, y], dtype=np.float32)
def _expand_corners(corners: np.ndarray, scale: float = 0.06) -> np.ndarray:
corners = corners.astype(np.float32)
center = corners.mean(axis=0, keepdims=True)
return (center + (corners - center) * (1.0 + float(scale))).astype(np.float32)
def _mask_to_full_building_corners(mask01: np.ndarray, band_frac: float = 0.12, expand: float = 0.06) -> np.ndarray:
mask01 = _clean_mask(mask01)
h, w = mask01.shape
mask255 = np.ascontiguousarray((mask01 * 255).astype(np.uint8))
cnts, _ = cv2.findContours(mask255, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
if not cnts:
raise ValueError("Mask is empty (no contours).")
cnt = max(cnts, key=cv2.contourArea)
if cv2.contourArea(cnt) < 500:
raise ValueError("Mask too small to infer corners.")
pts = cnt.reshape(-1, 2).astype(np.float32)
x_min, y_min = pts.min(axis=0)
x_max, y_max = pts.max(axis=0)
dx = max(float(x_max - x_min), 1.0)
dy = max(float(y_max - y_min), 1.0)
bf = float(band_frac)
left_pts = pts[pts[:, 0] <= x_min + bf * dx]
right_pts = pts[pts[:, 0] >= x_max - bf * dx]
top_pts = pts[pts[:, 1] <= y_min + bf * dy]
bottom_pts = pts[pts[:, 1] >= y_max - bf * dy]
if min(len(left_pts), len(right_pts), len(top_pts), len(bottom_pts)) < 30:
raise ValueError("Not enough contour points for stable corner fitting.")
L = _fitline_to_abc(left_pts)
R = _fitline_to_abc(right_pts)
T = _fitline_to_abc(top_pts)
B = _fitline_to_abc(bottom_pts)
tl = _intersect_lines_abc(L, T)
tr = _intersect_lines_abc(R, T)
br = _intersect_lines_abc(R, B)
bl = _intersect_lines_abc(L, B)
if tl is None or tr is None or br is None or bl is None:
raise ValueError("Failed to intersect boundary lines for corners.")
corners = np.array([tl, tr, br, bl], dtype=np.float32)
corners = _expand_corners(corners, scale=expand)
return _order_points(corners)
def _rectify_by_quad(rgb_img: np.ndarray, mask01: np.ndarray, band_frac=0.12, expand=0.06):
corners = _mask_to_full_building_corners(mask01, band_frac=band_frac, expand=expand)
(tl, tr, br, bl) = corners
wA = np.linalg.norm(br - bl)
wB = np.linalg.norm(tr - tl)
hA = np.linalg.norm(tr - br)
hB = np.linalg.norm(tl - bl)
out_w = max(int(max(wA, wB)), 200)
out_h = max(int(max(hA, hB)), 200)
dst = np.array([[0, 0], [out_w - 1, 0], [out_w - 1, out_h - 1], [0, out_h - 1]], dtype=np.float32)
H = cv2.getPerspectiveTransform(corners, dst).astype(np.float32)
bgr = cv2.cvtColor(rgb_img, cv2.COLOR_RGB2BGR)
warped_bgr, _ = _warp_with_bounds(bgr, H, border_value=(255, 255, 255), interp=cv2.INTER_LINEAR)
warped_rgb = cv2.cvtColor(warped_bgr, cv2.COLOR_BGR2RGB)
mask255 = (mask01 * 255).astype(np.uint8)
warped_mask255, _ = _warp_with_bounds(mask255, H, border_value=0, interp=cv2.INTER_NEAREST)
warped_mask01 = (warped_mask255 > 0).astype(np.uint8)
return warped_rgb, warped_mask01, rgb_img
# -------------------------
# Main pipeline
# -------------------------
def straighten_and_chart(
image_np,
box_threshold=0.35,
text_threshold=0.25, # kept for UI compatibility, not strictly used now
padding=0.03,
outline_thickness=3,
chart_mode="blueprint",
canny_low=60,
canny_high=160,
hough_threshold=80,
min_line_length=40,
max_line_gap=8,
line_thickness=2,
add_grid=False,
):
if image_np is None:
raise ValueError("Please upload an image.")
pil = Image.fromarray(image_np).convert("RGB")
W, H = pil.size
rgb_full = np.array(pil)
box = _detect_building_box(pil, box_threshold=box_threshold, text_threshold=text_threshold)
x1, y1, x2, y2 = box
pad_x = float(padding) * (x2 - x1)
pad_y = float(padding) * (y2 - y1)
x1 = max(0, x1 - pad_x)
y1 = max(0, y1 - pad_y)
x2 = min(W - 1, x2 + pad_x)
y2 = min(H - 1, y2 + pad_y)
box = np.array([x1, y1, x2, y2], dtype=np.float32)
mask01 = _segment_box_mask(pil, box)
mask01 = _clean_mask(mask01)
original_outlined = _draw_outline_on_image(image_np, mask01, thickness=int(outline_thickness))
rect_rgb, rect_mask01, dbg = _front_facade_rectify(rgb_full, mask01)
if rect_rgb is None or rect_mask01 is None:
rect_rgb, rect_mask01, dbg2 = _rectify_by_quad(rgb_full, mask01, band_frac=0.12, expand=0.06)
dbg = dbg if dbg is not None else dbg2
straightened_outlined = _draw_outline_on_image(rect_rgb, rect_mask01, thickness=int(outline_thickness))
chart = architectural_chart(
rect_rgb,
mode=str(chart_mode),
edge1=int(canny_low),
edge2=int(canny_high),
hough_threshold=int(hough_threshold),
min_line_length=int(min_line_length),
max_line_gap=int(max_line_gap),
thickness=int(line_thickness),
add_grid=bool(add_grid),
)
mask_rgb = np.stack([mask01 * 255] * 3, axis=-1).astype(np.uint8)
debug = image_np.copy()
x1i, y1i, x2i, y2i = map(int, box)
cv2.rectangle(debug, (x1i, y1i), (x2i, y2i), (255, 255, 255), 2)
return chart, straightened_outlined, original_outlined, debug, mask_rgb
demo = gr.Interface(
fn=straighten_and_chart,
inputs=[
gr.Image(type="numpy", label="Upload photo"),
gr.Slider(0.1, 0.8, value=0.35, step=0.05, label="Box threshold (DINO)"),
gr.Slider(0.05, 0.6, value=0.25, step=0.05, label="Text threshold (unused, kept for UI)"),
gr.Slider(0.0, 0.15, value=0.03, step=0.01, label="BBox padding"),
gr.Slider(1, 12, value=3, step=1, label="Outline thickness"),
gr.Radio(["blueprint", "black_on_white"], value="blueprint", label="Architectural chart style"),
gr.Slider(1, 200, value=60, step=1, label="Canny low threshold"),
gr.Slider(1, 300, value=160, step=1, label="Canny high threshold"),
gr.Slider(10, 200, value=80, step=1, label="Hough threshold"),
gr.Slider(10, 400, value=40, step=5, label="Min line length"),
gr.Slider(0, 50, value=8, step=1, label="Max line gap"),
gr.Slider(1, 8, value=2, step=1, label="Chart line thickness"),
gr.Checkbox(value=False, label="Add grid"),
],
outputs=[
gr.Image(type="numpy", label="Architectural chart (front façade corrected)"),
gr.Image(type="numpy", label="Front façade (rectified) + outline"),
gr.Image(type="numpy", label="Original + outline"),
gr.Image(type="numpy", label="Debug (bbox)"),
gr.Image(type="numpy", label="Building mask (SAM)"),
],
title="Auto Building Front-Façade Rectifier + Architectural Chart",
description=(
"GroundingDINO + SAM: detect and segment a building, correct off-angle views toward a front façade "
"using vanishing-point rectification (fallback to contour quad), then generate an architectural chart."
),
)
if __name__ == "__main__":
demo.launch()