import argparse import copy import json import math import cv2 import gradio as gr import numpy as np import torch from PIL import Image from shapely.geometry import Polygon from datasets.discrete_tokenizer import DiscreteTokenizer from models import build_model from util.plot_utils import plot_semantic_rich_floorplan_opencv DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") MODEL_ARGS = argparse.Namespace( poly2seq=True, seq_len=512, num_bins=32, image_size=256, input_channels=3, backbone="resnet50", dilation=False, position_embedding="sine", position_embedding_scale=2 * np.pi, num_feature_levels=4, enc_layers=6, dec_layers=6, dim_feedforward=1024, hidden_dim=256, dropout=0.1, nheads=8, num_queries=800, num_polys=20, dec_n_points=4, enc_n_points=4, query_pos_type="sine", with_poly_refine=False, masked_attn=False, semantic_classes=13, aux_loss=False, dec_attn_concat_src=True, pre_decoder_pos_embed=False, learnable_dec_pe=False, dec_qkv_proj=False, per_token_sem_loss=True, add_cls_token=False, use_anchor=True, inject_cls_embed=False, device="cuda" if torch.cuda.is_available() else "cpu", ) R2G_LABEL = { 0: "Living Room", 1: "Kitchen", 2: "Bedroom", 3: "Bathroom", 4: "Balcony", 5: "Corridor", 6: "Dining Room", 7: "Study", 8: "Studio", 9: "Store Room", 10: "Garden", 11: "Laundry Room", 12: "Office", 13: "Basement", 14: "Garage", 15: "Undefined", 16: "Door", 17: "Window", } def _process_predictions( pred_corners, i, semantic_rich, image_size, pred_room_label, pred_room_logits, per_token_sem_loss, add_cls_token=False, ): """Extract polygons from poly2seq model output.""" np_softmax = lambda x: np.exp(x) / np.sum(np.exp(x), axis=-1, keepdims=True) pred_corners_per_scene = pred_corners[i] room_polys = [] if semantic_rich: room_types = [] window_doors = [] window_doors_types = [] pred_room_label_per_scene = pred_room_label[i].cpu().numpy() pred_room_logit_per_scene = pred_room_logits[i].cpu().numpy() all_room_polys = [] tmp = [] all_length_list = [0] for j in range(len(pred_corners_per_scene)): if isinstance(pred_corners_per_scene[j], int): if pred_corners_per_scene[j] == 2 and tmp: all_room_polys.append(tmp) all_length_list.append(len(tmp) + 1 + add_cls_token) tmp = [] continue tmp.append(pred_corners_per_scene[j]) if len(tmp): all_room_polys.append(tmp) all_length_list.append(len(tmp) + 1 + add_cls_token) start_poly_indices = np.cumsum(all_length_list) final_pred_classes = [] for j, poly in enumerate(all_room_polys): if len(poly) < 2: continue corners = np.array(poly, dtype=np.float32) * (image_size - 1) corners = np.around(corners).astype(np.int32) if not semantic_rich: if len(corners) >= 4 and Polygon(corners).area >= 100: room_polys.append(corners) else: if per_token_sem_loss: pred_classes, counts = np.unique( pred_room_label_per_scene[start_poly_indices[j]:start_poly_indices[j + 1]][:-1], return_counts=True, ) pred_class = pred_classes[np.argmax(counts)] else: pred_class = pred_room_label_per_scene[start_poly_indices[j + 1] - 1] final_pred_classes.append(pred_class) if len(corners) >= 3 and Polygon(corners).area >= 100: room_polys.append(corners) room_types.append(pred_class) elif len(corners) == 2: window_doors.append(corners) window_doors_types.append(pred_class) if not semantic_rich: pred_room_label_per_scene = len(all_room_polys) * [-1] return { "room_polys": room_polys, "room_types": room_types if semantic_rich else None, "window_doors": window_doors if semantic_rich else None, "window_doors_types": window_doors_types if semantic_rich else None, } @torch.no_grad() def generate(model, samples, semantic_rich=False, use_cache=True, per_token_sem_loss=False): """Generate room polygons from model predictions (poly2seq mode only).""" model.eval() image_size = samples[0].size(2) outputs = model.forward_inference(samples, use_cache) pred_corners = outputs["gen_out"] bs = outputs["pred_logits"].shape[0] pred_room_label = None pred_room_logits = None if "pred_room_logits" in outputs: pred_room_logits = outputs["pred_room_logits"] prob = torch.nn.functional.softmax(pred_room_logits, -1) _, pred_room_label = prob[..., :-1].max(-1) result_rooms = [] result_classes = [] for i in range(bs): scene_outputs = _process_predictions( pred_corners, i, semantic_rich, image_size, pred_room_label, pred_room_logits, per_token_sem_loss, ) room_polys = scene_outputs["room_polys"] room_types = scene_outputs["room_types"] window_doors = scene_outputs["window_doors"] window_doors_types = scene_outputs["window_doors_types"] if window_doors: result_rooms.append(room_polys + window_doors) result_classes.append(room_types + window_doors_types) else: result_rooms.append(room_polys) result_classes.append(room_types) return {"room": result_rooms, "labels": result_classes} def load_model(): tokenizer = DiscreteTokenizer( MODEL_ARGS.num_bins, MODEL_ARGS.seq_len, add_cls=MODEL_ARGS.add_cls_token ) MODEL_ARGS.vocab_size = len(tokenizer) model = build_model(MODEL_ARGS, train=False, tokenizer=tokenizer) model.to(DEVICE) ckpt_path = "checkpoints/r2g_res256_ep0849.pth" checkpoint = torch.load(ckpt_path, map_location="cpu") ckpt_state_dict = copy.deepcopy(checkpoint["ema"]) for key in list(ckpt_state_dict.keys()): if key.startswith("module."): ckpt_state_dict[key[7:]] = ckpt_state_dict.pop(key) model.load_state_dict(ckpt_state_dict, strict=False) for param in model.parameters(): param.requires_grad = False model.eval() return model print("Loading model...") MODEL = load_model() print("Model loaded.") def preprocess_image(pil_image: Image.Image) -> torch.Tensor: """Resize preserving aspect ratio + pad to (image_size, image_size).""" target = MODEL_ARGS.image_size image_np = np.array(pil_image.convert("RGB")) h, w = image_np.shape[:2] scale = min(target / h, target / w) new_h, new_w = int(h * scale), int(w * scale) resized = cv2.resize(image_np, (new_w, new_h), interpolation=cv2.INTER_CUBIC) padded = np.full((target, target, 3), 255, dtype=np.uint8) top = (target - new_h) // 2 left = (target - new_w) // 2 padded[top:top + new_h, left:left + new_w] = resized tensor = padded.transpose((2, 0, 1)).astype(np.float32) / 255.0 return torch.as_tensor(tensor) def predict_floorplan(image: Image.Image): if image is None: return None, json.dumps({"error": "No image provided"}) input_tensor = preprocess_image(image).unsqueeze(0).to(DEVICE) outputs = generate( MODEL, input_tensor, semantic_rich=MODEL_ARGS.semantic_classes > 0, use_cache=True, per_token_sem_loss=MODEL_ARGS.per_token_sem_loss, ) pred_rooms = outputs["room"][0] pred_labels = outputs["labels"][0] image_size = MODEL_ARGS.image_size if pred_labels is None: pred_labels = [-1] * len(pred_rooms) result_polygons = [] for poly, label in zip(pred_rooms, pred_labels): coords = poly.astype(float).tolist() result_polygons.append({ "label": R2G_LABEL.get(int(label), "Unknown"), "label_id": int(label), "polygon": coords, }) floorplan_map = plot_semantic_rich_floorplan_opencv( zip(pred_rooms, pred_labels), None, door_window_index=[], semantics_label_mapping=R2G_LABEL, plot_text=True, one_color=False, is_sem=True, img_w=image_size * 2, img_h=image_size * 2, scale=2, ) if floorplan_map is not None and floorplan_map.size > 0: floorplan_rgb = cv2.cvtColor(floorplan_map, cv2.COLOR_BGR2RGB) vis_image = Image.fromarray(floorplan_rgb) else: vis_image = None return vis_image, result_polygons demo = gr.Interface( fn=predict_floorplan, inputs=gr.Image(type="pil", label="Floor Plan Image"), outputs=[ gr.Image(type="pil", label="Detected Rooms"), gr.JSON(label="Detected Polygons"), ], title="Raster2Seq - Floor Plan Vectorization", description="Upload a floor plan image to detect room polygons and their semantic labels. Returns both a visualization and structured JSON with polygon coordinates.", ) demo.launch(server_name="0.0.0.0", server_port=7860)