Spaces:
Runtime error
Runtime error
| import argparse | |
| import copy | |
| import json | |
| import math | |
| import cv2 | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| from PIL import Image | |
| from shapely.geometry import Polygon | |
| from datasets.discrete_tokenizer import DiscreteTokenizer | |
| from models import build_model | |
| from util.plot_utils import plot_semantic_rich_floorplan_opencv | |
| DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| MODEL_ARGS = argparse.Namespace( | |
| poly2seq=True, | |
| seq_len=512, | |
| num_bins=32, | |
| image_size=256, | |
| input_channels=3, | |
| backbone="resnet50", | |
| dilation=False, | |
| position_embedding="sine", | |
| position_embedding_scale=2 * np.pi, | |
| num_feature_levels=4, | |
| enc_layers=6, | |
| dec_layers=6, | |
| dim_feedforward=1024, | |
| hidden_dim=256, | |
| dropout=0.1, | |
| nheads=8, | |
| num_queries=800, | |
| num_polys=20, | |
| dec_n_points=4, | |
| enc_n_points=4, | |
| query_pos_type="sine", | |
| with_poly_refine=False, | |
| masked_attn=False, | |
| semantic_classes=13, | |
| aux_loss=False, | |
| dec_attn_concat_src=True, | |
| pre_decoder_pos_embed=False, | |
| learnable_dec_pe=False, | |
| dec_qkv_proj=False, | |
| per_token_sem_loss=True, | |
| add_cls_token=False, | |
| use_anchor=True, | |
| inject_cls_embed=False, | |
| device="cuda" if torch.cuda.is_available() else "cpu", | |
| ) | |
| R2G_LABEL = { | |
| 0: "Living Room", | |
| 1: "Kitchen", | |
| 2: "Bedroom", | |
| 3: "Bathroom", | |
| 4: "Balcony", | |
| 5: "Corridor", | |
| 6: "Dining Room", | |
| 7: "Study", | |
| 8: "Studio", | |
| 9: "Store Room", | |
| 10: "Garden", | |
| 11: "Laundry Room", | |
| 12: "Office", | |
| 13: "Basement", | |
| 14: "Garage", | |
| 15: "Undefined", | |
| 16: "Door", | |
| 17: "Window", | |
| } | |
| def _process_predictions( | |
| pred_corners, i, semantic_rich, image_size, pred_room_label, | |
| pred_room_logits, per_token_sem_loss, add_cls_token=False, | |
| ): | |
| """Extract polygons from poly2seq model output.""" | |
| np_softmax = lambda x: np.exp(x) / np.sum(np.exp(x), axis=-1, keepdims=True) | |
| pred_corners_per_scene = pred_corners[i] | |
| room_polys = [] | |
| if semantic_rich: | |
| room_types = [] | |
| window_doors = [] | |
| window_doors_types = [] | |
| pred_room_label_per_scene = pred_room_label[i].cpu().numpy() | |
| pred_room_logit_per_scene = pred_room_logits[i].cpu().numpy() | |
| all_room_polys = [] | |
| tmp = [] | |
| all_length_list = [0] | |
| for j in range(len(pred_corners_per_scene)): | |
| if isinstance(pred_corners_per_scene[j], int): | |
| if pred_corners_per_scene[j] == 2 and tmp: | |
| all_room_polys.append(tmp) | |
| all_length_list.append(len(tmp) + 1 + add_cls_token) | |
| tmp = [] | |
| continue | |
| tmp.append(pred_corners_per_scene[j]) | |
| if len(tmp): | |
| all_room_polys.append(tmp) | |
| all_length_list.append(len(tmp) + 1 + add_cls_token) | |
| start_poly_indices = np.cumsum(all_length_list) | |
| final_pred_classes = [] | |
| for j, poly in enumerate(all_room_polys): | |
| if len(poly) < 2: | |
| continue | |
| corners = np.array(poly, dtype=np.float32) * (image_size - 1) | |
| corners = np.around(corners).astype(np.int32) | |
| if not semantic_rich: | |
| if len(corners) >= 4 and Polygon(corners).area >= 100: | |
| room_polys.append(corners) | |
| else: | |
| if per_token_sem_loss: | |
| pred_classes, counts = np.unique( | |
| pred_room_label_per_scene[start_poly_indices[j]:start_poly_indices[j + 1]][:-1], | |
| return_counts=True, | |
| ) | |
| pred_class = pred_classes[np.argmax(counts)] | |
| else: | |
| pred_class = pred_room_label_per_scene[start_poly_indices[j + 1] - 1] | |
| final_pred_classes.append(pred_class) | |
| if len(corners) >= 3 and Polygon(corners).area >= 100: | |
| room_polys.append(corners) | |
| room_types.append(pred_class) | |
| elif len(corners) == 2: | |
| window_doors.append(corners) | |
| window_doors_types.append(pred_class) | |
| if not semantic_rich: | |
| pred_room_label_per_scene = len(all_room_polys) * [-1] | |
| return { | |
| "room_polys": room_polys, | |
| "room_types": room_types if semantic_rich else None, | |
| "window_doors": window_doors if semantic_rich else None, | |
| "window_doors_types": window_doors_types if semantic_rich else None, | |
| } | |
| def generate(model, samples, semantic_rich=False, use_cache=True, per_token_sem_loss=False): | |
| """Generate room polygons from model predictions (poly2seq mode only).""" | |
| model.eval() | |
| image_size = samples[0].size(2) | |
| outputs = model.forward_inference(samples, use_cache) | |
| pred_corners = outputs["gen_out"] | |
| bs = outputs["pred_logits"].shape[0] | |
| pred_room_label = None | |
| pred_room_logits = None | |
| if "pred_room_logits" in outputs: | |
| pred_room_logits = outputs["pred_room_logits"] | |
| prob = torch.nn.functional.softmax(pred_room_logits, -1) | |
| _, pred_room_label = prob[..., :-1].max(-1) | |
| result_rooms = [] | |
| result_classes = [] | |
| for i in range(bs): | |
| scene_outputs = _process_predictions( | |
| pred_corners, i, semantic_rich, image_size, | |
| pred_room_label, pred_room_logits, per_token_sem_loss, | |
| ) | |
| room_polys = scene_outputs["room_polys"] | |
| room_types = scene_outputs["room_types"] | |
| window_doors = scene_outputs["window_doors"] | |
| window_doors_types = scene_outputs["window_doors_types"] | |
| if window_doors: | |
| result_rooms.append(room_polys + window_doors) | |
| result_classes.append(room_types + window_doors_types) | |
| else: | |
| result_rooms.append(room_polys) | |
| result_classes.append(room_types) | |
| return {"room": result_rooms, "labels": result_classes} | |
| def load_model(): | |
| tokenizer = DiscreteTokenizer( | |
| MODEL_ARGS.num_bins, MODEL_ARGS.seq_len, add_cls=MODEL_ARGS.add_cls_token | |
| ) | |
| MODEL_ARGS.vocab_size = len(tokenizer) | |
| model = build_model(MODEL_ARGS, train=False, tokenizer=tokenizer) | |
| model.to(DEVICE) | |
| ckpt_path = "checkpoints/r2g_res256_ep0849.pth" | |
| checkpoint = torch.load(ckpt_path, map_location="cpu") | |
| ckpt_state_dict = copy.deepcopy(checkpoint["ema"]) | |
| for key in list(ckpt_state_dict.keys()): | |
| if key.startswith("module."): | |
| ckpt_state_dict[key[7:]] = ckpt_state_dict.pop(key) | |
| model.load_state_dict(ckpt_state_dict, strict=False) | |
| for param in model.parameters(): | |
| param.requires_grad = False | |
| model.eval() | |
| return model | |
| print("Loading model...") | |
| MODEL = load_model() | |
| print("Model loaded.") | |
| def preprocess_image(pil_image: Image.Image) -> torch.Tensor: | |
| """Resize preserving aspect ratio + pad to (image_size, image_size).""" | |
| target = MODEL_ARGS.image_size | |
| image_np = np.array(pil_image.convert("RGB")) | |
| h, w = image_np.shape[:2] | |
| scale = min(target / h, target / w) | |
| new_h, new_w = int(h * scale), int(w * scale) | |
| resized = cv2.resize(image_np, (new_w, new_h), interpolation=cv2.INTER_CUBIC) | |
| padded = np.full((target, target, 3), 255, dtype=np.uint8) | |
| top = (target - new_h) // 2 | |
| left = (target - new_w) // 2 | |
| padded[top:top + new_h, left:left + new_w] = resized | |
| tensor = padded.transpose((2, 0, 1)).astype(np.float32) / 255.0 | |
| return torch.as_tensor(tensor) | |
| def predict_floorplan(image: Image.Image): | |
| if image is None: | |
| return None, json.dumps({"error": "No image provided"}) | |
| input_tensor = preprocess_image(image).unsqueeze(0).to(DEVICE) | |
| outputs = generate( | |
| MODEL, | |
| input_tensor, | |
| semantic_rich=MODEL_ARGS.semantic_classes > 0, | |
| use_cache=True, | |
| per_token_sem_loss=MODEL_ARGS.per_token_sem_loss, | |
| ) | |
| pred_rooms = outputs["room"][0] | |
| pred_labels = outputs["labels"][0] | |
| image_size = MODEL_ARGS.image_size | |
| if pred_labels is None: | |
| pred_labels = [-1] * len(pred_rooms) | |
| result_polygons = [] | |
| for poly, label in zip(pred_rooms, pred_labels): | |
| coords = poly.astype(float).tolist() | |
| result_polygons.append({ | |
| "label": R2G_LABEL.get(int(label), "Unknown"), | |
| "label_id": int(label), | |
| "polygon": coords, | |
| }) | |
| floorplan_map = plot_semantic_rich_floorplan_opencv( | |
| zip(pred_rooms, pred_labels), | |
| None, | |
| door_window_index=[], | |
| semantics_label_mapping=R2G_LABEL, | |
| plot_text=True, | |
| one_color=False, | |
| is_sem=True, | |
| img_w=image_size * 2, | |
| img_h=image_size * 2, | |
| scale=2, | |
| ) | |
| if floorplan_map is not None and floorplan_map.size > 0: | |
| floorplan_rgb = cv2.cvtColor(floorplan_map, cv2.COLOR_BGR2RGB) | |
| vis_image = Image.fromarray(floorplan_rgb) | |
| else: | |
| vis_image = None | |
| return vis_image, result_polygons | |
| demo = gr.Interface( | |
| fn=predict_floorplan, | |
| inputs=gr.Image(type="pil", label="Floor Plan Image"), | |
| outputs=[ | |
| gr.Image(type="pil", label="Detected Rooms"), | |
| gr.JSON(label="Detected Polygons"), | |
| ], | |
| title="Raster2Seq - Floor Plan Vectorization", | |
| description="Upload a floor plan image to detect room polygons and their semantic labels. Returns both a visualization and structured JSON with polygon coordinates.", | |
| ) | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |