""" Semantic Segmentation — Pixel-level classification with DeepLabV3 Courses: 100 ch3, 360 ch4 """ import numpy as np import torch import torchvision.models.segmentation as seg_models import torchvision.transforms as T import gradio as gr from PIL import Image device = torch.device("cpu") # Load DeepLabV3 with MobileNetV3 backbone (lightweight) model = seg_models.deeplabv3_mobilenet_v3_large( weights=seg_models.DeepLabV3_MobileNet_V3_Large_Weights.DEFAULT ).eval().to(device) preprocess = T.Compose([ T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) # PASCAL VOC class names (21 classes) CLASS_NAMES = [ "background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "dining table", "dog", "horse", "motorbike", "person", "potted plant", "sheep", "sofa", "train", "tv/monitor", ] # Color palette for each class PALETTE = np.array([ [0, 0, 0], # background [128, 0, 0], # aeroplane [0, 128, 0], # bicycle [128, 128, 0], # bird [0, 0, 128], # boat [128, 0, 128], # bottle [0, 128, 128], # bus [128, 128, 128], # car [64, 0, 0], # cat [192, 0, 0], # chair [64, 128, 0], # cow [192, 128, 0], # dining table [64, 0, 128], # dog [192, 0, 128], # horse [64, 128, 128], # motorbike [192, 128, 128], # person [0, 64, 0], # potted plant [128, 64, 0], # sheep [0, 192, 0], # sofa [128, 192, 0], # train [0, 64, 128], # tv/monitor ], dtype=np.uint8) def segment(image: Image.Image, display_mode: str): if image is None: return None, None, "" img = image.convert("RGB") w, h = img.size # Inference inp = preprocess(img).unsqueeze(0).to(device) with torch.no_grad(): output = model(inp)["out"] pred = output.argmax(1).squeeze().cpu().numpy() # Resize prediction to original size pred_resized = np.array( Image.fromarray(pred.astype(np.uint8)).resize((w, h), Image.NEAREST) ) # Color segmentation map seg_color = PALETTE[pred_resized] # Overlay img_np = np.array(img) overlay = (img_np * 0.5 + seg_color * 0.5).astype(np.uint8) # Detected classes unique_classes = np.unique(pred_resized) detected = [CLASS_NAMES[c] for c in unique_classes if c != 0] legend = "**Detected classes:**\n\n" for c in unique_classes: if c == 0: continue color = PALETTE[c] pixel_pct = np.sum(pred_resized == c) / pred_resized.size * 100 color_hex = f"#{color[0]:02x}{color[1]:02x}{color[2]:02x}" legend += f"- ██ {CLASS_NAMES[c]} ({pixel_pct:.1f}%)\n" if not detected: legend += "- No objects detected (background only)" if display_mode == "Overlay": return overlay, seg_color, legend elif display_mode == "Segmentation Only": return seg_color, seg_color, legend else: # Side by Side return overlay, seg_color, legend with gr.Blocks(title="Semantic Segmentation") as demo: gr.Markdown( "# Semantic Segmentation\n" "Upload an image to see pixel-level classification (21 PASCAL VOC classes).\n" "*Courses: 100 Deep Learning ch3, 360 Autonomous Driving ch4*" ) with gr.Row(): with gr.Column(scale=1): input_image = gr.Image(type="pil", label="Upload Image") mode = gr.Radio( ["Overlay", "Segmentation Only", "Side by Side"], value="Overlay", label="Display Mode", ) btn = gr.Button("Segment", variant="primary") with gr.Column(scale=2): with gr.Row(): overlay_out = gr.Image(label="Result") seg_out = gr.Image(label="Segmentation Map") legend_md = gr.Markdown() btn.click(segment, [input_image, mode], [overlay_out, seg_out, legend_md]) gr.Examples( examples=[ ["examples/street.jpg", "Overlay"], ["examples/room.jpg", "Side by Side"], ], inputs=[input_image, mode], ) if __name__ == "__main__": demo.launch()