Ayesha-Majeed's picture
Upload app.py
269676c verified
Raw
History Blame Contribute Delete
43.7 kB
import gradio as gr
import numpy as np
import cv2
import time
import torch
import warnings
import os
import zipfile
from PIL import Image
import random
warnings.filterwarnings("ignore")
# ═══════════════════════════════════════════════════════════════════════════════
# STEP 1: Extract any .zip files in current directory
# ═══════════════════════════════════════════════════════════════════════════════
print("=" * 60)
print(f"[STARTUP] Working dir: {os.getcwd()}")
for f in os.listdir("."):
if f.endswith(".zip"):
try:
with zipfile.ZipFile(f, 'r') as zf:
zf.extractall(".")
print(f"[ZIP] Extracted {f} OK!")
except Exception as e:
print(f"[ZIP] ERROR: {e}")
# ═══════════════════════════════════════════════════════════════════════════════
# STEP 2: Copy images to root
# ═══════════════════════════════════════════════════════════════════════════════
def prepare_clean_examples(src_folder, prefix, limit=10):
results = []
if not os.path.exists(src_folder): return results
count = 0
for root, dirs, files in os.walk(src_folder):
for fname in sorted(files):
if not fname.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.webp')):
continue
src_path = os.path.join(root, fname)
dst_name = f"{prefix}_{count}.jpg"
try:
import shutil
shutil.copy2(src_path, dst_name)
results.append(dst_name)
count += 1
if count >= limit: break
except Exception as e:
print(f"Error copying {src_path}: {e}")
if count >= limit: break
return results
mirror_examples = []
for folder in ["test car windows", "test_car_windows", "test car windows segmentation"]:
if os.path.exists(folder):
mirror_examples = prepare_clean_examples(folder, "mirror", limit=15)
break
if not mirror_examples and os.path.exists("car.jpeg"):
mirror_examples = ["car.jpeg"]
# ═══════════════════════════════════════════════════════════════════════════════
# Global Settings
# ═══════════════════════════════════════════════════════════════════════════════
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
CONF = 0.45
def apply_mask_overlay(img_rgb, mask_bool, color=(0, 215, 255), alpha=0.4):
# 1. Darken the background (50% brightness, no blur)
dark_bg = cv2.addWeighted(img_rgb, 0.5, np.zeros_like(img_rgb), 0.5, 0)
# 2. For the mask area, keep original brightness and tint it
tinted_sharp = img_rgb.copy()
tinted_sharp[mask_bool] = color
tinted_sharp = cv2.addWeighted(tinted_sharp, alpha, img_rgb, 1 - alpha, 0)
# 3. Find and draw the boundary edge strictly inside the mask
mask_img = (mask_bool * 255).astype(np.uint8)
contours, _ = cv2.findContours(mask_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# Draw contour on the tinted image (before blending)
cv2.drawContours(tinted_sharp, contours, -1, color, 2, cv2.LINE_AA)
# 4. Combine: Dark background outside, Bright tinted object + boundary inside
blended = np.where(mask_bool[:, :, None], tinted_sharp, dark_bg)
return blended
def draw_boxes(img_rgb, boxes, labels, color=(0, 215, 255)):
out = img_rgb.copy()
for box, label in zip(boxes, labels):
x1, y1, x2, y2 = map(int, box)
# Faint inner bounding box line
cv2.rectangle(out, (x1, y1), (x2, y2), color, 1)
# HUD-Style Corner Brackets
length = int(min(x2 - x1, y2 - y1) * 0.15)
thick = 3
# Top-Left
cv2.line(out, (x1, y1), (x1 + length, y1), color, thick, cv2.LINE_AA)
cv2.line(out, (x1, y1), (x1, y1 + length), color, thick, cv2.LINE_AA)
# Top-Right
cv2.line(out, (x2, y1), (x2 - length, y1), color, thick, cv2.LINE_AA)
cv2.line(out, (x2, y1), (x2, y1 + length), color, thick, cv2.LINE_AA)
# Bottom-Left
cv2.line(out, (x1, y2), (x1 + length, y2), color, thick, cv2.LINE_AA)
cv2.line(out, (x1, y2), (x1, y2 - length), color, thick, cv2.LINE_AA)
# Bottom-Right
cv2.line(out, (x2, y2), (x2 - length, y2), color, thick, cv2.LINE_AA)
cv2.line(out, (x2, y2), (x2, y2 - length), color, thick, cv2.LINE_AA)
# Text labels have been removed to prevent obstructing the view of the segmentation masks.
return out
# ═══════════════════════════════════════════════════════════════════════════════
# Morphological post-processing helper
# ═══════════════════════════════════════════════════════════════════════════════
def apply_morphology(mask_uint8, close_k=15, open_k=7):
"""Fill holes (Closing) then remove tiny blobs (Opening) on a binary mask."""
close_kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (close_k, close_k))
open_kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (open_k, open_k))
closed = cv2.morphologyEx(mask_uint8, cv2.MORPH_CLOSE, close_kernel) # fill holes
opened = cv2.morphologyEx(closed, cv2.MORPH_OPEN, open_kernel) # remove noise
return opened
# ═══════════════════════════════════════════════════════════════════════════════
# Model Functions
# ═══════════════════════════════════════════════════════════════════════════════
def run_yolo_generic(img_rgb, model_path, target_classes, color, morph_cleanup=False):
from ultralytics import YOLO
t0 = time.time()
model = YOLO(model_path)
# Use retina_masks=True to get pixel-perfect masks at the original image resolution
results = model(img_rgb, conf=CONF, verbose=False, retina_masks=True)
elapsed = time.time() - t0
result = results[0]
h, w = img_rgb.shape[:2]
combined_mask = np.zeros((h, w), dtype=np.uint8)
boxes, labels = [], []
if result.masks is not None:
for mask, box, cls, conf in zip(
result.masks.data, result.boxes.xyxy,
result.boxes.cls, result.boxes.conf
):
if int(cls) not in target_classes:
continue
# Since retina_masks=True, mask is already (h, w). Just threshold it.
mask_np = mask.cpu().numpy().astype(np.uint8)
# Optional per-instance morphological cleanup before combining
if morph_cleanup:
mask_np = apply_morphology(mask_np)
combined_mask |= mask_np
boxes.append(box.cpu().tolist())
labels.append(f"glass {conf:.2f}")
# We purposely do NOT apply morphology on the final combined_mask here,
# otherwise it will bridge the gaps (pillars) between separate windows!
combined_mask_bool = combined_mask > 0
morph_note = " | Morphology: ON ✅" if morph_cleanup else ""
out = apply_mask_overlay(img_rgb, combined_mask_bool, color=color)
out = draw_boxes(out, boxes, labels, color=color)
bw_mask = (combined_mask * 255).astype(np.uint8)
return out, bw_mask, f"Found: {len(boxes)} | Inference Time: {elapsed:.2f}s{morph_note}"
def run_sam_strategy(img_rgb, yolo_model_path, target_classes, color, strategy, morph_cleanup=False):
try:
from segment_anything import sam_model_registry, SamPredictor
import urllib.request
CKPT = "sam_vit_b_01ec64.pth"
URL = "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth"
if not os.path.exists(CKPT): urllib.request.urlretrieve(URL, CKPT)
t0 = time.time()
sam = sam_model_registry["vit_b"](checkpoint=CKPT).to(DEVICE)
predictor = SamPredictor(sam)
predictor.set_image(img_rgb)
from ultralytics import YOLO as _YOLO
yolo_res = _YOLO(yolo_model_path)(img_rgb, conf=CONF, verbose=False, retina_masks=True)[0]
h, w = img_rgb.shape[:2]
combined_mask = np.zeros((h, w), dtype=bool)
boxes_list, labels = [], []
if yolo_res.boxes is not None and yolo_res.masks is not None:
for box, mask_data, cls, conf in zip(yolo_res.boxes.xyxy, yolo_res.masks.data, yolo_res.boxes.cls, yolo_res.boxes.conf):
if int(cls) not in target_classes: continue
box_np = box.cpu().numpy()
yolo_mask = mask_data.cpu().numpy() > 0.5
if strategy == 1:
# Strategy 1: Bbox + 5 Points
x1, y1, x2, y2 = map(int, box_np)
cx, cy = (x1+x2)//2, (y1+y2)//2
pts = [[cx, cy], [x1+5, y1+5], [x2-5, y1+5], [x1+5, y2-5], [x2-5, y2-5]]
pts_np = np.array(pts)
labels_np = np.ones(len(pts))
masks_sam, _, _ = predictor.predict(box=box_np, point_coords=pts_np, point_labels=labels_np, multimask_output=False)
sam_mask = masks_sam[0]
elif strategy == 2:
# Strategy 2: Mask + 5 Points
y_coords, x_coords = np.where(yolo_mask)
if len(x_coords) == 0: continue
cx, cy = int(np.mean(x_coords)), int(np.mean(y_coords))
idx_top, idx_bot = np.argmin(y_coords), np.argmax(y_coords)
idx_lft, idx_rgt = np.argmin(x_coords), np.argmax(x_coords)
def get_mid(x_1, y_1, x_2, y_2, f=0.6):
return int(x_1 + (x_2-x_1)*f), int(y_1 + (y_2-y_1)*f)
pts = []
if yolo_mask[cy, cx]: pts.append([cx, cy])
else: pts.append([x_coords[len(x_coords)//2], y_coords[len(y_coords)//2]])
for idx in [idx_top, idx_bot, idx_lft, idx_rgt]:
px, py = get_mid(cx, cy, x_coords[idx], y_coords[idx])
if 0 <= py < h and 0 <= px < w and yolo_mask[py, px]: pts.append([px, py])
else: pts.append(pts[0])
pts_np = np.array(pts)
labels_np = np.ones(len(pts))
masks_sam, _, _ = predictor.predict(box=box_np, point_coords=pts_np, point_labels=labels_np, multimask_output=False)
sam_mask = masks_sam[0]
elif strategy == 3:
# Strategy 3: Direct Mask Prompting
yolo_mask_resized = cv2.resize((yolo_mask).astype(np.float32), (256, 256), interpolation=cv2.INTER_NEAREST)
mask_input = np.zeros((1, 256, 256), dtype=np.float32)
mask_input[0] = np.where(yolo_mask_resized > 0.5, 30.0, -30.0)
masks_sam, _, _ = predictor.predict(box=box_np, mask_input=mask_input, multimask_output=False)
raw_mask = (masks_sam[0].astype(np.uint8) * 255)
contours, _ = cv2.findContours(raw_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
filled_mask = np.zeros_like(raw_mask)
cv2.drawContours(filled_mask, contours, -1, 255, cv2.FILLED)
sam_mask = (filled_mask > 0)
else:
sam_mask = np.zeros((h, w), dtype=bool)
sam_mask_uint = sam_mask.astype(np.uint8)
if morph_cleanup:
sam_mask_uint = apply_morphology(sam_mask_uint)
combined_mask |= sam_mask_uint.astype(bool)
boxes_list.append(box_np.tolist())
labels.append(f"glass {conf:.2f}")
elapsed = time.time() - t0
morph_note = " | Morphology: ON ✅" if morph_cleanup else ""
out = apply_mask_overlay(img_rgb, combined_mask, color=color)
out = draw_boxes(out, boxes_list, labels, color=color)
return out, (combined_mask * 255).astype(np.uint8), f"Found: {len(boxes_list)} | Strategy: {strategy} | Inference: {elapsed:.2f}s{morph_note}"
except ImportError:
return img_rgb, None, "Error: segment-anything not installed"
def run_mask_rcnn(img_rgb, weights_path):
t0 = time.time()
try:
from torchvision.models.detection import maskrcnn_resnet50_fpn_v2
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
import torchvision.transforms.v2 as T
model = maskrcnn_resnet50_fpn_v2(weights=None)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, 2)
in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, 256, 2)
checkpoint = torch.load(weights_path, map_location=DEVICE, weights_only=False)
if isinstance(checkpoint, dict) and "model_state_dict" in checkpoint:
model.load_state_dict(checkpoint["model_state_dict"])
else:
model.load_state_dict(checkpoint)
model.to(DEVICE)
model.eval()
img_tensor = T.ToTensor()(Image.fromarray(img_rgb)).to(DEVICE)
with torch.no_grad():
outputs = model([img_tensor])[0]
h, w = img_rgb.shape[:2]
pred_mask = np.zeros((h, w), dtype=bool)
boxes_list, labels_list = [], []
for score, mask, box, cls in zip(outputs['scores'], outputs['masks'], outputs['boxes'], outputs['labels']):
if score > 0.45:
m = (mask[0].cpu().numpy() > 0.5)
pred_mask |= m
boxes_list.append(box.cpu().numpy().tolist())
labels_list.append(f"glass {score:.2f}")
elapsed = time.time() - t0
out = apply_mask_overlay(img_rgb, pred_mask, color=(255, 165, 0))
out = draw_boxes(out, boxes_list, labels_list, color=(255, 165, 0))
bw_mask = (pred_mask * 255).astype(np.uint8)
return out, bw_mask, f"Found: {len(boxes_list)} | Inference: {elapsed:.2f}s"
except Exception as e:
return img_rgb, None, f"Mask R-CNN Error: {e}"
def run_grounding_dino(img_rgb, text_prompt):
try:
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
t0 = time.time()
model_id = "IDEA-Research/grounding-dino-tiny"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(DEVICE)
inputs = processor(images=img_rgb, text=text_prompt, return_tensors="pt").to(DEVICE)
with torch.no_grad():
outputs = model(**inputs)
h, w = img_rgb.shape[:2]
results = processor.post_process_grounded_object_detection(
outputs, inputs.input_ids, text_threshold=0.25, target_sizes=[(h, w)]
)[0]
boxes = results["boxes"].cpu().numpy().tolist()
scores = results["scores"].cpu().numpy().tolist()
labels = results["labels"]
elapsed = time.time() - t0
bw_mask = np.zeros((h, w), dtype=np.uint8) # DINO is boxes only
str_labels = [f"{lbl} {scr:.2f}" for lbl, scr in zip(labels, scores)]
out = draw_boxes(img_rgb.copy(), boxes, str_labels, color=(255, 100, 50))
return out, bw_mask, f"Found: {len(boxes)} | Inference Time: {elapsed:.2f}s"
except Exception as e:
return img_rgb, None, f"Grounding DINO Error: {e}\n(Need transformers>=4.35)"
def run_grounded_sam(img_rgb, text_prompt):
try:
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
from segment_anything import sam_model_registry, SamPredictor
import urllib.request
t0 = time.time()
# 1. DINO Detection
dino_id = "IDEA-Research/grounding-dino-tiny"
processor = AutoProcessor.from_pretrained(dino_id)
dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(dino_id).to(DEVICE)
inputs = processor(images=img_rgb, text=text_prompt, return_tensors="pt").to(DEVICE)
with torch.no_grad():
outputs = dino_model(**inputs)
h, w = img_rgb.shape[:2]
dino_res = processor.post_process_grounded_object_detection(
outputs, inputs.input_ids, text_threshold=0.25, target_sizes=[(h, w)]
)[0]
boxes = dino_res["boxes"].cpu().numpy()
scores = dino_res["scores"].cpu().numpy()
labels_txt = dino_res["labels"]
# 2. SAM Segmentation
CKPT = "sam_vit_b_01ec64.pth"
URL = "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth"
if not os.path.exists(CKPT): urllib.request.urlretrieve(URL, CKPT)
sam = sam_model_registry["vit_b"](checkpoint=CKPT).to(DEVICE)
predictor = SamPredictor(sam)
predictor.set_image(img_rgb)
combined_mask = np.zeros((h, w), dtype=bool)
str_labels = []
if len(boxes) > 0:
for box, score, label in zip(boxes, scores, labels_txt):
masks, _, _ = predictor.predict(box=box, multimask_output=False)
combined_mask |= masks[0]
str_labels.append(f"{label} {score:.2f}")
elapsed = time.time() - t0
out = apply_mask_overlay(img_rgb, combined_mask, color=(255, 80, 160))
out = draw_boxes(out, boxes.tolist(), str_labels, color=(255, 80, 160))
return out, (combined_mask * 255).astype(np.uint8), f"Found: {len(boxes)} | Inference: {elapsed:.2f}s"
except Exception as e:
return img_rgb, None, f"Grounded SAM Error: {e}"
def run_intelliarts_car_parts(img_rgb):
t0 = time.time()
try:
import detectron2
except ImportError:
print("Installing detectron2... this may take a few minutes!")
os.system('pip install git+https://github.com/facebookresearch/detectron2.git --no-build-isolation')
try:
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
import urllib.request
model_url = "https://huggingface.co/spaces/intelliarts/Car_parts_detection/resolve/main/model_final.pth"
model_path = "intelliarts_model_final.pth"
if not os.path.exists(model_path):
print("Downloading Intelliarts Car Parts weights...")
urllib.request.urlretrieve(model_url, model_path)
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.45
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 19
cfg.MODEL.WEIGHTS = model_path
cfg.MODEL.DEVICE = DEVICE
predictor = DefaultPredictor(cfg)
outputs = predictor(img_rgb)
instances = outputs["instances"].to("cpu")
# Classes: 2: back_glass, 8: front_glass, 14: left_mirror, 15: right_mirror
target_classes = [2, 8, 14, 15]
h, w = img_rgb.shape[:2]
combined_mask = np.zeros((h, w), dtype=bool)
boxes_list, labels_list = [], []
classes = instances.pred_classes.numpy()
scores = instances.scores.numpy()
boxes = instances.pred_boxes.tensor.numpy()
masks = instances.pred_masks.numpy()
class_names = ['_background_', 'back_bumper', 'back_glass', 'back_left_door', 'back_left_light', 'back_right_door', 'back_right_light', 'front_bumper', 'front_glass', 'front_left_door', 'front_left_light', 'front_right_door', 'front_right_light', 'hood', 'left_mirror', 'right_mirror', 'tailgate', 'trunk', 'wheel']
for i in range(len(classes)):
c = classes[i]
if c in target_classes:
combined_mask |= masks[i]
boxes_list.append(boxes[i].tolist())
labels_list.append(f"{class_names[c]} {scores[i]:.2f}")
elapsed = time.time() - t0
out = apply_mask_overlay(img_rgb, combined_mask, color=(50, 150, 255))
out = draw_boxes(out, boxes_list, labels_list, color=(50, 150, 255))
bw_mask = (combined_mask * 255).astype(np.uint8)
return out, bw_mask, f"Found: {len(boxes_list)} | Inference: {elapsed:.2f}s"
except Exception as e:
return img_rgb, None, f"Intelliarts Detectron2 Error: {e}"
# ═══════════════════════════════════════════════════════════════════════════════
# SegFormer Function
# ═══════════════════════════════════════════════════════════════════════════════
def run_segformer(img_rgb, morph_cleanup=False):
try:
from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation
import torch.nn.functional as F
t0 = time.time()
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# Paths to try (works for both local PC and Hugging Face Cloud deployment)
paths_to_try = [
os.path.join(base_dir, "SegFormer_Model", "best_segformer_dice_model"), # Local PC
"best_segformer_dice_model", # Hugging Face Root
os.path.join(os.path.dirname(__file__), "best_segformer_dice_model"), # Next to app.py
]
# If files were uploaded directly to the root (no folder)
if os.path.exists("config.json"):
paths_to_try.append(".")
if os.path.exists(os.path.join(os.path.dirname(__file__), "config.json")):
paths_to_try.append(os.path.dirname(__file__))
model_path = None
for p in paths_to_try:
# For SegFormer, the path must contain config.json
if os.path.exists(p) and os.path.exists(os.path.join(p, "config.json")):
model_path = p
break
# Fallback
if model_path is None:
model_path = "best_segformer_dice_model"
processor = SegformerImageProcessor.from_pretrained(model_path)
model = SegformerForSemanticSegmentation.from_pretrained(model_path).to(DEVICE)
inputs = processor(images=Image.fromarray(img_rgb), return_tensors="pt")
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
with torch.no_grad():
outputs = model(**inputs)
h, w = img_rgb.shape[:2]
logits = F.interpolate(outputs.logits, size=(h, w), mode="bilinear", align_corners=False)[0]
probs = F.softmax(logits, dim=0)
pred_mask = (probs[1] > 0.5).cpu().numpy().astype(np.uint8)
# Apply morphological cleanup if requested
if morph_cleanup:
pred_mask = apply_morphology(pred_mask, close_k=15, open_k=7)
elapsed = time.time() - t0
morph_note = " | Morphology: ON ✅" if morph_cleanup else ""
out = apply_mask_overlay(img_rgb, pred_mask, color=(255, 50, 50))
bw_mask = (pred_mask * 255).astype(np.uint8)
return out, bw_mask, f"Found: 1 (Semantic) | Inference: {elapsed:.2f}s{morph_note}"
except Exception as e:
return img_rgb, None, f"SegFormer Error: {e}"
# ═══════════════════════════════════════════════════════════════════════════════
# BiRefNet Function
# ═══════════════════════════════════════════════════════════════════════════════
def run_birefnet(img_rgb):
try:
from transformers import AutoModelForImageSegmentation
from torchvision import transforms
import torch.nn.functional as F
t0 = time.time()
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# Paths to try (works for local PC and Hugging Face Cloud deployment)
paths_to_try = [
os.path.join(base_dir, "BiRefNet_Model", "best_model-20260624T051601Z-3-001", "best_model"), # Local PC
"birefnet_model", # Hugging Face Root / Root dir
os.path.join(os.path.dirname(os.path.abspath(__file__)), "birefnet_model"), # Next to app.py
"best_birefnet_model" # Extra fallback
]
model_path = None
for p in paths_to_try:
if os.path.exists(p) and os.path.exists(os.path.join(p, "config.json")) and os.path.exists(os.path.join(p, "model.safetensors")):
model_path = p
break
# Final fallback: Download directly from Hugging Face Model Repo!
if model_path is None:
model_path = "Ayesha-Majeed/birefnet_car_window"
model = AutoModelForImageSegmentation.from_pretrained(model_path, trust_remote_code=True).to(DEVICE)
model.eval()
image_transform = transforms.Compose([
transforms.Resize((1024, 1024)),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])
from PIL import Image
pil_img = Image.fromarray(img_rgb)
input_tensor = image_transform(pil_img).unsqueeze(0).to(DEVICE)
with torch.no_grad():
if DEVICE == "cuda":
with torch.amp.autocast("cuda"):
preds = model(input_tensor)
final_pred = preds[-1] if isinstance(preds, (list, tuple)) else preds
else:
preds = model(input_tensor)
final_pred = preds[-1] if isinstance(preds, (list, tuple)) else preds
h, w = img_rgb.shape[:2]
final_pred = F.interpolate(final_pred, size=(h, w), mode="bilinear", align_corners=False)
pred_mask = (torch.sigmoid(final_pred) > 0.5).squeeze().cpu().numpy().astype(np.uint8)
elapsed = time.time() - t0
out = apply_mask_overlay(img_rgb, pred_mask > 0, color=(255, 0, 0)) # Red
bw_mask = (pred_mask * 255).astype(np.uint8)
return out, bw_mask, f"Found: 1 (Semantic) | Inference: {elapsed:.2f}s"
except Exception as e:
return img_rgb, None, f"BiRefNet Error: {e}"
# ═══════════════════════════════════════════════════════════════════════════════
# Gradio Process Function
# ═══════════════════════════════════════════════════════════════════════════════
# A beautiful palette of pastel and neon colors for dynamic visualizations
PASTEL_COLORS = [
(255, 105, 180), # Hot/Light Pink
(180, 130, 255), # Light Purple
(0, 215, 255), # Light Sky Blue / Cyan
(255, 220, 50), # Light Yellow
(255, 160, 50), # Light Orange
(150, 255, 150), # Light Mint Green
(240, 240, 255), # Light White / Silver
]
def process_image(img_rgb, model_name, text_prompt="", morph_cleanup=False):
if img_rgb is None: return None, None, "Please upload an image."
# Pick a random color for this specific inference run
run_color = random.choice(PASTEL_COLORS)
try:
if model_name == "YOLOv8x-seg (Custom Window)":
return run_yolo_generic(img_rgb, "best.pt", target_classes=[0, 1], color=run_color, morph_cleanup=morph_cleanup)
elif model_name == "YOLOv8x-seg":
return run_yolo_generic(img_rgb, "best.pt", target_classes=[0, 1], color=(255, 215, 0), morph_cleanup=morph_cleanup)
elif model_name == "YOLO11x-seg":
if os.path.exists("yolo11_best.pt"):
y11_weights = "yolo11_best.pt"
else:
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
y11_weights = os.path.join(base_dir, "runs", "segment", "runs", "car_mirror_seg", "yolo11x_seg_1024", "weights", "best.pt")
if not os.path.exists(y11_weights):
y11_weights = "best.pt" # Fallback
return run_yolo_generic(img_rgb, y11_weights, target_classes=[0, 1], color=(0, 255, 120), morph_cleanup=morph_cleanup)
elif model_name == "SAM + YOLO (Strategy 1: Bbox + 5 Points)":
return run_sam_strategy(img_rgb, "best.pt", target_classes=[0, 1], color=run_color, strategy=1, morph_cleanup=morph_cleanup)
elif model_name == "SAM + YOLO (Strategy 2: Mask + 5 Points)":
return run_sam_strategy(img_rgb, "best.pt", target_classes=[0, 1], color=run_color, strategy=2, morph_cleanup=morph_cleanup)
elif model_name == "SAM + YOLO (Strategy 3: Direct Mask Prompting)":
return run_sam_strategy(img_rgb, "best.pt", target_classes=[0, 1], color=run_color, strategy=3, morph_cleanup=morph_cleanup)
elif model_name == "Mask R-CNN":
# First check if she uploaded it directly next to app.py as "maskrcnn_best.pt"
if os.path.exists("maskrcnn_best.pt"):
mrcnn_weights = "maskrcnn_best.pt"
else:
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
mrcnn_weights = os.path.join(base_dir, "Mask_RCNN", "runs", "woven-sweep-5", "best.pt")
if not os.path.exists(mrcnn_weights):
mrcnn_weights = "Mask_RCNN/runs/woven-sweep-5/best.pt"
return run_mask_rcnn(img_rgb, mrcnn_weights)
elif model_name == "Grounding DINO (Zero-Shot Detection)":
return run_grounding_dino(img_rgb, text_prompt)
elif model_name == "Grounded SAM (Zero-Shot Segmentation)":
return run_grounded_sam(img_rgb, text_prompt)
elif model_name == "Intelliarts Car Parts (Detectron2)":
return run_intelliarts_car_parts(img_rgb)
elif model_name == "SegFormer":
return run_segformer(img_rgb, morph_cleanup=morph_cleanup)
else:
return img_rgb, None, "Model not recognized."
except Exception as e:
return img_rgb, None, f"Error: {str(e)}"
# ═══════════════════════════════════════════════════════════════════════════════
# Gradio UI
# ═══════════════════════════════════════════════════════════════════════════════
theme = gr.themes.Soft(primary_hue="blue", secondary_hue="indigo")
with gr.Blocks(theme=theme, title="Car Window Segmentation") as demo:
gr.Markdown("""
# Car Window Segmentation
Compare your custom trained YOLOv8 model against state-of-the-art Zero-Shot models!
""")
# ── TAB 3: Comprehensive Evaluation ──
with gr.Tab("Comprehensive Evaluation"):
gr.Markdown("### Comprehensive Evaluation: Results from All Trained and Pretrained Models")
gr.Markdown("""**The following models will run and display their results below:**
**Custom Trained Models:**
1. SegFormer
2. SegFormer + Morphological
3. YOLO11x-seg
4. YOLOv8x-seg
5. Mask R-CNN
6. BiRefNet
7. SAM + YOLO (Strategy 1: Bbox + 5 Points)
8. SAM + YOLO (Strategy 2: Mask + 5 Points)
9. SAM + YOLO (Strategy 3: Direct Mask Prompting)
**Pretrained Zero-Shot Models:**
10\. Grounding DINO
11\. Grounded SAM
12\. Intelliarts Car Parts
**Our Findings:** SegFormer and YOLO11x deliver the best performance with significantly sharper edge precision.
""")
with gr.Row():
input_image_seq = gr.Image(type="numpy", label="Upload Window Image")
with gr.Row():
submit_btn_seq = gr.Button("Run All Models", variant="primary", size="lg")
stop_btn_seq = gr.Button("🛑 Stop Processing", variant="stop", size="lg")
if mirror_examples:
gr.Markdown("### Or click any example image below to load it:")
compare_gallery = gr.Gallery(value=mirror_examples, columns=10, height=120, object_fit="cover", allow_preview=False, show_label=False)
def load_compare_img(evt: gr.SelectData): return mirror_examples[evt.index]
compare_gallery.select(fn=load_compare_img, inputs=None, outputs=input_image_seq)
gr.Markdown("---")
gr.Markdown("## 🚀 Custom Trained Models")
gr.Markdown("### 1️⃣ SegFormer (Transformer)")
with gr.Row():
seq_segf_img = gr.Image(label="SegFormer Overlay", interactive=False)
seq_segf_bw = gr.Image(label="SegFormer Binary Mask", interactive=False, image_mode="L")
seq_segf_stats = gr.Textbox(label="SegFormer Stats", interactive=False)
gr.Markdown("---")
gr.Markdown("### 2️⃣ SegFormer + Morphological Cleanup (Holes Filled + Sharp Borders)")
with gr.Row():
seq_segf_morph_img = gr.Image(label="SegFormer + Morph Overlay", interactive=False)
seq_segf_morph_bw = gr.Image(label="SegFormer + Morph Binary Mask", interactive=False, image_mode="L")
seq_segf_morph_stats = gr.Textbox(label="SegFormer + Morph Stats", interactive=False)
gr.Markdown("---")
gr.Markdown("### 3️⃣ YOLO11x-seg")
with gr.Row():
seq_yolo11_img = gr.Image(label="YOLO11x Overlay", interactive=False)
seq_yolo11_bw = gr.Image(label="YOLO11x Binary Mask", interactive=False, image_mode="L")
seq_yolo11_stats = gr.Textbox(label="YOLO11x Stats", interactive=False)
gr.Markdown("---")
gr.Markdown("### 4️⃣ YOLOv8x-seg")
with gr.Row():
seq_yolo_img = gr.Image(label="YOLO Overlay", interactive=False)
seq_yolo_bw = gr.Image(label="YOLO Binary Mask", interactive=False, image_mode="L")
seq_yolo_stats = gr.Textbox(label="YOLO Stats", interactive=False)
gr.Markdown("---")
gr.Markdown("### 5️⃣ Mask R-CNN (ResNet50-FPN)")
with gr.Row():
seq_mrcnn_img = gr.Image(label="Mask R-CNN Overlay", interactive=False)
seq_mrcnn_bw = gr.Image(label="Mask R-CNN Binary Mask", interactive=False, image_mode="L")
seq_mrcnn_stats = gr.Textbox(label="Mask R-CNN Stats", interactive=False)
gr.Markdown("---")
gr.Markdown("### 6️⃣ BiRefNet (Boundary-Aware Model)")
with gr.Row():
seq_biref_img = gr.Image(label="BiRefNet Overlay", interactive=False)
seq_biref_bw = gr.Image(label="BiRefNet Binary Mask", interactive=False, image_mode="L")
seq_biref_stats = gr.Textbox(label="BiRefNet Stats", interactive=False)
gr.Markdown("---")
gr.Markdown("### 7️⃣ SAM + YOLO (Strategy 1: Bbox + 5 Points)")
with gr.Row():
seq_sam1_img = gr.Image(label="SAM+YOLO Strat 1 Overlay", interactive=False)
seq_sam1_bw = gr.Image(label="SAM+YOLO Strat 1 Binary Mask", interactive=False, image_mode="L")
seq_sam1_stats = gr.Textbox(label="SAM+YOLO Strat 1 Stats", interactive=False)
gr.Markdown("---")
gr.Markdown("### 8️⃣ SAM + YOLO (Strategy 2: Mask + 5 Points)")
with gr.Row():
seq_sam2_img = gr.Image(label="SAM+YOLO Strat 2 Overlay", interactive=False)
seq_sam2_bw = gr.Image(label="SAM+YOLO Strat 2 Binary Mask", interactive=False, image_mode="L")
seq_sam2_stats = gr.Textbox(label="SAM+YOLO Strat 2 Stats", interactive=False)
gr.Markdown("---")
gr.Markdown("### 9️⃣ SAM + YOLO (Strategy 3: Direct Mask Prompting)")
with gr.Row():
seq_sam3_img = gr.Image(label="SAM+YOLO Strat 3 Overlay", interactive=False)
seq_sam3_bw = gr.Image(label="SAM+YOLO Strat 3 Binary Mask", interactive=False, image_mode="L")
seq_sam3_stats = gr.Textbox(label="SAM+YOLO Strat 3 Stats", interactive=False)
gr.Markdown("---")
gr.Markdown("## 🌍 Pretrained Zero-Shot Models")
gr.Markdown("### 🔟 Grounding DINO (Zero-Shot Detection)")
with gr.Row():
seq_dino_img = gr.Image(label="Grounding DINO Overlay", interactive=False)
seq_dino_bw = gr.Image(label="Grounding DINO Binary Mask", interactive=False, image_mode="L")
seq_dino_stats = gr.Textbox(label="Grounding DINO Stats", interactive=False)
gr.Markdown("---")
gr.Markdown("### 1️⃣1️⃣ Grounded SAM (Zero-Shot Segmentation)")
with gr.Row():
seq_gsam_img = gr.Image(label="Grounded SAM Overlay", interactive=False)
seq_gsam_bw = gr.Image(label="Grounded SAM Binary Mask", interactive=False, image_mode="L")
seq_gsam_stats = gr.Textbox(label="Grounded SAM Stats", interactive=False)
gr.Markdown("---")
gr.Markdown("### 1️⃣2️⃣ Intelliarts Car Parts (Detectron2)")
with gr.Row():
seq_intell_img = gr.Image(label="Intelliarts Car Parts Overlay", interactive=False)
seq_intell_bw = gr.Image(label="Intelliarts Car Parts Binary Mask", interactive=False, image_mode="L")
seq_intell_stats = gr.Textbox(label="Intelliarts Car Parts Stats", interactive=False)
def run_all_models(img):
if img is None:
yield tuple([None]*36)
return
# ── Step 0: Show "Processing..." in ALL textboxes immediately ──
PENDING = "⏳ Processing..."
results = [None] * 36
# Set all stats textboxes to pending state
for i in [2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 32, 35]:
results[i] = PENDING
yield tuple(results)
# 1. SegFormer
results[0], results[1], results[2] = run_segformer(img, morph_cleanup=False)
yield tuple(results)
# 2. SegFormer + Morphology
results[3], results[4], results[5] = run_segformer(img, morph_cleanup=True)
yield tuple(results)
# 3. YOLO11x-seg
results[6], results[7], results[8] = process_image(img, "YOLO11x-seg", "", False)
yield tuple(results)
# 4. YOLOv8x-seg
results[9], results[10], results[11] = process_image(img, "YOLOv8x-seg", "", False)
yield tuple(results)
# 5. Mask R-CNN
results[12], results[13], results[14] = process_image(img, "Mask R-CNN", "", False)
yield tuple(results)
# 6. BiRefNet
results[15], results[16], results[17] = run_birefnet(img)
yield tuple(results)
# 7. SAM + YOLO Strat 1
results[18], results[19], results[20] = process_image(img, "SAM + YOLO (Strategy 1: Bbox + 5 Points)", "", False)
yield tuple(results)
# 8. SAM + YOLO Strat 2
results[21], results[22], results[23] = process_image(img, "SAM + YOLO (Strategy 2: Mask + 5 Points)", "", False)
yield tuple(results)
# 9. SAM + YOLO Strat 3
results[24], results[25], results[26] = process_image(img, "SAM + YOLO (Strategy 3: Direct Mask Prompting)", "", False)
yield tuple(results)
# 10. Grounding DINO
results[27], results[28], results[29] = process_image(img, "Grounding DINO (Zero-Shot Detection)", "car window. car glass. windshield.", False)
yield tuple(results)
# 11. Grounded SAM
results[30], results[31], results[32] = process_image(img, "Grounded SAM (Zero-Shot Segmentation)", "car window. car glass. windshield.", False)
yield tuple(results)
# 12. Intelliarts
results[33], results[34], results[35] = process_image(img, "Intelliarts Car Parts (Detectron2)", "", False)
yield tuple(results)
run_event = submit_btn_seq.click(
fn=run_all_models,
inputs=[input_image_seq],
outputs=[seq_segf_img, seq_segf_bw, seq_segf_stats,
seq_segf_morph_img, seq_segf_morph_bw, seq_segf_morph_stats,
seq_yolo11_img, seq_yolo11_bw, seq_yolo11_stats,
seq_yolo_img, seq_yolo_bw, seq_yolo_stats,
seq_mrcnn_img, seq_mrcnn_bw, seq_mrcnn_stats,
seq_biref_img, seq_biref_bw, seq_biref_stats,
seq_sam1_img, seq_sam1_bw, seq_sam1_stats,
seq_sam2_img, seq_sam2_bw, seq_sam2_stats,
seq_sam3_img, seq_sam3_bw, seq_sam3_stats,
seq_dino_img, seq_dino_bw, seq_dino_stats,
seq_gsam_img, seq_gsam_bw, seq_gsam_stats,
seq_intell_img, seq_intell_bw, seq_intell_stats]
)
stop_btn_seq.click(fn=None, inputs=None, outputs=None, cancels=[run_event])
if __name__ == "__main__":
demo.launch()