""" Inference script for CLPRNet with PARSeq Tiny backbone. Two-stage inference: 1. Detection: CLPRNet backbone + detection head -> boxes (with NMS) 2. Recognition: Crop detected plates -> PARSeq Tiny -> plate strings """ from model_parseq import CLPRNetPARSeq, Tokenizer import torch import torchvision.transforms as transforms import torch.nn.functional as F import os import numpy as np import cv2 from PIL import Image, ImageDraw, ImageFont CHARACTER = Tokenizer.CHARSET DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") img_size = (1024, 1024) font_size = 30 try: font = ImageFont.truetype('resource/msyh.ttc', font_size, encoding='utf-8') except: font = ImageFont.load_default() # Load model model = CLPRNetPARSeq(max_label_length=8) model = model.to(DEVICE) model.load_state_dict(torch.load('resource/CLPRNet_PARSeq.pth', map_location=DEVICE)) model.eval() if not os.path.exists('output'): os.makedirs('output') tran = transforms.Compose([ transforms.ToPILImage(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) def IOU(box, other_boxes): box_area = (box[2] - box[0]) * (box[3] - box[1]) other_boxes_area = (other_boxes[:, 2] - other_boxes[:, 0]) * (other_boxes[:, 3] - other_boxes[:, 1]) x1 = torch.max(box[0], other_boxes[:, 0]) y1 = torch.max(box[1], other_boxes[:, 1]) x2 = torch.min(box[2], other_boxes[:, 2]) y2 = torch.min(box[3], other_boxes[:, 3]) Min = torch.zeros(1, device=box.device) w, h = torch.max(Min, x2 - x1), torch.max(Min, y2 - y1) overlap_area = w * h iou = overlap_area / (box_area + other_boxes_area - overlap_area + 1e-6) return iou def NMS(boxes, C=0.3): if len(boxes) == 0: return [] sort_boxes = boxes[boxes[:, 0].argsort(descending=True)] keep = [] while len(sort_boxes) > 0: ref_box = sort_boxes[0] keep.append(ref_box) if len(sort_boxes) > 1: other_boxes = sort_boxes[1:] sort_boxes = other_boxes[torch.where(IOU(ref_box[1:5], other_boxes[:, 1:5]) < C)] else: break return torch.stack(keep) def inference(src, image_list): grid = 64 mask_x = (np.array([[i for i in range(grid)]] * grid) + 0.5) * img_size[0] / grid mask_y = (np.array([[i] * grid for i in range(grid)]) + 0.5) * img_size[1] / grid mask = torch.from_numpy(np.stack([mask_x, mask_y], axis=2)) x_mask = mask[:, :, 0].to(DEVICE).unsqueeze_(dim=2) y_mask = mask[:, :, 1].to(DEVICE).unsqueeze_(dim=2) for img_name in image_list: print(img_name) org_img = cv2.imread(os.path.join(src, img_name)) # Normalize image (pad to square) height, width, _ = org_img.shape size = height if height > width else width img2 = np.zeros((size, size, 3)).astype("uint8") if height == size: y = 0 x = (size - width) // 2 else: x = 0 y = (size - height) // 2 img2[y:y + height, x:x + width, :] = org_img img = cv2.resize(img2, img_size) # Inference inputs = img[:, :, ::-1] # BGR -> RGB inputs = tran(inputs) inputs = inputs.unsqueeze(dim=0) inputs = inputs.to(DEVICE) with torch.no_grad(): # Stage 1: Detection only (no boxes provided) y_detection, _, at_lp, _ = model(inputs) # Stage 2: Extract detected boxes and recognize plates for index in range(y_detection.shape[0]): l, t, r, b, c = torch.split(y_detection[index, :, :, :5], 1, dim=-1) l = x_mask - l * inputs.shape[3] t = y_mask - t * inputs.shape[2] r = x_mask + r * inputs.shape[3] b = y_mask + b * inputs.shape[2] # Flatten and filter by confidence out = torch.flatten(torch.concat([c, l, t, r, b], dim=2), start_dim=0, end_dim=1) out = out[torch.where(out[:, 0] > 0.3)] if len(out) == 0: print(" No plates detected") continue out = NMS(out, 0.3) # Crop detected plates and recognize with PARSeq boxes_for_rec = [det[1:5] for det in out] boxes_tensor = torch.stack(boxes_for_rec) with torch.no_grad(): plate_texts, confidences = model.recognize_plates( inputs[index:index+1], [boxes_tensor] ) # Draw results preb_lurd_list = [] preb_pl_list = [] preb_c = [] for i, det in enumerate(out): lurd = torch.tensor([det[1], det[2], det[3], det[4]]).cpu().numpy() lurd[0] = lurd[0] * size / img_size[0] - x lurd[1] = lurd[1] * size / img_size[1] - y lurd[2] = lurd[2] * size / img_size[0] - x lurd[3] = lurd[3] * size / img_size[1] - y preb_lurd_list.append(lurd.astype('int32')) if i < len(plate_texts): preb_pl_list.append(plate_texts[i]) det_conf = float(det[0].cpu().numpy()) rec_conf = confidences[i] if i < len(confidences) else 0.0 preb_c.append(round(det_conf * rec_conf, 3)) else: preb_pl_list.append("???") preb_c.append(0.0) # Draw bounding boxes for i in preb_lurd_list: cv2.rectangle(org_img, i[:2], i[2:], (0, 0, 255), 2) org_img_rgb = org_img[:, :, ::-1] org_img_pil = Image.fromarray(org_img_rgb.astype('uint8')).convert('RGB') draw = ImageDraw.Draw(org_img_pil) for i in range(len(preb_pl_list)): label_text = f"{preb_pl_list[i]}_{preb_c[i]}" label_size = int(draw.textlength(label_text, font)) draw.rectangle( [(preb_lurd_list[i][0], preb_lurd_list[i][1] - font_size), (preb_lurd_list[i][0] + label_size, preb_lurd_list[i][1])], fill='red' ) draw.text( xy=(preb_lurd_list[i][0], preb_lurd_list[i][1] - int(font_size * 1.25)), text=label_text, fill=(255, 255, 255), font=font ) print(f" Plate: {preb_pl_list[i]}, Conf: {preb_c[i]}") org_img_pil.save('output/' + img_name) if __name__ == '__main__': src = 'image' inference(src, os.listdir(src))