import cv2 import torch import torchvision.transforms as T import torch.nn.functional as F from torchvision.ops import nms from model_architecture import ObjectDetectionModel transform = T.Compose([ T.ToTensor(), T.Normalize(mean=[0.485,0.456,0.406], std=[0.299,0.224,0.225])]) def decode_target(grid_pred, S=None, image_size=384, conf_thresh=0.5, iou_thresh=0.5, wh_relative_to="image"): """ Decodes predictions to boxes/labels/scores. Accepts either: - grid_pred: [S, S, A, 5+C] (single image) - grid_pred: [B, S, S, A, 5+C] (batch) Returns: - list per image: each item is list of dicts { 'bbox':[x1,y1,x2,y2], 'label':int, 'score':float } Notes: - x,y,w,h are assumed to be logits that must be activated (sigmoid). - w,h interpreted according to wh_relative_to: "image" -> w_rel * image_size "cell" -> w_rel * (image_size / S) """ single_image = False if grid_pred.dim() == 4: grid = grid_pred.unsqueeze(0) single_image = True elif grid_pred.dim() == 5: grid = grid_pred else: raise ValueError(f"Unexpected pred dim {grid_pred.dim()}, expected 4 or 5.") B = grid.shape[0] if S is None: S = grid.shape[1] C = grid.shape[-1] - 5 results_batch = [] for b in range(B): preds = grid[b] dets = [] cell_size = image_size / S for j in range(S): for i in range(S): for a in range(preds.shape[2]): cell = preds[j, i, a] if cell.numel() < 5: continue x_cell = torch.sigmoid(cell[0]).item() y_cell = torch.sigmoid(cell[1]).item() w_val = cell[2].item() h_val = cell[3].item() obj_p = torch.sigmoid(cell[4]).item() if C > 0: class_logits = cell[5:5+C] class_probs = F.softmax(class_logits, dim=0).detach().numpy() class_id = int(class_probs.argmax()) class_p = float(class_probs[class_id]) else: class_id = 0 class_p = 1.0 score = obj_p * class_p if score < conf_thresh: continue x_abs = (i + x_cell) * cell_size y_abs = (j + y_cell) * cell_size if wh_relative_to == "cell": w_abs = (torch.sigmoid(torch.tensor(w_val)).item()) * cell_size h_abs = (torch.sigmoid(torch.tensor(h_val)).item()) * cell_size else: w_abs = (torch.sigmoid(torch.tensor(w_val)).item()) * image_size h_abs = (torch.sigmoid(torch.tensor(h_val)).item()) * image_size x1 = x_abs - w_abs / 2.0 y1 = y_abs - h_abs / 2.0 x2 = x_abs + w_abs / 2.0 y2 = y_abs + h_abs / 2.0 x1 = max(0.0, min(image_size, x1)) y1 = max(0.0, min(image_size, y1)) x2 = max(0.0, min(image_size, x2)) y2 = max(0.0, min(image_size, y2)) if x2 <= x1 or y2 <= y1: continue dets.append({ "bbox":[x1, y1, x2, y2], "label": int(class_id), "score": float(score) }) if len(dets) > 0: boxes = torch.tensor([d["bbox"] for d in dets], dtype=torch.float32) scores = torch.tensor([d["score"] for d in dets], dtype=torch.float32) keep = nms(boxes, scores, iou_thresh).detach().tolist() dets = [dets[k] for k in keep] results_batch.append(dets) return results_batch[0] if single_image else results_batch class ModelRunners : def __init__(self,Model) : self.__architecture = ObjectDetectionModel(1) self.__checkpoint = torch.load(Model) self.__architecture.load_state_dict(self.__checkpoint) self.__threshold = 0.25 self.__imageSize = (720,480) self.__camera = 0 def set_threshold(self) : print("Note : for low threshold score can make model so sensitive at prediction ") print("Note : for hight threshold score can make model so selective at prediction") conf_threshold = float(input("set threshold score System (0.1/0.9) : ")) self.__threshold = conf_threshold def __image_procces (self,pred_diction) : return decode_target(pred_diction,S=16,image_size=384,conf_thresh=self.__threshold) def set_imagesize (self,size : tuple) : if not isinstance(size,tuple) : raise RuntimeError("the size parameters must be default (720,480)") self.__imageSize = size def set_webcam (self,cam : int ) : if not isinstance(cam,int) : raise RuntimeError("WebCam input must be int (0,1)") if cam > 1 : raise RuntimeError("Webcam just aivable for 0 internal and 1 external") self.__camera = cam def main_model (self) : device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.__architecture.to(device) capture = cv2.VideoCapture(self.__camera) while True : ret,frame = capture.read() if not ret : break H,W = frame.shape[:2] img_rgb = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB) img_resize = cv2.resize(img_rgb,(384,384)) tensor = transform(img_resize).unsqueeze(0) with torch.no_grad() : pred = self.__architecture(tensor) result = self.__image_procces(pred[0]) scaleX,scaleY = W / 384 , H / 384 for r in result : xmin,ymin,xmax,ymax = r["bbox"] xmin,xmax = int(xmin * scaleX) , int(xmax * scaleX) ymin,ymax = int(ymin * scaleY), int (ymax * scaleY) conf = r["score"] cv2.rectangle(frame,(xmin,ymin),(xmax,ymax),(0,255,0),2) cv2.putText(frame, f"{r['label']} {conf:.2f}", (xmin, max(0, ymin - 5)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) frame = cv2.resize(frame,(self.__imageSize)) cv2.imshow("Live show cam",frame) if cv2.waitKey(1) & 0xFF == ord('q'): break capture.release() cv2.destroyAllWindows() def run_model_at_video(self,path) : cap = cv2.VideoCapture(path) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.__architecture.to(device) while True : ret,frame = cap.read() if not ret : break H,W = frame.shape[:2] img_rgb = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB) img_resize = cv2.resize(img_rgb,(384,384)) tensor = transform(img_resize).unsqueeze(0) with torch.no_grad() : pred = self.__architecture(tensor) result = self.__image_procces(pred[0]) scaleX,scaleY = W / 384 , H / 384 for r in result : xmin,ymin,xmax,ymax = r["bbox"] xmin,xmax = int(xmin * scaleX) , int(xmax * scaleX) ymin,ymax = int(ymin * scaleY), int (ymax * scaleY) conf = r["score"] cv2.rectangle(frame,(xmin,ymin),(xmax,ymax),(0,255,0),2) cv2.putText(frame, f"{r['label']} {conf:.2f}", (xmin, max(0, ymin - 5)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) frame = cv2.resize(frame,(self.__imageSize)) cv2.imshow("Live show cam",frame) if cv2.waitKey(1) & 0xFF == ord('q') : break cap.release() cv2.destroyAllWindows()