PeopleDetector / image_processing.py
Airin-chan's picture
Upload 3 files
a9c8060 verified
import cv2
import torch
import torchvision.transforms as T
import torch.nn.functional as F
from torchvision.ops import nms
from model_architecture import ObjectDetectionModel
transform = T.Compose([
T.ToTensor(),
T.Normalize(mean=[0.485,0.456,0.406],
std=[0.299,0.224,0.225])])
def decode_target(grid_pred, S=None, image_size=384, conf_thresh=0.5, iou_thresh=0.5, wh_relative_to="image"):
"""
Decodes predictions to boxes/labels/scores. Accepts either:
- grid_pred: [S, S, A, 5+C] (single image)
- grid_pred: [B, S, S, A, 5+C] (batch)
Returns:
- list per image: each item is list of dicts { 'bbox':[x1,y1,x2,y2], 'label':int, 'score':float }
Notes:
- x,y,w,h are assumed to be logits that must be activated (sigmoid).
- w,h interpreted according to wh_relative_to:
"image" -> w_rel * image_size
"cell" -> w_rel * (image_size / S)
"""
single_image = False
if grid_pred.dim() == 4:
grid = grid_pred.unsqueeze(0)
single_image = True
elif grid_pred.dim() == 5:
grid = grid_pred
else:
raise ValueError(f"Unexpected pred dim {grid_pred.dim()}, expected 4 or 5.")
B = grid.shape[0]
if S is None:
S = grid.shape[1]
C = grid.shape[-1] - 5
results_batch = []
for b in range(B):
preds = grid[b]
dets = []
cell_size = image_size / S
for j in range(S):
for i in range(S):
for a in range(preds.shape[2]):
cell = preds[j, i, a]
if cell.numel() < 5:
continue
x_cell = torch.sigmoid(cell[0]).item()
y_cell = torch.sigmoid(cell[1]).item()
w_val = cell[2].item()
h_val = cell[3].item()
obj_p = torch.sigmoid(cell[4]).item()
if C > 0:
class_logits = cell[5:5+C]
class_probs = F.softmax(class_logits, dim=0).detach().numpy()
class_id = int(class_probs.argmax())
class_p = float(class_probs[class_id])
else:
class_id = 0
class_p = 1.0
score = obj_p * class_p
if score < conf_thresh:
continue
x_abs = (i + x_cell) * cell_size
y_abs = (j + y_cell) * cell_size
if wh_relative_to == "cell":
w_abs = (torch.sigmoid(torch.tensor(w_val)).item()) * cell_size
h_abs = (torch.sigmoid(torch.tensor(h_val)).item()) * cell_size
else:
w_abs = (torch.sigmoid(torch.tensor(w_val)).item()) * image_size
h_abs = (torch.sigmoid(torch.tensor(h_val)).item()) * image_size
x1 = x_abs - w_abs / 2.0
y1 = y_abs - h_abs / 2.0
x2 = x_abs + w_abs / 2.0
y2 = y_abs + h_abs / 2.0
x1 = max(0.0, min(image_size, x1))
y1 = max(0.0, min(image_size, y1))
x2 = max(0.0, min(image_size, x2))
y2 = max(0.0, min(image_size, y2))
if x2 <= x1 or y2 <= y1:
continue
dets.append({
"bbox":[x1, y1, x2, y2],
"label": int(class_id),
"score": float(score)
})
if len(dets) > 0:
boxes = torch.tensor([d["bbox"] for d in dets], dtype=torch.float32)
scores = torch.tensor([d["score"] for d in dets], dtype=torch.float32)
keep = nms(boxes, scores, iou_thresh).detach().tolist()
dets = [dets[k] for k in keep]
results_batch.append(dets)
return results_batch[0] if single_image else results_batch
class ModelRunners :
def __init__(self,Model) :
self.__architecture = ObjectDetectionModel(1)
self.__checkpoint = torch.load(Model)
self.__architecture.load_state_dict(self.__checkpoint)
self.__threshold = 0.25
self.__imageSize = (720,480)
self.__camera = 0
def set_threshold(self) :
print("Note : for low threshold score can make model so sensitive at prediction ")
print("Note : for hight threshold score can make model so selective at prediction")
conf_threshold = float(input("set threshold score System (0.1/0.9) : "))
self.__threshold = conf_threshold
def __image_procces (self,pred_diction) :
return decode_target(pred_diction,S=16,image_size=384,conf_thresh=self.__threshold)
def set_imagesize (self,size : tuple) :
if not isinstance(size,tuple) :
raise RuntimeError("the size parameters must be default (720,480)")
self.__imageSize = size
def set_webcam (self,cam : int ) :
if not isinstance(cam,int) :
raise RuntimeError("WebCam input must be int (0,1)")
if cam > 1 :
raise RuntimeError("Webcam just aivable for 0 internal and 1 external")
self.__camera = cam
def main_model (self) :
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.__architecture.to(device)
capture = cv2.VideoCapture(self.__camera)
while True :
ret,frame = capture.read()
if not ret :
break
H,W = frame.shape[:2]
img_rgb = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
img_resize = cv2.resize(img_rgb,(384,384))
tensor = transform(img_resize).unsqueeze(0)
with torch.no_grad() :
pred = self.__architecture(tensor)
result = self.__image_procces(pred[0])
scaleX,scaleY = W / 384 , H / 384
for r in result :
xmin,ymin,xmax,ymax = r["bbox"]
xmin,xmax = int(xmin * scaleX) , int(xmax * scaleX)
ymin,ymax = int(ymin * scaleY), int (ymax * scaleY)
conf = r["score"]
cv2.rectangle(frame,(xmin,ymin),(xmax,ymax),(0,255,0),2)
cv2.putText(frame, f"{r['label']} {conf:.2f}",
(xmin, max(0, ymin - 5)),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
frame = cv2.resize(frame,(self.__imageSize))
cv2.imshow("Live show cam",frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
capture.release()
cv2.destroyAllWindows()
def run_model_at_video(self,path) :
cap = cv2.VideoCapture(path)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.__architecture.to(device)
while True :
ret,frame = cap.read()
if not ret :
break
H,W = frame.shape[:2]
img_rgb = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
img_resize = cv2.resize(img_rgb,(384,384))
tensor = transform(img_resize).unsqueeze(0)
with torch.no_grad() :
pred = self.__architecture(tensor)
result = self.__image_procces(pred[0])
scaleX,scaleY = W / 384 , H / 384
for r in result :
xmin,ymin,xmax,ymax = r["bbox"]
xmin,xmax = int(xmin * scaleX) , int(xmax * scaleX)
ymin,ymax = int(ymin * scaleY), int (ymax * scaleY)
conf = r["score"]
cv2.rectangle(frame,(xmin,ymin),(xmax,ymax),(0,255,0),2)
cv2.putText(frame, f"{r['label']} {conf:.2f}",
(xmin, max(0, ymin - 5)),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
frame = cv2.resize(frame,(self.__imageSize))
cv2.imshow("Live show cam",frame)
if cv2.waitKey(1) & 0xFF == ord('q') :
break
cap.release()
cv2.destroyAllWindows()