Spaces:
Sleeping
Sleeping
File size: 4,034 Bytes
59d7373 6763f66 0424e50 59d7373 aa38a5f 59d7373 6763f66 0424e50 6763f66 0424e50 6763f66 0424e50 6763f66 0424e50 6763f66 0424e50 6763f66 0424e50 aa38a5f 6763f66 0424e50 92b6df1 0424e50 92b6df1 0424e50 92b6df1 aa38a5f 0424e50 aa38a5f 0424e50 aa38a5f 0424e50 aa38a5f 92b6df1 aa38a5f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 | import torch
from fastai.vision.all import load_learner
from ultralytics import YOLO
import cv2
from PIL import Image
import numpy as np
from collections import Counter
from pytorch_grad_cam import GradCAM
from pytorch_grad_cam.utils.image import show_cam_on_image
import numpy as np
from collections import defaultdict
device = torch.device("cpu")
learner = load_learner("best_model.pkl", cpu=True)
yolo_model = YOLO("yolov8n.pt")
def predict_video(video_path, max_frames=10):
cap = cv2.VideoCapture(video_path)
frame_count = 0
preds_list = []
conf_list = []
while cap.isOpened() and frame_count < max_frames:
ret, frame = cap.read()
if not ret:
break
results = yolo_model.predict(frame, conf=0.5, verbose=False)
boxes = results[0].boxes.xyxy
for box in boxes:
x1, y1, x2, y2 = map(int, box.tolist())
face = frame[y1:y2, x1:x2]
if face.size == 0: continue
img = Image.fromarray(cv2.cvtColor(face, cv2.COLOR_BGR2RGB))
_, _, probs = learner.predict(img)
pred_class = probs.argmax().item()
conf = probs[pred_class].item()
preds_list.append(pred_class)
conf_list.append(conf)
frame_count += 1
cap.release()
if len(preds_list) == 0:
return "unknown", 0.0
# 🗳️ Majority Vote
final_pred = Counter(preds_list).most_common(1)[0][0]
final_conf = np.mean([conf for pred, conf in zip(preds_list, conf_list) if pred == final_pred])
label = learner.dls.vocab[final_pred]
return label, final_conf
def predict_video_with_cam(video_path, max_frames=5):
cap = cv2.VideoCapture(video_path)
learner.model.eval()
results = []
preds_list = []
confs_list = []
frame_index = 0
frames_with_faces = 0
while cap.isOpened() and frame_index < max_frames:
ret, frame = cap.read()
if not ret:
break
detections = yolo_model.predict(frame, conf=0.5, verbose=False)
boxes = detections[0].boxes.xyxy
if len(boxes) > 0:
frames_with_faces += 1
x1, y1, x2, y2 = map(int, boxes[0].tolist())
face = frame[y1:y2, x1:x2]
if face.size == 0:
continue
img = Image.fromarray(cv2.cvtColor(face, cv2.COLOR_BGR2RGB))
_, _, probs = learner.predict(img)
pred_class = probs.argmax().item()
conf = probs[pred_class].item()
def get_last_conv_layer(m):
for _, module in reversed(list(m.named_modules())):
if isinstance(module, torch.nn.Conv2d):
return module
raise Exception("No Conv2d layer found")
target_layer = get_last_conv_layer(learner.model)
cam = GradCAM(model=learner.model, target_layers=[target_layer])
input_tensor = learner.dls.test_dl([img]).one_batch()[0]
grayscale_cam = cam(input_tensor=input_tensor)[0]
img_array = np.array(img.resize((224, 224))) / 255.0
cam_image = show_cam_on_image(img_array, grayscale_cam, use_rgb=True)
label = learner.dls.vocab[pred_class]
results.append((label, conf, cam_image))
preds_list.append(pred_class)
confs_list.append(conf)
frame_index += 1
cap.release()
if frames_with_faces == 0:
return [], "unknown", 0.0, 0, max_frames
# Weighted confidence
class_scores = defaultdict(float)
total_weight = defaultdict(float)
for pred, conf in zip(preds_list, confs_list):
class_scores[pred] += conf
total_weight[pred] += 1
weighted_avg = {c: class_scores[c] / total_weight[c] for c in class_scores}
final_pred = max(weighted_avg, key=weighted_avg.get)
final_conf = weighted_avg[final_pred]
final_label = learner.dls.vocab[final_pred]
return results, final_label, final_conf, frames_with_faces, max_frames |