File size: 4,034 Bytes
59d7373
 
 
 
 
 
 
6763f66
 
 
0424e50
 
59d7373
 
 
aa38a5f
59d7373
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6763f66
 
0424e50
6763f66
 
0424e50
6763f66
0424e50
 
6763f66
0424e50
 
 
 
6763f66
 
 
 
 
 
 
 
0424e50
6763f66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0424e50
aa38a5f
6763f66
 
 
0424e50
 
 
 
92b6df1
0424e50
92b6df1
0424e50
92b6df1
aa38a5f
 
0424e50
aa38a5f
 
 
0424e50
aa38a5f
 
 
0424e50
aa38a5f
 
 
 
92b6df1
aa38a5f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import torch
from fastai.vision.all import load_learner
from ultralytics import YOLO
import cv2
from PIL import Image
import numpy as np
from collections import Counter
from pytorch_grad_cam import GradCAM
from pytorch_grad_cam.utils.image import show_cam_on_image
import numpy as np
from collections import defaultdict

device = torch.device("cpu")

learner = load_learner("best_model.pkl", cpu=True)
yolo_model = YOLO("yolov8n.pt")  

def predict_video(video_path, max_frames=10):
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    preds_list = []
    conf_list = []

    while cap.isOpened() and frame_count < max_frames:
        ret, frame = cap.read()
        if not ret:
            break

        results = yolo_model.predict(frame, conf=0.5, verbose=False)
        boxes = results[0].boxes.xyxy

        for box in boxes:
            x1, y1, x2, y2 = map(int, box.tolist())
            face = frame[y1:y2, x1:x2]
            if face.size == 0: continue

            img = Image.fromarray(cv2.cvtColor(face, cv2.COLOR_BGR2RGB))

            _, _, probs = learner.predict(img)
            pred_class = probs.argmax().item()
            conf = probs[pred_class].item()

            preds_list.append(pred_class)
            conf_list.append(conf)

        frame_count += 1

    cap.release()

    if len(preds_list) == 0:
        return "unknown", 0.0

    # 🗳️ Majority Vote
    final_pred = Counter(preds_list).most_common(1)[0][0]
    final_conf = np.mean([conf for pred, conf in zip(preds_list, conf_list) if pred == final_pred])
    label = learner.dls.vocab[final_pred]
    return label, final_conf

def predict_video_with_cam(video_path, max_frames=5):
    cap = cv2.VideoCapture(video_path)
    learner.model.eval()

    results = []
    preds_list = []
    confs_list = []

    frame_index = 0
    frames_with_faces = 0

    while cap.isOpened() and frame_index < max_frames:
        ret, frame = cap.read()
        if not ret:
            break

        detections = yolo_model.predict(frame, conf=0.5, verbose=False)
        boxes = detections[0].boxes.xyxy

        if len(boxes) > 0:
            frames_with_faces += 1
            x1, y1, x2, y2 = map(int, boxes[0].tolist())
            face = frame[y1:y2, x1:x2]
            if face.size == 0:
                continue

            img = Image.fromarray(cv2.cvtColor(face, cv2.COLOR_BGR2RGB))
            _, _, probs = learner.predict(img)
            pred_class = probs.argmax().item()
            conf = probs[pred_class].item()

            def get_last_conv_layer(m):
                for _, module in reversed(list(m.named_modules())):
                    if isinstance(module, torch.nn.Conv2d):
                        return module
                raise Exception("No Conv2d layer found")

            target_layer = get_last_conv_layer(learner.model)
            cam = GradCAM(model=learner.model, target_layers=[target_layer])
            input_tensor = learner.dls.test_dl([img]).one_batch()[0]
            grayscale_cam = cam(input_tensor=input_tensor)[0]

            img_array = np.array(img.resize((224, 224))) / 255.0
            cam_image = show_cam_on_image(img_array, grayscale_cam, use_rgb=True)

            label = learner.dls.vocab[pred_class]
            results.append((label, conf, cam_image))
            preds_list.append(pred_class)
            confs_list.append(conf)

        frame_index += 1

    cap.release()

    if frames_with_faces == 0:
        return [], "unknown", 0.0, 0, max_frames

    # Weighted confidence
    class_scores = defaultdict(float)
    total_weight = defaultdict(float)

    for pred, conf in zip(preds_list, confs_list):
        class_scores[pred] += conf
        total_weight[pred] += 1

    weighted_avg = {c: class_scores[c] / total_weight[c] for c in class_scores}
    final_pred = max(weighted_avg, key=weighted_avg.get)
    final_conf = weighted_avg[final_pred]
    final_label = learner.dls.vocab[final_pred]

    return results, final_label, final_conf, frames_with_faces, max_frames