Spaces:

nayan90k
/

face-mask-detection

Sleeping

App Files Files Community

nayan90k commited on 13 days ago

Commit

0fba8bd

verified ·

1 Parent(s): 159f25c

Upload 5 files

Browse files

Files changed (5) hide show

app.py +121 -0
best.pt +3 -0
inference_core.py +363 -0
labels.json +7 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,121 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import os
+from pathlib import Path
+import cv2
+import gradio as gr
+from inference_core import MaskClassifierPyTorch, parse_class_thresholds
+MODEL_PATH = Path("best.pt")
+def _env_float(name: str, default: float) -> float:
+    value = os.getenv(name)
+    if value is None or value.strip() == "":
+        return default
+    return float(value)
+classifier = MaskClassifierPyTorch(
+    model_path=MODEL_PATH,
+    use_mediapipe=True,
+    min_top_confidence=_env_float("MASK_MIN_TOP_CONFIDENCE", 0.0),
+    min_margin=_env_float("MASK_MIN_MARGIN", 0.0),
+    class_thresholds=parse_class_thresholds(os.getenv("MASK_CLASS_THRESHOLDS")),
+    reject_label="uncertain",
+)
+def predict_image(input_image):
+    if input_image is None:
+        return "No image provided", None
+    image_bgr = cv2.cvtColor(input_image, cv2.COLOR_RGB2BGR)
+    result = classifier.predict_from_bgr(image_bgr)
+    if not result["ok"]:
+        return result["error"], None
+    scores = result["scores"]
+    text = f"Label: {result['label']}\nConfidence: {result['confidence']:.4f}"
+    return text, scores
+def predict_video(video_path: str, sample_every_n_frames: int):
+    if not video_path:
+        return "No video provided"
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        return "Could not open video"
+    frame_idx = 0
+    preds = []
+    while True:
+        ok, frame = cap.read()
+        if not ok:
+            break
+        if frame_idx % max(1, int(sample_every_n_frames)) == 0:
+            result = classifier.predict_from_bgr(frame)
+            if result["ok"]:
+                preds.append(result)
+        frame_idx += 1
+    cap.release()
+    if not preds:
+        return "No detectable face found in sampled frames"
+    counts = {}
+    conf_sum = {}
+    for p in preds:
+        label = p["label"]
+        counts[label] = counts.get(label, 0) + 1
+        conf_sum[label] = conf_sum.get(label, 0.0) + p["confidence"]
+    non_uncertain_counts = {k: v for k, v in counts.items() if k != "uncertain"}
+    if non_uncertain_counts:
+        top_label = max(non_uncertain_counts, key=non_uncertain_counts.get)
+        avg_conf = conf_sum[top_label] / counts[top_label]
+    else:
+        top_label = "uncertain"
+        avg_conf = conf_sum[top_label] / counts[top_label]
+    lines = [
+        f"Frames scanned: {frame_idx}",
+        f"Frames predicted: {len(preds)}",
+        f"Final label: {top_label}",
+        f"Avg confidence: {avg_conf:.4f}",
+        f"Label counts: {counts}",
+    ]
+    return "\n".join(lines)
+with gr.Blocks(title="Face Mask Detection") as demo:
+    gr.Markdown("# Face Mask Detection (MobileNetV2 + ONNX INT8)")
+    gr.Markdown("Upload an image or video to run mask classification.")
+    with gr.Tab("Image"):
+        image_input = gr.Image(type="numpy", label="Input Image")
+        image_btn = gr.Button("Predict")
+        image_text = gr.Textbox(label="Result")
+        image_scores = gr.Label(label="Class Probabilities")
+        image_btn.click(
+            fn=predict_image, inputs=[image_input], outputs=[image_text, image_scores]
+        )
+    with gr.Tab("Video"):
+        video_input = gr.Video(label="Input Video")
+        frame_stride = gr.Slider(
+            minimum=1, maximum=60, value=15, step=1, label="Sample every N frames"
+        )
+        video_btn = gr.Button("Predict")
+        video_text = gr.Textbox(label="Result", lines=8)
+        video_btn.click(
+            fn=predict_video, inputs=[video_input, frame_stride], outputs=[video_text]
+        )
+if __name__ == "__main__":
+    demo.launch()

best.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f58b6f7a07858598c15df1ed4595df96ad0936e224edb01b730c09fe90e58641
+size 27063625

inference_core.py ADDED Viewed

	@@ -0,0 +1,363 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import json
+from pathlib import Path
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+from torchvision import models
+def parse_class_thresholds(spec: str | None) -> dict[str, float]:
+    if not spec:
+        return {}
+    thresholds: dict[str, float] = {}
+    items = [item.strip() for item in spec.split(",") if item.strip()]
+    for item in items:
+        if "=" not in item:
+            raise ValueError(
+                f"Invalid threshold item '{item}'. Expected format: class=value"
+            )
+        label, raw_value = item.split("=", 1)
+        label = label.strip()
+        if not label:
+            raise ValueError("Class label cannot be empty in class threshold spec")
+        value = float(raw_value.strip())
+        if value < 0.0 or value > 1.0:
+            raise ValueError(f"Threshold for '{label}' must be in [0, 1], got {value}")
+        thresholds[label] = value
+    return thresholds
+class FaceDetector:
+    def __init__(self, use_mediapipe: bool = True, min_confidence: float = 0.5):
+        self._backend = "none"
+        self._detector = None
+        self._min_confidence = min_confidence
+        if use_mediapipe:
+            try:
+                import mediapipe as mp
+                self._detector = mp.solutions.face_detection.FaceDetection(
+                    model_selection=1,
+                    min_detection_confidence=min_confidence,
+                )
+                self._backend = "mediapipe"
+            except Exception:
+                self._detector = None
+    def close(self) -> None:
+        if self._backend == "mediapipe" and self._detector is not None:
+            self._detector.close()
+    def _largest_bbox(self, detections, width: int, height: int):
+        largest = None
+        largest_det = None
+        area_max = -1.0
+        for d in detections:
+            bbox = d.location_data.relative_bounding_box
+            w = max(0.0, bbox.width) * width
+            h = max(0.0, bbox.height) * height
+            area = w * h
+            if area > area_max:
+                area_max = area
+                largest = bbox
+                largest_det = d
+        return largest, largest_det
+    def detect_largest_face_with_meta(self, image_bgr: np.ndarray, margin: float = 0.2):
+        h, w = image_bgr.shape[:2]
+        if self._backend != "mediapipe" or self._detector is None:
+            meta = {"bbox": [0, 0, w, h], "keypoints": []}
+            return image_bgr, meta
+        rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
+        result = self._detector.process(rgb)
+        if not result.detections:
+            return None, None
+        bbox, detection = self._largest_bbox(result.detections, w, h)
+        if bbox is None:
+            return None, None
+        x = bbox.xmin * w
+        y = bbox.ymin * h
+        bw = bbox.width * w
+        bh = bbox.height * h
+        cx = x + bw / 2.0
+        cy = y + bh / 2.0
+        side = max(bw, bh) * (1.0 + margin)
+        x1 = int(max(0, cx - side / 2.0))
+        y1 = int(max(0, cy - side / 2.0))
+        x2 = int(min(w, cx + side / 2.0))
+        y2 = int(min(h, cy + side / 2.0))
+        if x2 <= x1 or y2 <= y1:
+            return None, None
+        keypoints = []
+        if detection is not None:
+            for kp in detection.location_data.relative_keypoints:
+                keypoints.append([int(kp.x * w), int(kp.y * h)])
+        meta = {"bbox": [x1, y1, x2, y2], "keypoints": keypoints}
+        return image_bgr[y1:y2, x1:x2], meta
+    def detect_largest_face(
+        self, image_bgr: np.ndarray, margin: float = 0.2
+    ) -> np.ndarray | None:
+        crop, _ = self.detect_largest_face_with_meta(image_bgr=image_bgr, margin=margin)
+        return crop
+class MaskClassifierONNX:
+    def __init__(
+        self,
+        model_path: Path,
+        labels_path: Path | None = None,
+        use_mediapipe: bool = True,
+        min_top_confidence: float = 0.0,
+        min_margin: float = 0.0,
+        class_thresholds: dict[str, float] | None = None,
+        reject_label: str = "uncertain",
+    ):
+        self.model_path = Path(model_path)
+        providers = ["CPUExecutionProvider"]
+        self.session = ort.InferenceSession(str(self.model_path), providers=providers)
+        self.input_name = self.session.get_inputs()[0].name
+        self.output_name = self.session.get_outputs()[0].name
+        self.class_names = self._load_class_names(labels_path)
+        self.detector = FaceDetector(use_mediapipe=use_mediapipe)
+        self.min_top_confidence = float(min_top_confidence)
+        self.min_margin = float(min_margin)
+        self.class_thresholds = dict(class_thresholds or {})
+        self.reject_label = reject_label
+        if self.min_top_confidence < 0.0 or self.min_top_confidence > 1.0:
+            raise ValueError("min_top_confidence must be in [0, 1]")
+        if self.min_margin < 0.0 or self.min_margin > 1.0:
+            raise ValueError("min_margin must be in [0, 1]")
+        for label, value in self.class_thresholds.items():
+            if value < 0.0 or value > 1.0:
+                raise ValueError(
+                    f"class threshold for '{label}' must be in [0, 1], got {value}"
+                )
+    def _load_class_names(self, labels_path: Path | None) -> list[str]:
+        candidate = labels_path
+        if candidate is None:
+            candidate = self.model_path.with_suffix(".labels.json")
+        if candidate.exists():
+            payload = json.loads(candidate.read_text(encoding="utf-8"))
+            if isinstance(payload, list):
+                return payload
+            if isinstance(payload, dict) and "class_names" in payload:
+                return list(payload["class_names"])
+        return ["with_mask", "incorrect_mask", "without_mask"]
+    @staticmethod
+    def preprocess(image_bgr: np.ndarray, image_size: int = 224) -> np.ndarray:
+        img = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
+        img = cv2.resize(img, (image_size, image_size), interpolation=cv2.INTER_AREA)
+        arr = img.astype(np.float32) / 255.0
+        mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
+        std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
+        arr = (arr - mean) / std
+        arr = np.transpose(arr, (2, 0, 1))
+        return np.expand_dims(arr, axis=0)
+    @staticmethod
+    def softmax(logits: np.ndarray) -> np.ndarray:
+        z = logits - np.max(logits, axis=1, keepdims=True)
+        exp = np.exp(z)
+        return exp / np.sum(exp, axis=1, keepdims=True)
+    def _apply_decision_policy(self, probs: np.ndarray) -> dict:
+        top_idx = int(np.argmax(probs))
+        top_label = self.class_names[top_idx]
+        top_conf = float(probs[top_idx])
+        if len(probs) > 1:
+            sorted_idx = np.argsort(probs)[::-1]
+            second_conf = float(probs[int(sorted_idx[1])])
+            margin = top_conf - second_conf
+        else:
+            margin = 1.0
+        if top_conf < self.min_top_confidence:
+            return {
+                "label": self.reject_label,
+                "decision_reason": "top_confidence_below_min",
+                "raw_label": top_label,
+                "raw_confidence": top_conf,
+                "margin": float(margin),
+            }
+        class_threshold = self.class_thresholds.get(top_label)
+        if class_threshold is not None and top_conf < class_threshold:
+            return {
+                "label": self.reject_label,
+                "decision_reason": "class_threshold_not_met",
+                "raw_label": top_label,
+                "raw_confidence": top_conf,
+                "margin": float(margin),
+            }
+        if margin < self.min_margin:
+            return {
+                "label": self.reject_label,
+                "decision_reason": "margin_below_min",
+                "raw_label": top_label,
+                "raw_confidence": top_conf,
+                "margin": float(margin),
+            }
+        return {
+            "label": top_label,
+            "decision_reason": "accepted",
+            "raw_label": top_label,
+            "raw_confidence": top_conf,
+            "margin": float(margin),
+        }
+    def predict_from_bgr(self, image_bgr: np.ndarray) -> dict:
+        face, meta = self.detector.detect_largest_face_with_meta(image_bgr, margin=0.2)
+        if face is None:
+            return {
+                "ok": False,
+                "error": "No face detected",
+                "label": None,
+                "confidence": None,
+                "scores": None,
+                "face_bbox": None,
+                "face_keypoints": None,
+            }
+        inp = self.preprocess(face)
+        logits = self.session.run([self.output_name], {self.input_name: inp})[0]
+        probs = self.softmax(logits)[0]
+        policy = self._apply_decision_policy(probs)
+        return {
+            "ok": True,
+            "label": policy["label"],
+            "confidence": policy["raw_confidence"],
+            "raw_label": policy["raw_label"],
+            "raw_confidence": policy["raw_confidence"],
+            "margin": policy["margin"],
+            "decision_reason": policy["decision_reason"],
+            "scores": {
+                name: float(probs[i]) for i, name in enumerate(self.class_names)
+            },
+            "face_bbox": meta.get("bbox") if meta else None,
+            "face_keypoints": meta.get("keypoints") if meta else None,
+        }
+class MaskClassifierPyTorch:
+    def __init__(
+        self,
+        model_path,
+        labels_path = None,
+        use_mediapipe: bool = True,
+        min_top_confidence: float = 0.0,
+        min_margin: float = 0.0,
+        class_thresholds: dict = None,
+        reject_label: str = "uncertain",
+    ):
+        self.model_path = Path(model_path)
+        self.class_names = self._load_class_names(labels_path)
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.model = models.mobilenet_v2(weights=None)
+        self.model.classifier[1] = nn.Linear(self.model.classifier[1].in_features, len(self.class_names))
+        state_dict = torch.load(self.model_path, map_location=self.device)
+        if 'model_state_dict' in state_dict:
+            state_dict = state_dict['model_state_dict']
+        self.model.load_state_dict(state_dict)
+        self.model.to(self.device)
+        self.model.eval()
+        self.detector = FaceDetector(use_mediapipe=use_mediapipe)
+        self.min_top_confidence = float(min_top_confidence)
+        self.min_margin = float(min_margin)
+        self.class_thresholds = dict(class_thresholds or {})
+        self.reject_label = reject_label
+    def _load_class_names(self, labels_path) -> list[str]:
+        candidate = labels_path
+        if candidate is None:
+            candidate = Path("labels.json")
+        if candidate.exists():
+            payload = json.loads(candidate.read_text(encoding="utf-8"))
+            if isinstance(payload, list):
+                return payload
+            if isinstance(payload, dict) and "class_names" in payload:
+                return list(payload["class_names"])
+        return ["with_mask", "incorrect_mask", "without_mask"]
+    @staticmethod
+    def preprocess(image_bgr: np.ndarray, image_size: int = 224) -> np.ndarray:
+        img = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
+        img = cv2.resize(img, (image_size, image_size), interpolation=cv2.INTER_AREA)
+        arr = img.astype(np.float32) / 255.0
+        mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
+        std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
+        arr = (arr - mean) / std
+        arr = np.transpose(arr, (2, 0, 1))
+        return np.expand_dims(arr, axis=0)
+    def _apply_decision_policy(self, probs: np.ndarray) -> dict:
+        top_idx = int(np.argmax(probs))
+        top_label = self.class_names[top_idx]
+        top_conf = float(probs[top_idx])
+        if len(probs) > 1:
+            sorted_idx = np.argsort(probs)[::-1]
+            second_conf = float(probs[int(sorted_idx[1])])
+            margin = top_conf - second_conf
+        else:
+            margin = 1.0
+        if top_conf < self.min_top_confidence:
+            return {"label": self.reject_label, "raw_label": top_label, "raw_confidence": top_conf}
+        class_threshold = self.class_thresholds.get(top_label)
+        if class_threshold is not None and top_conf < class_threshold:
+            return {"label": self.reject_label, "raw_label": top_label, "raw_confidence": top_conf}
+        if margin < self.min_margin:
+            return {"label": self.reject_label, "raw_label": top_label, "raw_confidence": top_conf}
+        return {"label": top_label, "raw_label": top_label, "raw_confidence": top_conf}
+    def predict_from_bgr(self, image_bgr: np.ndarray) -> dict:
+        face, meta = self.detector.detect_largest_face_with_meta(image_bgr, margin=0.2)
+        if face is None:
+            return {"ok": False, "error": "No face detected"}
+        inp = self.preprocess(face)
+        tensor_inp = torch.from_numpy(inp).to(self.device).float()
+        with torch.no_grad():
+            outputs = self.model(tensor_inp)
+            probs = torch.nn.functional.softmax(outputs[0], dim=0).cpu().numpy()
+        policy = self._apply_decision_policy(probs)
+        scores = {}
+        for i, class_name in enumerate(self.class_names):
+            scores[class_name] = float(probs[i])
+        return {
+            "ok": True,
+            "label": policy["label"],
+            "confidence": policy["raw_confidence"],
+            "scores": scores
+        }

labels.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "class_names": [
+    "incorrect_mask",
+    "with_mask",
+    "without_mask"
+  ]
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio
+numpy
+opencv-python-headless
+mediapipe
+torch
+torchvision