pvs_backend / src /components /phone_detector.py
adnankhan-11's picture
PVD System - Initial deployment
d2885a7
"""
This file runs phone detection on hand crops using YOLO.
It supports both your custom trained model and the fallback pretrained model.
The file only handles phone prediction logic, not posture logic or pose extraction.
This keeps runtime inference clean and matches your original project behavior.
"""
from pathlib import Path
import numpy as np
import torch
from src.components.phone_model_loader import PhoneModelLoader
from src.entity.config_entity import PhoneDetectorConfig
from src.utils.common import resolve_device
from src.utils.logger import get_logger
class PhoneDetector:
"""
Detect phone presence inside a cropped hand frame.
"""
def __init__(
self,
config: PhoneDetectorConfig,
log_dir: Path | None = None,
log_level: str = "INFO",
) -> None:
self.config = config
self.logger = get_logger(
self.__class__.__name__, log_dir=log_dir, level=log_level
)
self.device = resolve_device("auto")
self.loader = PhoneModelLoader(
config=config, log_dir=log_dir, log_level=log_level
)
def load(self, use_trained: bool = True):
"""
Load requested YOLO model.
"""
return self.loader.get_model(use_trained=use_trained)
def detect_phone(
self,
model,
frame_rgb: np.ndarray,
use_trained: bool = True,
) -> tuple[str, np.ndarray | None]:
"""
Run phone detection on one RGB hand crop.
Returns:
- detection text
- relative xyxy phone box or None
"""
if frame_rgb is None:
return "", None
if frame_rgb.ndim != 3:
raise ValueError(
f"Expected RGB frame with 3 dims, got shape: {frame_rgb.shape}"
)
tensor_frame = torch.from_numpy(frame_rgb).float() / 255.0
tensor_frame = tensor_frame.permute(2, 0, 1).unsqueeze(0).to(self.device)
results = model(tensor_frame)
first_result = results[0]
result_classes = first_result.boxes.cls.cpu().numpy().astype(np.int32)
phone_class_index = (
self.config.inference.trained_model_phone_class_index
if use_trained
else self.config.inference.fallback_model_phone_class_index
)
confidence_threshold = (
self.config.inference.confidence_threshold_trained
if use_trained
else self.config.inference.confidence_threshold_fallback
)
if not np.any(result_classes == phone_class_index):
return "", None
confidences = (
first_result.boxes.conf.cpu()
.numpy()
.astype(np.float32)[result_classes == phone_class_index]
)
boxes = (
first_result.boxes.data.cpu()
.numpy()
.astype(np.float32)[result_classes == phone_class_index]
)
max_conf_index = int(np.argmax(confidences))
best_confidence = float(confidences[max_conf_index])
if best_confidence < confidence_threshold:
return "", None
detection_text = f"Phone: {best_confidence:.3f}"
relative_xyxy = boxes[max_conf_index][:4]
return detection_text, relative_xyxy
def predict(self, frame_rgb: np.ndarray, use_trained: bool = True) -> dict:
"""
Full phone detection entrypoint for one hand crop.
Returns:
{
"detected": bool,
"text": str,
"relative_xyxy": list or None,
"used_trained_model": bool
}
"""
model = self.load(use_trained=use_trained)
detection_text, relative_xyxy = self.detect_phone(
model=model,
frame_rgb=frame_rgb,
use_trained=use_trained,
)
result = {
"detected": relative_xyxy is not None,
"text": detection_text,
"relative_xyxy": (
relative_xyxy.tolist() if relative_xyxy is not None else None
),
"used_trained_model": use_trained,
}
self.logger.info("Phone detection result: %s", result)
return result