Spaces:

adnankhan-11
/

pvs_backend

Sleeping

App Files Files Community

pvs_backend / src /components /phone_detector.py

adnankhan-11

PVD System - Initial deployment

d2885a7 about 1 month ago

raw

history blame contribute delete

4.23 kB

	"""
	This file runs phone detection on hand crops using YOLO.
	It supports both your custom trained model and the fallback pretrained model.
	The file only handles phone prediction logic, not posture logic or pose extraction.
	This keeps runtime inference clean and matches your original project behavior.
	"""

	from pathlib import Path

	import numpy as np
	import torch

	from src.components.phone_model_loader import PhoneModelLoader
	from src.entity.config_entity import PhoneDetectorConfig
	from src.utils.common import resolve_device
	from src.utils.logger import get_logger


	class PhoneDetector:
	"""
	Detect phone presence inside a cropped hand frame.
	"""

	def __init__(
	self,
	config: PhoneDetectorConfig,
	log_dir: Path \| None = None,
	log_level: str = "INFO",
	) -> None:
	self.config = config
	self.logger = get_logger(
	self.__class__.__name__, log_dir=log_dir, level=log_level
	)

	self.device = resolve_device("auto")
	self.loader = PhoneModelLoader(
	config=config, log_dir=log_dir, log_level=log_level
	)

	def load(self, use_trained: bool = True):
	"""
	Load requested YOLO model.
	"""
	return self.loader.get_model(use_trained=use_trained)

	def detect_phone(
	self,
	model,
	frame_rgb: np.ndarray,
	use_trained: bool = True,
	) -> tuple[str, np.ndarray \| None]:
	"""
	Run phone detection on one RGB hand crop.

	Returns:
	- detection text
	- relative xyxy phone box or None
	"""
	if frame_rgb is None:
	return "", None

	if frame_rgb.ndim != 3:
	raise ValueError(
	f"Expected RGB frame with 3 dims, got shape: {frame_rgb.shape}"
	)

	tensor_frame = torch.from_numpy(frame_rgb).float() / 255.0
	tensor_frame = tensor_frame.permute(2, 0, 1).unsqueeze(0).to(self.device)

	results = model(tensor_frame)
	first_result = results[0]

	result_classes = first_result.boxes.cls.cpu().numpy().astype(np.int32)

	phone_class_index = (
	self.config.inference.trained_model_phone_class_index
	if use_trained
	else self.config.inference.fallback_model_phone_class_index
	)

	confidence_threshold = (
	self.config.inference.confidence_threshold_trained
	if use_trained
	else self.config.inference.confidence_threshold_fallback
	)

	if not np.any(result_classes == phone_class_index):
	return "", None

	confidences = (
	first_result.boxes.conf.cpu()
	.numpy()
	.astype(np.float32)[result_classes == phone_class_index]
	)
	boxes = (
	first_result.boxes.data.cpu()
	.numpy()
	.astype(np.float32)[result_classes == phone_class_index]
	)

	max_conf_index = int(np.argmax(confidences))
	best_confidence = float(confidences[max_conf_index])

	if best_confidence < confidence_threshold:
	return "", None

	detection_text = f"Phone: {best_confidence:.3f}"
	relative_xyxy = boxes[max_conf_index][:4]

	return detection_text, relative_xyxy

	def predict(self, frame_rgb: np.ndarray, use_trained: bool = True) -> dict:
	"""
	Full phone detection entrypoint for one hand crop.

	Returns:
	{
	"detected": bool,
	"text": str,
	"relative_xyxy": list or None,
	"used_trained_model": bool
	}
	"""
	model = self.load(use_trained=use_trained)
	detection_text, relative_xyxy = self.detect_phone(
	model=model,
	frame_rgb=frame_rgb,
	use_trained=use_trained,
	)

	result = {
	"detected": relative_xyxy is not None,
	"text": detection_text,
	"relative_xyxy": (
	relative_xyxy.tolist() if relative_xyxy is not None else None
	),
	"used_trained_model": use_trained,
	}

	self.logger.info("Phone detection result: %s", result)
	return result