Spaces:

webnn
/

video-object-detection

Running

App Files Files Community

video-object-detection / src /main.js

ibelem's picture

Upload 11 files

b2edc6b verified 5 months ago

history blame contribute delete

11.3 kB

	import { AutoModel, AutoProcessor, RawImage } from "@huggingface/transformers";

	// Reference the elements that we will need
	const deviceLabel = document.getElementById("device");
	const status = document.getElementById("status");
	const container = document.getElementById("container");
	const overlay = document.getElementById("overlay");
	const canvas = document.getElementById("canvas");
	const video = document.getElementById("video");
	const thresholdSlider = document.getElementById("threshold");
	const thresholdLabel = document.getElementById("threshold-value");
	const sizeSlider = document.getElementById("size");
	const sizeLabel = document.getElementById("size-value");
	const scaleSlider = document.getElementById("scale");
	const scaleLabel = document.getElementById("scale-value");

	function setStreamSize(width, height) {
	video.width = canvas.width = Math.round(width);
	video.height = canvas.height = Math.round(height);

	// Make sure overlay matches canvas exactly
	overlay.style.width = `${canvas.width}px`;
	overlay.style.height = `${canvas.height}px`;
	}

	status.textContent = "Loading model...";

	function getDeviceConfig(deviceParam, dtypeParam) {
	const defaultDevice = 'webnn-gpu';
	const defaultDtype = 'fp16';
	const webnnDevices = ['webnn-gpu', 'webnn-cpu', 'webnn-npu'];
	const supportedDtypes = ['fp16', 'fp32', 'int8'];

	const device = (deviceParam \|\| defaultDevice).toLowerCase();
	const dtype = (dtypeParam && supportedDtypes.includes(dtypeParam.toLowerCase()))
	? dtypeParam.toLowerCase()
	: (webnnDevices.includes(device) ? defaultDtype : 'fp32');

	const sessionOptions = { logSeverityLevel: 0 };
	return { device, dtype, sessionOptions };
	}

	const urlParams = new URLSearchParams(window.location.search);
	let { device, dtype, sessionOptions } = getDeviceConfig(urlParams.get('device'), urlParams.get('dtype'));

	let deviceValue = 'WebNN GPU';
	switch (device) {
	case 'webgpu':
	deviceValue = 'WebGPU';
	break;
	case 'webnn-gpu':
	deviceValue = 'WebNN GPU';
	break;
	case 'webnn-cpu':
	deviceValue = 'WebNN CPU';
	break;
	case 'webnn-npu':
	deviceValue = 'WebNN NPU';
	break;
	default:
	deviceValue = 'WebNN GPU';
	}

	deviceLabel.textContent = deviceValue;
	if (!['webgpu', 'webnn-gpu', 'webnn-cpu', 'webnn-npu'].includes(device)) {
	status.textContent = `Unsupported device ${device}. Falling back to WebNN GPU.`;
	device = 'webnn-gpu';
	}

	// Load model and processor
	const model_id = "webnn/yolo12n";
	let model, processor;

	try {
	status.textContent = "Loading model...";

	model = await AutoModel.from_pretrained(model_id, {
	device: device,
	dtype: dtype,
	session_options: sessionOptions
	});

	processor = await AutoProcessor.from_pretrained(model_id);

	// Configure processor to match model's expected input size (640x640)
	processor.feature_extractor.size = { width: 640, height: 640 };

	status.textContent = "Model loaded successfully!";
	} catch (err) {
	console.error(err);
	let errorMessage = `Error: ${err.message}`;
	status.textContent = errorMessage;
	status.style.color = "red";

	// Stop execution
	throw err;
	}

	// Set up controls
	let scale = 1;
	scaleSlider.addEventListener("input", () => {
	scale = Number(scaleSlider.value);
	setStreamSize(video.videoWidth * scale, video.videoHeight * scale);
	scaleLabel.textContent = scale;
	});
	scaleSlider.disabled = false;

	let threshold = 0.25;
	thresholdSlider.addEventListener("input", () => {
	threshold = Number(thresholdSlider.value);
	thresholdLabel.textContent = threshold.toFixed(2);
	});
	thresholdSlider.disabled = false;

	let size = 640;
	sizeSlider.addEventListener("input", () => {
	size = Number(sizeSlider.value);
	processor.feature_extractor.size = { width: size, height: size };
	sizeLabel.textContent = size;
	});
	sizeSlider.disabled = false;

	status.textContent = "Ready";

	const COLOURS = [
	"#EF4444",
	"#4299E1",
	"#059669",
	"#FBBF24",
	"#4B52B1",
	"#7B3AC2",
	"#ED507A",
	"#1DD1A1",
	"#F3873A",
	"#4B5563",
	"#DC2626",
	"#1852B4",
	"#18A35D",
	"#F59E0B",
	"#4059BE",
	"#6027A5",
	"#D63D60",
	"#00AC9B",
	"#E64A19",
	"#272A34",
	];

	// Render a bounding box and label on the image
	function renderBox(detection, canvasWidth, canvasHeight) {
	const { bbox, score, class: classId } = detection;
	if (score < threshold) return; // Skip boxes with low confidence

	const [x, y, width, height] = bbox;
	const color = COLOURS[classId % COLOURS.length];

	// Ensure coordinates are within bounds
	const clampedX = Math.max(0, Math.min(x, canvasWidth - width));
	const clampedY = Math.max(0, Math.min(y, canvasHeight - height));
	const clampedWidth = Math.max(1, Math.min(width, canvasWidth - clampedX));
	const clampedHeight = Math.max(1, Math.min(height, canvasHeight - clampedY));

	// Draw the box
	const boxElement = document.createElement("div");
	boxElement.className = "bounding-box";
	Object.assign(boxElement.style, {
	position: "absolute",
	left: `${clampedX}px`,
	top: `${clampedY}px`,
	width: `${clampedWidth}px`,
	height: `${clampedHeight}px`,
	border: `2px solid ${color}`,
	backgroundColor: "transparent",
	pointerEvents: "none",
	boxSizing: "border-box"
	});

	// Draw label
	const labelElement = document.createElement("span");
	labelElement.textContent = `${model.config.id2label[classId]} (${(100 * score).toFixed(1)}%)`;
	labelElement.className = "bounding-box-label";
	Object.assign(labelElement.style, {
	backgroundColor: color,
	color: "white",
	padding: "2px 6px",
	fontSize: "12px",
	position: "absolute",
	top: "-22px",
	left: "0px",
	whiteSpace: "nowrap",
	borderRadius: "2px"
	});

	boxElement.appendChild(labelElement);
	overlay.appendChild(boxElement);
	}

	function calculateIoU(boxA, boxB) {
	const [xA, yA, wA, hA] = boxA;
	const [xB, yB, wB, hB] = boxB;

	const x1 = Math.max(xA, xB);
	const y1 = Math.max(yA, yB);
	const x2 = Math.min(xA + wA, xB + wB);
	const y2 = Math.min(yA + hA, yB + hB);

	const intersection = Math.max(0, x2 - x1) * Math.max(0, y2 - y1);
	const areaA = wA * hA;
	const areaB = wB * hB;

	const union = areaA + areaB - intersection;
	return intersection / union;
	}

	function applyNMS(detections, iouThreshold = 0.5) {
	// Sort detections by confidence score in descending order
	detections.sort((a, b) => b.score - a.score);

	const filteredDetections = [];
	const used = new Array(detections.length).fill(false);

	for (let i = 0; i < detections.length; i++) {
	if (used[i]) continue;

	const detectionA = detections[i];
	filteredDetections.push(detectionA);

	for (let j = i + 1; j < detections.length; j++) {
	if (used[j]) continue;

	const detectionB = detections[j];

	// Only apply NMS to boxes of the same class
	if (detectionA.class === detectionB.class) {
	const iou = calculateIoU(detectionA.bbox, detectionB.bbox);
	if (iou > iouThreshold) {
	used[j] = true; // Suppress overlapping box
	}
	}
	}
	}

	return filteredDetections;
	}

	function processDetections(outputs, canvasWidth, canvasHeight) {
	// Clear previous detections
	overlay.innerHTML = "";

	// Process YOLOv12 outputs
	const predictions = outputs.tolist()[0]; // Get the first batch
	const numClasses = predictions.length - 4; // Subtract 4 for bbox coordinates
	const numPredictions = predictions[0].length; // Number of predictions

	let detections = [];

	// Process each prediction
	for (let i = 0; i < numPredictions; i++) {
	const x = predictions[0][i]; // center x (0-640)
	const y = predictions[1][i]; // center y (0-640)
	const w = predictions[2][i]; // width (0-640)
	const h = predictions[3][i]; // height (0-640)

	let maxScore = 0;
	let maxClassIndex = -1;

	for (let c = 0; c < numClasses; c++) {
	const score = predictions[c + 4][i];
	if (score > maxScore) {
	maxScore = score;
	maxClassIndex = c;
	}
	}

	if (maxScore < threshold) continue;

	// Convert from center coordinates to top-left coordinates
	// Scale from 640x640 model output to canvas dimensions
	const scaleX = canvasWidth / 640;
	const scaleY = canvasHeight / 640;

	const centerX = x * scaleX;
	const centerY = y * scaleY;
	const boxWidth = w * scaleX;
	const boxHeight = h * scaleY;

	const xmin = centerX - (boxWidth / 2);
	const ymin = centerY - (boxHeight / 2);

	detections.push({
	bbox: [xmin, ymin, boxWidth, boxHeight],
	score: maxScore,
	class: maxClassIndex,
	});
	}

	// Apply Non-Maximum Suppression to remove duplicate detections
	const filteredDetections = applyNMS(detections, 0.45); // Lower IoU threshold for better suppression

	// Debug: Log detection info
	if (filteredDetections.length > 0) {
	console.log(`Found ${filteredDetections.length} detections:`,
	filteredDetections.map(d => ({
	class: model.config.id2label[d.class],
	score: d.score.toFixed(3),
	bbox: d.bbox.map(v => Math.round(v))
	}))
	);
	}

	// Render filtered detections
	filteredDetections.forEach((detection) => {
	renderBox(detection, canvasWidth, canvasHeight);
	});

	return filteredDetections.length;
	}

	let isProcessing = false;
	let previousTime;
	const context = canvas.getContext("2d", { willReadFrequently: true });
	function updateCanvas() {
	const { width, height } = canvas;
	context.drawImage(video, 0, 0, width, height);

	if (!isProcessing) {
	isProcessing = true;
	(async function () {
	try {
	// Read the current frame from the video
	const pixelData = context.getImageData(0, 0, width, height).data;
	const image = new RawImage(pixelData, width, height, 4);

	// Process the image and run the model
	const inputs = await processor(image);
	const { outputs } = await model(inputs);

	// Process detections and render boxes
	const detectionCount = processDetections(outputs, width, height);

	if (previousTime !== undefined) {
	const fps = 1000 / (performance.now() - previousTime);
	status.textContent = `FPS: ${fps.toFixed(2)} \| Detections: ${detectionCount}`;
	}
	previousTime = performance.now();
	} catch (error) {
	console.error("Detection error:", error);
	status.textContent = `Error: ${error.message}`;
	} finally {
	isProcessing = false;
	}
	})();
	}

	window.requestAnimationFrame(updateCanvas);
	}

	// Start the video stream
	navigator.mediaDevices
	.getUserMedia(
	{ video: true }, // Ask for video
	)
	.then((stream) => {
	// Set up the video and canvas elements.
	video.srcObject = stream;
	video.play();

	const videoTrack = stream.getVideoTracks()[0];
	const { width, height } = videoTrack.getSettings();

	setStreamSize(width * scale, height * scale);

	// Set container width and height depending on the image aspect ratio
	const ar = width / height;
	const [cw, ch] = ar > 720 / 405 ? [720, 720 / ar] : [405 * ar, 405];
	container.style.width = `${cw}px`;
	container.style.height = `${ch}px`;

	// Start the animation loop
	window.requestAnimationFrame(updateCanvas);
	})
	.catch((error) => {
	alert(error);
	});