yolo-detection-app / index.html
MaxLeft's picture
Add 3 files
91f934f verified
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>ONNX YOLO Segmentation Web Demo</title>
<script src="https://cdn.tailwindcss.com"></script>
<script src="https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/ort.min.js"></script>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
<style>
.detection-box {
position: absolute;
border: 2px solid #3B82F6;
background-color: rgba(59, 130, 246, 0.2);
display: flex;
flex-direction: column;
align-items: center;
justify-content: flex-end;
color: white;
font-weight: bold;
font-size: 12px;
}
.detection-label {
background-color: #3B82F6;
padding: 2px 5px;
border-radius: 3px;
margin-bottom: 2px;
}
.pulse {
animation: pulse 2s infinite;
}
@keyframes pulse {
0% {
box-shadow: 0 0 0 0 rgba(59, 130, 246, 0.7);
}
70% {
box-shadow: 0 0 0 10px rgba(59, 130, 246, 0);
}
100% {
box-shadow: 0 0 0 0 rgba(59, 130, 246, 0);
}
}
#video-container {
position: relative;
width: 100%;
max-width: 640px;
margin: 0 auto;
border-radius: 8px;
overflow: hidden;
box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);
}
#video, #canvas {
width: 100%;
height: auto;
display: block;
}
#canvas {
position: absolute;
top: 0;
left: 0;
z-index: 10;
}
#segmentation {
position: absolute;
top: 0;
left: 0;
z-index: 5;
opacity: 0.5;
}
.dropzone {
border: 2px dashed #4B5563;
border-radius: 8px;
padding: 20px;
text-align: center;
cursor: pointer;
transition: all 0.3s;
}
.dropzone:hover {
border-color: #3B82F6;
background-color: rgba(59, 130, 246, 0.1);
}
.dropzone.active {
border-color: #3B82F6;
background-color: rgba(59, 130, 246, 0.2);
}
.status-badge {
display: inline-flex;
align-items: center;
padding: 4px 8px;
border-radius: 9999px;
font-size: 12px;
font-weight: 600;
}
.status-badge.ready {
background-color: rgba(16, 185, 129, 0.2);
color: #10B981;
}
.status-badge.loading {
background-color: rgba(245, 158, 11, 0.2);
color: #F59E0B;
}
.status-badge.error {
background-color: rgba(239, 68, 68, 0.2);
color: #EF4444;
}
.status-badge.disabled {
background-color: rgba(75, 85, 99, 0.2);
color: #4B5563;
}
.output-log {
font-family: 'Courier New', Courier, monospace;
background-color: rgba(31, 41, 55, 0.8);
border-radius: 8px;
padding: 16px;
max-height: 200px;
overflow-y: auto;
}
.legend {
display: flex;
flex-wrap: wrap;
gap: 8px;
margin-top: 8px;
}
.legend-item {
display: flex;
align-items: center;
font-size: 12px;
}
.legend-color {
width: 16px;
height: 16px;
border-radius: 3px;
margin-right: 4px;
}
.confidence-bar {
height: 4px;
background-color: #4B5563;
border-radius: 2px;
margin-top: 2px;
overflow: hidden;
}
.confidence-fill {
height: 100%;
background-color: #10B981;
}
.debug-output {
font-family: 'Courier New', Courier, monospace;
background-color: rgba(31, 41, 55, 0.8);
border-radius: 8px;
padding: 16px;
max-height: 200px;
overflow-y: auto;
margin-top: 16px;
font-size: 12px;
white-space: pre-wrap;
}
</style>
</head>
<body class="bg-gray-900 text-gray-100 min-h-screen">
<div class="container mx-auto px-4 py-8">
<header class="text-center mb-8">
<h1 class="text-3xl md:text-4xl font-bold mb-2 text-blue-400">
<i class="fas fa-shapes mr-2"></i> YOLO Segmentation Web Demo
</h1>
<p class="text-gray-400 max-w-2xl mx-auto">
Real-time instance segmentation with YOLO ONNX models in your browser
</p>
</header>
<div class="max-w-4xl mx-auto">
<div class="grid grid-cols-1 md:grid-cols-2 gap-8">
<!-- Left column - Controls -->
<div class="space-y-6">
<!-- Model Selection -->
<div class="bg-gray-800 rounded-lg p-6 shadow-lg">
<h2 class="text-xl font-bold mb-4 text-blue-400">
<i class="fas fa-file-export mr-2"></i> Model Selection
</h2>
<div id="dropzone" class="dropzone mb-4">
<div class="flex flex-col items-center justify-center py-4">
<i class="fas fa-file-upload text-4xl text-blue-400 mb-2"></i>
<p class="text-gray-300">Drag & drop your YOLO ONNX model file here</p>
<p class="text-gray-400 text-sm mt-1">or click to browse</p>
<input type="file" id="modelFile" accept=".onnx" class="hidden" />
</div>
</div>
<div class="flex items-center justify-between">
<div>
<p id="modelStatusText" class="text-sm text-gray-400">No model selected</p>
<p id="modelSizeText" class="text-xs text-gray-500"></p>
</div>
<span id="modelStatusBadge" class="status-badge disabled">
<i class="fas fa-times-circle mr-1"></i> Not Loaded
</span>
</div>
</div>
<!-- Detection Settings -->
<div class="bg-gray-800 rounded-lg p-6 shadow-lg">
<h2 class="text-xl font-bold mb-4 text-blue-400">
<i class="fas fa-sliders-h mr-2"></i> Detection Settings
</h2>
<div class="space-y-4">
<div>
<label for="confidenceThreshold" class="block text-sm font-medium text-gray-300 mb-1">
Confidence Threshold: <span id="confidenceValue">0.5</span>
</label>
<input type="range" id="confidenceThreshold" min="0" max="1" step="0.05" value="0.5"
class="w-full h-2 bg-gray-700 rounded-lg appearance-none cursor-pointer">
</div>
<div>
<label for="iouThreshold" class="block text-sm font-medium text-gray-300 mb-1">
IOU Threshold: <span id="iouValue">0.45</span>
</label>
<input type="range" id="iouThreshold" min="0" max="1" step="0.05" value="0.45"
class="w-full h-2 bg-gray-700 rounded-lg appearance-none cursor-pointer">
</div>
<div class="flex items-center justify-between">
<label for="showMasks" class="text-sm font-medium text-gray-300">
Show Segmentation Masks
</label>
<label class="relative inline-flex items-center cursor-pointer">
<input type="checkbox" id="showMasks" class="sr-only peer" checked>
<div class="w-11 h-6 bg-gray-700 peer-focus:outline-none rounded-full peer peer-checked:after:translate-x-full peer-checked:after:border-white after:content-[''] after:absolute after:top-[2px] after:left-[2px] after:bg-white after:border-gray-300 after:border after:rounded-full after:h-5 after:w-5 after:transition-all peer-checked:bg-blue-600"></div>
</label>
</div>
</div>
</div>
<!-- Webcam Controls -->
<div class="bg-gray-800 rounded-lg p-6 shadow-lg">
<h2 class="text-xl font-bold mb-4 text-blue-400">
<i class="fas fa-video mr-2"></i> Webcam Controls
</h2>
<div class="flex flex-col space-y-4">
<button id="startBtn" class="bg-green-600 hover:bg-green-700 text-white font-bold py-3 px-6 rounded-lg flex items-center justify-center disabled:opacity-50 disabled:cursor-not-allowed" disabled>
<i class="fas fa-play mr-2"></i> Start Detection
</button>
<div class="flex items-center justify-between">
<div>
<p class="text-sm text-gray-400">Webcam Status</p>
</div>
<span id="webcamStatusBadge" class="status-badge disabled">
<i class="fas fa-times-circle mr-1"></i> Inactive
</span>
</div>
</div>
</div>
<!-- Performance Stats -->
<div class="bg-gray-800 rounded-lg p-6 shadow-lg">
<h2 class="text-xl font-bold mb-4 text-blue-400">
<i class="fas fa-tachometer-alt mr-2"></i> Performance
</h2>
<div class="grid grid-cols-2 gap-4">
<div class="bg-gray-700 p-4 rounded-lg text-center">
<div class="text-2xl font-bold text-blue-400" id="fpsCounter">-</div>
<div class="text-gray-300 text-sm">FPS</div>
</div>
<div class="bg-gray-700 p-4 rounded-lg text-center">
<div class="text-2xl font-bold text-green-400" id="inferenceTime">-</div>
<div class="text-gray-300 text-sm">ms/inference</div>
</div>
</div>
</div>
</div>
<!-- Right column - Output -->
<div class="space-y-6">
<!-- Video Feed -->
<div class="bg-gray-800 rounded-lg p-6 shadow-lg">
<h2 class="text-xl font-bold mb-4 text-blue-400">
<i class="fas fa-eye mr-2"></i> Live Detection
</h2>
<div id="video-container" class="relative">
<div id="videoPlaceholder" class="bg-gray-700 rounded-lg flex items-center justify-center aspect-square">
<div class="text-center p-8">
<i class="fas fa-camera text-4xl text-gray-500 mb-4"></i>
<p class="text-gray-400">Webcam feed will appear here</p>
</div>
</div>
<video id="video" autoplay playsinline muted class="hidden"></video>
<canvas id="segmentation" class="hidden"></canvas>
<canvas id="canvas" class="hidden"></canvas>
</div>
<div id="detectionLegend" class="legend mt-4 hidden">
<!-- Legend items will be added dynamically -->
</div>
</div>
<!-- Output Log -->
<div class="bg-gray-800 rounded-lg p-6 shadow-lg">
<h2 class="text-xl font-bold mb-4 text-blue-400">
<i class="fas fa-terminal mr-2"></i> Output Log
</h2>
<div class="output-log text-sm" id="log">
<p class="text-gray-400">Waiting for model to load...</p>
</div>
</div>
<!-- Debug Output -->
<div class="bg-gray-800 rounded-lg p-6 shadow-lg">
<h2 class="text-xl font-bold mb-4 text-blue-400">
<i class="fas fa-bug mr-2"></i> Debug Output
</h2>
<div class="debug-output" id="debugOutput">
<p class="text-gray-400">Raw tensor output will appear here</p>
</div>
</div>
</div>
</div>
</div>
<footer class="mt-12 text-center text-gray-500 text-sm">
<p>Powered by ONNX Runtime Web - All processing happens in your browser</p>
</footer>
</div>
<script>
// DOM elements
const video = document.getElementById('video');
const canvas = document.getElementById('canvas');
const segmentationCanvas = document.getElementById('segmentation');
const ctx = canvas.getContext('2d');
const segCtx = segmentationCanvas.getContext('2d');
const startBtn = document.getElementById('startBtn');
const logElement = document.getElementById('log');
const debugOutput = document.getElementById('debugOutput');
const modelFileInput = document.getElementById('modelFile');
const dropzone = document.getElementById('dropzone');
const modelStatusText = document.getElementById('modelStatusText');
const modelSizeText = document.getElementById('modelSizeText');
const modelStatusBadge = document.getElementById('modelStatusBadge');
const webcamStatusBadge = document.getElementById('webcamStatusBadge');
const fpsCounter = document.getElementById('fpsCounter');
const inferenceTime = document.getElementById('inferenceTime');
const videoPlaceholder = document.getElementById('videoPlaceholder');
const videoContainer = document.getElementById('video-container');
const confidenceThreshold = document.getElementById('confidenceThreshold');
const iouThreshold = document.getElementById('iouThreshold');
const confidenceValue = document.getElementById('confidenceValue');
const iouValue = document.getElementById('iouValue');
const showMasks = document.getElementById('showMasks');
const detectionLegend = document.getElementById('detectionLegend');
// App state
let session = null;
let modelBuffer = null;
let isRunning = false;
let frameCount = 0;
let lastFpsUpdate = 0;
let fps = 0;
let lastInferenceTime = 0;
let classColors = {};
let classNames = {}; // Will be populated based on model output
// Update log with timestamp
function log(message) {
const now = new Date();
const timestamp = now.toLocaleTimeString();
const logEntry = document.createElement('p');
logEntry.innerHTML = `<span class="text-gray-500">[${timestamp}]</span> ${message}`;
logElement.appendChild(logEntry);
logElement.scrollTop = logElement.scrollHeight;
}
// Update debug output with raw tensor data
function debugLog(message) {
const debugEntry = document.createElement('div');
debugEntry.textContent = message;
debugOutput.appendChild(debugEntry);
debugOutput.scrollTop = debugOutput.scrollHeight;
}
// Generate random colors for classes
function generateClassColors(count) {
const colors = {};
for (let i = 0; i < count; i++) {
// Generate a bright color
const hue = (i * 360 / count) % 360;
colors[i] = `hsl(${hue}, 80%, 60%)`;
}
return colors;
}
// Update settings UI
confidenceThreshold.addEventListener('input', () => {
confidenceValue.textContent = confidenceThreshold.value;
});
iouThreshold.addEventListener('input', () => {
iouValue.textContent = iouThreshold.value;
});
// Set up dropzone interactions
dropzone.addEventListener('click', () => {
modelFileInput.click();
});
['dragenter', 'dragover', 'dragleave', 'drop'].forEach(eventName => {
dropzone.addEventListener(eventName, preventDefaults, false);
});
function preventDefaults(e) {
e.preventDefault();
e.stopPropagation();
}
['dragenter', 'dragover'].forEach(eventName => {
dropzone.addEventListener(eventName, highlight, false);
});
['dragleave', 'drop'].forEach(eventName => {
dropzone.addEventListener(eventName, unhighlight, false);
});
function highlight() {
dropzone.classList.add('active');
}
function unhighlight() {
dropzone.classList.remove('active');
}
dropzone.addEventListener('drop', handleDrop, false);
// Handle model file selection
function handleDrop(e) {
const dt = e.dataTransfer;
const files = dt.files;
if (files.length > 0 && files[0].name.endsWith('.onnx')) {
handleModelFile(files[0]);
}
}
modelFileInput.addEventListener('change', (e) => {
const files = e.target.files;
if (files.length > 0 && files[0].name.endsWith('.onnx')) {
handleModelFile(files[0]);
}
});
// Process the selected model file
async function handleModelFile(file) {
try {
// Update UI
modelStatusText.textContent = `Loading ${file.name}...`;
modelSizeText.textContent = `(${(file.size/1e6).toFixed(1)} MB)`;
modelStatusBadge.className = 'status-badge loading';
modelStatusBadge.innerHTML = '<i class="fas fa-spinner fa-spin mr-1"></i> Loading';
startBtn.disabled = true;
// Read the file
const reader = new FileReader();
reader.onload = async (ev) => {
modelBuffer = ev.target.result;
// Initialize ONNX session
log(`Initializing ONNX session for ${file.name}`);
try {
// Create session options with WebGL and WASM backends
const sessionOptions = {
executionProviders: ['webgl', 'wasm'],
graphOptimizationLevel: 'all'
};
// Try to create session with WebGL first, fall back to WASM if needed
try {
session = await ort.InferenceSession.create(modelBuffer, sessionOptions);
} catch (webglError) {
log(`WebGL backend failed, falling back to WASM: ${webglError.message}`);
sessionOptions.executionProviders = ['wasm'];
session = await ort.InferenceSession.create(modelBuffer, sessionOptions);
}
// Generate colors for classes (assuming 80 classes for YOLO)
classColors = generateClassColors(80);
// Success
modelStatusText.textContent = `Loaded: ${file.name}`;
modelStatusBadge.className = 'status-badge ready';
modelStatusBadge.innerHTML = '<i class="fas fa-check-circle mr-1"></i> Ready';
startBtn.disabled = false;
log(`Model loaded successfully with ${session.inputNames.length} inputs and ${session.outputNames.length} outputs`);
log(`Input shape: ${JSON.stringify(session.inputs[0].dims)}`);
// Check if this is a segmentation model
const isSegmentation = session.outputNames.some(name => name.includes('mask'));
log(`Model type: ${isSegmentation ? 'Segmentation' : 'Detection'}`);
} catch (error) {
modelStatusText.textContent = `Model loaded (${file.name})`;
modelStatusBadge.className = 'status-badge ready';
modelStatusBadge.innerHTML = '<i class="fas fa-check-circle mr-1"></i> Ready';
log(`Model initialization completed with warnings: ${error.message}`);
console.log('Model loaded but with warnings:', error);
// Try to create session anyway (some models might still work despite warnings)
session = await ort.InferenceSession.create(modelBuffer);
startBtn.disabled = false;
}
};
reader.onerror = (error) => {
modelStatusText.textContent = `Error reading file`;
modelStatusBadge.className = 'status-badge error';
modelStatusBadge.innerHTML = '<i class="fas fa-exclamation-circle mr-1"></i> Error';
log(`File read error: ${error.target.error}`);
};
reader.readAsArrayBuffer(file);
} catch (error) {
log(`Error handling model file: ${error.message}`);
console.error(error);
}
}
// Start webcam and detection
startBtn.addEventListener('click', async () => {
if (isRunning) {
// Stop detection
isRunning = false;
startBtn.innerHTML = '<i class="fas fa-play mr-2"></i> Start Detection';
startBtn.classList.remove('bg-red-600', 'hover:bg-red-700');
startBtn.classList.add('bg-green-600', 'hover:bg-green-700');
webcamStatusBadge.className = 'status-badge disabled';
webcamStatusBadge.innerHTML = '<i class="fas fa-times-circle mr-1"></i> Inactive';
log('Detection stopped');
return;
}
try {
// Get webcam access
log('Requesting webcam access...');
const stream = await navigator.mediaDevices.getUserMedia({
video: {
width: { ideal: 640 },
height: { ideal: 640 },
facingMode: 'environment'
},
audio: false
});
// Set up video element
video.srcObject = stream;
await video.play();
// Wait for video dimensions to be available
await new Promise(resolve => {
const checkDimensions = () => {
if (video.videoWidth > 0 && video.videoHeight > 0) {
resolve();
} else {
setTimeout(checkDimensions, 50);
}
};
checkDimensions();
});
// Set canvas dimensions to match video
const videoWidth = video.videoWidth;
const videoHeight = video.videoHeight;
canvas.width = videoWidth;
canvas.height = videoHeight;
segmentationCanvas.width = videoWidth;
segmentationCanvas.height = videoHeight;
// Adjust container aspect ratio
videoContainer.style.aspectRatio = `${videoWidth}/${videoHeight}`;
// Show video and canvas
videoPlaceholder.classList.add('hidden');
video.classList.remove('hidden');
canvas.classList.remove('hidden');
segmentationCanvas.classList.remove('hidden');
detectionLegend.classList.remove('hidden');
// Update UI
isRunning = true;
startBtn.innerHTML = '<i class="fas fa-stop mr-2"></i> Stop Detection';
startBtn.classList.remove('bg-green-600', 'hover:bg-green-700');
startBtn.classList.add('bg-red-600', 'hover:bg-red-700');
webcamStatusBadge.className = 'status-badge ready';
webcamStatusBadge.innerHTML = '<i class="fas fa-check-circle mr-1"></i> Active';
log(`Webcam started (${videoWidth}x${videoHeight}) - beginning detection`);
// Start detection loop
detectionLoop();
} catch (error) {
log(`Error accessing webcam: ${error.message}`);
console.error(error);
webcamStatusBadge.className = 'status-badge error';
webcamStatusBadge.innerHTML = '<i class="fas fa-exclamation-circle mr-1"></i> Error';
}
});
// Non-maximum suppression for YOLO outputs
function nonMaxSuppression(boxes, scores, iouThreshold) {
const selectedIndices = [];
const areas = boxes.map(box => (box[2] - box[0]) * (box[3] - box[1]));
// Sort boxes by score (descending)
const scoreIndices = scores.map((score, index) => ({score, index}))
.sort((a, b) => b.score - a.score)
.map(obj => obj.index);
while (scoreIndices.length > 0) {
const current = scoreIndices.shift();
selectedIndices.push(current);
const currentBox = boxes[current];
// Calculate IoU with remaining boxes
const remainingBoxes = scoreIndices.map(i => boxes[i]);
const ious = remainingBoxes.map(box => {
const x1 = Math.max(currentBox[0], box[0]);
const y1 = Math.max(currentBox[1], box[1]);
const x2 = Math.min(currentBox[2], box[2]);
const y2 = Math.min(currentBox[3], box[3]);
const intersection = Math.max(0, x2 - x1) * Math.max(0, y2 - y1);
const union = areas[current] + areas[box] - intersection;
return intersection / union;
});
// Filter out boxes with high IoU
for (let i = ious.length - 1; i >= 0; i--) {
if (ious[i] > iouThreshold) {
scoreIndices.splice(i, 1);
}
}
}
return selectedIndices;
}
// Process YOLO output tensor (updated for YOLOv8 format)
function processYoloOutput(output, imgWidth, imgHeight) {
const confThreshold = parseFloat(confidenceThreshold.value);
const iouThresh = parseFloat(iouThreshold.value);
// Get the output tensor (YOLOv8 uses 'output0' for detections)
const outputTensor = output.output0;
const outputData = outputTensor.data;
// Clear previous debug output
debugOutput.innerHTML = '';
// Log raw tensor shape and first few values
debugLog(`Output tensor shape: [${outputTensor.dims.join(', ')}]`);
debugLog(`First 20 values: ${Array.from(outputData.slice(0, 20)).map(v => v.toFixed(2)).join(', ')}`);
// YOLOv8 output format: [batch, num_detections, 4 (box) + 1 (conf) + num_classes]
const numDetections = outputTensor.dims[1];
const numFeatures = outputTensor.dims[2];
debugLog(`Num detections: ${numDetections}, Num features: ${numFeatures}`);
// Extract boxes, scores, and class IDs
const boxes = [];
const scores = [];
const classIds = [];
for (let i = 0; i < numDetections; i++) {
const offset = i * numFeatures;
// Get box in (x1, y1, x2, y2) format (already normalized to [0,1])
const x1 = outputData[offset];
const y1 = outputData[offset + 1];
const x2 = outputData[offset + 2];
const y2 = outputData[offset + 3];
// Get confidence score
const conf = outputData[offset + 4];
// Find class with maximum probability
let maxScore = -1;
let classId = -1;
// Start from offset + 4 (skip box coordinates and objectness)
for (let j = 4; j < numFeatures; j++) {
const score = outputData[offset + j];
if (score > maxScore) {
maxScore = score;
classId = j - 4; // Subtract 4 because first 4 elements are box coordinates
}
}
// Calculate final score (objectness * class probability)
const finalScore = conf * maxScore;
// Filter by confidence threshold
if (finalScore > confThreshold) {
// Scale box coordinates to image dimensions
const scaledBox = [
x1 * imgWidth,
y1 * imgHeight,
x2 * imgWidth,
y2 * imgHeight
];
boxes.push(scaledBox);
scores.push(finalScore);
classIds.push(classId);
// Log detection details
debugLog(`Detection ${i}: [${scaledBox.map(v => v.toFixed(1)).join(', ')}] score=${finalScore.toFixed(2)} class=${classId}`);
}
}
// Apply non-max suppression
const selectedIndices = nonMaxSuppression(boxes, scores, iouThresh);
// Prepare final detections
const detections = selectedIndices.map(idx => ({
box: boxes[idx],
score: scores[idx],
classId: classIds[idx],
mask: output.output1 ? getMaskForDetection(output.output1.data, idx, output.output1.dims) : null
}));
return detections;
}
// Extract mask for a specific detection
function getMaskForDetection(masksData, detectionIdx, maskShape) {
// maskShape: [1, mask_dim, mask_height, mask_width]
const maskDim = maskShape[1];
const maskHeight = maskShape[2];
const maskWidth = maskShape[3];
const mask = new Array(maskHeight * maskWidth).fill(0);
// For each pixel, find the channel with max value
for (let y = 0; y < maskHeight; y++) {
for (let x = 0; x < maskWidth; x++) {
let maxVal = -Infinity;
let bestChannel = 0;
for (let c = 0; c < maskDim; c++) {
const idx = (c * maskHeight * maskWidth) + (y * maskWidth) + x;
const val = masksData[detectionIdx * maskDim * maskHeight * maskWidth + idx];
if (val > maxVal) {
maxVal = val;
bestChannel = c;
}
}
mask[y * maskWidth + x] = bestChannel;
}
}
return {
data: mask,
width: maskWidth,
height: maskHeight
};
}
// Draw detections on canvas
function drawDetections(detections, imgWidth, imgHeight) {
// Clear previous drawings
ctx.clearRect(0, 0, canvas.width, canvas.height);
segCtx.clearRect(0, 0, segmentationCanvas.width, segmentationCanvas.height);
// Draw video frame
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
// Clear legend and rebuild
detectionLegend.innerHTML = '';
const legendItems = new Set();
// Draw each detection
detections.forEach(det => {
const [x1, y1, x2, y2] = det.box;
const width = x2 - x1;
const height = y2 - y1;
const className = classNames[det.classId] || `Class ${det.classId}`;
const color = classColors[det.classId] || '#3B82F6';
// Add to legend
if (!legendItems.has(det.classId)) {
legendItems.add(det.classId);
const legendItem = document.createElement('div');
legendItem.className = 'legend-item';
legendItem.innerHTML = `
<div class="legend-color" style="background-color: ${color};"></div>
<span>${className}</span>
<div class="confidence-bar">
<div class="confidence-fill" style="width: ${det.score * 100}%;"></div>
</div>
`;
detectionLegend.appendChild(legendItem);
}
// Draw mask if available and enabled
if (det.mask && showMasks && showMasks.checked) {
const mask = det.mask;
const scaleX = width / mask.width;
const scaleY = height / mask.height;
// Create a temporary canvas for the mask
const maskCanvas = document.createElement('canvas');
maskCanvas.width = mask.width;
maskCanvas.height = mask.height;
const maskCtx = maskCanvas.getContext('2d');
// Draw mask data
const maskImageData = maskCtx.createImageData(mask.width, mask.height);
for (let i = 0; i < mask.data.length; i++) {
if (mask.data[i] > 0) { // Only draw non-zero mask values
const idx = i * 4;
const [r, g, b] = hexToRgb(color);
maskImageData.data[idx] = r;
maskImageData.data[idx + 1] = g;
maskImageData.data[idx + 2] = b;
maskImageData.data[idx + 3] = 150; // Alpha
}
}
maskCtx.putImageData(maskImageData, 0, 0);
// Draw the mask on the segmentation canvas
segCtx.save();
segCtx.translate(x1, y1);
segCtx.scale(scaleX, scaleY);
segCtx.drawImage(maskCanvas, 0, 0);
segCtx.restore();
}
// Draw bounding box
ctx.strokeStyle = color;
ctx.lineWidth = 2;
ctx.strokeRect(x1, y1, width, height);
// Draw label background
const label = `${className} ${(det.score * 100).toFixed(1)}%`;
const textWidth = ctx.measureText(label).width;
ctx.fillStyle = color;
ctx.fillRect(x1 - 2, y1 - 20, textWidth + 4, 20);
// Draw label text
ctx.fillStyle = 'white';
ctx.font = '12px Arial';
ctx.fillText(label, x1, y1 - 5);
});
}
// Helper to convert hex to RGB
function hexToRgb(hex) {
const result = /^#?([a-f\d]{2})([a-f\d]{2})([a-f\d]{2})$/i.exec(hex);
return result ? [
parseInt(result[1], 16),
parseInt(result[2], 16),
parseInt(result[3], 16)
] : [0, 0, 0];
}
// Detection loop
async function detectionLoop() {
if (!isRunning) return;
const startTime = performance.now();
try {
// Preprocess frame
const inputTensor = await preprocessFrame(video);
// Run inference
const feeds = { [session.inputNames[0]]: inputTensor };
const inferenceStart = performance.now();
const output = await session.run(feeds);
lastInferenceTime = performance.now() - inferenceStart;
// Process YOLO output
const detections = processYoloOutput(output, video.videoWidth, video.videoHeight);
// Draw detections
drawDetections(detections, video.videoWidth, video.videoHeight);
// Log detection info
if (detections.length > 0) {
const topDetection = detections[0];
const className = classNames[topDetection.classId] || `Class ${topDetection.classId}`;
log(`Detected ${detections.length} objects (top: ${className} @ ${(topDetection.score * 100).toFixed(1)}%)`);
}
// Update performance counters
frameCount++;
const now = performance.now();
if (now - lastFpsUpdate >= 1000) {
fps = frameCount * 1000 / (now - lastFpsUpdate);
frameCount = 0;
lastFpsUpdate = now;
// Update UI
fpsCounter.textContent = Math.round(fps);
inferenceTime.textContent = lastInferenceTime.toFixed(1);
}
} catch (error) {
log(`Detection error: ${error.message}`);
console.error(error);
}
// Schedule next frame
requestAnimationFrame(detectionLoop);
}
// Preprocess video frame for model input
async function preprocessFrame(videoElement) {
// Create temporary canvas
const tempCanvas = document.createElement('canvas');
tempCanvas.width = videoElement.videoWidth;
tempCanvas.height = videoElement.videoHeight;
const tempCtx = tempCanvas.getContext('2d');
// Draw video frame to canvas
tempCtx.drawImage(videoElement, 0, 0, tempCanvas.width, tempCanvas.height);
// Get image data
const imageData = tempCtx.getImageData(0, 0, tempCanvas.width, tempCanvas.height);
// Convert to Float32Array and normalize (assuming model expects [0,1] range)
const float32Data = new Float32Array(tempCanvas.width * tempCanvas.height * 3);
// Convert from RGBA to RGB and normalize
for (let i = 0, j = 0; i < imageData.data.length; i += 4) {
float32Data[j++] = imageData.data[i] / 255.0; // R
float32Data[j++] = imageData.data[i + 1] / 255.0; // G
float32Data[j++] = imageData.data[i + 2] / 255.0; // B
}
// Convert from HWC to CHW format (channels first)
const chwData = new Float32Array(float32Data.length);
const channelSize = tempCanvas.width * tempCanvas.height;
for (let c = 0; c < 3; ++c) {
for (let i = 0; i < channelSize; ++i) {
chwData[c * channelSize + i] = float32Data[i * 3 + c];
}
}
// Create tensor with shape [1, 3, height, width]
return new ort.Tensor('float32', chwData, [1, 3, tempCanvas.height, tempCanvas.width]);
}
// Initialize class names (simplified COCO classes for demo)
function initClassNames() {
classNames = {
0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane',
5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light',
10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench',
14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow',
20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack',
25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee',
30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat',
35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket',
39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife',
44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich',
49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza',
54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant',
59: 'bed', 60: 'dining table', 61: 'toilet', 62: 'tv', 63: 'laptop',
64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone', 68: 'microwave',
69: 'oven', 70: 'toaster', 71: 'sink', 72: 'refrigerator', 73: 'book',
74: 'clock', 75: 'vase', 76: 'scissors', 77: 'teddy bear', 78: 'hair drier',
79: 'toothbrush'
};
}
// Initialize on page load
window.addEventListener('DOMContentLoaded', () => {
initClassNames();
});
// Clean up on page unload
window.addEventListener('beforeunload', () => {
if (session) {
// Clean up ONNX session if needed
}
// Stop webcam stream
if (video.srcObject) {
video.srcObject.getTracks().forEach(track => track.stop());
}
});
</script>
<p style="border-radius: 8px; text-align: center; font-size: 12px; color: #fff; margin-top: 16px;position: fixed; left: 8px; bottom: 8px; z-index: 10; background: rgba(0, 0, 0, 0.8); padding: 4px 8px;">Made with <img src="https://enzostvs-deepsite.hf.space/logo.svg" alt="DeepSite Logo" style="width: 16px; height: 16px; vertical-align: middle;display:inline-block;margin-right:3px;filter:brightness(0) invert(1);"><a href="https://enzostvs-deepsite.hf.space" style="color: #fff;text-decoration: underline;" target="_blank" >DeepSite</a> - 🧬 <a href="https://enzostvs-deepsite.hf.space?remix=MaxLeft/yolo-detection-app" style="color: #fff;text-decoration: underline;" target="_blank" >Remix</a></p></body>
</html>