Spaces:

ashishninehertz
/

Camera_Interaction_App

Running

App Files Files Community

Camera_Interaction_App / index.html

ashishninehertz

update_space

d2ef7f6 about 1 year ago

raw

history blame contribute delete

14.3 kB

	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8" />
	<meta name="viewport" content="width=device-width, initial-scale=1.0" />
	<title>Vision AI Assistant</title>
	<style>
	:root {
	--primary: #4f46e5;
	--primary-light: #6366f1;
	--success: #10b981;
	--danger: #ef4444;
	--dark: #1e293b;
	--light: #f8fafc;
	--gray: #94a3b8;
	--border: #e2e8f0;
	}

	* {
	margin: 0;
	padding: 0;
	box-sizing: border-box;
	}

	body {
	font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
	background-color: #f1f5f9;
	color: var(--dark);
	min-height: 100vh;
	display: flex;
	flex-direction: column;
	align-items: center;
	padding: 2rem;
	}

	.app-container {
	width: 100%;
	max-width: 800px;
	display: flex;
	flex-direction: column;
	gap: 1.5rem;
	}

	h1 {
	color: var(--primary);
	margin-bottom: 0.5rem;
	text-align: center;
	}

	.subtitle {
	color: var(--gray);
	text-align: center;
	margin-bottom: 2rem;
	}

	.video-container {
	position: relative;
	width: 100%;
	aspect-ratio: 4/3;
	background-color: #000;
	border-radius: 12px;
	overflow: hidden;
	box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
	}

	#videoFeed {
	width: 100%;
	height: 100%;
	object-fit: cover;
	}

	.loading-overlay {
	position: absolute;
	top: 0;
	left: 0;
	width: 100%;
	height: 100%;
	display: none;
	flex-direction: column;
	justify-content: center;
	align-items: center;
	background-color: rgba(0, 0, 0, 0.7);
	color: white;
	font-weight: 600;
	}

	.loading-spinner {
	width: 40px;
	height: 40px;
	border: 4px solid rgba(255, 255, 255, 0.3);
	border-radius: 50%;
	border-top-color: white;
	animation: spin 1s ease-in-out infinite;
	margin-bottom: 1rem;
	}

	@keyframes spin {
	to { transform: rotate(360deg); }
	}

	.control-panel {
	background: white;
	border-radius: 12px;
	padding: 1.5rem;
	box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
	}

	.input-group {
	display: flex;
	flex-direction: column;
	gap: 0.5rem;
	margin-bottom: 1.5rem;
	}

	label {
	font-weight: 600;
	color: var(--dark);
	}

	textarea {
	width: 100%;
	padding: 0.75rem;
	border: 1px solid var(--border);
	border-radius: 8px;
	font-family: inherit;
	resize: vertical;
	min-height: 60px;
	}

	textarea:focus {
	outline: none;
	border-color: var(--primary-light);
	box-shadow: 0 0 0 2px rgba(99, 102, 241, 0.2);
	}

	.controls {
	display: flex;
	gap: 1rem;
	align-items: center;
	flex-wrap: wrap;
	}

	.interval-select {
	flex-grow: 1;
	display: flex;
	align-items: center;
	gap: 0.5rem;
	}

	select {
	padding: 0.5rem;
	border-radius: 8px;
	border: 1px solid var(--border);
	background-color: white;
	flex-grow: 1;
	}

	.btn {
	padding: 0.75rem 1.5rem;
	border-radius: 8px;
	font-weight: 600;
	cursor: pointer;
	border: none;
	transition: all 0.2s;
	}

	.btn-primary {
	background-color: var(--primary);
	color: white;
	}

	.btn-primary:hover {
	background-color: var(--primary-light);
	}

	.btn-danger {
	background-color: var(--danger);
	color: white;
	}

	.btn-danger:hover {
	background-color: #dc2626;
	}

	.status-indicator {
	display: flex;
	align-items: center;
	gap: 0.5rem;
	margin-top: 1rem;
	font-size: 0.875rem;
	color: var(--gray);
	}

	.status-dot {
	width: 10px;
	height: 10px;
	border-radius: 50%;
	background-color: var(--gray);
	}

	.status-dot.active {
	background-color: var(--success);
	}

	.status-dot.processing {
	background-color: var(--primary);
	animation: pulse 1.5s infinite;
	}

	@keyframes pulse {
	0% { opacity: 1; }
	50% { opacity: 0.5; }
	100% { opacity: 1; }
	}

	.hidden {
	display: none;
	}

	canvas {
	display: none;
	}
	</style>
	</head>
	<body>
	<div class="app-container">
	<h1>Vision AI Assistant</h1>
	<p class="subtitle">Ask questions about what your camera sees</p>

	<div class="video-container">
	<video id="videoFeed" autoplay playsinline></video>
	<div class="loading-overlay" id="loadingOverlay">
	<div class="loading-spinner"></div>
	<span id="loadingText">Loading AI Model</span>
	</div>
	</div>

	<div class="control-panel">
	<div class="input-group">
	<label for="instructionText">Your Question</label>
	<textarea
	id="instructionText"
	placeholder="Ask me about what you see..."
	>What do you see?</textarea>
	</div>

	<div class="input-group">
	<label for="responseText">AI Response</label>
	<textarea
	id="responseText"
	readonly
	placeholder="The AI response will appear here..."
	></textarea>
	</div>

	<div class="controls">
	<div class="interval-select">
	<label for="intervalSelect">Interval:</label>
	<select id="intervalSelect">
	<option value="0" selected>Manual</option>
	<option value="100">100ms</option>
	<option value="250">250ms</option>
	<option value="500">500ms</option>
	<option value="1000">1s</option>
	<option value="2000">2s</option>
	</select>
	</div>

	<button id="startButton" class="btn btn-primary">Start</button>
	</div>

	<div class="status-indicator">
	<div class="status-dot" id="statusDot"></div>
	<span id="statusText">Initializing</span>
	</div>
	</div>
	</div>

	<canvas id="canvas" class="hidden"></canvas>

	<script type="module">
	import {
	AutoProcessor,
	AutoModelForVision2Seq,
	RawImage,
	} from "https://cdn.jsdelivr.net/npm/@huggingface/transformers/dist/transformers.min.js";

	const video = document.getElementById("videoFeed");
	const canvas = document.getElementById("canvas");
	const instructionText = document.getElementById("instructionText");
	const responseText = document.getElementById("responseText");
	const intervalSelect = document.getElementById("intervalSelect");
	const startButton = document.getElementById("startButton");
	const loadingOverlay = document.getElementById("loadingOverlay");
	const loadingText = document.getElementById("loadingText");
	const statusDot = document.getElementById("statusDot");
	const statusText = document.getElementById("statusText");

	let stream;
	let isProcessing = false;
	let processor, model;

	function updateStatus(text, state) {
	statusText.textContent = text;
	statusDot.className = 'status-dot';
	if (state === 'active') {
	statusDot.classList.add('active');
	} else if (state === 'processing') {
	statusDot.classList.add('processing');
	}
	}

	async function initModel() {
	const modelId = "HuggingFaceTB/SmolVLM-500M-Instruct";
	loadingOverlay.style.display = "flex";
	loadingText.textContent = "Loading processor...";
	updateStatus("Loading processor...", "processing");

	try {
	processor = await AutoProcessor.from_pretrained(modelId);
	loadingText.textContent = "Loading model...";
	updateStatus("Loading model...", "processing");

	// Modified model configuration to use FP32 instead of FP16
	model = await AutoModelForVision2Seq.from_pretrained(modelId, {
	dtype: {
	embed_tokens: "fp32", // Changed from fp16 to fp32
	vision_encoder: "q4",
	decoder_model_merged: "q4",
	},
	device: "webgpu",
	});

	loadingOverlay.style.display = "none";
	updateStatus("Ready", "active");
	responseText.value = "Model loaded successfully. Ready to start.";
	} catch (error) {
	console.error("Model loading error:", error);
	loadingText.textContent = "Model loading failed";
	responseText.value = `Error loading model: ${error.message}`;
	updateStatus("Error loading model", "");
	throw error;
	}
	}

	async function initCamera() {
	try {
	updateStatus("Accessing camera...", "processing");
	stream = await navigator.mediaDevices.getUserMedia({
	video: true,
	audio: false,
	});
	video.srcObject = stream;
	responseText.value = "Camera access granted. Ready to start.";
	} catch (err) {
	console.error("Error accessing camera:", err);
	responseText.value = `Error accessing camera: ${err.name} - ${err.message}. Please ensure permissions are granted and you are on HTTPS or localhost.`;
	updateStatus("Camera error", "");
	alert(
	`Error accessing camera: ${err.name}. Make sure you've granted permission and are on HTTPS or localhost.`
	);
	}
	}

	function captureImage() {
	if (!stream \|\| !video.videoWidth) {
	console.warn("Video stream not ready for capture.");
	return null;
	}
	canvas.width = video.videoWidth;
	canvas.height = video.videoHeight;
	const context = canvas.getContext("2d", { willReadFrequently: true });
	context.drawImage(video, 0, 0, canvas.width, canvas.height);
	const frame = context.getImageData(0, 0, canvas.width, canvas.height);
	return new RawImage(frame.data, frame.width, frame.height, 4);
	}

	async function runLocalVisionInference(imgElement, instruction) {
	const messages = [
	{
	role: "user",
	content: [{ type: "image" }, { type: "text", text: instruction }],
	},
	];
	const text = processor.apply_chat_template(messages, {
	add_generation_prompt: true,
	});
	const inputs = await processor(text, [imgElement], {
	do_image_splitting: false,
	});
	const generatedIds = await model.generate({
	...inputs,
	max_new_tokens: 100,
	});
	const output = processor.batch_decode(
	generatedIds.slice(null, [inputs.input_ids.dims.at(-1), null]),
	{ skip_special_tokens: true }
	);
	return output[0].trim();
	}

	async function sendData() {
	if (!isProcessing) return;
	const instruction = instructionText.value;
	const rawImg = captureImage();
	if (!rawImg) {
	responseText.value = "Capture failed";
	return;
	}
	try {
	const reply = await runLocalVisionInference(rawImg, instruction);
	responseText.value = reply;
	} catch (e) {
	console.error(e);
	responseText.value = `Error: ${e.message}`;
	}
	}

	function sleep(ms) {
	return new Promise((resolve) => setTimeout(resolve, ms));
	}

	async function processingLoop() {
	const intervalMs = parseInt(intervalSelect.value, 10);
	while (isProcessing) {
	await sendData();
	if (!isProcessing) break;
	await sleep(intervalMs);
	}
	}

	function handleStart() {
	if (!stream) {
	responseText.value = "Camera not available. Cannot start.";
	alert("Camera not available. Please grant permission first.");
	return;
	}
	isProcessing = true;
	startButton.textContent = "Stop";
	startButton.classList.remove("btn-primary");
	startButton.classList.add("btn-danger");
	instructionText.disabled = true;
	intervalSelect.disabled = true;
	responseText.value = "Processing started...";
	updateStatus("Processing...", "processing");
	processingLoop();
	}

	function handleStop() {
	isProcessing = false;
	startButton.textContent = "Start";
	startButton.classList.remove("btn-danger");
	startButton.classList.add("btn-primary");
	instructionText.disabled = false;
	intervalSelect.disabled = false;
	if (responseText.value.startsWith("Processing started...")) {
	responseText.value = "Processing stopped.";
	}
	updateStatus("Ready", "active");
	}

	startButton.addEventListener("click", () => {
	if (isProcessing) {
	handleStop();
	} else {
	handleStart();
	}
	});

	window.addEventListener("DOMContentLoaded", async () => {
	if (!navigator.gpu) {
	responseText.value = "WebGPU is not available in this browser. Please try Chrome or Edge with WebGPU enabled.";
	updateStatus("WebGPU not available", "");
	startButton.disabled = true;
	return;
	}

	await initModel();
	await initCamera();
	});

	window.addEventListener("beforeunload", () => {
	if (stream) {
	stream.getTracks().forEach((track) => track.stop());
	}
	});
	</script>
	</body>
	</html>