Spaces:
Running
Running
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8" /> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /> | |
| <title>Qwen3.5-0.8B WebGPU demo</title> | |
| <style> | |
| body { | |
| font-family: "Inter", "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; | |
| display: flex; | |
| flex-direction: column; | |
| align-items: center; | |
| justify-content: center; | |
| gap: 12px; | |
| padding: 14px; | |
| margin: 0; | |
| min-height: 100vh; | |
| min-height: 100svh; | |
| box-sizing: border-box; | |
| background: linear-gradient(180deg, #f8fafc 0%, #eef2ff 100%); | |
| color: #0f172a; | |
| } | |
| h1 { | |
| margin: 0; | |
| font-size: 2rem; | |
| font-weight: 700; | |
| letter-spacing: -0.02em; | |
| color: #111827; | |
| text-align: center; | |
| } | |
| .io-areas { | |
| display: flex; | |
| gap: 12px; | |
| align-items: center; | |
| width: min(92vw, 760px); | |
| background: rgba(255, 255, 255, 0.9); | |
| border: 1px solid #e5e7eb; | |
| padding: 10px 0; | |
| border-radius: 12px; | |
| box-shadow: 0 8px 24px rgba(15, 23, 42, 0.08); | |
| } | |
| .io-areas { | |
| flex-direction: column; | |
| align-items: center; | |
| gap: 8px; | |
| } | |
| .row-main { | |
| display: grid; | |
| grid-template-columns: minmax(0, 1fr) auto; | |
| width: min(100%, 720px); | |
| gap: 10px; | |
| align-items: flex-end; | |
| } | |
| .field-group { | |
| display: flex; | |
| flex-direction: column; | |
| gap: 4px; | |
| align-items: stretch; | |
| } | |
| .instruction-group { | |
| min-width: 0; | |
| } | |
| .response-group { | |
| display: grid; | |
| grid-template-columns: minmax(0, 1fr) auto; | |
| width: min(100%, 720px); | |
| gap: 10px; | |
| align-items: flex-end; | |
| max-width: none; | |
| } | |
| textarea { | |
| width: 100%; | |
| min-height: 2.2em; | |
| padding: 7px 9px; | |
| border: 1px solid #d1d5db; | |
| border-radius: 8px; | |
| font-size: 13px; | |
| line-height: 1.35; | |
| color: #111827; | |
| background-color: #ffffff; | |
| box-sizing: border-box; | |
| } | |
| textarea:focus, | |
| select:focus { | |
| outline: none; | |
| border-color: #6366f1; | |
| box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.15); | |
| } | |
| textarea[readonly] { | |
| background-color: #f9fafb; | |
| } | |
| .wide-textarea { | |
| width: 100%; | |
| } | |
| .single-line { | |
| resize: none; | |
| overflow: hidden; | |
| } | |
| .response-area { | |
| min-height: calc(1.35em * 3 + 14px); | |
| max-height: calc(1.35em * 3 + 14px); | |
| resize: none; | |
| overflow-y: auto; | |
| } | |
| .control-group { | |
| display: flex; | |
| align-items: center; | |
| justify-content: flex-end; | |
| gap: 8px; | |
| } | |
| .control-spacer { | |
| width: 84px; | |
| visibility: hidden; | |
| } | |
| #videoFeed { | |
| display: block; | |
| width: 100%; | |
| height: 100%; | |
| border-radius: 10px; | |
| object-fit: cover; | |
| } | |
| #videoContainer { | |
| position: relative; | |
| width: min(92vw, 640px); | |
| aspect-ratio: 4 / 3; | |
| border: 1px solid #cbd5e1; | |
| background-color: #000; | |
| border-radius: 12px; | |
| margin: 0 auto; | |
| overflow: hidden; | |
| box-shadow: 0 12px 30px rgba(15, 23, 42, 0.16); | |
| } | |
| #loadingOverlay { | |
| position: absolute; | |
| top: 0; | |
| left: 0; | |
| width: 100%; | |
| height: 100%; | |
| display: none; | |
| justify-content: center; | |
| align-items: center; | |
| background-color: rgba(0, 0, 0, 0.7); | |
| z-index: 10; | |
| border-radius: 10px; | |
| color: #ffffff; | |
| font-size: 1.1rem; | |
| font-weight: 600; | |
| backdrop-filter: blur(1px); | |
| } | |
| #prefillIndicator { | |
| position: absolute; | |
| top: 10px; | |
| right: 10px; | |
| display: none; | |
| align-items: center; | |
| gap: 8px; | |
| z-index: 11; | |
| padding: 7px 10px; | |
| border-radius: 999px; | |
| background: rgba(17, 24, 39, 0.72); | |
| color: #fff; | |
| font-size: 12px; | |
| font-weight: 600; | |
| backdrop-filter: blur(2px); | |
| } | |
| #prefillIndicator.active { | |
| display: inline-flex; | |
| } | |
| #sourceToggleButton { | |
| position: absolute; | |
| top: 10px; | |
| left: 10px; | |
| z-index: 12; | |
| padding: 6px 10px; | |
| border: 1px solid rgba(255, 255, 255, 0.28); | |
| border-radius: 999px; | |
| background: rgba(17, 24, 39, 0.72); | |
| color: #fff; | |
| font-size: 12px; | |
| font-weight: 600; | |
| cursor: pointer; | |
| backdrop-filter: blur(2px); | |
| opacity: 0; | |
| pointer-events: none; | |
| transition: opacity 0.18s ease; | |
| } | |
| #sourceToggleButton:hover:not(:disabled) { | |
| background: rgba(31, 41, 55, 0.82); | |
| } | |
| #sourceToggleButton:disabled { | |
| opacity: 0.45; | |
| cursor: not-allowed; | |
| } | |
| #videoContainer:hover #sourceToggleButton, | |
| #videoContainer:focus-within #sourceToggleButton { | |
| opacity: 1; | |
| pointer-events: auto; | |
| } | |
| .spinner { | |
| width: 12px; | |
| height: 12px; | |
| border: 2px solid rgba(255, 255, 255, 0.35); | |
| border-top-color: #ffffff; | |
| border-radius: 50%; | |
| animation: spin 0.8s linear infinite; | |
| } | |
| @keyframes spin { | |
| to { | |
| transform: rotate(360deg); | |
| } | |
| } | |
| #startButton { | |
| min-width: 84px; | |
| padding: 8px 14px; | |
| font-size: 14px; | |
| font-weight: 600; | |
| cursor: pointer; | |
| border: none; | |
| border-radius: 8px; | |
| color: white; | |
| transition: | |
| transform 0.1s ease, | |
| box-shadow 0.2s ease; | |
| } | |
| #startButton:hover:not(:disabled) { | |
| transform: translateY(-1px); | |
| box-shadow: 0 6px 16px rgba(15, 23, 42, 0.2); | |
| } | |
| #startButton.start { | |
| background-color: #16a34a; | |
| } | |
| #startButton.stop { | |
| background-color: #dc2626; | |
| } | |
| label { | |
| font-weight: 600; | |
| color: #374151; | |
| font-size: 13px; | |
| } | |
| .hidden { | |
| display: none; | |
| } | |
| @media (max-width: 640px) { | |
| body { | |
| padding: 10px; | |
| } | |
| h1 { | |
| font-size: 1.2rem; | |
| text-align: center; | |
| } | |
| .row-main { | |
| grid-template-columns: 1fr; | |
| align-items: stretch; | |
| } | |
| .response-group { | |
| grid-template-columns: 1fr; | |
| align-items: stretch; | |
| } | |
| .control-group { | |
| justify-content: flex-start; | |
| } | |
| .control-spacer { | |
| display: none; | |
| } | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <h1>Qwen3.5-0.8B WebGPU demo</h1> | |
| <div id="videoContainer"> | |
| <video id="videoFeed" autoplay playsinline></video> | |
| <button id="sourceToggleButton" type="button">Use video file</button> | |
| <input id="videoFileInput" type="file" accept="video/*" class="hidden" /> | |
| <div id="prefillIndicator"> | |
| <span class="spinner"></span> | |
| <span>Processing image</span> | |
| </div> | |
| <div id="loadingOverlay">Loading model (~850MB)...</div> | |
| </div> | |
| <canvas id="canvas" class="hidden"></canvas> | |
| <div class="io-areas"> | |
| <div class="row-main"> | |
| <div class="field-group instruction-group"> | |
| <label for="instructionText">Instruction:</label> | |
| <textarea | |
| id="instructionText" | |
| class="wide-textarea single-line" | |
| name="Instruction" | |
| rows="1" | |
| ></textarea> | |
| </div> | |
| <div class="control-group"> | |
| <button id="startButton" class="start">Start</button> | |
| </div> | |
| </div> | |
| <div class="field-group response-group"> | |
| <div class="field-group"> | |
| <label for="responseText">Response:</label> | |
| <textarea | |
| id="responseText" | |
| class="wide-textarea response-area" | |
| name="Response" | |
| rows="3" | |
| readonly | |
| placeholder="Response will appear here..." | |
| ></textarea> | |
| </div> | |
| <div class="control-spacer" aria-hidden="true"></div> | |
| </div> | |
| </div> | |
| <script type="module"> | |
| import { | |
| AutoProcessor, | |
| Qwen3_5ForConditionalGeneration, | |
| RawImage, | |
| TextStreamer, | |
| } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@4.0.0-next.5"; | |
| const MODEL_ID = "onnx-community/Qwen3.5-0.8B-ONNX"; | |
| const CAPTURE_MAX_WIDTH = 800; | |
| const video = document.getElementById("videoFeed"); | |
| const canvas = document.getElementById("canvas"); | |
| const instructionText = document.getElementById("instructionText"); | |
| const responseText = document.getElementById("responseText"); | |
| const startButton = document.getElementById("startButton"); | |
| const loadingOverlay = document.getElementById("loadingOverlay"); | |
| const prefillIndicator = document.getElementById("prefillIndicator"); | |
| const sourceToggleButton = document.getElementById("sourceToggleButton"); | |
| const videoFileInput = document.getElementById("videoFileInput"); | |
| instructionText.value = "Briefly describe what you see (2 sentences max)."; | |
| let stream = null; | |
| let fileObjectUrl = null; | |
| let isProcessing = false; | |
| let isModelReady = false; | |
| let sourceMode = "webcam"; | |
| let processor = null; | |
| let model = null; | |
| function hasVideoFrame() { | |
| return video.videoWidth > 0 && video.videoHeight > 0; | |
| } | |
| function hasActiveInput() { | |
| if (sourceMode === "file") { | |
| return Boolean(video.src); | |
| } | |
| return Boolean(stream); | |
| } | |
| function updateSourceToggleButton() { | |
| sourceToggleButton.textContent = sourceMode === "webcam" ? "Use video file" : "Use webcam"; | |
| } | |
| function updateStartAvailability() { | |
| setStartButtonEnabled(isModelReady && hasActiveInput()); | |
| } | |
| function setResponse(text) { | |
| responseText.value = text; | |
| } | |
| function setLoading(isLoading) { | |
| loadingOverlay.style.display = isLoading ? "flex" : "none"; | |
| } | |
| function setPrefillProcessing(isProcessingImage) { | |
| prefillIndicator.classList.toggle("active", isProcessingImage); | |
| } | |
| function setControlsDisabled(disabled) { | |
| sourceToggleButton.disabled = disabled; | |
| } | |
| function setStartButtonMode(mode) { | |
| if (mode === "start") { | |
| startButton.textContent = "Start"; | |
| startButton.classList.remove("stop"); | |
| startButton.classList.add("start"); | |
| } else { | |
| startButton.textContent = "Stop"; | |
| startButton.classList.remove("start"); | |
| startButton.classList.add("stop"); | |
| } | |
| } | |
| function setStartButtonEnabled(enabled) { | |
| startButton.disabled = !enabled; | |
| startButton.style.opacity = enabled ? "1" : "0.6"; | |
| startButton.style.cursor = enabled ? "pointer" : "not-allowed"; | |
| } | |
| async function initModel() { | |
| setLoading(true); | |
| setResponse("Loading processor..."); | |
| processor = await AutoProcessor.from_pretrained(MODEL_ID); | |
| setResponse("Processor loaded. Loading model..."); | |
| model = await Qwen3_5ForConditionalGeneration.from_pretrained(MODEL_ID, { | |
| dtype: { | |
| embed_tokens: "q4", | |
| vision_encoder: "fp16", | |
| decoder_model_merged: "q4", | |
| }, | |
| device: "webgpu", | |
| }); | |
| setResponse("Model loaded. Initializing camera..."); | |
| setLoading(false); | |
| } | |
| async function initCamera() { | |
| try { | |
| stream = await navigator.mediaDevices.getUserMedia({ | |
| video: true, | |
| audio: false, | |
| }); | |
| if (fileObjectUrl) { | |
| URL.revokeObjectURL(fileObjectUrl); | |
| fileObjectUrl = null; | |
| } | |
| video.removeAttribute("src"); | |
| video.srcObject = stream; | |
| sourceMode = "webcam"; | |
| updateSourceToggleButton(); | |
| updateStartAvailability(); | |
| setResponse("Camera access granted. Ready to start."); | |
| return true; | |
| } catch (err) { | |
| console.error("Error accessing camera:", err); | |
| setResponse( | |
| `Error accessing camera: ${err.name} - ${err.message}. Please ensure permissions are granted and you are on HTTPS or localhost.`, | |
| ); | |
| alert( | |
| `Error accessing camera: ${err.name}. Make sure you've granted permission and are on HTTPS or localhost.`, | |
| ); | |
| return false; | |
| } | |
| } | |
| function stopWebcamStream() { | |
| if (stream) { | |
| stream.getTracks().forEach((track) => track.stop()); | |
| stream = null; | |
| } | |
| } | |
| async function switchToVideoFile(file) { | |
| if (!file) return; | |
| stopWebcamStream(); | |
| if (fileObjectUrl) { | |
| URL.revokeObjectURL(fileObjectUrl); | |
| } | |
| fileObjectUrl = URL.createObjectURL(file); | |
| sourceMode = "file"; | |
| video.srcObject = null; | |
| video.src = fileObjectUrl; | |
| video.loop = true; | |
| video.muted = true; | |
| try { | |
| await video.play(); | |
| } catch (err) { | |
| console.warn("Autoplay blocked for video file:", err); | |
| } | |
| updateSourceToggleButton(); | |
| updateStartAvailability(); | |
| setResponse(`Using video file: ${file.name}`); | |
| } | |
| async function switchToWebcam() { | |
| const cameraReady = await initCamera(); | |
| if (!cameraReady) { | |
| updateStartAvailability(); | |
| } | |
| } | |
| function captureImage() { | |
| if (!hasVideoFrame()) { | |
| console.warn("Video stream not ready for capture."); | |
| return null; | |
| } | |
| const sourceWidth = video.videoWidth; | |
| const sourceHeight = video.videoHeight; | |
| const scale = Math.min(1, CAPTURE_MAX_WIDTH / sourceWidth); | |
| canvas.width = Math.max(1, Math.round(sourceWidth * scale)); | |
| canvas.height = Math.max(1, Math.round(sourceHeight * scale)); | |
| const context = canvas.getContext("2d", { willReadFrequently: true }); | |
| context.imageSmoothingEnabled = false; | |
| context.drawImage(video, 0, 0, canvas.width, canvas.height); | |
| return RawImage.fromCanvas(canvas); | |
| } | |
| async function runInference(imgElement, instruction) { | |
| const messages = [ | |
| { | |
| role: "user", | |
| content: [{ type: "image" }, { type: "text", text: instruction }], | |
| }, | |
| ]; | |
| const text = processor.apply_chat_template(messages, { | |
| add_generation_prompt: true, | |
| tokenizer_kwargs: { enable_thinking: false }, | |
| }); | |
| const start = performance.now(); | |
| const inputs = await processor(text, [imgElement]); | |
| const end = performance.now(); | |
| console.log(`Preprocessing time: ${(end - start).toFixed(2)} ms`); | |
| let waitingForFirstToken = true; | |
| setPrefillProcessing(true); | |
| try { | |
| await model.generate({ | |
| ...inputs, | |
| do_sample: false, | |
| max_new_tokens: 128, | |
| streamer: new TextStreamer(processor.tokenizer, { | |
| skip_prompt: true, | |
| skip_special_tokens: true, | |
| callback_function: (token) => { | |
| if (waitingForFirstToken) { | |
| setPrefillProcessing(false); | |
| responseText.value = token.trimStart(); | |
| waitingForFirstToken = false; | |
| } else { | |
| responseText.value += token; | |
| } | |
| }, | |
| }), | |
| }); | |
| } finally { | |
| setPrefillProcessing(false); | |
| } | |
| } | |
| async function sendData() { | |
| if (!isProcessing) return; | |
| const instruction = instructionText.value; | |
| const rawImg = captureImage(); | |
| if (!rawImg) { | |
| setResponse("Capture failed"); | |
| return; | |
| } | |
| try { | |
| await runInference(rawImg, instruction); | |
| } catch (e) { | |
| console.error(e); | |
| setResponse(`Error: ${e.message}`); | |
| } | |
| } | |
| async function processingLoop() { | |
| while (isProcessing) { | |
| await sendData(); | |
| } | |
| } | |
| function handleStart() { | |
| if (!isModelReady || !processor || !model) { | |
| setResponse("Demo is not ready yet. Please wait."); | |
| return; | |
| } | |
| if (!hasActiveInput()) { | |
| setResponse("No video source selected. Use webcam or choose a video file."); | |
| return; | |
| } | |
| if (isProcessing) { | |
| return; | |
| } | |
| if (!hasVideoFrame()) { | |
| setResponse("Video is not ready yet."); | |
| return; | |
| } | |
| isProcessing = true; | |
| setStartButtonMode("stop"); | |
| setControlsDisabled(true); | |
| setResponse("Processing started..."); | |
| processingLoop(); | |
| } | |
| function handleStop() { | |
| isProcessing = false; | |
| setStartButtonMode("start"); | |
| setControlsDisabled(false); | |
| setPrefillProcessing(false); | |
| if (responseText.value.startsWith("Processing started...")) { | |
| setResponse("Processing stopped."); | |
| } | |
| } | |
| startButton.addEventListener("click", () => { | |
| if (isProcessing) { | |
| handleStop(); | |
| } else { | |
| handleStart(); | |
| } | |
| }); | |
| sourceToggleButton.addEventListener("click", async () => { | |
| if (sourceMode === "webcam") { | |
| videoFileInput.click(); | |
| return; | |
| } | |
| await switchToWebcam(); | |
| }); | |
| videoFileInput.addEventListener("change", async (event) => { | |
| const file = event.target.files?.[0]; | |
| if (!file) return; | |
| await switchToVideoFile(file); | |
| event.target.value = ""; | |
| }); | |
| video.addEventListener("loadeddata", () => { | |
| updateStartAvailability(); | |
| }); | |
| window.addEventListener("DOMContentLoaded", async () => { | |
| updateSourceToggleButton(); | |
| setStartButtonEnabled(false); | |
| if (!navigator.gpu) { | |
| const warningElement = document.createElement("p"); | |
| warningElement.textContent = "WebGPU is not available in this browser."; | |
| warningElement.style.color = "red"; | |
| warningElement.style.textAlign = "center"; | |
| video.parentNode.insertBefore(warningElement, video.nextSibling); | |
| setResponse("WebGPU is not available in this browser."); | |
| return; | |
| } | |
| try { | |
| await initModel(); | |
| isModelReady = true; | |
| await initCamera(); | |
| updateStartAvailability(); | |
| } catch (error) { | |
| console.error("Initialization error:", error); | |
| setLoading(false); | |
| setResponse(`Initialization failed: ${error.message}`); | |
| alert(`Initialization failed: ${error.message}`); | |
| } | |
| }); | |
| window.addEventListener("beforeunload", () => { | |
| stopWebcamStream(); | |
| if (fileObjectUrl) { | |
| URL.revokeObjectURL(fileObjectUrl); | |
| } | |
| }); | |
| </script> | |
| </body> | |
| </html> | |