| <!DOCTYPE html> |
| <html> |
| <head> |
| <meta charset="UTF-8"> |
| <title>SmolVLM Benchmark Demo</title> |
| <style> |
| body { font-family: Arial, sans-serif; margin: 20px; } |
| fieldset { margin-bottom: 20px; padding: 10px; } |
| legend { font-weight: bold; } |
| label { display: block; margin-top: 5px; } |
| input, select { margin-bottom: 5px; width: 100%; max-width: 400px; } |
| table { border-collapse: collapse; margin-top: 20px; width: 100%; max-width: 600px; } |
| th, td { border: 1px solid #ccc; padding: 8px; text-align: left; } |
| button { padding: 10px 20px; } |
| .model-results { margin-bottom: 40px; } |
| </style> |
| </head> |
| <body> |
| <h1>SmolVLM Benchmark Demo</h1> |
| |
| |
| <fieldset id="model-options"> |
| <legend>Model Options (Note: Benchmarking all three SmolVLM models by default)</legend> |
| <label for="model-id">Select Model ID:</label> |
| <select id="model-id" disabled> |
| <option value="hf-internal-testing/tiny-random-Idefics3ForConditionalGeneration">hf-internal-testing/tiny-random-Idefics3ForConditionalGeneration</option> |
| <option value="HuggingFaceTB/SmolVLM-256M-Instruct" selected>HuggingFaceTB/SmolVLM-256M-Instruct</option> |
| <option value="HuggingFaceTB/SmolVLM-500M-Instruct">HuggingFaceTB/SmolVLM-500M-Instruct</option> |
| <option value="HuggingFaceTB/SmolVLM-Instruct">HuggingFaceTB/SmolVLM-Instruct</option> |
| </select> |
|
|
| <label for="decoder-dtype">Decoder (decoder_model_merged) dtype:</label> |
| <select id="decoder-dtype"> |
| <option value="fp32">fp32</option> |
| <option value="fp16">fp16</option> |
| <option value="q8">q8</option> |
| <option value="q4" selected>q4</option> |
| <option value="q4f16">q4f16</option> |
| </select> |
| |
| <label for="embed-dtype">Embed Tokens dtype:</label> |
| <select id="embed-dtype"> |
| <option value="fp32">fp32</option> |
| <option value="fp16">fp16</option> |
| <option value="q8">q8</option> |
| <option value="q4" selected>q4</option> |
| <option value="q4f16">q4f16</option> |
| </select> |
| |
| <label for="vision-dtype">Vision Encoder dtype:</label> |
| <select id="vision-dtype"> |
| <option value="fp32">fp32</option> |
| <option value="fp16">fp16</option> |
| <option value="q8">q8</option> |
| <option value="q4" selected>q4</option> |
| <option value="q4f16">q4f16</option> |
| </select> |
| </fieldset> |
|
|
| |
| <fieldset id="hardware-options"> |
| <legend>Hardware Options</legend> |
| <label for="device">Select Device:</label> |
| <select id="device"> |
| <option value="wasm">wasm</option> |
| <option value="webgpu" selected>webgpu</option> |
| </select> |
| </fieldset> |
|
|
| |
| <fieldset id="benchmark-options"> |
| <legend>Benchmark Options</legend> |
| <label for="image-url">Image URL:</label> |
| <input type="text" id="image-url" value="https://huggingface.co/spaces/merve/chameleon-7b/resolve/main/bee.jpg"> |
|
|
| <label for="do-split">Do Image Splitting (do_image_splitting)</label> |
| <input type="checkbox" id="do-split" checked> |
|
|
| <label for="max-tokens">Number of Tokens to Generate:</label> |
| <input type="number" id="max-tokens" value="32"> |
|
|
| <label for="num-runs">Number of Runs:</label> |
| <input type="number" id="num-runs" value="3"> |
| </fieldset> |
|
|
| <button id="start-benchmark">Start Benchmark</button> |
|
|
| <div id="results"></div> |
|
|
| <script type="module"> |
| import { |
| AutoProcessor, |
| AutoModelForVision2Seq, |
| load_image, |
| TextStreamer, |
| } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.4.2"; |
| |
| class SmolVLM { |
| static model = null; |
| static processor = null; |
| static model_id = null; |
| static async getInstance(modelId, dtypeSettings, device, revision) { |
| if (this.model_id !== modelId) { |
| await this.model?.dispose(); |
| this.model = null; |
| this.processor = null; |
| this.model_id = modelId; |
| } |
| if (!this.processor) { |
| this.processor = await AutoProcessor.from_pretrained(modelId); |
| } |
| if (!this.model) { |
| this.model = await AutoModelForVision2Seq.from_pretrained(modelId, { |
| dtype: { |
| embed_tokens: dtypeSettings.embed, |
| vision_encoder: dtypeSettings.vision, |
| decoder_model_merged: dtypeSettings.decoder, |
| }, |
| device: device, |
| revision, |
| }); |
| } |
| return [this.processor, this.model]; |
| } |
| } |
| |
| async function runBenchmark() { |
| document.getElementById("model-options").disabled = true; |
| document.getElementById("hardware-options").disabled = true; |
| const resultsDiv = document.getElementById("results"); |
| resultsDiv.innerHTML = ""; |
| |
| const modelIds = { |
| "HuggingFaceTB/SmolVLM-256M-Instruct": "refs/pr/11", |
| "HuggingFaceTB/SmolVLM-500M-Instruct": "refs/pr/9", |
| "HuggingFaceTB/SmolVLM-Instruct": "main" |
| }; |
| |
| const decoder_dtype = document.getElementById("decoder-dtype").value || "q4"; |
| const embed_dtype = document.getElementById("embed-dtype").value || "q4"; |
| const vision_dtype = document.getElementById("vision-dtype").value || "q4"; |
| const device = document.getElementById("device").value; |
| const imageUrl = document.getElementById("image-url").value; |
| const maxTokens = parseInt(document.getElementById("max-tokens").value) || 32; |
| const numRuns = parseInt(document.getElementById("num-runs").value) || 3; |
| const doImageSplitting = document.getElementById("do-split").checked; |
| |
| const dtypeSettings = { decoder: decoder_dtype, embed: embed_dtype, vision: vision_dtype }; |
| const image = await load_image(imageUrl); |
| |
| for (const [modelId, revision] of Object.entries(modelIds)) { |
| const modelShortName = modelId.split("/").pop(); |
| const modelSection = document.createElement("div"); |
| modelSection.className = "model-results"; |
| modelSection.innerHTML = `<h2>Benchmarking ${modelShortName}</h2><p id="status-${modelShortName}">Loading...</p><pre id="bar-${modelShortName}">▯▯▯▯▯</pre>`; |
| resultsDiv.appendChild(modelSection); |
| |
| const status = document.getElementById(`status-${modelShortName}`); |
| const bar = document.getElementById(`bar-${modelShortName}`); |
| |
| try { |
| status.innerText = "Loading processor and model..."; |
| const [processor, model] = await SmolVLM.getInstance(modelId, dtypeSettings, device, revision); |
| |
| status.innerText = "Warming up..."; |
| const messages = [{ |
| role: "user", |
| content: [ |
| { type: "image" }, |
| { type: "text", text: "Can you describe this image?" }, |
| ], |
| }]; |
| const text = processor.apply_chat_template(messages, { add_generation_prompt: true }); |
| const inputs = await processor(text, [image], { do_image_splitting: doImageSplitting }); |
| await model.generate({ ...inputs, max_new_tokens: 1 }); |
| |
| let totalTime = 0; |
| let totalTps = 0; |
| let runsResults = []; |
| |
| for (let i = 0; i < numRuns; ++i) { |
| status.innerText = `Running benchmark... (${i + 1}/${numRuns})`; |
| bar.innerText = createProgressBar(i + 1, numRuns); |
| const start = performance.now(); |
| |
| const text = processor.apply_chat_template(messages, { add_generation_prompt: true }); |
| const inputs = await processor(text, [image], { do_image_splitting: doImageSplitting }); |
| |
| let numTokens = 0; |
| let startTime; |
| let tps = 0; |
| const token_callback_function = () => { |
| startTime = startTime || performance.now(); |
| tps = (numTokens++ / (performance.now() - startTime)) * 1000; |
| }; |
| const streamer = new TextStreamer(processor.tokenizer, { |
| skip_prompt: true, |
| skip_special_tokens: true, |
| token_callback_function, |
| }); |
| await model.generate({ |
| ...inputs, |
| max_new_tokens: maxTokens, |
| min_new_tokens: maxTokens, |
| streamer, |
| }); |
| const elapsed = performance.now() - start; |
| |
| |
| totalTime += elapsed; |
| totalTps += tps; |
| runsResults.push({ |
| run: i + 1, |
| time: elapsed.toFixed(2), |
| tps: tps.toFixed(2) |
| }); |
| } |
| |
| const avgTime = (totalTime / numRuns).toFixed(2); |
| const avgTps = (totalTps / numRuns).toFixed(2); |
| status.innerText = "✅ Done!"; |
| bar.innerText = createProgressBar(numRuns, numRuns); |
| |
| let tableHtml = "<table>"; |
| tableHtml += "<tr><th>Run</th><th>Execution Time (ms)</th><th>Tokens per Second</th></tr>"; |
| runsResults.forEach(r => { |
| tableHtml += `<tr><td>${r.run}</td><td>${r.time}</td><td>${r.tps}</td></tr>`; |
| }); |
| tableHtml += `<tr><td><strong>Average</strong></td><td><strong>${avgTime}</strong></td><td><strong>${avgTps}</strong></td></tr>`; |
| tableHtml += "</table>"; |
| modelSection.innerHTML += tableHtml; |
| |
| } catch (e) { |
| status.innerText = "❌ Error: " + e.toString(); |
| } |
| } |
| } |
| |
| function createProgressBar(current, total) { |
| const filled = "▮".repeat(current); |
| const empty = "▯".repeat(total - current); |
| return filled + empty; |
| } |
| |
| document.getElementById("start-benchmark").addEventListener("click", runBenchmark); |
| </script> |
|
|
| </body> |
| </html> |
|
|
|
|