Spaces:

HuggingFaceTB
/

smolvlm-web-benchmarking-all

Running

App Files Files Community

smolvlm-web-benchmarking-all / index.html

Xenova HF Staff

Update index.html

4439ab3 verified about 1 year ago

raw

history blame contribute delete

9.48 kB

	<!DOCTYPE html>
	<html>
	<head>
	<meta charset="UTF-8">
	<title>SmolVLM Benchmark Demo</title>
	<style>
	body { font-family: Arial, sans-serif; margin: 20px; }
	fieldset { margin-bottom: 20px; padding: 10px; }
	legend { font-weight: bold; }
	label { display: block; margin-top: 5px; }
	input, select { margin-bottom: 5px; width: 100%; max-width: 400px; }
	table { border-collapse: collapse; margin-top: 20px; width: 100%; max-width: 600px; }
	th, td { border: 1px solid #ccc; padding: 8px; text-align: left; }
	button { padding: 10px 20px; }
	.model-results { margin-bottom: 40px; }
	</style>
	</head>
	<body>
	<h1>SmolVLM Benchmark Demo</h1>

	<!-- Model Options (ignored in the benchmark loop) -->
	<fieldset id="model-options">
	<legend>Model Options (Note: Benchmarking all three SmolVLM models by default)</legend>
	<label for="model-id">Select Model ID:</label>
	<select id="model-id" disabled>
	<option value="hf-internal-testing/tiny-random-Idefics3ForConditionalGeneration">hf-internal-testing/tiny-random-Idefics3ForConditionalGeneration</option>
	<option value="HuggingFaceTB/SmolVLM-256M-Instruct" selected>HuggingFaceTB/SmolVLM-256M-Instruct</option>
	<option value="HuggingFaceTB/SmolVLM-500M-Instruct">HuggingFaceTB/SmolVLM-500M-Instruct</option>
	<option value="HuggingFaceTB/SmolVLM-Instruct">HuggingFaceTB/SmolVLM-Instruct</option>
	</select>

	<label for="decoder-dtype">Decoder (decoder_model_merged) dtype:</label>
	<select id="decoder-dtype">
	<option value="fp32">fp32</option>
	<option value="fp16">fp16</option>
	<option value="q8">q8</option>
	<option value="q4" selected>q4</option>
	<option value="q4f16">q4f16</option>
	</select>

	<label for="embed-dtype">Embed Tokens dtype:</label>
	<select id="embed-dtype">
	<option value="fp32">fp32</option>
	<option value="fp16">fp16</option>
	<option value="q8">q8</option>
	<option value="q4" selected>q4</option>
	<option value="q4f16">q4f16</option>
	</select>

	<label for="vision-dtype">Vision Encoder dtype:</label>
	<select id="vision-dtype">
	<option value="fp32">fp32</option>
	<option value="fp16">fp16</option>
	<option value="q8">q8</option>
	<option value="q4" selected>q4</option>
	<option value="q4f16">q4f16</option>
	</select>
	</fieldset>

	<!-- Hardware Options -->
	<fieldset id="hardware-options">
	<legend>Hardware Options</legend>
	<label for="device">Select Device:</label>
	<select id="device">
	<option value="wasm">wasm</option>
	<option value="webgpu" selected>webgpu</option>
	</select>
	</fieldset>

	<!-- Benchmark Options -->
	<fieldset id="benchmark-options">
	<legend>Benchmark Options</legend>
	<label for="image-url">Image URL:</label>
	<input type="text" id="image-url" value="https://huggingface.co/spaces/merve/chameleon-7b/resolve/main/bee.jpg">

	<label for="do-split">Do Image Splitting (do_image_splitting)</label>
	<input type="checkbox" id="do-split" checked>

	<label for="max-tokens">Number of Tokens to Generate:</label>
	<input type="number" id="max-tokens" value="32">

	<label for="num-runs">Number of Runs:</label>
	<input type="number" id="num-runs" value="3">
	</fieldset>

	<button id="start-benchmark">Start Benchmark</button>

	<div id="results"></div>

	<script type="module">
	import {
	AutoProcessor,
	AutoModelForVision2Seq,
	load_image,
	TextStreamer,
	} from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.4.2";

	class SmolVLM {
	static model = null;
	static processor = null;
	static model_id = null;
	static async getInstance(modelId, dtypeSettings, device, revision) {
	if (this.model_id !== modelId) {
	await this.model?.dispose();
	this.model = null;
	this.processor = null;
	this.model_id = modelId;
	}
	if (!this.processor) {
	this.processor = await AutoProcessor.from_pretrained(modelId);
	}
	if (!this.model) {
	this.model = await AutoModelForVision2Seq.from_pretrained(modelId, {
	dtype: {
	embed_tokens: dtypeSettings.embed,
	vision_encoder: dtypeSettings.vision,
	decoder_model_merged: dtypeSettings.decoder,
	},
	device: device,
	revision,
	});
	}
	return [this.processor, this.model];
	}
	}

	async function runBenchmark() {
	document.getElementById("model-options").disabled = true;
	document.getElementById("hardware-options").disabled = true;
	const resultsDiv = document.getElementById("results");
	resultsDiv.innerHTML = "";

	const modelIds = {
	"HuggingFaceTB/SmolVLM-256M-Instruct": "refs/pr/11",
	"HuggingFaceTB/SmolVLM-500M-Instruct": "refs/pr/9",
	"HuggingFaceTB/SmolVLM-Instruct": "main"
	};

	const decoder_dtype = document.getElementById("decoder-dtype").value \|\| "q4";
	const embed_dtype = document.getElementById("embed-dtype").value \|\| "q4";
	const vision_dtype = document.getElementById("vision-dtype").value \|\| "q4";
	const device = document.getElementById("device").value;
	const imageUrl = document.getElementById("image-url").value;
	const maxTokens = parseInt(document.getElementById("max-tokens").value) \|\| 32;
	const numRuns = parseInt(document.getElementById("num-runs").value) \|\| 3;
	const doImageSplitting = document.getElementById("do-split").checked;

	const dtypeSettings = { decoder: decoder_dtype, embed: embed_dtype, vision: vision_dtype };
	const image = await load_image(imageUrl);

	for (const [modelId, revision] of Object.entries(modelIds)) {
	const modelShortName = modelId.split("/").pop();
	const modelSection = document.createElement("div");
	modelSection.className = "model-results";
	modelSection.innerHTML = `<h2>Benchmarking ${modelShortName}</h2><p id="status-${modelShortName}">Loading...</p><pre id="bar-${modelShortName}">▯▯▯▯▯</pre>`;
	resultsDiv.appendChild(modelSection);

	const status = document.getElementById(`status-${modelShortName}`);
	const bar = document.getElementById(`bar-${modelShortName}`);

	try {
	status.innerText = "Loading processor and model...";
	const [processor, model] = await SmolVLM.getInstance(modelId, dtypeSettings, device, revision);

	status.innerText = "Warming up...";
	const messages = [{
	role: "user",
	content: [
	{ type: "image" },
	{ type: "text", text: "Can you describe this image?" },
	],
	}];
	const text = processor.apply_chat_template(messages, { add_generation_prompt: true });
	const inputs = await processor(text, [image], { do_image_splitting: doImageSplitting });
	await model.generate({ ...inputs, max_new_tokens: 1 });

	let totalTime = 0;
	let totalTps = 0;
	let runsResults = [];

	for (let i = 0; i < numRuns; ++i) {
	status.innerText = `Running benchmark... (${i + 1}/${numRuns})`;
	bar.innerText = createProgressBar(i + 1, numRuns);
	const start = performance.now();
	// const [processor, model] = await SmolVLM.getInstance(modelId, dtypeSettings, device, revision);
	const text = processor.apply_chat_template(messages, { add_generation_prompt: true });
	const inputs = await processor(text, [image], { do_image_splitting: doImageSplitting });

	let numTokens = 0;
	let startTime;
	let tps = 0;
	const token_callback_function = () => {
	startTime = startTime \|\| performance.now();
	tps = (numTokens++ / (performance.now() - startTime)) * 1000;
	};
	const streamer = new TextStreamer(processor.tokenizer, {
	skip_prompt: true,
	skip_special_tokens: true,
	token_callback_function,
	});
	await model.generate({
	...inputs,
	max_new_tokens: maxTokens,
	min_new_tokens: maxTokens,
	streamer,
	});
	const elapsed = performance.now() - start;


	totalTime += elapsed;
	totalTps += tps;
	runsResults.push({
	run: i + 1,
	time: elapsed.toFixed(2),
	tps: tps.toFixed(2)
	});
	}

	const avgTime = (totalTime / numRuns).toFixed(2);
	const avgTps = (totalTps / numRuns).toFixed(2);
	status.innerText = "✅ Done!";
	bar.innerText = createProgressBar(numRuns, numRuns);

	let tableHtml = "<table>";
	tableHtml += "<tr><th>Run</th><th>Execution Time (ms)</th><th>Tokens per Second</th></tr>";
	runsResults.forEach(r => {
	tableHtml += `<tr><td>${r.run}</td><td>${r.time}</td><td>${r.tps}</td></tr>`;
	});
	tableHtml += `<tr><td><strong>Average</strong></td><td><strong>${avgTime}</strong></td><td><strong>${avgTps}</strong></td></tr>`;
	tableHtml += "</table>";
	modelSection.innerHTML += tableHtml;

	} catch (e) {
	status.innerText = "❌ Error: " + e.toString();
	}
	}
	}

	function createProgressBar(current, total) {
	const filled = "▮".repeat(current);
	const empty = "▯".repeat(total - current);
	return filled + empty;
	}

	document.getElementById("start-benchmark").addEventListener("click", runBenchmark);
	</script>

	</body>
	</html>