| | <!doctype html> |
| | <html> |
| | <head> |
| | <meta charset="utf-8" /> |
| | <meta name="viewport" content="width=device-width,initial-scale=1" /> |
| | <title>WebGPU · Transformers.js · Image Captioning</title> |
| | <style> |
| | body { font: 16px/1.45 system-ui, sans-serif; margin: 24px auto; max-width: 900px; padding: 0 16px; } |
| | .card { border:1px solid #4443; border-radius:12px; padding:16px; } |
| | .log { white-space:pre-wrap; background:#111; color:#0f0; padding:12px; border-radius:8px; min-height:80px; } |
| | img { max-width:100%; border-radius:8px; margin-top:10px; } |
| | .muted { opacity:.75; font-size:14px; } |
| | button,input { font:inherit; } |
| | </style> |
| | </head> |
| | <body> |
| | <h2>Image → Text in your browser (Transformers.js + WebGPU)</h2> |
| | <p id="env">Probing environment…</p> |
| |
|
| | <div class="card"> |
| | <h3>Caption an image (file upload)</h3> |
| | <input id="file" type="file" accept="image/*" /> |
| | <button id="run" disabled>Caption</button> |
| | <div><img id="preview" alt="preview will appear here" /></div> |
| | <h4>Output</h4> |
| | <div id="log" class="log">Loading model…</div> |
| | <p class="muted"> |
| | Model: <code>Xenova/vit-gpt2-image-captioning</code><br /> |
| | Backend: <span id="backend">…</span> |
| | </p> |
| | </div> |
| |
|
| | <script type="module"> |
| | const envEl = document.getElementById('env'); |
| | const fileEl = document.getElementById('file'); |
| | const runBtn = document.getElementById('run'); |
| | const logEl = document.getElementById('log'); |
| | const imgEl = document.getElementById('preview'); |
| | const backendEl = document.getElementById('backend'); |
| | |
| | |
| | const hasWebGPU = 'gpu' in navigator; |
| | let device = hasWebGPU ? 'webgpu' : 'wasm'; |
| | backendEl.textContent = device.toUpperCase(); |
| | envEl.textContent = hasWebGPU ? '✅ WebGPU detected (will fallback if slow)…' : '⚠️ Using WASM (CPU).'; |
| | |
| | |
| | const { pipeline } = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3'); |
| | |
| | |
| | const LOAD_TIMEOUT_MS = 25000; |
| | let captioner; |
| | |
| | async function buildPipeline(targetDevice) { |
| | logEl.textContent = `Loading model… device=${targetDevice}`; |
| | backendEl.textContent = targetDevice.toUpperCase(); |
| | return await pipeline('image-to-text', 'Xenova/vit-gpt2-image-captioning', { device: targetDevice }); |
| | } |
| | |
| | try { |
| | if (device === 'webgpu') { |
| | const webgpuPromise = buildPipeline('webgpu'); |
| | const timeout = new Promise((_, rej) => setTimeout(() => rej(new Error('webgpu-timeout')), LOAD_TIMEOUT_MS)); |
| | captioner = await Promise.race([webgpuPromise, timeout]); |
| | } else { |
| | captioner = await buildPipeline('wasm'); |
| | } |
| | } catch (e) { |
| | if (hasWebGPU && (e.message === 'webgpu-timeout' || String(e).toLowerCase().includes('webgpu'))) { |
| | envEl.textContent = '⚠️ WebGPU load slow/failed → falling back to WASM.'; |
| | device = 'wasm'; |
| | captioner = await buildPipeline('wasm'); |
| | } else { |
| | logEl.textContent = 'Error loading model: ' + e; |
| | throw e; |
| | } |
| | } |
| | |
| | logEl.textContent = `Model ready · device=${device}`; |
| | runBtn.disabled = false; |
| | |
| | |
| | let imgDataURL = null; |
| | |
| | fileEl.addEventListener('change', () => { |
| | logEl.textContent = 'Image selected. Preparing preview…'; |
| | const f = fileEl.files?.[0]; |
| | if (!f) { logEl.textContent = 'No file chosen.'; return; } |
| | |
| | |
| | if (!f.type.startsWith('image/')) { |
| | logEl.textContent = `Unsupported file type: ${f.type || 'unknown'}. Use JPG/PNG.`; |
| | return; |
| | } |
| | |
| | const reader = new FileReader(); |
| | reader.onerror = () => { |
| | logEl.textContent = 'Failed to read file. Try another image.'; |
| | }; |
| | reader.onload = async () => { |
| | imgDataURL = reader.result; |
| | imgEl.src = imgDataURL; |
| | try { |
| | |
| | if (imgEl.decode) await imgEl.decode(); |
| | logEl.textContent = 'Preview ready. Click “Caption”.'; |
| | } catch { |
| | logEl.textContent = 'Could not decode image. Try a JPG/PNG under ~5 MB.'; |
| | } |
| | }; |
| | reader.readAsDataURL(f); |
| | }); |
| | |
| | |
| | |
| | runBtn.addEventListener('click', async () => { |
| | if (!captioner) return; |
| | if (!imgDataURL) { logEl.textContent = 'Pick an image first.'; return; } |
| | logEl.textContent = 'Running…'; |
| | try { |
| | const out = await captioner(imgDataURL, { |
| | max_new_tokens: 48, |
| | num_beams: 5, |
| | do_sample: false, |
| | no_repeat_ngram_size: 3 |
| | }); |
| | logEl.textContent = out[0].generated_text; |
| | } catch (e) { |
| | logEl.textContent = 'Inference error: ' + e; |
| | console.error(e); |
| | } |
| | }); |
| | </script> |
| | </body> |
| | </html> |
| |
|
| |
|
| |
|
| |
|
| |
|