File size: 5,169 Bytes
ca67b71 7ac754e e74a613 7ac754e e74a613 7ac754e 8c05aeb 7ac754e 8c05aeb 7ac754e 9a9bb4d 7ac754e d5183db 50dd43a 7ac754e 9a9bb4d 8c05aeb 7ac754e 9a9bb4d 8c05aeb 9a9bb4d 7ac754e 9a9bb4d 7ac754e 8c05aeb 9a9bb4d 7ac754e 9a9bb4d e74a613 7ac754e e74a613 7ac754e e74a613 7ac754e e74a613 7ac754e 8c05aeb 7ac754e e74a613 7ac754e e74a613 9a9bb4d d5183db 7ac754e ca67b71 50dd43a 9a9bb4d 8c05aeb e74a613 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
<!doctype html>
<html>
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<title>WebGPU · Transformers.js · Image Captioning</title>
<style>
body { font: 16px/1.45 system-ui, sans-serif; margin: 24px auto; max-width: 900px; padding: 0 16px; }
.card { border:1px solid #4443; border-radius:12px; padding:16px; }
.log { white-space:pre-wrap; background:#111; color:#0f0; padding:12px; border-radius:8px; min-height:80px; }
img { max-width:100%; border-radius:8px; margin-top:10px; }
.muted { opacity:.75; font-size:14px; }
button,input { font:inherit; }
</style>
</head>
<body>
<h2>Image → Text in your browser (Transformers.js + WebGPU)</h2>
<p id="env">Probing environment…</p>
<div class="card">
<h3>Caption an image (file upload)</h3>
<input id="file" type="file" accept="image/*" />
<button id="run" disabled>Caption</button>
<div><img id="preview" alt="preview will appear here" /></div>
<h4>Output</h4>
<div id="log" class="log">Loading model…</div>
<p class="muted">
Model: <code>Xenova/vit-gpt2-image-captioning</code><br />
Backend: <span id="backend">…</span>
</p>
</div>
<script type="module">
const envEl = document.getElementById('env');
const fileEl = document.getElementById('file');
const runBtn = document.getElementById('run');
const logEl = document.getElementById('log');
const imgEl = document.getElementById('preview');
const backendEl = document.getElementById('backend');
// Prefer WebGPU; fall back to WASM if unavailable/slow
const hasWebGPU = 'gpu' in navigator;
let device = hasWebGPU ? 'webgpu' : 'wasm';
backendEl.textContent = device.toUpperCase();
envEl.textContent = hasWebGPU ? '✅ WebGPU detected (will fallback if slow)…' : '⚠️ Using WASM (CPU).';
// Load Transformers.js v3
const { pipeline } = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3');
// Watchdog: if WebGPU load takes too long, retry on WASM
const LOAD_TIMEOUT_MS = 25000;
let captioner;
async function buildPipeline(targetDevice) {
logEl.textContent = `Loading model… device=${targetDevice}`;
backendEl.textContent = targetDevice.toUpperCase();
return await pipeline('image-to-text', 'Xenova/vit-gpt2-image-captioning', { device: targetDevice });
}
try {
if (device === 'webgpu') {
const webgpuPromise = buildPipeline('webgpu');
const timeout = new Promise((_, rej) => setTimeout(() => rej(new Error('webgpu-timeout')), LOAD_TIMEOUT_MS));
captioner = await Promise.race([webgpuPromise, timeout]);
} else {
captioner = await buildPipeline('wasm');
}
} catch (e) {
if (hasWebGPU && (e.message === 'webgpu-timeout' || String(e).toLowerCase().includes('webgpu'))) {
envEl.textContent = '⚠️ WebGPU load slow/failed → falling back to WASM.';
device = 'wasm';
captioner = await buildPipeline('wasm');
} else {
logEl.textContent = 'Error loading model: ' + e;
throw e;
}
}
logEl.textContent = `Model ready · device=${device}`;
runBtn.disabled = false;
// ---------- Robust file load (FileReader → data URL, with checks) ----------
let imgDataURL = null;
fileEl.addEventListener('change', () => {
logEl.textContent = 'Image selected. Preparing preview…';
const f = fileEl.files?.[0];
if (!f) { logEl.textContent = 'No file chosen.'; return; }
// Some Android cameras save HEIC/HEIF which many browsers can’t decode.
if (!f.type.startsWith('image/')) {
logEl.textContent = `Unsupported file type: ${f.type || 'unknown'}. Use JPG/PNG.`;
return;
}
const reader = new FileReader();
reader.onerror = () => {
logEl.textContent = 'Failed to read file. Try another image.';
};
reader.onload = async () => {
imgDataURL = reader.result; // base64 data URL
imgEl.src = imgDataURL;
try {
// ensure it decoded before we allow run
if (imgEl.decode) await imgEl.decode();
logEl.textContent = 'Preview ready. Click “Caption”.';
} catch {
logEl.textContent = 'Could not decode image. Try a JPG/PNG under ~5 MB.';
}
};
reader.readAsDataURL(f);
});
// --------------------------------------------------------------------------
// Run captioning (beam search for better captions)
runBtn.addEventListener('click', async () => {
if (!captioner) return;
if (!imgDataURL) { logEl.textContent = 'Pick an image first.'; return; }
logEl.textContent = 'Running…';
try {
const out = await captioner(imgDataURL, {
max_new_tokens: 48,
num_beams: 5,
do_sample: false,
no_repeat_ngram_size: 3
});
logEl.textContent = out[0].generated_text;
} catch (e) {
logEl.textContent = 'Inference error: ' + e;
console.error(e);
}
});
</script>
</body>
</html>
|