Spaces:

Javedalam
/

transformersjs-webgpu-captioning

Running

App Files Files Community

Javedalam commited on Aug 31, 2025

Commit

9a9bb4d

verified ·

1 Parent(s): 50dd43a

Update index.html

Browse files

Files changed (1) hide show

index.html +40 -12

index.html CHANGED Viewed

@@ -3,7 +3,7 @@
 <head>
   <meta charset="utf-8" />
   <meta name="viewport" content="width=device-width,initial-scale=1" />
-  <title>WebGPU · Transformers.js · Image Captioning</title>
   <style>
     body { font: 16px/1.45 system-ui, sans-serif; margin: 24px auto; max-width: 900px; padding: 0 16px; }
     .card { border:1px solid #4443; border-radius:12px; padding:16px; }
@@ -25,7 +25,7 @@
     <h4>Output</h4>
     <div id="log" class="log">Loading model…</div>
     <p class="muted">
-      Model: <code>Xenova/vit-gpt2-image-captioning</code><br />
       Backend: <span id="backend">…</span>
     </p>
   </div>
@@ -40,24 +40,46 @@
     // Prefer WebGPU; fall back to WASM
     const hasWebGPU = 'gpu' in navigator;
-    const device = hasWebGPU ? 'webgpu' : 'wasm';
     backendEl.textContent = device.toUpperCase();
-    envEl.textContent = hasWebGPU ? '✅ WebGPU detected' : '⚠️ Using WASM (CPU)';
     // Load Transformers.js v3
     const { pipeline } = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3');
-    // Build captioning pipeline with a model that fetches cleanly
     let captioner;
     try {
-      captioner = await pipeline('image-to-text', 'Xenova/vit-gpt2-image-captioning', { device });
-      logEl.textContent = `Model ready · device=${device}`;
-      runBtn.disabled = false;
     } catch (e) {
-      logEl.textContent = 'Error loading model: ' + e;
-      console.error(e);
     }
     // Preview selected image
     let imgURL = null;
     fileEl.addEventListener('change', () => {
@@ -68,13 +90,18 @@
       imgEl.src = imgURL;
     });
-    // Run captioning
     runBtn.addEventListener('click', async () => {
       if (!captioner) return;
       if (!imgURL) { logEl.textContent = 'Pick an image first.'; return; }
       logEl.textContent = 'Running…';
       try {
-        const out = await captioner(imgURL); // [{ generated_text }]
         logEl.textContent = out[0].generated_text;
       } catch (e) {
         logEl.textContent = 'Inference error: ' + e;
@@ -85,3 +112,4 @@
 </body>
 </html>

 <head>
   <meta charset="utf-8" />
   <meta name="viewport" content="width=device-width,initial-scale=1" />
+  <title>WebGPU · Transformers.js · Better Image Captioning</title>
   <style>
     body { font: 16px/1.45 system-ui, sans-serif; margin: 24px auto; max-width: 900px; padding: 0 16px; }
     .card { border:1px solid #4443; border-radius:12px; padding:16px; }
     <h4>Output</h4>
     <div id="log" class="log">Loading model…</div>
     <p class="muted">
+      Model: <code>Xenova/blip-image-captioning-large</code><br />
       Backend: <span id="backend">…</span>
     </p>
   </div>
     // Prefer WebGPU; fall back to WASM
     const hasWebGPU = 'gpu' in navigator;
+    let device = hasWebGPU ? 'webgpu' : 'wasm';
     backendEl.textContent = device.toUpperCase();
+    envEl.textContent = hasWebGPU ? '✅ WebGPU detected (will fallback if slow)…'
+                                  : '⚠️ Using WASM (CPU).';
     // Load Transformers.js v3
     const { pipeline } = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3');
+    // Watchdog: if WebGPU load takes too long, retry on WASM
+    const LOAD_TIMEOUT_MS = 30000; // 30s
     let captioner;
+    async function buildPipeline(targetDevice) {
+      logEl.textContent = `Loading model… device=${targetDevice}`;
+      backendEl.textContent = targetDevice.toUpperCase();
+      return await pipeline('image-to-text', 'Xenova/blip-image-captioning-large', { device: targetDevice });
+    }
     try {
+      if (device === 'webgpu') {
+        const webgpuPromise = buildPipeline('webgpu');
+        const timeout = new Promise((_, rej) => setTimeout(() => rej(new Error('webgpu-timeout')), LOAD_TIMEOUT_MS));
+        captioner = await Promise.race([webgpuPromise, timeout]);
+      } else {
+        captioner = await buildPipeline('wasm');
+      }
     } catch (e) {
+      if (hasWebGPU && (e.message === 'webgpu-timeout' || String(e).includes('webgpu'))) {
+        envEl.textContent = '⚠️ WebGPU load slow/failed → falling back to WASM.';
+        device = 'wasm';
+        captioner = await buildPipeline('wasm');
+      } else {
+        logEl.textContent = 'Error loading model: ' + e;
+        throw e;
+      }
     }
+    logEl.textContent = `Model ready · device=${device}`;
+    runBtn.disabled = false;
     // Preview selected image
     let imgURL = null;
     fileEl.addEventListener('change', () => {
       imgEl.src = imgURL;
     });
+    // Run captioning with better decoding (beam search)
     runBtn.addEventListener('click', async () => {
       if (!captioner) return;
       if (!imgURL) { logEl.textContent = 'Pick an image first.'; return; }
       logEl.textContent = 'Running…';
       try {
+        const out = await captioner(imgURL, {
+          max_new_tokens: 48,
+          num_beams: 5,
+          do_sample: false,
+          no_repeat_ngram_size: 3
+        }); // → [{ generated_text }]
         logEl.textContent = out[0].generated_text;
       } catch (e) {
         logEl.textContent = 'Inference error: ' + e;
 </body>
 </html>