Spaces:

Javedalam
/

transformersjs-webgpu-captioning

Running

App Files Files Community

Javedalam commited on Aug 31, 2025

Commit

8c05aeb

verified ·

1 Parent(s): de5ca6f

Update index.html

Browse files

Files changed (1) hide show

index.html +9 -21

index.html CHANGED Viewed

@@ -3,7 +3,7 @@
 <head>
   <meta charset="utf-8" />
   <meta name="viewport" content="width=device-width,initial-scale=1" />
-  <title>WebGPU · Transformers.js · Better Image Captioning</title>
   <style>
     body { font: 16px/1.45 system-ui, sans-serif; margin: 24px auto; max-width: 900px; padding: 0 16px; }
     .card { border:1px solid #4443; border-radius:12px; padding:16px; }
@@ -25,7 +25,7 @@
     <h4>Output</h4>
     <div id="log" class="log">Loading model…</div>
     <p class="muted">
-      Model: <code>Xenova/blip-image-captioning-large</code><br />
       Backend: <span id="backend">…</span>
     </p>
   </div>
@@ -38,37 +38,23 @@
     const imgEl  = document.getElementById('preview');
     const backendEl = document.getElementById('backend');
-    // Prefer WebGPU; fall back to WASM
     const hasWebGPU = 'gpu' in navigator;
     let device = hasWebGPU ? 'webgpu' : 'wasm';
     backendEl.textContent = device.toUpperCase();
     envEl.textContent = hasWebGPU ? '✅ WebGPU detected (will fallback if slow)…' : '⚠️ Using WASM (CPU).';
-    // --- SINGLE FIX: force ?download=1 on HF "resolve" URLs to avoid 401s ---
-    const _fetch = window.fetch.bind(window);
-    window.fetch = (url, opts) => {
-      try {
-        const u = new URL(url);
-        if (u.hostname === 'huggingface.co' && u.pathname.includes('/resolve/')) {
-          if (!u.searchParams.has('download')) u.searchParams.set('download', '1');
-          url = u.toString();
-        }
-      } catch {}
-      return _fetch(url, opts);
-    };
-    // -----------------------------------------------------------------------
     // Load Transformers.js v3
     const { pipeline } = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3');
     // Watchdog: if WebGPU load takes too long, retry on WASM
-    const LOAD_TIMEOUT_MS = 30000;
     let captioner;
     async function buildPipeline(targetDevice) {
       logEl.textContent = `Loading model… device=${targetDevice}`;
       backendEl.textContent = targetDevice.toUpperCase();
-      return await pipeline('image-to-text', 'Xenova/blip-image-captioning-large', { device: targetDevice });
     }
     try {
@@ -80,7 +66,8 @@
         captioner = await buildPipeline('wasm');
       }
     } catch (e) {
-      if (hasWebGPU && (e.message === 'webgpu-timeout' || String(e).includes('webgpu'))) {
         envEl.textContent = '⚠️ WebGPU load slow/failed → falling back to WASM.';
         device = 'wasm';
         captioner = await buildPipeline('wasm');
@@ -103,7 +90,7 @@
       imgEl.src = imgURL;
     });
-    // Run captioning (beam search for better quality)
     runBtn.addEventListener('click', async () => {
       if (!captioner) return;
       if (!imgURL) { logEl.textContent = 'Pick an image first.'; return; }
@@ -126,3 +113,4 @@
 </html>

 <head>
   <meta charset="utf-8" />
   <meta name="viewport" content="width=device-width,initial-scale=1" />
+  <title>WebGPU · Transformers.js · Image Captioning (Works)</title>
   <style>
     body { font: 16px/1.45 system-ui, sans-serif; margin: 24px auto; max-width: 900px; padding: 0 16px; }
     .card { border:1px solid #4443; border-radius:12px; padding:16px; }
     <h4>Output</h4>
     <div id="log" class="log">Loading model…</div>
     <p class="muted">
+      Model: <code>Xenova/vit-gpt2-image-captioning</code><br />
       Backend: <span id="backend">…</span>
     </p>
   </div>
     const imgEl  = document.getElementById('preview');
     const backendEl = document.getElementById('backend');
+    // Prefer WebGPU; fall back to WASM if unavailable/slow
     const hasWebGPU = 'gpu' in navigator;
     let device = hasWebGPU ? 'webgpu' : 'wasm';
     backendEl.textContent = device.toUpperCase();
     envEl.textContent = hasWebGPU ? '✅ WebGPU detected (will fallback if slow)…' : '⚠️ Using WASM (CPU).';
     // Load Transformers.js v3
     const { pipeline } = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3');
     // Watchdog: if WebGPU load takes too long, retry on WASM
+    const LOAD_TIMEOUT_MS = 25000;
     let captioner;
     async function buildPipeline(targetDevice) {
       logEl.textContent = `Loading model… device=${targetDevice}`;
       backendEl.textContent = targetDevice.toUpperCase();
+      return await pipeline('image-to-text', 'Xenova/vit-gpt2-image-captioning', { device: targetDevice });
     }
     try {
         captioner = await buildPipeline('wasm');
       }
     } catch (e) {
+      // Fallback once to WASM if WebGPU fails or stalls
+      if (hasWebGPU && (e.message === 'webgpu-timeout' || String(e).toLowerCase().includes('webgpu'))) {
         envEl.textContent = '⚠️ WebGPU load slow/failed → falling back to WASM.';
         device = 'wasm';
         captioner = await buildPipeline('wasm');
       imgEl.src = imgURL;
     });
+    // Run captioning (beam search for better captions)
     runBtn.addEventListener('click', async () => {
       if (!captioner) return;
       if (!imgURL) { logEl.textContent = 'Pick an image first.'; return; }
 </html>