Spaces:

Javedalam
/

transformersjs-webgpu-captioning

Running

File size: 5,169 Bytes

ca67b71
 
7ac754e
 
 
e74a613
7ac754e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e74a613
7ac754e
 
 
8c05aeb
7ac754e
 
 
 
 
 
 
 
 
 
 
 
8c05aeb
7ac754e
9a9bb4d
7ac754e
d5183db
 
50dd43a
 
7ac754e
9a9bb4d
8c05aeb
7ac754e
9a9bb4d
 
 
 
8c05aeb
9a9bb4d
 
7ac754e
9a9bb4d
 
 
 
 
 
 
7ac754e
8c05aeb
9a9bb4d
 
 
 
 
 
 
7ac754e
 
9a9bb4d
 
 
e74a613
 
 
7ac754e
e74a613
7ac754e
e74a613
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ac754e
e74a613
7ac754e
8c05aeb
7ac754e
 
e74a613
7ac754e
 
e74a613
9a9bb4d
 
 
 
d5183db
7ac754e
 
 
 
 
 
 
 
ca67b71
50dd43a
9a9bb4d
8c05aeb
e74a613

<!doctype html>
<html>
<head>
  <meta charset="utf-8" />
  <meta name="viewport" content="width=device-width,initial-scale=1" />
  <title>WebGPU · Transformers.js · Image Captioning</title>
  <style>
    body { font: 16px/1.45 system-ui, sans-serif; margin: 24px auto; max-width: 900px; padding: 0 16px; }
    .card { border:1px solid #4443; border-radius:12px; padding:16px; }
    .log { white-space:pre-wrap; background:#111; color:#0f0; padding:12px; border-radius:8px; min-height:80px; }
    img { max-width:100%; border-radius:8px; margin-top:10px; }
    .muted { opacity:.75; font-size:14px; }
    button,input { font:inherit; }
  </style>
</head>
<body>
  <h2>Image → Text in your browser (Transformers.js + WebGPU)</h2>
  <p id="env">Probing environment…</p>

  <div class="card">
    <h3>Caption an image (file upload)</h3>
    <input id="file" type="file" accept="image/*" />
    <button id="run" disabled>Caption</button>
    <div><img id="preview" alt="preview will appear here" /></div>
    <h4>Output</h4>
    <div id="log" class="log">Loading model…</div>
    <p class="muted">
      Model: <code>Xenova/vit-gpt2-image-captioning</code><br />
      Backend: <span id="backend">…</span>
    </p>
  </div>

  <script type="module">
    const envEl = document.getElementById('env');
    const fileEl = document.getElementById('file');
    const runBtn = document.getElementById('run');
    const logEl  = document.getElementById('log');
    const imgEl  = document.getElementById('preview');
    const backendEl = document.getElementById('backend');

    // Prefer WebGPU; fall back to WASM if unavailable/slow
    const hasWebGPU = 'gpu' in navigator;
    let device = hasWebGPU ? 'webgpu' : 'wasm';
    backendEl.textContent = device.toUpperCase();
    envEl.textContent = hasWebGPU ? '✅ WebGPU detected (will fallback if slow)…' : '⚠️ Using WASM (CPU).';

    // Load Transformers.js v3
    const { pipeline } = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3');

    // Watchdog: if WebGPU load takes too long, retry on WASM
    const LOAD_TIMEOUT_MS = 25000;
    let captioner;

    async function buildPipeline(targetDevice) {
      logEl.textContent = `Loading model… device=${targetDevice}`;
      backendEl.textContent = targetDevice.toUpperCase();
      return await pipeline('image-to-text', 'Xenova/vit-gpt2-image-captioning', { device: targetDevice });
    }

    try {
      if (device === 'webgpu') {
        const webgpuPromise = buildPipeline('webgpu');
        const timeout = new Promise((_, rej) => setTimeout(() => rej(new Error('webgpu-timeout')), LOAD_TIMEOUT_MS));
        captioner = await Promise.race([webgpuPromise, timeout]);
      } else {
        captioner = await buildPipeline('wasm');
      }
    } catch (e) {
      if (hasWebGPU && (e.message === 'webgpu-timeout' || String(e).toLowerCase().includes('webgpu'))) {
        envEl.textContent = '⚠️ WebGPU load slow/failed → falling back to WASM.';
        device = 'wasm';
        captioner = await buildPipeline('wasm');
      } else {
        logEl.textContent = 'Error loading model: ' + e;
        throw e;
      }
    }

    logEl.textContent = `Model ready · device=${device}`;
    runBtn.disabled = false;

    // ---------- Robust file load (FileReader → data URL, with checks) ----------
    let imgDataURL = null;

    fileEl.addEventListener('change', () => {
      logEl.textContent = 'Image selected. Preparing preview…';
      const f = fileEl.files?.[0];
      if (!f) { logEl.textContent = 'No file chosen.'; return; }

      // Some Android cameras save HEIC/HEIF which many browsers can’t decode.
      if (!f.type.startsWith('image/')) {
        logEl.textContent = `Unsupported file type: ${f.type || 'unknown'}. Use JPG/PNG.`;
        return;
      }

      const reader = new FileReader();
      reader.onerror = () => {
        logEl.textContent = 'Failed to read file. Try another image.';
      };
      reader.onload = async () => {
        imgDataURL = reader.result; // base64 data URL
        imgEl.src = imgDataURL;
        try {
          // ensure it decoded before we allow run
          if (imgEl.decode) await imgEl.decode();
          logEl.textContent = 'Preview ready. Click “Caption”.';
        } catch {
          logEl.textContent = 'Could not decode image. Try a JPG/PNG under ~5 MB.';
        }
      };
      reader.readAsDataURL(f);
    });
    // --------------------------------------------------------------------------

    // Run captioning (beam search for better captions)
    runBtn.addEventListener('click', async () => {
      if (!captioner) return;
      if (!imgDataURL) { logEl.textContent = 'Pick an image first.'; return; }
      logEl.textContent = 'Running…';
      try {
        const out = await captioner(imgDataURL, {
          max_new_tokens: 48,
          num_beams: 5,
          do_sample: false,
          no_repeat_ngram_size: 3
        });
        logEl.textContent = out[0].generated_text;
      } catch (e) {
        logEl.textContent = 'Inference error: ' + e;
        console.error(e);
      }
    });
  </script>
</body>
</html>