Javedalam commited on
Commit
9a9bb4d
·
verified ·
1 Parent(s): 50dd43a

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +40 -12
index.html CHANGED
@@ -3,7 +3,7 @@
3
  <head>
4
  <meta charset="utf-8" />
5
  <meta name="viewport" content="width=device-width,initial-scale=1" />
6
- <title>WebGPU · Transformers.js · Image Captioning</title>
7
  <style>
8
  body { font: 16px/1.45 system-ui, sans-serif; margin: 24px auto; max-width: 900px; padding: 0 16px; }
9
  .card { border:1px solid #4443; border-radius:12px; padding:16px; }
@@ -25,7 +25,7 @@
25
  <h4>Output</h4>
26
  <div id="log" class="log">Loading model…</div>
27
  <p class="muted">
28
- Model: <code>Xenova/vit-gpt2-image-captioning</code><br />
29
  Backend: <span id="backend">…</span>
30
  </p>
31
  </div>
@@ -40,24 +40,46 @@
40
 
41
  // Prefer WebGPU; fall back to WASM
42
  const hasWebGPU = 'gpu' in navigator;
43
- const device = hasWebGPU ? 'webgpu' : 'wasm';
44
  backendEl.textContent = device.toUpperCase();
45
- envEl.textContent = hasWebGPU ? '✅ WebGPU detected' : '⚠️ Using WASM (CPU)';
 
46
 
47
  // Load Transformers.js v3
48
  const { pipeline } = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3');
49
 
50
- // Build captioning pipeline with a model that fetches cleanly
 
51
  let captioner;
 
 
 
 
 
 
 
52
  try {
53
- captioner = await pipeline('image-to-text', 'Xenova/vit-gpt2-image-captioning', { device });
54
- logEl.textContent = `Model ready · device=${device}`;
55
- runBtn.disabled = false;
 
 
 
 
56
  } catch (e) {
57
- logEl.textContent = 'Error loading model: ' + e;
58
- console.error(e);
 
 
 
 
 
 
59
  }
60
 
 
 
 
61
  // Preview selected image
62
  let imgURL = null;
63
  fileEl.addEventListener('change', () => {
@@ -68,13 +90,18 @@
68
  imgEl.src = imgURL;
69
  });
70
 
71
- // Run captioning
72
  runBtn.addEventListener('click', async () => {
73
  if (!captioner) return;
74
  if (!imgURL) { logEl.textContent = 'Pick an image first.'; return; }
75
  logEl.textContent = 'Running…';
76
  try {
77
- const out = await captioner(imgURL); // [{ generated_text }]
 
 
 
 
 
78
  logEl.textContent = out[0].generated_text;
79
  } catch (e) {
80
  logEl.textContent = 'Inference error: ' + e;
@@ -85,3 +112,4 @@
85
  </body>
86
  </html>
87
 
 
 
3
  <head>
4
  <meta charset="utf-8" />
5
  <meta name="viewport" content="width=device-width,initial-scale=1" />
6
+ <title>WebGPU · Transformers.js · Better Image Captioning</title>
7
  <style>
8
  body { font: 16px/1.45 system-ui, sans-serif; margin: 24px auto; max-width: 900px; padding: 0 16px; }
9
  .card { border:1px solid #4443; border-radius:12px; padding:16px; }
 
25
  <h4>Output</h4>
26
  <div id="log" class="log">Loading model…</div>
27
  <p class="muted">
28
+ Model: <code>Xenova/blip-image-captioning-large</code><br />
29
  Backend: <span id="backend">…</span>
30
  </p>
31
  </div>
 
40
 
41
  // Prefer WebGPU; fall back to WASM
42
  const hasWebGPU = 'gpu' in navigator;
43
+ let device = hasWebGPU ? 'webgpu' : 'wasm';
44
  backendEl.textContent = device.toUpperCase();
45
+ envEl.textContent = hasWebGPU ? '✅ WebGPU detected (will fallback if slow)'
46
+ : '⚠️ Using WASM (CPU).';
47
 
48
  // Load Transformers.js v3
49
  const { pipeline } = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3');
50
 
51
+ // Watchdog: if WebGPU load takes too long, retry on WASM
52
+ const LOAD_TIMEOUT_MS = 30000; // 30s
53
  let captioner;
54
+
55
+ async function buildPipeline(targetDevice) {
56
+ logEl.textContent = `Loading model… device=${targetDevice}`;
57
+ backendEl.textContent = targetDevice.toUpperCase();
58
+ return await pipeline('image-to-text', 'Xenova/blip-image-captioning-large', { device: targetDevice });
59
+ }
60
+
61
  try {
62
+ if (device === 'webgpu') {
63
+ const webgpuPromise = buildPipeline('webgpu');
64
+ const timeout = new Promise((_, rej) => setTimeout(() => rej(new Error('webgpu-timeout')), LOAD_TIMEOUT_MS));
65
+ captioner = await Promise.race([webgpuPromise, timeout]);
66
+ } else {
67
+ captioner = await buildPipeline('wasm');
68
+ }
69
  } catch (e) {
70
+ if (hasWebGPU && (e.message === 'webgpu-timeout' || String(e).includes('webgpu'))) {
71
+ envEl.textContent = '⚠️ WebGPU load slow/failed → falling back to WASM.';
72
+ device = 'wasm';
73
+ captioner = await buildPipeline('wasm');
74
+ } else {
75
+ logEl.textContent = 'Error loading model: ' + e;
76
+ throw e;
77
+ }
78
  }
79
 
80
+ logEl.textContent = `Model ready · device=${device}`;
81
+ runBtn.disabled = false;
82
+
83
  // Preview selected image
84
  let imgURL = null;
85
  fileEl.addEventListener('change', () => {
 
90
  imgEl.src = imgURL;
91
  });
92
 
93
+ // Run captioning with better decoding (beam search)
94
  runBtn.addEventListener('click', async () => {
95
  if (!captioner) return;
96
  if (!imgURL) { logEl.textContent = 'Pick an image first.'; return; }
97
  logEl.textContent = 'Running…';
98
  try {
99
+ const out = await captioner(imgURL, {
100
+ max_new_tokens: 48,
101
+ num_beams: 5,
102
+ do_sample: false,
103
+ no_repeat_ngram_size: 3
104
+ }); // → [{ generated_text }]
105
  logEl.textContent = out[0].generated_text;
106
  } catch (e) {
107
  logEl.textContent = 'Inference error: ' + e;
 
112
  </body>
113
  </html>
114
 
115
+