lex-interviewer-chat

Sleeping

File size: 5,505 Bytes

5477c5d

<!DOCTYPE html>
<html>
<head>
  <title>ONNX WebGPU Test</title>
  <script type="module">
    import { pipeline, TextStreamer } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@4.0.0-next.8/dist/transformers.min.js';

    const log = (msg) => {
      console.log(msg);
      document.getElementById('log').textContent += msg + '\n';
    };

    const MODELS = {
      reference: 'onnx-community/NVIDIA-Nemotron-3-Nano-4B-BF16-ONNX',
      finetuned: 'bobber/lex-interviewer-nemotron-4b-grpo-v12',
    };

    window.runTest = async (modelKey) => {
      const modelId = MODELS[modelKey];
      document.getElementById('log').textContent = '';
      log(`Testing: ${modelId}`);
      log(`Device: webgpu`);

      // Check WebGPU
      if (!navigator.gpu) { log('❌ No WebGPU!'); return; }
      const adapter = await navigator.gpu.requestAdapter();
      log(`GPU: ${adapter ? (adapter.info?.description || adapter.name || 'adapter found') : 'no adapter'}`);

      log('Loading pipeline (this downloads ~2.5GB)...');
      const statusEl = document.getElementById('status');
      statusEl.textContent = 'Downloading model...';

      let gen;
      try {
        gen = await pipeline('text-generation', modelId, {
          dtype: 'q4',
          device: 'webgpu',
          progress_callback: (p) => {
            if (p.status === 'progress') {
              const pct = Math.round((p.loaded / p.total) * 100);
              statusEl.textContent = `Downloading: ${pct}%`;
            }
          }
        });
      } catch(e) {
        log(`❌ Pipeline error: ${e.message}`);
        return;
      }
      statusEl.textContent = 'Model loaded!';
      log('Model loaded ✓');

      // Test with thinking enabled
      for (const enableThinking of [true, false]) {
        log(`\n=== enable_thinking: ${enableThinking} ===`);

        const allChunks = [];
        const streamer = new TextStreamer(gen.tokenizer, {
          skip_prompt: true,
          skip_special_tokens: false,
          callback_function: (output) => {
            allChunks.push(output);
          },
        });

        const messages = [
          { role: 'system', content: 'You are an AI interviewer. Ask one question at a time.' },
          { role: 'user', content: "I think neural networks are simple." },
        ];

        log('Generating...');
        await gen(messages, {
          max_new_tokens: 512,
          do_sample: false,
          eos_token_id: [2, 11],
          streamer,
          tokenizer_encode_kwargs: { enable_thinking: enableThinking },
        });

        const fullText = allChunks.join('');
        log(`Total chunks: ${allChunks.length}`);
        log(`Total chars: ${fullText.length}`);
        log(`Contains </think>: ${fullText.includes('</think>')}`);
        log(`Contains <|im_end|>: ${fullText.includes('<|im_end|>')}`);

        log(`First 3 chunks: ${allChunks.slice(0, 3).map(c => JSON.stringify(c)).join(', ')}`);
        log(`Last 3 chunks: ${allChunks.slice(-3).map(c => JSON.stringify(c)).join(', ')}`);

        if (fullText.includes('</think>')) {
          const afterThink = fullText.slice(fullText.indexOf('</think>') + 8)
            .replace(/<\|im_end\|>/g, '').trim();
          log(`Content after </think>: ${JSON.stringify(afterThink.slice(0, 200))}`);
        } else {
          log(`❌ No </think> found!`);
          log(`Full output (last 300): ${JSON.stringify(fullText.slice(-300))}`);
        }

        // Simulate the parser
        let isFirst = true;
        let inThink = false;
        let reasoning = '';
        let content = '';
        let buf = '';
        for (const chunk of allChunks) {
          if (!chunk || chunk === '<|im_end|>') continue;
          let text = chunk;
          if (isFirst && enableThinking) { text = '<think>' + text; isFirst = false; }
          else if (isFirst) { isFirst = false; }
          buf += text;
          while (buf.length > 0) {
            if (inThink) {
              const ci = buf.indexOf('</think>');
              if (ci !== -1) {
                reasoning += buf.slice(0, ci);
                buf = buf.slice(ci + 8);
                inThink = false;
                continue;
              }
              reasoning += buf;
              buf = '';
              break;
            }
            const oi = buf.indexOf('<think>');
            if (oi !== -1) {
              content += buf.slice(0, oi);
              buf = buf.slice(oi + 7);
              inThink = true;
              continue;
            }
            content += buf;
            buf = '';
            break;
          }
        }
        log(`Parser result: content=${JSON.stringify(content.trim().slice(0, 200))}`);
        log(`Parser result: reasoning_length=${reasoning.length}`);
        log(`Parser result: still_in_think=${inThink}`);
        log(`Would show "No response": ${!content.trim()}`);
      }

      log('\n✅ Test complete!');
      statusEl.textContent = 'Test complete!';
    };
  </script>
</head>
<body style="font-family: monospace; padding: 20px; background: #1a1a1a; color: #eee;">
  <h2>ONNX WebGPU Think-Tag Test</h2>
  <p id="status">Ready</p>
  <button onclick="runTest('reference')" style="padding: 10px 20px; margin: 5px;">Test Reference Model</button>
  <button onclick="runTest('finetuned')" style="padding: 10px 20px; margin: 5px;">Test Fine-tuned Model</button>
  <hr>
  <pre id="log" style="white-space: pre-wrap; max-height: 80vh; overflow-y: auto;"></pre>
</body>
</html>