/**
 * Model loader and inference using transformers.js v4.2.0 + WebGPU.
 * Follows the proven pattern from webml-community/Qwen3.5-WebGPU.
 * Loads Qwen3.5-VL 0.8B ONNX from mshz88/FADA-Mobile-ONNX.
 */

const MODEL_ID = "mshz88/FADA-Mobile-ONNX";
const MODEL_REVISION = "2936611f4ad147e0cbe03e3884de8a42c5cc42b9"; // Pin to specific commit, bypass CDN cache
const TRANSFORMERS_CDN = "https://cdn.jsdelivr.net/npm/@huggingface/transformers@4.2.0";

let processor = null;
let model = null;
let tokenizer = null;
let loadingPromise = null;
let activeDevice = null;

export function getLoadingStatus() {
  if (model && processor && tokenizer) return "ready";
  if (loadingPromise) return "loading";
  return "idle";
}

export function getActiveDevice() {
  return activeDevice;
}

/**
 * Load model + processor + tokenizer. Shows progress via callback.
 * @param {function} onProgress - Progress callback
 * @param {string} device - "webgpu" or "wasm"
 */
export async function loadModel(onProgress, device = "webgpu") {
  if (model && processor && tokenizer) return;
  if (loadingPromise) return loadingPromise;

  loadingPromise = (async () => {
    const {
      Qwen3_5ForConditionalGeneration,
      AutoProcessor,
      AutoTokenizer,
      env,
    } = await import(TRANSFORMERS_CDN);

    // Force high-performance GPU (discrete NVIDIA over integrated Intel)
    if (device === "webgpu" && navigator.gpu) {
      try {
        const adapter = await navigator.gpu.requestAdapter({
          powerPreference: "high-performance",
        });
        if (adapter) {
          const info = await adapter.requestAdapterInfo();
          console.log("[FADA] GPU adapter selected:", info.vendor, info.architecture, info.device, info.description);
          onProgress?.({ status: "info", note: `GPU: ${info.description || info.vendor || 'discrete GPU'}` });
          // Store globally for diagnostics
          window.__FADA_GPU_INFO = info;
        } else {
          console.warn("[FADA] No high-performance GPU adapter found");
        }
      } catch (e) {
        console.warn("[FADA] Could not query GPU adapter:", e);
      }

      // Try to set power preference at the env/backend level for transformers.js
      try {
        if (env?.backends?.onnx?.webgpu) {
          env.backends.onnx.webgpu.powerPreference = "high-performance";
          console.log("[FADA] Set env.backends.onnx.webgpu.powerPreference = high-performance");
        }
        // Also try the top-level webgpu settings
        if (env?.webgpu) {
          env.webgpu.powerPreference = "high-performance";
        }
      } catch (e) {
        console.warn("[FADA] Could not set env power preference:", e);
      }
    }

    onProgress?.({ status: "loading", file: "processor" });
    processor = await AutoProcessor.from_pretrained(MODEL_ID, {
      revision: MODEL_REVISION,
    });
    console.log("[FADA] Processor loaded");

    onProgress?.({ status: "loading", file: "tokenizer" });
    tokenizer = await AutoTokenizer.from_pretrained(MODEL_ID, {
      revision: MODEL_REVISION,
    });
    console.log("[FADA] Tokenizer loaded");

    onProgress?.({ status: "loading", file: "model" });
    console.log(`[FADA] Loading model on device=${device}...`);
    activeDevice = device;

    // Vision encoder dtype: fp16 on WebGPU, fp32 on WASM
    const visionDtype = device === "wasm" ? "fp32" : "fp16";

    const buildModelConfig = (vDtype) => ({
      revision: MODEL_REVISION,
      dtype: {
        embed_tokens: "q4",
        vision_encoder: vDtype,
        decoder_model_merged: "q4",
      },
      device: device,
      progress_callback: (info) => {
        if (info.status === "progress") {
          onProgress?.({
            status: "downloading",
            file: info.file || "",
            progress: info.progress || 0,
            loaded: info.loaded || 0,
            total: info.total || 0,
          });
        }
      },
    });

    try {
      onProgress?.({
        status: "compiling",
        file: "model",
        note: "Creating GPU session & compiling shaders (2-5 min)...",
      });
      model = await Qwen3_5ForConditionalGeneration.from_pretrained(
        MODEL_ID,
        buildModelConfig(visionDtype)
      );
    } catch (err) {
      // Fallback: if fp16 not supported, retry with fp32 vision encoder
      const isFp16Error =
        visionDtype === "fp16" &&
        (err?.message?.toLowerCase().includes("does not support fp16") ||
         err?.message?.toLowerCase().includes("shader"));

      if (isFp16Error) {
        console.warn("[FADA] FP16 not supported, retrying with FP32 vision encoder...");
        onProgress?.({
          status: "loading",
          file: "model",
          note: "FP16 not supported, loading vision encoder in FP32 mode...",
        });
        model = await Qwen3_5ForConditionalGeneration.from_pretrained(
          MODEL_ID,
          buildModelConfig("fp32")
        );
      } else {
        throw err;
      }
    }

    console.log("[FADA] Model loaded successfully on", device);
    onProgress?.({ status: "ready" });
  })();

  try {
    await loadingPromise;
  } catch (e) {
    loadingPromise = null;
    model = null;
    processor = null;
    tokenizer = null;
    activeDevice = null;
    throw e;
  }
}

/**
 * Run inference on a single image with a text prompt.
 * Uses the community pattern: processor.apply_chat_template + model.generate + tokenizer.decode
 * No manual KV cache logic — transformers.js handles it internally.
 *
 * @param {HTMLImageElement|HTMLCanvasElement|RawImage} image
 * @param {string} prompt
 * @param {object} opts - { maxNewTokens, temperature, onToken }
 * @returns {string} Generated text
 */
export async function runInference(image, prompt, opts = {}) {
  if (!model || !processor || !tokenizer) throw new Error("Model not loaded");

  const { RawImage } = await import(TRANSFORMERS_CDN);

  // Convert browser image to RawImage using the same approach as webml-community demo:
  // Export to data URL then use RawImage.read() for proper internal state.
  let rawImage = image;
  if (image instanceof HTMLImageElement || image instanceof HTMLCanvasElement) {
    const origW = image.naturalWidth || image.width;
    const origH = image.naturalHeight || image.height;

    // Resize to limit GPU memory (max 672px longest side for WebGPU)
    const MAX_DIM = 672;
    let processWidth = origW;
    let processHeight = origH;
    if (processWidth > MAX_DIM || processHeight > MAX_DIM) {
      const scale = MAX_DIM / Math.max(processWidth, processHeight);
      processWidth = Math.round(processWidth * scale);
      processHeight = Math.round(processHeight * scale);
    }

    // Create resized canvas and export as data URL for RawImage.read()
    const resizedCanvas = document.createElement("canvas");
    resizedCanvas.width = processWidth;
    resizedCanvas.height = processHeight;
    const resizedCtx = resizedCanvas.getContext("2d");
    resizedCtx.drawImage(image, 0, 0, processWidth, processHeight);
    const dataURL = resizedCanvas.toDataURL("image/png");

    // Use RawImage.read() — the proven way (matches webml-community demo)
    rawImage = await RawImage.read(dataURL);
    console.log(`[FADA] Image loaded via RawImage.read(): ${origW}x${origH} -> ${processWidth}x${processHeight}, channels=${rawImage.channels}`);
  }

  const maxNewTokens = opts.maxNewTokens || 1024;
  const temperature = opts.temperature ?? 0.1;

  // Build messages array matching training format:
  // The model was trained WITHOUT a system message, just user + assistant.
  // The Qwen3.5 chat template auto-adds <think>\n\n</think>\n\n when
  // enable_thinking is false/undefined (pre-closed thinking mode).
  const messages = [
    {
      role: "user",
      content: [
        { type: "image" },
        { type: "text", text: prompt },
      ],
    },
  ];

  // Apply chat template with enable_thinking explicitly disabled.
  // The model was fine-tuned with thinking PRE-CLOSED: the template adds
  // <think>\n\n</think>\n\n before the model's actual output.
  // We must ensure this is present in the generation prompt.
  console.log("[FADA] Building inputs...");
  let text;
  const templateOpts = {
    add_generation_prompt: true,
    enable_thinking: false,
  };

  if (processor.apply_chat_template) {
    text = processor.apply_chat_template(messages, templateOpts);
  } else if (tokenizer.apply_chat_template) {
    text = tokenizer.apply_chat_template(messages, {
      ...templateOpts,
      tokenize: false,
    });
  } else {
    // Manual fallback: must include the pre-closed <think> block
    text = `<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>${prompt}<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n`;
  }

  // Verify the template output includes the required <think>...</think> block.
  // The model was trained expecting <think>\n\n</think>\n\n before its output.
  // Handle three failure cases:
  // 1. Template produced open <think>\n (enable_thinking=true path) -> close it
  // 2. Template didn't include <think> at all -> add the block
  // 3. Template produced correct <think>\n\n</think>\n\n -> leave as-is
  if (text.includes("<|im_start|>assistant")) {
    const assistantIdx = text.lastIndexOf("<|im_start|>assistant");
    const afterAssistant = text.slice(assistantIdx);
    if (!afterAssistant.includes("<think>")) {
      // No <think> at all - append the pre-closed block
      console.warn("[FADA] Template missing <think> block - appending pre-closed thinking");
      text = text.trimEnd() + "\n<think>\n\n</think>\n\n";
    } else if (!afterAssistant.includes("</think>")) {
      // Has <think> but no </think> - close it
      console.warn("[FADA] Template produced open <think> - patching to pre-closed format");
      text = text.replace(/<think>\s*$/, "<think>\n\n</think>\n\n");
    }
  }

  // Debug: log the template output (first and last parts)
  console.log("[FADA] Template text (first 120):", JSON.stringify(text.slice(0, 120)));
  console.log("[FADA] Template text (last 80):", JSON.stringify(text.slice(-80)));

  // Call processor with SINGLE image (not array!) — matches webml-community demo pattern
  const inputs = await processor(text, rawImage);
  const inputLen = inputs.input_ids.dims[1];
  console.log(
    `[FADA] Input tokens: ${inputLen}, generating (max_new_tokens=${maxNewTokens}, temp=${temperature}, device=${activeDevice})...`
  );

  // === VISION PIPELINE DIAGNOSTIC ===
  console.log("[FADA-VISION] === Vision Pipeline Diagnostic ===");
  console.log("[FADA-VISION] Inputs keys:", Object.keys(inputs));
  if (inputs.pixel_values) {
    console.log("[FADA-VISION] pixel_values shape:", inputs.pixel_values.dims);
    console.log("[FADA-VISION] pixel_values dtype:", inputs.pixel_values.type);
    const pv = inputs.pixel_values.data;
    const sample = Array.from(pv.slice(0, 10));
    console.log("[FADA-VISION] pixel_values first 10:", sample);
    console.log("[FADA-VISION] pixel_values has NaN:", sample.some(v => isNaN(v)));
    console.log("[FADA-VISION] pixel_values all zero:", sample.every(v => v === 0));
    const sampleLarge = Array.from(pv.slice(0, Math.min(1000, pv.length)));
    console.log("[FADA-VISION] pixel_values min/max:", Math.min(...sampleLarge), Math.max(...sampleLarge));
  } else {
    console.log("[FADA-VISION] \u2717 NO pixel_values in inputs! Image was NOT processed!");
  }
  if (inputs.image_grid_thw) {
    console.log("[FADA-VISION] image_grid_thw shape:", inputs.image_grid_thw.dims);
    console.log("[FADA-VISION] image_grid_thw data:", Array.from(inputs.image_grid_thw.data).map(Number));
  } else {
    console.log("[FADA-VISION] \u2717 NO image_grid_thw! Vision metadata missing!");
  }
  // Check if input_ids contain image tokens (token id 151655 for <|image_pad|>)
  if (inputs.input_ids) {
    const ids = Array.from(inputs.input_ids.data).map(t => typeof t === 'bigint' ? Number(t) : t);
    // Qwen3.5-VL image pad token is 151655
    const imageTokenCount = ids.filter(t => t === 151655).length;
    const imageTokenCount2 = ids.filter(t => t === 248056).length;
    console.log("[FADA-VISION] input_ids contain", imageTokenCount, "image tokens (151655) and", imageTokenCount2, "tokens (248056)");
    console.log("[FADA-VISION] input_ids length:", ids.length);
    console.log("[FADA-VISION] First 30 input tokens:", ids.slice(0, 30));
    // Find image pad tokens
    const firstImageIdx = ids.indexOf(151655);
    const firstImageIdx2 = ids.indexOf(248056);
    const imgIdx = firstImageIdx >= 0 ? firstImageIdx : firstImageIdx2;
    if (imgIdx >= 0) {
      console.log("[FADA-VISION] Image tokens start at index:", imgIdx);
      console.log("[FADA-VISION] Tokens around image:", ids.slice(Math.max(0, imgIdx-3), imgIdx+10));
    } else {
      console.log("[FADA-VISION] \u2717 NO image pad tokens found in input_ids! Template may be wrong.");
    }
  }
  console.log("[FADA-VISION] === End Vision Diagnostic ===");

  // === DEEP DEBUG: Pre-generation diagnostics ===
  console.log("[FADA-DEBUG] === Pre-Generation Diagnostics ===");
  console.log("[FADA-DEBUG] Prompt:", prompt);
  console.log("[FADA-DEBUG] Template text (first 500 chars):", typeof text, text?.substring?.(0, 500));
  console.log("[FADA-DEBUG] Template text (last 200 chars):", text?.slice?.(-200));
  console.log("[FADA-DEBUG] Input IDs shape:", inputs.input_ids?.dims);
  console.log("[FADA-DEBUG] Input IDs first 10:", Array.from(inputs.input_ids?.data?.slice(0, 10) || []).map(Number));
  console.log("[FADA-DEBUG] Input IDs last 10:", Array.from(inputs.input_ids?.data?.slice(-10) || []).map(Number));
  console.log("[FADA-DEBUG] Pixel values shape:", inputs.pixel_values?.dims);
  console.log("[FADA-DEBUG] Image grid thw:", inputs.image_grid_thw?.dims, inputs.image_grid_thw?.data ? Array.from(inputs.image_grid_thw.data).map(Number) : 'N/A');
  console.log("[FADA-DEBUG] Attention mask shape:", inputs.attention_mask?.dims);
  // Decode last 20 input tokens to verify template ending
  try {
    const lastInputTokens = Array.from(inputs.input_ids.data.slice(-20)).map(Number);
    const decodedEnd = tokenizer.decode(lastInputTokens, { skip_special_tokens: false });
    console.log("[FADA-DEBUG] Last 20 input tokens decoded:", JSON.stringify(decodedEnd));
  } catch (e) { console.warn("[FADA-DEBUG] Could not decode last input tokens:", e); }
  console.log("[FADA-DEBUG] === Starting Generation ===");

  // Generate — transformers.js handles KV cache internally
  const t0 = performance.now();
  let output;
  try {
    output = await model.generate({
      ...inputs,
      max_new_tokens: maxNewTokens,
      temperature: temperature > 0 ? temperature : undefined,
      do_sample: temperature > 0,
      top_p: 0.95,
    });
  } catch (err) {
    console.error("[FADA-DEBUG] Generation error:", err);
    if (err?.message?.includes("Device") && err?.message?.includes("lost")) {
      throw new Error("GPU ran out of memory. Try a smaller image or switch to CPU (WASM) mode.");
    }
    if (err?.message?.includes("mapAsync")) {
      throw new Error("GPU ran out of memory. Try a smaller image or switch to CPU (WASM) mode.");
    }
    throw err;
  }
  const genTime = ((performance.now() - t0) / 1000).toFixed(1);
  console.log("[FADA-DEBUG] Generation completed in", genTime, "seconds");

  // Decode only generated tokens (skip input)
  const outputDims = output.dims || [];
  const outputTotalLen = outputDims[1] || (outputDims[0] === 1 ? output.data?.length : 0);

  // Convert output Tensor to a plain JS array
  let outputTokens;
  try {
    // For 2D tensor [1, seq_len], try .tolist() which returns nested array
    if (output.tolist) {
      const listed = output.tolist();
      // Could be [[tok1, tok2, ...]] (2D) or [tok1, tok2, ...] (1D)
      const flat = Array.isArray(listed[0]) ? listed[0] : listed;
      outputTokens = flat.map(t => (typeof t === "bigint" ? Number(t) : t));
    } else {
      const firstBatch = output[0];
      const allTokens = firstBatch?.tolist
        ? firstBatch.tolist()
        : Array.from(firstBatch?.data || firstBatch || output.data || []);
      outputTokens = allTokens.map(t => (typeof t === "bigint" ? Number(t) : t));
    }
  } catch (e) {
    console.warn("[FADA] Tensor conversion fallback:", e);
    // Fallback: try direct Array.from on output data
    const raw = output.data || output;
    outputTokens = Array.from(raw).map(t => (typeof t === "bigint" ? Number(t) : t));
  }

  // === DEEP DEBUG: Post-generation analysis ===
  console.log("[FADA-DEBUG] === Post-Generation Analysis ===");
  console.log(`[FADA-DEBUG] Output raw dims: ${JSON.stringify(outputDims)}`);
  console.log(`[FADA-DEBUG] Output type: ${typeof output}, constructor: ${output?.constructor?.name}`);
  console.log(`[FADA-DEBUG] outputTokens.length: ${outputTokens.length}, inputLen: ${inputLen}`);
  console.log(`[FADA] Output tensor dims: ${JSON.stringify(outputDims)}, outputTokens.length: ${outputTokens.length}, inputLen: ${inputLen}`);

  // Slice to only the newly generated tokens
  const newTokens = outputTokens.slice(inputLen);
  const numGenerated = newTokens.length;

  // Debug: log first few generated tokens
  if (numGenerated > 0) {
    console.log(`[FADA] First 20 new token IDs: [${newTokens.slice(0, 20).join(", ")}]`);
    const rawDecoded = tokenizer.decode(newTokens.slice(0, 30), { skip_special_tokens: false });
    console.log(`[FADA] First 30 tokens decoded (with special): ${JSON.stringify(rawDecoded)}`);
    console.log(`[FADA-DEBUG] Decoded (skip_special=true): ${JSON.stringify(tokenizer.decode(newTokens.slice(0, 30), { skip_special_tokens: true }))}`);
  } else {
    console.warn("[FADA-DEBUG] *** CRITICAL: 0 new tokens generated! ***");
    console.log(`[FADA-DEBUG] Output length === Input length? ${outputTokens.length === inputLen}`);
    console.log(`[FADA-DEBUG] Output length: ${outputTokens.length} vs Input length: ${inputLen}`);
    console.log(`[FADA-DEBUG] Last 10 output token IDs: [${outputTokens.slice(-10).join(", ")}]`);
    // Try decoding last few tokens to understand what happened
    if (outputTokens.length > 0) {
      const lastFew = tokenizer.decode(outputTokens.slice(-10), { skip_special_tokens: false });
      console.log(`[FADA-DEBUG] Last 10 tokens decoded (no skip): ${JSON.stringify(lastFew)}`);
    }
    // Check if ALL output tokens are the same as input (nothing was generated)
    if (outputTokens.length > inputLen) {
      // Actually there ARE new tokens but our inputLen might be wrong
      console.log(`[FADA-DEBUG] WAIT: outputTokens(${outputTokens.length}) > inputLen(${inputLen}) - recalculating...`);
      const extraTokens = outputTokens.slice(inputLen);
      console.log(`[FADA-DEBUG] Extra tokens: [${extraTokens.slice(0, 20).join(", ")}]`);
      console.log(`[FADA-DEBUG] Extra decoded: ${JSON.stringify(tokenizer.decode(extraTokens, { skip_special_tokens: false }))}`);
    } else if (outputTokens.length < inputLen) {
      console.log(`[FADA-DEBUG] STRANGE: output shorter than input! Model may have truncated.`);
    } else {
      console.log(`[FADA-DEBUG] Output exactly equals input - model generated EOS immediately or nothing at all.`);
      // Check what the last token IS
      const lastToken = outputTokens[outputTokens.length - 1];
      console.log(`[FADA-DEBUG] Last token ID: ${lastToken}, decoded: ${JSON.stringify(tokenizer.decode([lastToken], { skip_special_tokens: false }))}`);
    }
  }

  const decoded = tokenizer.decode(newTokens, {
    skip_special_tokens: true,
  });

  console.log(
    `[FADA] Generated ${numGenerated} tokens in ${genTime}s (${(numGenerated / (genTime || 1)).toFixed(1)} tok/s)`
  );

  // Clean output: remove <think></think> tags and stray role prefixes
  // Ensure decoded is always a string (safeguard against unexpected return types)
  let textOutput = typeof decoded === "string" ? decoded : String(decoded || "");
  textOutput = textOutput.replace(/<think>[\s\S]*?<\/think>/g, "").trim();
  const roleIdx = textOutput.indexOf("\nassistant\n");
  if (roleIdx !== -1) textOutput = textOutput.slice(0, roleIdx);
  if (textOutput.startsWith("assistant\n")) textOutput = textOutput.slice("assistant\n".length);

  if (numGenerated > 0 && textOutput.length === 0) {
    console.warn("[FADA] Generated tokens but cleaned output is empty. Raw decoded:", JSON.stringify(decoded?.slice(0, 200)));
  }

  return textOutput.trim();
}

/**
 * Text-only diagnostic test - runs inference WITHOUT any image.
 * If this produces output, the decoder works and the issue is in the vision pipeline.
 * If this also produces 0 tokens, the issue is in the decoder/generation loop.
 */
export async function testTextOnly() {
  if (!model || !tokenizer) throw new Error("Model not loaded");

  console.log("[FADA-TEST] === TEXT-ONLY DIAGNOSTIC ===");

  // Simple text-only message (no image)
  const messages = [
    { role: "system", content: "You are a helpful assistant." },
    { role: "user", content: "Say hello and tell me what you can do." }
  ];

  let text;
  try {
    text = tokenizer.apply_chat_template(messages, {
      add_generation_prompt: true,
      tokenize: false
    });
  } catch(e) {
    text = `<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nSay hello and tell me what you can do.<|im_end|>\n<|im_start|>assistant\n`;
  }

  // Add pre-closed thinking block if model expects it
  if (!text.includes("<think>")) {
    text += "<think>\n\n</think>\n\n";
  }

  console.log("[FADA-TEST] Template:", text);

  const inputs = tokenizer(text, { return_tensors: "pt" });
  console.log("[FADA-TEST] Input IDs shape:", inputs.input_ids.dims);
  console.log("[FADA-TEST] Input length:", inputs.input_ids.dims[1]);

  const t0 = performance.now();
  const output = await model.generate({
    ...inputs,
    max_new_tokens: 50,
    temperature: 0.7,
    do_sample: true,
    top_p: 0.9,
  });
  const elapsed = ((performance.now() - t0) / 1000).toFixed(1);

  // Decode output
  let outputTokens;
  try {
    if (output.tolist) {
      const list = output.tolist();
      outputTokens = (Array.isArray(list[0]) ? list[0] : list).map(t => typeof t === "bigint" ? Number(t) : t);
    } else {
      const firstBatch = output[0];
      const allTokens = firstBatch.tolist ? firstBatch.tolist() : Array.from(firstBatch.data || firstBatch);
      outputTokens = allTokens.map(t => typeof t === "bigint" ? Number(t) : t);
    }
  } catch(e) {
    console.error("[FADA-TEST] Output conversion error:", e);
    outputTokens = [];
  }

  const inputLen = inputs.input_ids.dims[1];
  const newTokens = outputTokens.slice(inputLen);

  console.log("[FADA-TEST] Output total tokens:", outputTokens.length);
  console.log("[FADA-TEST] New tokens:", newTokens.length);
  console.log("[FADA-TEST] Time:", elapsed, "s");

  if (newTokens.length > 0) {
    console.log("[FADA-TEST] First 20 tokens:", newTokens.slice(0, 20));
    const decoded = tokenizer.decode(newTokens, { skip_special_tokens: true });
    console.log("[FADA-TEST] \u2713 DECODED:", decoded);
    return `TEXT-ONLY TEST PASSED (${newTokens.length} tokens in ${elapsed}s): ${decoded}`;
  } else {
    console.log("[FADA-TEST] \u2717 0 tokens generated - DECODER ITSELF IS BROKEN ON WEBGPU");
    console.log("[FADA-TEST] Last 5 tokens of output:", outputTokens.slice(-5));
    const lastDecoded = tokenizer.decode(outputTokens.slice(-5), { skip_special_tokens: false });
    console.log("[FADA-TEST] Last tokens decoded:", lastDecoded);
    return `TEXT-ONLY TEST FAILED: 0 tokens in ${elapsed}s. Decoder broken on WebGPU.`;
  }
}