{ "source_model": "nvidia/LocateAnything-3B", "approach": "surgery removes fp32 embedding Gather; language graph takes inputs_embeds; embedding table shipped as group-wise INT4 for a browser-side custom gather/dequant", "embedding_int4": { "block_size": 32, "max_abs_diff_vs_fp32": 0.01708984375, "mean_rel_err_vs_fp32": 0.10263473320935858, "packed_MB": 156.3, "scales_MB": 19.5 }, "parity_sample": "person.jpg / category 'person' (slow/AR)", "next_token_argmax": { "torch": 151672, "onnx_fp_embed4": 151672, "onnx_int4_embed4": 151672, "match": true }, "logit_max_abs_diff": { "vision_vs_torch": 0.0024852752685546875, "language_fp_embed4_vs_torch": 0.9800300598144531, "language_int4_vs_fp": 12.58173942565918, "language_int4_vs_torch": 12.641683578491211 }, "latency_cpu_s": { "vision_onnx_cpu": 1.5138812065124512, "language_fp_onnx_cpu": 4.6505208015441895, "language_int4_onnx_cpu": 9.6680269241333 }, "notes": "Last-token (next-token) argmax matches torch exactly. Earlier prefill-tail positions diverge under INT4 and are unused in slow/AR generation. Vision ships fp32 (Gemm/dynamic linears not INT4-able; fp16 export blocked by custom RoPE casts)." }