| { | |
| "source_model": "nvidia/LocateAnything-3B", | |
| "approach": "surgery removes fp32 embedding Gather; language graph takes inputs_embeds; embedding table shipped as group-wise INT4 for a browser-side custom gather/dequant", | |
| "embedding_int4": { | |
| "block_size": 32, | |
| "max_abs_diff_vs_fp32": 0.01708984375, | |
| "mean_rel_err_vs_fp32": 0.10263473320935858, | |
| "packed_MB": 156.3, | |
| "scales_MB": 19.5 | |
| }, | |
| "parity_sample": "person.jpg / category 'person' (slow/AR)", | |
| "next_token_argmax": { | |
| "torch": 151672, | |
| "onnx_fp_embed4": 151672, | |
| "onnx_int4_embed4": 151672, | |
| "match": true | |
| }, | |
| "logit_max_abs_diff": { | |
| "vision_vs_torch": 0.0024852752685546875, | |
| "language_fp_embed4_vs_torch": 0.9800300598144531, | |
| "language_int4_vs_fp": 12.58173942565918, | |
| "language_int4_vs_torch": 12.641683578491211 | |
| }, | |
| "latency_cpu_s": { | |
| "vision_onnx_cpu": 1.5138812065124512, | |
| "language_fp_onnx_cpu": 4.6505208015441895, | |
| "language_int4_onnx_cpu": 9.6680269241333 | |
| }, | |
| "notes": "Last-token (next-token) argmax matches torch exactly. Earlier prefill-tail positions diverge under INT4 and are unused in slow/AR generation. Vision ships fp32 (Gemm/dynamic linears not INT4-able; fp16 export blocked by custom RoPE casts)." | |
| } |