Slim build: 4-bit embedding gather (1.25GB fp32 -> 176MB INT4), INT4 language tail

79ce74e verified 2 days ago

1.25 kB

	{
	"source_model": "nvidia/LocateAnything-3B",
	"approach": "surgery removes fp32 embedding Gather; language graph takes inputs_embeds; embedding table shipped as group-wise INT4 for a browser-side custom gather/dequant",
	"embedding_int4": {
	"block_size": 32,
	"max_abs_diff_vs_fp32": 0.01708984375,
	"mean_rel_err_vs_fp32": 0.10263473320935858,
	"packed_MB": 156.3,
	"scales_MB": 19.5
	},
	"parity_sample": "person.jpg / category 'person' (slow/AR)",
	"next_token_argmax": {
	"torch": 151672,
	"onnx_fp_embed4": 151672,
	"onnx_int4_embed4": 151672,
	"match": true
	},
	"logit_max_abs_diff": {
	"vision_vs_torch": 0.0024852752685546875,
	"language_fp_embed4_vs_torch": 0.9800300598144531,
	"language_int4_vs_fp": 12.58173942565918,
	"language_int4_vs_torch": 12.641683578491211
	},
	"latency_cpu_s": {
	"vision_onnx_cpu": 1.5138812065124512,
	"language_fp_onnx_cpu": 4.6505208015441895,
	"language_int4_onnx_cpu": 9.6680269241333
	},
	"notes": "Last-token (next-token) argmax matches torch exactly. Earlier prefill-tail positions diverge under INT4 and are unused in slow/AR generation. Vision ships fp32 (Gemm/dynamic linears not INT4-able; fp16 export blocked by custom RoPE casts)."
	}