{ "_method": "HF generate(), greedy, attn_implementation=sdpa, no repetition penalty, no n-gram blocking", "_note": "vLLM/TGI/SGLang typically achieve 2-5x higher tok/s at small batches due to CUDA graphs and optimized kernels. Use vLLM for production deployment.", "bs1_tok_per_sec": 82, "bs4_tok_per_sec": 323, "bs8_tok_per_sec": 618, "bs32_tok_per_sec": 2154, "ttft_ms_bs1": 14, "peak_mem_gb_bf16": 1.17, "peak_mem_gb_int8": 0.66 }