{
  "_method": "HF generate(), greedy, attn_implementation=sdpa, no repetition penalty, no n-gram blocking",
  "_note": "vLLM/TGI/SGLang typically achieve 2-5x higher tok/s at small batches due to CUDA graphs and optimized kernels. Use vLLM for production deployment.",
  "bs1_tok_per_sec": 82,
  "bs4_tok_per_sec": 323,
  "bs8_tok_per_sec": 618,
  "bs32_tok_per_sec": 2154,
  "ttft_ms_bs1": 14,
  "peak_mem_gb_bf16": 1.17,
  "peak_mem_gb_int8": 0.66
}