File size: 7,736 Bytes
58b76f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
Implementation,Impl ID,Workload,Batch,Seq Length,Heads,Head Dim,Dtype,Mean (ms),P10 (ms),P50 (ms),P90 (ms),Reps,Peak Mem (MB),Backend,Family
Flash (PyTorch SDPA),torch_flash_ma,flux_L128,1,1152,24,128,bfloat16,0.407123202085495,0.40537598729133606,0.40755200386047363,0.407584011554718,5,83.38,FLASH,torch-sdpa
Flash (PyTorch SDPA),torch_flash_ma,flux_L256,1,1280,24,128,bfloat16,0.5235007882118226,0.5212159752845764,0.5232639908790588,0.523360013961792,5,90.62,FLASH,torch-sdpa
Flash (PyTorch SDPA),torch_flash_ma,flux_L320,1,1344,24,128,bfloat16,0.545849597454071,0.5418559908866882,0.5468159914016724,0.5469120144844055,5,95.06,FLASH,torch-sdpa
Flash (PyTorch SDPA),torch_flash_ma,flux_L384,1,1408,24,128,bfloat16,0.5892416119575501,0.5867519974708557,0.5888000130653381,0.5888000130653381,5,99.88,FLASH,torch-sdpa
Flash (PyTorch SDPA),torch_flash_ma,flux_L448,1,1472,24,128,bfloat16,0.6449280023574829,0.6430720090866089,0.6442239880561829,0.6450240015983582,5,103.81,FLASH,torch-sdpa
Flash (PyTorch SDPA),torch_flash_ma,flux_L512,1,1536,24,128,bfloat16,0.6823423862457275,0.6777600049972534,0.6809599995613098,0.6818559765815735,5,109.12,FLASH,torch-sdpa
MemEff (PyTorch SDPA),torch_mem_eff,flux_L128,1,1152,24,128,bfloat16,0.48371200561523436,0.4821760058403015,0.4833280146121979,0.4853760004043579,5,83.38,EFFICIENT,torch-sdpa
MemEff (PyTorch SDPA),torch_mem_eff,flux_L256,1,1280,24,128,bfloat16,0.6268800020217895,0.6246399879455566,0.6266880035400391,0.6286720037460327,5,90.62,EFFICIENT,torch-sdpa
MemEff (PyTorch SDPA),torch_mem_eff,flux_L320,1,1344,24,128,bfloat16,0.699776005744934,0.6973440051078796,0.7004160284996033,0.7004479765892029,5,95.94,EFFICIENT,torch-sdpa
MemEff (PyTorch SDPA),torch_mem_eff,flux_L384,1,1408,24,128,bfloat16,0.8333312034606933,0.8284159898757935,0.8325120210647583,0.8376320004463196,5,100.0,EFFICIENT,torch-sdpa
MemEff (PyTorch SDPA),torch_mem_eff,flux_L448,1,1472,24,128,bfloat16,0.9533439993858337,0.9502720236778259,0.9512959718704224,0.9572479724884033,5,103.81,EFFICIENT,torch-sdpa
MemEff (PyTorch SDPA),torch_mem_eff,flux_L512,1,1536,24,128,bfloat16,1.0066367864608765,1.0024960041046143,1.0045440196990967,1.0097919702529907,5,109.12,EFFICIENT,torch-sdpa
xFormers,xformers_meff,flux_L128,1,1152,24,128,bfloat16,0.3452928066253662,0.3389439880847931,0.3461120128631592,0.3461120128631592,5,83.38,memory_efficient,xformers
xFormers,xformers_meff,flux_L256,1,1280,24,128,bfloat16,0.41234560012817384,0.40959998965263367,0.41280001401901245,0.41286399960517883,5,90.62,memory_efficient,xformers
xFormers,xformers_meff,flux_L320,1,1344,24,128,bfloat16,0.4366208016872406,0.4310399889945984,0.4331519901752472,0.4362240135669708,5,95.06,memory_efficient,xformers
xFormers,xformers_meff,flux_L384,1,1408,24,128,bfloat16,0.4450624048709869,0.4359680116176605,0.44361600279808044,0.447488009929657,5,99.88,memory_efficient,xformers
xFormers,xformers_meff,flux_L448,1,1472,24,128,bfloat16,0.4750400006771088,0.4711039960384369,0.47513601183891296,0.4763199985027313,5,103.81,memory_efficient,xformers
xFormers,xformers_meff,flux_L512,1,1536,24,128,bfloat16,0.5009407997131348,0.49663999676704407,0.4997119903564453,0.5038080215454102,5,109.12,memory_efficient,xformers
Compiled (default),torch_flash_compiled_default,flux_L128,1,1152,24,128,bfloat16,0.3856383919715881,0.3563520014286041,0.35942399501800537,0.3624959886074066,5,83.38,FLASH,torch-sdpa
Compiled (default),torch_flash_compiled_default,flux_L256,1,1280,24,128,bfloat16,0.4982912003993988,0.4926080107688904,0.49663999676704407,0.5017600059509277,5,90.62,FLASH,torch-sdpa
Compiled (default),torch_flash_compiled_default,flux_L320,1,1344,24,128,bfloat16,0.5369919896125793,0.5335040092468262,0.5366079807281494,0.5386239886283875,5,95.25,FLASH,torch-sdpa
Compiled (default),torch_flash_compiled_default,flux_L384,1,1408,24,128,bfloat16,0.5841408014297486,0.5775359869003296,0.5868800282478333,0.5877760052680969,5,99.88,FLASH,torch-sdpa
Compiled (default),torch_flash_compiled_default,flux_L448,1,1472,24,128,bfloat16,0.6184704065322876,0.6072319746017456,0.6113280057907104,0.6144000291824341,5,103.81,FLASH,torch-sdpa
Compiled (default),torch_flash_compiled_default,flux_L512,1,1536,24,128,bfloat16,0.6428672075271606,0.6399999856948853,0.6430720090866089,0.6430720090866089,5,109.12,FLASH,torch-sdpa
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L128,1,1152,24,128,bfloat16,0.40020479559898375,0.3665919899940491,0.3768320083618164,0.41171199083328247,5,81.75,FLASH,torch-sdpa
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L256,1,1280,24,128,bfloat16,0.5535807967185974,0.5160959959030151,0.5489599704742432,0.5631359815597534,5,92.88,FLASH,torch-sdpa
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L320,1,1344,24,128,bfloat16,0.6143999934196472,0.562175989151001,0.6144000291824341,0.6318079829216003,5,95.13,FLASH,torch-sdpa
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L384,1,1408,24,128,bfloat16,0.6754495978355408,0.6512640118598938,0.6584320068359375,0.6799359917640686,5,97.13,FLASH,torch-sdpa
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L448,1,1472,24,128,bfloat16,0.7210752129554748,0.6973119974136353,0.7014080286026001,0.7229440212249756,5,99.0,FLASH,torch-sdpa
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L512,1,1536,24,128,bfloat16,0.7735359907150269,0.7485439777374268,0.7557439804077148,0.7710719704627991,5,101.63,FLASH,torch-sdpa
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L128,1,1152,24,128,bfloat16,0.2456959992647171,0.24371199309825897,0.24566400051116943,0.2457599937915802,5,83.38,flash-attn,hf-kernels
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L256,1,1280,24,128,bfloat16,0.3215551972389221,0.3164159953594208,0.319487988948822,0.32051199674606323,5,90.62,flash-attn,hf-kernels
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L320,1,1344,24,128,bfloat16,0.3384703993797302,0.33670398592948914,0.33792001008987427,0.33983999490737915,5,95.06,flash-attn,hf-kernels
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L384,1,1408,24,128,bfloat16,0.3510208010673523,0.3481599986553192,0.3491840064525604,0.35225600004196167,5,99.88,flash-attn,hf-kernels
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L448,1,1472,24,128,bfloat16,0.3829823970794678,0.38095998764038086,0.3829759955406189,0.3840000033378601,5,103.81,flash-attn,hf-kernels
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L512,1,1536,24,128,bfloat16,0.4259391903877258,0.4227519929409027,0.4249599874019623,0.4259839951992035,5,109.12,flash-attn,hf-kernels
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L128,1,1152,24,128,bfloat16,0.2755008041858673,0.26736000180244446,0.27561599016189575,0.27955201268196106,5,83.38,flash-attn3,hf-kernels
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L256,1,1280,24,128,bfloat16,0.3397440016269684,0.3368000090122223,0.3399679958820343,0.34191998839378357,5,90.62,flash-attn3,hf-kernels
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L320,1,1344,24,128,bfloat16,0.36019839644432067,0.3563520014286041,0.3604480028152466,0.36137598752975464,5,95.06,flash-attn3,hf-kernels
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L384,1,1408,24,128,bfloat16,0.37342079877853396,0.3718400001525879,0.37379199266433716,0.3746879994869232,5,99.88,flash-attn3,hf-kernels
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L448,1,1472,24,128,bfloat16,0.4024448037147522,0.3993600010871887,0.4014720022678375,0.4034560024738312,5,103.81,flash-attn3,hf-kernels
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L512,1,1536,24,128,bfloat16,0.4305088043212891,0.4270080029964447,0.4291520118713379,0.4331519901752472,5,109.12,flash-attn3,hf-kernels