lhallee commited on
Commit
d72bb80
·
verified ·
1 Parent(s): aec951b

Upload entrypoint_setup.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. entrypoint_setup.py +22 -0
entrypoint_setup.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch._inductor.config as inductor_config
3
+ import torch._dynamo as dynamo
4
+
5
+ # Enable TensorFloat32 tensor cores for float32 matmul (Ampere+ GPUs)
6
+ # Provides significant speedup with minimal precision loss
7
+ torch.set_float32_matmul_precision('high')
8
+
9
+ # Enable TF32 for matrix multiplications and cuDNN operations
10
+ torch.backends.cuda.matmul.allow_tf32 = True
11
+ torch.backends.cudnn.allow_tf32 = True
12
+
13
+ # Enable cuDNN autotuner - finds fastest algorithms for your hardware
14
+ # Best when input sizes are consistent; may slow down first iterations
15
+ torch.backends.cudnn.benchmark = True
16
+
17
+ # Deterministic operations off for speed (set True if reproducibility needed)
18
+ torch.backends.cudnn.deterministic = False
19
+ inductor_config.max_autotune_gemm_backends = "ATEN,CUTLASS,FBGEMM"
20
+
21
+ dynamo.config.capture_scalar_outputs = True
22
+ torch._dynamo.config.recompile_limit = 64