icarus112 commited on
Commit
19dee83
·
verified ·
1 Parent(s): 278f184

fix(htm): set NON_PORTABLE_CLUSTER_SIZE_ALLOWED=1 for cluster_size=16

Browse files
Files changed (1) hide show
  1. overlay/htm_rust/src/gpu/fused.rs +12 -0
overlay/htm_rust/src/gpu/fused.rs CHANGED
@@ -244,6 +244,18 @@ impl FusedState {
244
  result::module::get_function(module, CString::new("htm_fused_step_batched").unwrap())
245
  }?;
246
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  // Probe SM count.
248
  let sm_count = match dev.attribute(
249
  cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
 
244
  result::module::get_function(module, CString::new("htm_fused_step_batched").unwrap())
245
  }?;
246
 
247
+ // Cluster size 16 on Hopper is "non-portable" (> 8 requires opt-in).
248
+ // Must set CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED=1 on
249
+ // every launched kernel function, otherwise cuLaunchKernelEx rejects
250
+ // the cluster dim with CUDA_ERROR_INVALID_CLUSTER_SIZE.
251
+ unsafe {
252
+ let attr = sys::CUfunction_attribute::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED;
253
+ // Ignore errors: older CUDA may lack the attribute, in which case
254
+ // only portable sizes (<= 8) work — plan_fused_launch caps at 8.
255
+ let _ = sys::lib().cuFuncSetAttribute(function, attr, 1);
256
+ let _ = sys::lib().cuFuncSetAttribute(function_batched, attr, 1);
257
+ }
258
+
259
  // Probe SM count.
260
  let sm_count = match dev.attribute(
261
  cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,