Spaces:
Runtime error
Runtime error
fix(htm): set NON_PORTABLE_CLUSTER_SIZE_ALLOWED=1 for cluster_size=16
Browse files
overlay/htm_rust/src/gpu/fused.rs
CHANGED
|
@@ -244,6 +244,18 @@ impl FusedState {
|
|
| 244 |
result::module::get_function(module, CString::new("htm_fused_step_batched").unwrap())
|
| 245 |
}?;
|
| 246 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
// Probe SM count.
|
| 248 |
let sm_count = match dev.attribute(
|
| 249 |
cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
|
|
|
|
| 244 |
result::module::get_function(module, CString::new("htm_fused_step_batched").unwrap())
|
| 245 |
}?;
|
| 246 |
|
| 247 |
+
// Cluster size 16 on Hopper is "non-portable" (> 8 requires opt-in).
|
| 248 |
+
// Must set CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED=1 on
|
| 249 |
+
// every launched kernel function, otherwise cuLaunchKernelEx rejects
|
| 250 |
+
// the cluster dim with CUDA_ERROR_INVALID_CLUSTER_SIZE.
|
| 251 |
+
unsafe {
|
| 252 |
+
let attr = sys::CUfunction_attribute::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED;
|
| 253 |
+
// Ignore errors: older CUDA may lack the attribute, in which case
|
| 254 |
+
// only portable sizes (<= 8) work — plan_fused_launch caps at 8.
|
| 255 |
+
let _ = sys::lib().cuFuncSetAttribute(function, attr, 1);
|
| 256 |
+
let _ = sys::lib().cuFuncSetAttribute(function_batched, attr, 1);
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
// Probe SM count.
|
| 260 |
let sm_count = match dev.attribute(
|
| 261 |
cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
|