| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #include <hip/hip_runtime.h> |
| #include <Python.h> |
| #include <vector> |
| #include <iostream> |
| #include <string> |
| #include <cstdint> |
|
|
| |
| #define HIP_SAFE_CALL(call) do { \ |
| hipError_t err = (call); \ |
| if (err != hipSuccess) { \ |
| const char* errStr = hipGetErrorString(err); \ |
| PyErr_Format(PyExc_RuntimeError, "HIP Error: %s at %s:%d", errStr, __FILE__, __LINE__); \ |
| return NULL; \ |
| } \ |
| } while(0) |
|
|
| #define HIP_SAFE_CALL_VOID(call) do { \ |
| hipError_t err = (call); \ |
| if (err != hipSuccess) { \ |
| fprintf(stderr, "HIP Error: %s at %s:%d\n", hipGetErrorString(err), __FILE__, __LINE__); \ |
| } \ |
| } while(0) |
|
|
| |
| static PyObject* get_hardware_info(PyObject* self, PyObject* args) { |
| int deviceId = 0; |
| hipError_t err = hipGetDevice(&deviceId); |
| if (err != hipSuccess) { |
| return PyUnicode_FromString("AMD ROCm (Device Not Found)"); |
| } |
|
|
| hipDeviceProp_t prop; |
| err = hipGetDeviceProperties(&prop, deviceId); |
| if (err != hipSuccess) { |
| return PyUnicode_FromString("AMD ROCm (Properties Unavailable)"); |
| } |
|
|
| |
| std::string info = std::string(prop.name) + " [Arch " + |
| std::to_string(prop.major) + "." + std::to_string(prop.minor) + ", " + |
| std::to_string(prop.totalGlobalMem / (1024*1024)) + " MB VRAM]"; |
| |
| return PyUnicode_FromString(info.c_str()); |
| } |
|
|
| |
| |
| |
| static int32_t *d_rocm_base = nullptr; |
| static int32_t *d_rocm_check = nullptr; |
| static int32_t *d_rocm_values = nullptr; |
| static uint32_t rocm_trie_size = 0; |
| static bool rocm_loaded = false; |
| static bool rocm_initialized = false; |
|
|
| |
| static void cleanup_rocm_memory(void) { |
| if (d_rocm_base) { hipFree(d_rocm_base); d_rocm_base = nullptr; } |
| if (d_rocm_check) { hipFree(d_rocm_check); d_rocm_check = nullptr; } |
| if (d_rocm_values) { hipFree(d_rocm_values); d_rocm_values = nullptr; } |
| rocm_loaded = false; |
| rocm_trie_size = 0; |
| } |
|
|
| |
| |
| |
| __global__ void tokenize_kernel_hip( |
| const int32_t* __restrict__ base, // Cached in L1 Texture Cache |
| const int32_t* __restrict__ check, // Cached in L1 Texture Cache |
| const int32_t* __restrict__ values, // Cached in L1 Texture Cache |
| const char* __restrict__ text_pool, // Massive contiguous char buffer |
| const int* __restrict__ offsets, // Start/End indices for each string |
| int* out_tokens, // Flattened Output Buffer |
| int* out_counts, // Token count per sentence |
| int n_sentences, |
| int max_capacity, // Hard limit on tokens per sequence (e.g., 2048) |
| uint32_t trie_sz |
| ) { |
| |
| |
| int idx = blockIdx.x * blockDim.x + threadIdx.x; |
| |
| |
| if (idx >= n_sentences) return; |
|
|
| |
| |
| int start = offsets[idx]; |
| int end = offsets[idx+1]; |
| int len = end - start; |
| |
| |
| |
| |
| int count = 0; |
| int write_ptr = idx * max_capacity; |
|
|
| int pos = 0; |
| |
| |
| |
| while (pos < len && count < max_capacity) { |
| int best_token = 1; |
| int best_len = 0; |
| int curr = 0; |
| |
| |
| |
| |
| for (int i = pos; i < len && i < pos + 128; ++i) { |
| unsigned char c = (unsigned char)text_pool[start + i]; |
| |
| |
| |
| int next = base[curr] + c; |
| |
| |
| if (next >= 0 && (uint32_t)next < trie_sz && check[next] == curr) { |
| curr = next; |
| |
| |
| int val = values[curr]; |
| |
| if (val != -1) { |
| best_token = val; |
| best_len = (i - pos) + 1; |
| } |
| } else { |
| break; |
| } |
| } |
| |
| |
| out_tokens[write_ptr + count] = best_token; |
| count++; |
| pos += (best_len > 0) ? best_len : 1; |
| } |
| |
| |
| out_counts[idx] = count; |
| } |
|
|
| |
| static PyObject* init_rocm_device(void) { |
| if (rocm_initialized) { |
| Py_RETURN_TRUE; |
| } |
| |
| int device_count = 0; |
| hipError_t err = hipGetDeviceCount(&device_count); |
| if (err != hipSuccess || device_count == 0) { |
| PyErr_SetString(PyExc_RuntimeError, "No ROCm/HIP devices available"); |
| return NULL; |
| } |
| |
| |
| err = hipSetDevice(0); |
| if (err != hipSuccess) { |
| PyErr_Format(PyExc_RuntimeError, "Failed to set HIP device: %s", hipGetErrorString(err)); |
| return NULL; |
| } |
| |
| |
| void* dummy = nullptr; |
| err = hipMalloc(&dummy, 1); |
| if (err != hipSuccess) { |
| PyErr_Format(PyExc_RuntimeError, "Failed to initialize HIP context: %s", hipGetErrorString(err)); |
| return NULL; |
| } |
| hipFree(dummy); |
| |
| rocm_initialized = true; |
| Py_RETURN_TRUE; |
| } |
|
|
| |
| |
| static PyObject* load_rocm(PyObject* self, PyObject* args) { |
| PyObject* py_bytes; |
| if (!PyArg_ParseTuple(args, "O", &py_bytes)) return NULL; |
| |
| if (!PyBytes_Check(py_bytes)) { |
| PyErr_SetString(PyExc_TypeError, "Expected bytes object"); |
| return NULL; |
| } |
|
|
| |
| if (!rocm_initialized) { |
| PyObject* init_result = init_rocm_device(); |
| if (init_result == NULL) { |
| return NULL; |
| } |
| Py_DECREF(init_result); |
| } |
|
|
| |
| Py_ssize_t total_len = PyBytes_Size(py_bytes); |
| if (total_len < 12) { |
| PyErr_SetString(PyExc_ValueError, "DAT file too small (< 12 bytes)"); |
| return NULL; |
| } |
|
|
| const char* raw = PyBytes_AsString(py_bytes); |
| |
| |
| uint32_t sz = 0; |
| memcpy(&sz, raw + 8, sizeof(uint32_t)); |
| |
| |
| if (sz == 0) { |
| PyErr_SetString(PyExc_ValueError, "Trie size is 0"); |
| return NULL; |
| } |
| if (sz > (1u << 24)) { |
| PyErr_SetString(PyExc_ValueError, "Trie size exceeds maximum (16M entries)"); |
| return NULL; |
| } |
|
|
| size_t array_bytes = sz * sizeof(int32_t); |
| size_t required_bytes = 12 + (array_bytes * 3); |
| |
| if ((size_t)total_len < required_bytes) { |
| PyErr_Format(PyExc_ValueError, |
| "DAT file incomplete. Need %zu bytes, got %zd", |
| required_bytes, total_len); |
| return NULL; |
| } |
|
|
| |
| cleanup_rocm_memory(); |
|
|
| |
| hipError_t err; |
| |
| err = hipMalloc((void**)&d_rocm_base, array_bytes); |
| if (err != hipSuccess) { |
| cleanup_rocm_memory(); |
| PyErr_Format(PyExc_RuntimeError, "hipMalloc d_rocm_base failed: %s", hipGetErrorString(err)); |
| return NULL; |
| } |
| |
| err = hipMalloc((void**)&d_rocm_check, array_bytes); |
| if (err != hipSuccess) { |
| cleanup_rocm_memory(); |
| PyErr_Format(PyExc_RuntimeError, "hipMalloc d_rocm_check failed: %s", hipGetErrorString(err)); |
| return NULL; |
| } |
|
|
| err = hipMalloc((void**)&d_rocm_values, array_bytes); |
| if (err != hipSuccess) { |
| cleanup_rocm_memory(); |
| PyErr_Format(PyExc_RuntimeError, "hipMalloc d_rocm_values failed: %s", hipGetErrorString(err)); |
| return NULL; |
| } |
|
|
| |
| const char* data_ptr = raw + 12; |
| |
| err = hipMemcpy(d_rocm_base, data_ptr, array_bytes, hipMemcpyHostToDevice); |
| if (err != hipSuccess) { |
| cleanup_rocm_memory(); |
| PyErr_Format(PyExc_RuntimeError, "hipMemcpy d_rocm_base failed: %s", hipGetErrorString(err)); |
| return NULL; |
| } |
| |
| err = hipMemcpy(d_rocm_check, data_ptr + array_bytes, array_bytes, hipMemcpyHostToDevice); |
| if (err != hipSuccess) { |
| cleanup_rocm_memory(); |
| PyErr_Format(PyExc_RuntimeError, "hipMemcpy d_rocm_check failed: %s", hipGetErrorString(err)); |
| return NULL; |
| } |
| |
| err = hipMemcpy(d_rocm_values, data_ptr + (array_bytes * 2), array_bytes, hipMemcpyHostToDevice); |
| if (err != hipSuccess) { |
| cleanup_rocm_memory(); |
| PyErr_Format(PyExc_RuntimeError, "hipMemcpy d_rocm_values failed: %s", hipGetErrorString(err)); |
| return NULL; |
| } |
| |
| |
| err = hipDeviceSynchronize(); |
| if (err != hipSuccess) { |
| cleanup_rocm_memory(); |
| PyErr_Format(PyExc_RuntimeError, "hipDeviceSynchronize failed: %s", hipGetErrorString(err)); |
| return NULL; |
| } |
| |
| rocm_trie_size = sz; |
| rocm_loaded = true; |
| |
| |
| char msg[256]; |
| snprintf(msg, sizeof(msg), "Loaded %u entries (%.2f MB) to AMD GPU", |
| sz, (array_bytes * 3) / (1024.0 * 1024.0)); |
| return PyUnicode_FromString(msg); |
| } |
|
|
| |
| |
| static PyObject* tokenize_batch_rocm(PyObject* self, PyObject* args) { |
| PyObject* list_obj; |
| if (!PyArg_ParseTuple(args, "O", &list_obj)) return NULL; |
| |
| if (!PyList_Check(list_obj)) { |
| PyErr_SetString(PyExc_TypeError, "Expected list of strings"); |
| return NULL; |
| } |
| |
| Py_ssize_t n = PyList_Size(list_obj); |
| if (n == 0) return PyList_New(0); |
|
|
| |
| if (!rocm_loaded || !d_rocm_base || !d_rocm_check || !d_rocm_values) { |
| PyErr_SetString(PyExc_RuntimeError, "ROCm engine not loaded. Call load_rocm() first."); |
| return NULL; |
| } |
|
|
| |
| |
| |
| std::vector<char> pool; |
| std::vector<int> offsets; |
| offsets.reserve(n + 1); |
| |
| size_t total_chars = 0; |
| for (Py_ssize_t i = 0; i < n; ++i) { |
| PyObject* s = PyList_GetItem(list_obj, i); |
| if (!PyUnicode_Check(s)) { |
| PyErr_SetString(PyExc_TypeError, "List must contain only strings"); |
| return NULL; |
| } |
| |
| Py_ssize_t len; |
| const char* p = PyUnicode_AsUTF8AndSize(s, &len); |
| if (!p) return NULL; |
| |
| offsets.push_back((int)total_chars); |
| pool.insert(pool.end(), p, p + len); |
| total_chars += len; |
| } |
| offsets.push_back((int)total_chars); |
|
|
| |
| size_t avg_len = total_chars / n; |
| int max_tok = (int)(avg_len * 2 + 64); |
| if (max_tok > 4096) max_tok = 4096; |
| if (max_tok < 64) max_tok = 64; |
|
|
| |
| char *d_text = nullptr; |
| int *d_offsets = nullptr, *d_out = nullptr, *d_counts = nullptr; |
| hipError_t err; |
| |
| err = hipMalloc((void**)&d_text, pool.size()); |
| if (err != hipSuccess) { |
| PyErr_Format(PyExc_RuntimeError, "hipMalloc d_text failed: %s", hipGetErrorString(err)); |
| return NULL; |
| } |
| |
| err = hipMalloc((void**)&d_offsets, offsets.size() * sizeof(int)); |
| if (err != hipSuccess) { |
| hipFree(d_text); |
| PyErr_Format(PyExc_RuntimeError, "hipMalloc d_offsets failed: %s", hipGetErrorString(err)); |
| return NULL; |
| } |
| |
| err = hipMalloc((void**)&d_out, n * max_tok * sizeof(int)); |
| if (err != hipSuccess) { |
| hipFree(d_text); hipFree(d_offsets); |
| PyErr_Format(PyExc_RuntimeError, "hipMalloc d_out failed: %s", hipGetErrorString(err)); |
| return NULL; |
| } |
| |
| err = hipMalloc((void**)&d_counts, n * sizeof(int)); |
| if (err != hipSuccess) { |
| hipFree(d_text); hipFree(d_offsets); hipFree(d_out); |
| PyErr_Format(PyExc_RuntimeError, "hipMalloc d_counts failed: %s", hipGetErrorString(err)); |
| return NULL; |
| } |
|
|
| |
| hipMemset(d_out, 0, n * max_tok * sizeof(int)); |
| hipMemset(d_counts, 0, n * sizeof(int)); |
|
|
| |
| hipMemcpy(d_text, pool.data(), pool.size(), hipMemcpyHostToDevice); |
| hipMemcpy(d_offsets, offsets.data(), offsets.size() * sizeof(int), hipMemcpyHostToDevice); |
|
|
| |
| |
| |
| int threads = 256; |
| int blocks = ((int)n + threads - 1) / threads; |
| |
| |
| hipLaunchKernelGGL(tokenize_kernel_hip, dim3(blocks), dim3(threads), 0, 0, |
| d_rocm_base, d_rocm_check, d_rocm_values, |
| d_text, d_offsets, d_out, d_counts, (int)n, max_tok, rocm_trie_size |
| ); |
|
|
| |
| err = hipGetLastError(); |
| if (err != hipSuccess) { |
| hipFree(d_text); hipFree(d_offsets); hipFree(d_out); hipFree(d_counts); |
| PyErr_Format(PyExc_RuntimeError, "Kernel launch failed: %s", hipGetErrorString(err)); |
| return NULL; |
| } |
|
|
| |
| err = hipDeviceSynchronize(); |
| if (err != hipSuccess) { |
| hipFree(d_text); hipFree(d_offsets); hipFree(d_out); hipFree(d_counts); |
| PyErr_Format(PyExc_RuntimeError, "Kernel execution failed: %s", hipGetErrorString(err)); |
| return NULL; |
| } |
|
|
| |
| std::vector<int> h_out(n * max_tok); |
| std::vector<int> h_counts(n); |
| |
| hipMemcpy(h_out.data(), d_out, h_out.size() * sizeof(int), hipMemcpyDeviceToHost); |
| hipMemcpy(h_counts.data(), d_counts, n * sizeof(int), hipMemcpyDeviceToHost); |
|
|
| |
| PyObject* result = PyList_New(n); |
| for (Py_ssize_t i = 0; i < n; ++i) { |
| int c = h_counts[i]; |
| PyObject* sub = PyList_New(c); |
| int row_ptr = (int)i * max_tok; |
| for (int k = 0; k < c; ++k) { |
| PyObject* val = PyLong_FromLong(h_out[row_ptr + k]); |
| PyList_SetItem(sub, k, val); |
| } |
| PyList_SetItem(result, i, sub); |
| } |
| |
| |
| hipFree(d_text); hipFree(d_offsets); hipFree(d_out); hipFree(d_counts); |
| |
| |
| PyObject* meta = PyDict_New(); |
| PyDict_SetItemString(meta, "sentences", PyLong_FromSsize_t(n)); |
| PyDict_SetItemString(meta, "max_tokens_per_sentence", PyLong_FromLong(max_tok)); |
| |
| PyObject* full_result = PyTuple_New(2); |
| PyTuple_SetItem(full_result, 0, result); |
| PyTuple_SetItem(full_result, 1, meta); |
| |
| return full_result; |
| } |
|
|
| |
| static void module_cleanup(void* module) { |
| cleanup_rocm_memory(); |
| } |
|
|
| |
| static PyMethodDef RocmMethods[] = { |
| {"load_rocm", load_rocm, METH_VARARGS, "Load DAT into AMD VRAM"}, |
| {"tokenize_batch_rocm", tokenize_batch_rocm, METH_VARARGS, "HIP Kernel Execute"}, |
| {"get_hardware_info", get_hardware_info, METH_VARARGS, "Get AMD GPU Telemetry"}, |
| {NULL, NULL, 0, NULL} |
| }; |
|
|
| static struct PyModuleDef rocm_module = { |
| PyModuleDef_HEAD_INIT, |
| "crayon_rocm", |
| "XERV Crayon AMD HIP Backend v4.3.0 - Production Grade", |
| -1, |
| RocmMethods, |
| NULL, NULL, NULL, |
| module_cleanup |
| }; |
|
|
| PyMODINIT_FUNC PyInit_crayon_rocm(void) { |
| return PyModule_Create(&rocm_module); |
| } |
|
|