#pragma once #include "tensor.hpp" #include #include #include #include #include namespace newnet { namespace backend { // =========================================================================== // BACKEND: All math operations live here. // // TODAY: Multi-threaded CPU (pthreads) // TOMORROW: Replace this file with SYCL/AdaptiveCpp kernels. // Neural network code (layers, graph, optimizer) does NOT change. // // To port to GPU: // 1. Copy this file to backend_sycl.hpp // 2. Replace each function body with a SYCL kernel // 3. Include backend_sycl.hpp instead of backend.hpp // Done. Everything else stays the same. // =========================================================================== // --- Matrix multiply: C = A * B --- // A is [M x K], B is [K x N], C is [M x N] inline void matmul(const float* A, const float* B, float* C, int M, int N, int K) { // Zero output first std::fill(C, C + M * N, 0.0f); int num_threads = std::thread::hardware_concurrency(); if (num_threads == 0) num_threads = 4; // For small matrices, single-threaded (thread overhead > compute) if (M * N * K < 50000) { for (int i = 0; i < M; i++) { for (int k = 0; k < K; k++) { float a_ik = A[i * K + k]; for (int j = 0; j < N; j++) { C[i * N + j] += a_ik * B[k * N + j]; } } } return; } // Multi-threaded: partition rows of output across cores // No locks needed — each thread writes to different rows std::vector threads; int rows_per_thread = M / num_threads; int remainder = M % num_threads; int row_start = 0; for (int t = 0; t < num_threads; t++) { int row_end = row_start + rows_per_thread + (t < remainder ? 1 : 0); threads.emplace_back([A, B, C, N, K, row_start, row_end]() { for (int i = row_start; i < row_end; i++) { for (int k = 0; k < K; k++) { float a_ik = A[i * K + k]; for (int j = 0; j < N; j++) { C[i * N + j] += a_ik * B[k * N + j]; } } } }); row_start = row_end; } for (auto& t : threads) t.join(); } // --- Element-wise add: C = A + B --- inline void add(const float* A, const float* B, float* C, int size) { for (int i = 0; i < size; i++) { C[i] = A[i] + B[i]; } } // --- Element-wise multiply: C = A * B (Hadamard product) --- inline void multiply(const float* A, const float* B, float* C, int size) { for (int i = 0; i < size; i++) { C[i] = A[i] * B[i]; } } // --- Scale: C = A * scalar --- inline void scale(const float* A, float scalar, float* C, int size) { for (int i = 0; i < size; i++) { C[i] = A[i] * scalar; } } // --- ReLU forward: out = max(0, in) --- inline void relu_forward(const float* in, float* out, int size) { for (int i = 0; i < size; i++) { out[i] = in[i] > 0.0f ? in[i] : 0.0f; } } // --- ReLU backward: grad_in = grad_out * (input > 0 ? 1 : 0) --- inline void relu_backward(const float* grad_out, const float* input, float* grad_in, int size) { for (int i = 0; i < size; i++) { grad_in[i] = input[i] > 0.0f ? grad_out[i] : 0.0f; } } // --- Sigmoid forward: out = 1 / (1 + exp(-in)) --- inline void sigmoid_forward(const float* in, float* out, int size) { for (int i = 0; i < size; i++) { out[i] = 1.0f / (1.0f + std::exp(-in[i])); } } // --- Sigmoid backward: grad_in = grad_out * out * (1 - out) --- inline void sigmoid_backward(const float* grad_out, const float* output, float* grad_in, int size) { for (int i = 0; i < size; i++) { grad_in[i] = grad_out[i] * output[i] * (1.0f - output[i]); } } // --- Sum columns: for each column j, sum all rows → result is [1 x N] --- inline void sum_columns(const float* A, float* out, int M, int N) { std::fill(out, out + N, 0.0f); for (int i = 0; i < M; i++) { for (int j = 0; j < N; j++) { out[j] += A[i * N + j]; } } } // --- Transpose: B = A^T --- // A is [M x N], B is [N x M] inline void transpose(const float* A, float* B, int M, int N) { for (int i = 0; i < M; i++) { for (int j = 0; j < N; j++) { B[j * M + i] = A[i * N + j]; } } } }} // namespace newnet::backend