newnet / core /backend.hpp
notRaphael's picture
Upload core/backend.hpp
0a73bb1 verified
#pragma once
#include "tensor.hpp"
#include <thread>
#include <vector>
#include <cmath>
#include <algorithm>
#include <functional>
namespace newnet { namespace backend {
// ===========================================================================
// BACKEND: All math operations live here.
//
// TODAY: Multi-threaded CPU (pthreads)
// TOMORROW: Replace this file with SYCL/AdaptiveCpp kernels.
// Neural network code (layers, graph, optimizer) does NOT change.
//
// To port to GPU:
// 1. Copy this file to backend_sycl.hpp
// 2. Replace each function body with a SYCL kernel
// 3. Include backend_sycl.hpp instead of backend.hpp
// Done. Everything else stays the same.
// ===========================================================================
// --- Matrix multiply: C = A * B ---
// A is [M x K], B is [K x N], C is [M x N]
inline void matmul(const float* A, const float* B, float* C,
int M, int N, int K) {
// Zero output first
std::fill(C, C + M * N, 0.0f);
int num_threads = std::thread::hardware_concurrency();
if (num_threads == 0) num_threads = 4;
// For small matrices, single-threaded (thread overhead > compute)
if (M * N * K < 50000) {
for (int i = 0; i < M; i++) {
for (int k = 0; k < K; k++) {
float a_ik = A[i * K + k];
for (int j = 0; j < N; j++) {
C[i * N + j] += a_ik * B[k * N + j];
}
}
}
return;
}
// Multi-threaded: partition rows of output across cores
// No locks needed — each thread writes to different rows
std::vector<std::thread> threads;
int rows_per_thread = M / num_threads;
int remainder = M % num_threads;
int row_start = 0;
for (int t = 0; t < num_threads; t++) {
int row_end = row_start + rows_per_thread + (t < remainder ? 1 : 0);
threads.emplace_back([A, B, C, N, K, row_start, row_end]() {
for (int i = row_start; i < row_end; i++) {
for (int k = 0; k < K; k++) {
float a_ik = A[i * K + k];
for (int j = 0; j < N; j++) {
C[i * N + j] += a_ik * B[k * N + j];
}
}
}
});
row_start = row_end;
}
for (auto& t : threads) t.join();
}
// --- Element-wise add: C = A + B ---
inline void add(const float* A, const float* B, float* C, int size) {
for (int i = 0; i < size; i++) {
C[i] = A[i] + B[i];
}
}
// --- Element-wise multiply: C = A * B (Hadamard product) ---
inline void multiply(const float* A, const float* B, float* C, int size) {
for (int i = 0; i < size; i++) {
C[i] = A[i] * B[i];
}
}
// --- Scale: C = A * scalar ---
inline void scale(const float* A, float scalar, float* C, int size) {
for (int i = 0; i < size; i++) {
C[i] = A[i] * scalar;
}
}
// --- ReLU forward: out = max(0, in) ---
inline void relu_forward(const float* in, float* out, int size) {
for (int i = 0; i < size; i++) {
out[i] = in[i] > 0.0f ? in[i] : 0.0f;
}
}
// --- ReLU backward: grad_in = grad_out * (input > 0 ? 1 : 0) ---
inline void relu_backward(const float* grad_out, const float* input,
float* grad_in, int size) {
for (int i = 0; i < size; i++) {
grad_in[i] = input[i] > 0.0f ? grad_out[i] : 0.0f;
}
}
// --- Sigmoid forward: out = 1 / (1 + exp(-in)) ---
inline void sigmoid_forward(const float* in, float* out, int size) {
for (int i = 0; i < size; i++) {
out[i] = 1.0f / (1.0f + std::exp(-in[i]));
}
}
// --- Sigmoid backward: grad_in = grad_out * out * (1 - out) ---
inline void sigmoid_backward(const float* grad_out, const float* output,
float* grad_in, int size) {
for (int i = 0; i < size; i++) {
grad_in[i] = grad_out[i] * output[i] * (1.0f - output[i]);
}
}
// --- Sum columns: for each column j, sum all rows → result is [1 x N] ---
inline void sum_columns(const float* A, float* out, int M, int N) {
std::fill(out, out + N, 0.0f);
for (int i = 0; i < M; i++) {
for (int j = 0; j < N; j++) {
out[j] += A[i * N + j];
}
}
}
// --- Transpose: B = A^T ---
// A is [M x N], B is [N x M]
inline void transpose(const float* A, float* B, int M, int N) {
for (int i = 0; i < M; i++) {
for (int j = 0; j < N; j++) {
B[j * M + i] = A[i * N + j];
}
}
}
}} // namespace newnet::backend