newnet / core /backend.hpp

Upload core/backend.hpp

0a73bb1 verified 18 days ago

4.61 kB

	#pragma once
	#include "tensor.hpp"
	#include <thread>
	#include <vector>
	#include <cmath>
	#include <algorithm>
	#include <functional>

	namespace newnet { namespace backend {

	// ===========================================================================
	// BACKEND: All math operations live here.
	//
	// TODAY: Multi-threaded CPU (pthreads)
	// TOMORROW: Replace this file with SYCL/AdaptiveCpp kernels.
	// Neural network code (layers, graph, optimizer) does NOT change.
	//
	// To port to GPU:
	// 1. Copy this file to backend_sycl.hpp
	// 2. Replace each function body with a SYCL kernel
	// 3. Include backend_sycl.hpp instead of backend.hpp
	// Done. Everything else stays the same.
	// ===========================================================================

	// --- Matrix multiply: C = A * B ---
	// A is [M x K], B is [K x N], C is [M x N]
	inline void matmul(const float* A, const float* B, float* C,
	int M, int N, int K) {
	// Zero output first
	std::fill(C, C + M * N, 0.0f);

	int num_threads = std::thread::hardware_concurrency();
	if (num_threads == 0) num_threads = 4;

	// For small matrices, single-threaded (thread overhead > compute)
	if (M * N * K < 50000) {
	for (int i = 0; i < M; i++) {
	for (int k = 0; k < K; k++) {
	float a_ik = A[i * K + k];
	for (int j = 0; j < N; j++) {
	C[i * N + j] += a_ik * B[k * N + j];
	}
	}
	}
	return;
	}

	// Multi-threaded: partition rows of output across cores
	// No locks needed — each thread writes to different rows
	std::vector<std::thread> threads;
	int rows_per_thread = M / num_threads;
	int remainder = M % num_threads;

	int row_start = 0;
	for (int t = 0; t < num_threads; t++) {
	int row_end = row_start + rows_per_thread + (t < remainder ? 1 : 0);

	threads.emplace_back([A, B, C, N, K, row_start, row_end]() {
	for (int i = row_start; i < row_end; i++) {
	for (int k = 0; k < K; k++) {
	float a_ik = A[i * K + k];
	for (int j = 0; j < N; j++) {
	C[i * N + j] += a_ik * B[k * N + j];
	}
	}
	}
	});

	row_start = row_end;
	}

	for (auto& t : threads) t.join();
	}

	// --- Element-wise add: C = A + B ---
	inline void add(const float* A, const float* B, float* C, int size) {
	for (int i = 0; i < size; i++) {
	C[i] = A[i] + B[i];
	}
	}

	// --- Element-wise multiply: C = A * B (Hadamard product) ---
	inline void multiply(const float* A, const float* B, float* C, int size) {
	for (int i = 0; i < size; i++) {
	C[i] = A[i] * B[i];
	}
	}

	// --- Scale: C = A * scalar ---
	inline void scale(const float* A, float scalar, float* C, int size) {
	for (int i = 0; i < size; i++) {
	C[i] = A[i] * scalar;
	}
	}

	// --- ReLU forward: out = max(0, in) ---
	inline void relu_forward(const float* in, float* out, int size) {
	for (int i = 0; i < size; i++) {
	out[i] = in[i] > 0.0f ? in[i] : 0.0f;
	}
	}

	// --- ReLU backward: grad_in = grad_out * (input > 0 ? 1 : 0) ---
	inline void relu_backward(const float* grad_out, const float* input,
	float* grad_in, int size) {
	for (int i = 0; i < size; i++) {
	grad_in[i] = input[i] > 0.0f ? grad_out[i] : 0.0f;
	}
	}

	// --- Sigmoid forward: out = 1 / (1 + exp(-in)) ---
	inline void sigmoid_forward(const float* in, float* out, int size) {
	for (int i = 0; i < size; i++) {
	out[i] = 1.0f / (1.0f + std::exp(-in[i]));
	}
	}

	// --- Sigmoid backward: grad_in = grad_out * out * (1 - out) ---
	inline void sigmoid_backward(const float* grad_out, const float* output,
	float* grad_in, int size) {
	for (int i = 0; i < size; i++) {
	grad_in[i] = grad_out[i] * output[i] * (1.0f - output[i]);
	}
	}

	// --- Sum columns: for each column j, sum all rows → result is [1 x N] ---
	inline void sum_columns(const float* A, float* out, int M, int N) {
	std::fill(out, out + N, 0.0f);
	for (int i = 0; i < M; i++) {
	for (int j = 0; j < N; j++) {
	out[j] += A[i * N + j];
	}
	}
	}

	// --- Transpose: B = A^T ---
	// A is [M x N], B is [N x M]
	inline void transpose(const float* A, float* B, int M, int N) {
	for (int i = 0; i < M; i++) {
	for (int j = 0; j < N; j++) {
	B[j * M + i] = A[i * N + j];
	}
	}
	}

	}} // namespace newnet::backend