| #pragma once |
| #include "tensor.hpp" |
| #include <thread> |
| #include <vector> |
| #include <cmath> |
| #include <algorithm> |
| #include <functional> |
|
|
| namespace newnet { namespace backend { |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| inline void matmul(const float* A, const float* B, float* C, |
| int M, int N, int K) { |
| |
| std::fill(C, C + M * N, 0.0f); |
| |
| int num_threads = std::thread::hardware_concurrency(); |
| if (num_threads == 0) num_threads = 4; |
| |
| |
| if (M * N * K < 50000) { |
| for (int i = 0; i < M; i++) { |
| for (int k = 0; k < K; k++) { |
| float a_ik = A[i * K + k]; |
| for (int j = 0; j < N; j++) { |
| C[i * N + j] += a_ik * B[k * N + j]; |
| } |
| } |
| } |
| return; |
| } |
| |
| |
| |
| std::vector<std::thread> threads; |
| int rows_per_thread = M / num_threads; |
| int remainder = M % num_threads; |
| |
| int row_start = 0; |
| for (int t = 0; t < num_threads; t++) { |
| int row_end = row_start + rows_per_thread + (t < remainder ? 1 : 0); |
| |
| threads.emplace_back([A, B, C, N, K, row_start, row_end]() { |
| for (int i = row_start; i < row_end; i++) { |
| for (int k = 0; k < K; k++) { |
| float a_ik = A[i * K + k]; |
| for (int j = 0; j < N; j++) { |
| C[i * N + j] += a_ik * B[k * N + j]; |
| } |
| } |
| } |
| }); |
| |
| row_start = row_end; |
| } |
| |
| for (auto& t : threads) t.join(); |
| } |
|
|
| |
| inline void add(const float* A, const float* B, float* C, int size) { |
| for (int i = 0; i < size; i++) { |
| C[i] = A[i] + B[i]; |
| } |
| } |
|
|
| |
| inline void multiply(const float* A, const float* B, float* C, int size) { |
| for (int i = 0; i < size; i++) { |
| C[i] = A[i] * B[i]; |
| } |
| } |
|
|
| |
| inline void scale(const float* A, float scalar, float* C, int size) { |
| for (int i = 0; i < size; i++) { |
| C[i] = A[i] * scalar; |
| } |
| } |
|
|
| |
| inline void relu_forward(const float* in, float* out, int size) { |
| for (int i = 0; i < size; i++) { |
| out[i] = in[i] > 0.0f ? in[i] : 0.0f; |
| } |
| } |
|
|
| |
| inline void relu_backward(const float* grad_out, const float* input, |
| float* grad_in, int size) { |
| for (int i = 0; i < size; i++) { |
| grad_in[i] = input[i] > 0.0f ? grad_out[i] : 0.0f; |
| } |
| } |
|
|
| |
| inline void sigmoid_forward(const float* in, float* out, int size) { |
| for (int i = 0; i < size; i++) { |
| out[i] = 1.0f / (1.0f + std::exp(-in[i])); |
| } |
| } |
|
|
| |
| inline void sigmoid_backward(const float* grad_out, const float* output, |
| float* grad_in, int size) { |
| for (int i = 0; i < size; i++) { |
| grad_in[i] = grad_out[i] * output[i] * (1.0f - output[i]); |
| } |
| } |
|
|
| |
| inline void sum_columns(const float* A, float* out, int M, int N) { |
| std::fill(out, out + N, 0.0f); |
| for (int i = 0; i < M; i++) { |
| for (int j = 0; j < N; j++) { |
| out[j] += A[i * N + j]; |
| } |
| } |
| } |
|
|
| |
| |
| inline void transpose(const float* A, float* B, int M, int N) { |
| for (int i = 0; i < M; i++) { |
| for (int j = 0; j < N; j++) { |
| B[j * M + i] = A[i * N + j]; |
| } |
| } |
| } |
|
|
| }} |
|
|