#include #include // --------- activations ---------- static inline float relu_f(float x) { return (x > 0.0f) ? x : 0.0f; } // Numerically-stable sigmoid static inline float sigmoid_f(float x) { if (x >= 0.0f) { float z = expf(-x); return 1.0f / (1.0f + z); } else { float z = expf(x); return z / (1.0f + z); } } // --------- linear layer ---------- // y[n, out_dim] = sum_i x[n, in_dim] * W[out_dim, in_dim] + b[out_dim] static void linear_forward( const float *x, // [N, in_dim] float *y, // [N, out_dim] const float *W, // [out_dim, in_dim] row-major: W[o*in_dim + i] const float *b, // [out_dim] int N, int in_dim, int out_dim ) { for (int n = 0; n < N; ++n) { const float *xn = x + (size_t)n * (size_t)in_dim; float *yn = y + (size_t)n * (size_t)out_dim; for (int o = 0; o < out_dim; ++o) { const float *Wo = W + (size_t)o * (size_t)in_dim; float acc = b ? b[o] : 0.0f; // dot(xn, Wo) for (int i = 0; i < in_dim; ++i) { acc += xn[i] * Wo[i]; } yn[o] = acc; } } } // In-place ReLU on a [N, dim] tensor static void relu_inplace(float *x, int N, int dim) { size_t total = (size_t)N * (size_t)dim; for (size_t idx = 0; idx < total; ++idx) { x[idx] = relu_f(x[idx]); } } // In-place Sigmoid on a [N, dim] tensor static void sigmoid_inplace(float *x, int N, int dim) { size_t total = (size_t)N * (size_t)dim; for (size_t idx = 0; idx < total; ++idx) { x[idx] = sigmoid_f(x[idx]); } } // --------- the requested block ---------- // // input/output buffer "out" starts as [N, D] and ends as [N, 1]. // // You must provide temporary buffers: // tmp1: [N, D/2] // tmp2: [N, D/8] // void score_tail_forward( float *out, // IN: [N, D], OUT: [N, 1] int N, int D, // must be divisible by 8 // Linear2 params: (D -> D/2) const float *W2, // [D/2, D] const float *b2, // [D/2] // Linear3 params: (D/2 -> D/8) const float *W3, // [D/8, D/2] const float *b3, // [D/8] // Linear4 params: (D/8 -> 1) const float *W4, // [1, D/8] (or just [D/8]) const float *b4, // [1] (optional; PyTorch Linear has bias) float bias_scalar, // your Parameter(torch.zeros(1)) // workspaces float *tmp1, // [N, D/2] float *tmp2 // [N, D/8] ) { const int D2 = D / 2; const int D8 = D / 8; // output = Linear2(output) linear_forward(out, tmp1, W2, b2, N, D, D2); // output = ReLU(output) relu_inplace(tmp1, N, D2); // output = Linear3(output) linear_forward(tmp1, tmp2, W3, b3, N, D2, D8); // output = ReLU(output) relu_inplace(tmp2, N, D8); // output = Linear4(output) + bias // Linear4 produces [N, 1] // Treat W4 as [1, D8] row-major => W4[i] for i in [0..D8-1] for (int n = 0; n < N; ++n) { const float *xn = tmp2 + (size_t)n * (size_t)D8; float acc = 0.0f; // Linear4 bias term (if present) if (b4) acc += b4[0]; // dot(xn, W4[0, :]) for (int i = 0; i < D8; ++i) { acc += xn[i] * W4[i]; } // add scalar bias parameter acc += bias_scalar; // write back into out as [N,1] out[n] = acc; } // output = Sigmoid(output) sigmoid_inplace(out, N, 1); }