TaskCLIP / asic.c
HanningChen
Initial HF Space: FastAPI + HTML (no weights yet)
f2f112a
#include <math.h>
#include <stddef.h>
// --------- activations ----------
static inline float relu_f(float x) {
return (x > 0.0f) ? x : 0.0f;
}
// Numerically-stable sigmoid
static inline float sigmoid_f(float x) {
if (x >= 0.0f) {
float z = expf(-x);
return 1.0f / (1.0f + z);
} else {
float z = expf(x);
return z / (1.0f + z);
}
}
// --------- linear layer ----------
// y[n, out_dim] = sum_i x[n, in_dim] * W[out_dim, in_dim] + b[out_dim]
static void linear_forward(
const float *x, // [N, in_dim]
float *y, // [N, out_dim]
const float *W, // [out_dim, in_dim] row-major: W[o*in_dim + i]
const float *b, // [out_dim]
int N,
int in_dim,
int out_dim
) {
for (int n = 0; n < N; ++n) {
const float *xn = x + (size_t)n * (size_t)in_dim;
float *yn = y + (size_t)n * (size_t)out_dim;
for (int o = 0; o < out_dim; ++o) {
const float *Wo = W + (size_t)o * (size_t)in_dim;
float acc = b ? b[o] : 0.0f;
// dot(xn, Wo)
for (int i = 0; i < in_dim; ++i) {
acc += xn[i] * Wo[i];
}
yn[o] = acc;
}
}
}
// In-place ReLU on a [N, dim] tensor
static void relu_inplace(float *x, int N, int dim) {
size_t total = (size_t)N * (size_t)dim;
for (size_t idx = 0; idx < total; ++idx) {
x[idx] = relu_f(x[idx]);
}
}
// In-place Sigmoid on a [N, dim] tensor
static void sigmoid_inplace(float *x, int N, int dim) {
size_t total = (size_t)N * (size_t)dim;
for (size_t idx = 0; idx < total; ++idx) {
x[idx] = sigmoid_f(x[idx]);
}
}
// --------- the requested block ----------
//
// input/output buffer "out" starts as [N, D] and ends as [N, 1].
//
// You must provide temporary buffers:
// tmp1: [N, D/2]
// tmp2: [N, D/8]
//
void score_tail_forward(
float *out, // IN: [N, D], OUT: [N, 1]
int N,
int D, // must be divisible by 8
// Linear2 params: (D -> D/2)
const float *W2, // [D/2, D]
const float *b2, // [D/2]
// Linear3 params: (D/2 -> D/8)
const float *W3, // [D/8, D/2]
const float *b3, // [D/8]
// Linear4 params: (D/8 -> 1)
const float *W4, // [1, D/8] (or just [D/8])
const float *b4, // [1] (optional; PyTorch Linear has bias)
float bias_scalar, // your Parameter(torch.zeros(1))
// workspaces
float *tmp1, // [N, D/2]
float *tmp2 // [N, D/8]
) {
const int D2 = D / 2;
const int D8 = D / 8;
// output = Linear2(output)
linear_forward(out, tmp1, W2, b2, N, D, D2);
// output = ReLU(output)
relu_inplace(tmp1, N, D2);
// output = Linear3(output)
linear_forward(tmp1, tmp2, W3, b3, N, D2, D8);
// output = ReLU(output)
relu_inplace(tmp2, N, D8);
// output = Linear4(output) + bias
// Linear4 produces [N, 1]
// Treat W4 as [1, D8] row-major => W4[i] for i in [0..D8-1]
for (int n = 0; n < N; ++n) {
const float *xn = tmp2 + (size_t)n * (size_t)D8;
float acc = 0.0f;
// Linear4 bias term (if present)
if (b4) acc += b4[0];
// dot(xn, W4[0, :])
for (int i = 0; i < D8; ++i) {
acc += xn[i] * W4[i];
}
// add scalar bias parameter
acc += bias_scalar;
// write back into out as [N,1]
out[n] = acc;
}
// output = Sigmoid(output)
sigmoid_inplace(out, N, 1);
}