notRaphael
/

newnet

ml-intern

Model card Files Files and versions

xet

Community

notRaphael commited on 18 days ago

Commit

cb41883

verified ·

1 Parent(s): 7f7a754

Upload layers/dense.hpp

Browse files

Files changed (1) hide show

layers/dense.hpp +171 -0

layers/dense.hpp ADDED Viewed

	@@ -0,0 +1,171 @@

+#pragma once
+#include "layer.hpp"
+#include "../core/backend.hpp"
+#include <string>
+namespace newnet {
+class Dense : public Layer {
+public:
+    Tensor weights;         // [in_features x out_features]
+    Tensor bias;            // [1 x out_features]
+    std::string activation; // "relu", "sigmoid", "none"
+    // Cached from forward pass — needed by backward (chain rule requires these values)
+    Tensor cached_input;    // the input x that was passed to forward()
+    Tensor cached_preact;   // result before activation: x*W + b
+    Tensor cached_output;   // result after activation (needed for sigmoid backward)
+    Dense(int in_features, int out_features, std::string activation_ = "none")
+        : activation(activation_) {
+        // Xavier initialization — prevents vanishing/exploding gradients
+        weights = Tensor::xavier(in_features, out_features);
+        weights.init_grad();
+        // Bias initialized to zero
+        bias = Tensor({1, out_features}, 0.0f);
+        bias.requires_grad = true;
+        bias.init_grad();
+    }
+    // Forward: output = activation(input * weights + bias)
+    //
+    // Math:
+    //   input:  [batch x in_f]
+    //   weights:[in_f x out_f]
+    //   output: [batch x out_f]
+    //
+    //   Step 1: preact = input @ weights + bias
+    //   Step 2: output = activation(preact)
+    //
+    Tensor forward(const Tensor& input) override {
+        int batch = input.rows();
+        int in_f = input.cols();
+        int out_f = weights.cols();
+        assert(in_f == weights.rows() && "Input features must match weight rows");
+        // Cache input for backward pass
+        cached_input = input;
+        // Step 1: matmul → preact = input * weights
+        Tensor preact({batch, out_f});
+        backend::matmul(
+            input.data.data(), weights.data.data(), preact.data.data(),
+            batch, out_f, in_f
+        );
+        // Step 2: add bias → preact[i][j] += bias[j] for each sample i
+        for (int i = 0; i < batch; i++) {
+            backend::add(
+                &preact.data[i * out_f], bias.data.data(),
+                &preact.data[i * out_f], out_f
+            );
+        }
+        cached_preact = preact;
+        // Step 3: apply activation function
+        Tensor output({batch, out_f});
+        if (activation == "relu") {
+            backend::relu_forward(preact.data.data(), output.data.data(), batch * out_f);
+        } else if (activation == "sigmoid") {
+            backend::sigmoid_forward(preact.data.data(), output.data.data(), batch * out_f);
+        } else {
+            output = preact;
+        }
+        cached_output = output;
+        return output;
+    }
+    // Backward: receives grad of loss w.r.t. this layer's OUTPUT
+    //           returns grad of loss w.r.t. this layer's INPUT (to pass to previous layer)
+    //
+    // Also computes:
+    //   grad_weights (accumulated into weights.grad)
+    //   grad_bias (accumulated into bias.grad)
+    //
+    // Derivations (pen and paper, done ONCE by human):
+    //   y = x * W + b
+    //   dL/dW = x^T * dL/dy        (how loss changes when weights change)
+    //   dL/db = sum_over_batch(dL/dy)  (how loss changes when bias changes)
+    //   dL/dx = dL/dy * W^T         (how loss changes when input changes — pass backward)
+    //
+    Tensor backward(const Tensor& grad_output) override {
+        int batch = cached_input.rows();
+        int in_f = cached_input.cols();
+        int out_f = weights.cols();
+        // Step 1: gradient through activation function
+        Tensor grad_preact({batch, out_f});
+        if (activation == "relu") {
+            // ReLU derivative: 1 if input > 0, else 0
+            backend::relu_backward(
+                grad_output.data.data(), cached_preact.data.data(),
+                grad_preact.data.data(), batch * out_f
+            );
+        } else if (activation == "sigmoid") {
+            // Sigmoid derivative: output * (1 - output)
+            backend::sigmoid_backward(
+                grad_output.data.data(), cached_output.data.data(),
+                grad_preact.data.data(), batch * out_f
+            );
+        } else {
+            grad_preact = grad_output;
+        }
+        // Step 2: grad_weights = input^T * grad_preact
+        //   cached_input: [batch x in_f]
+        //   input^T:      [in_f x batch]
+        //   grad_preact:  [batch x out_f]
+        //   result:       [in_f x out_f] — same shape as weights
+        Tensor input_T({in_f, batch});
+        backend::transpose(
+            cached_input.data.data(), input_T.data.data(),
+            batch, in_f
+        );
+        std::vector<float> gw(in_f * out_f, 0.0f);
+        backend::matmul(
+            input_T.data.data(), grad_preact.data.data(), gw.data(),
+            in_f, out_f, batch
+        );
+        // Accumulate (+=) not overwrite — supports gradient accumulation across batches
+        for (int i = 0; i < in_f * out_f; i++) {
+            weights.grad[i] += gw[i];
+        }
+        // Step 3: grad_bias = sum of grad_preact along batch dimension
+        //   For each output feature j: grad_bias[j] = sum_i(grad_preact[i][j])
+        std::vector<float> gb(out_f, 0.0f);
+        backend::sum_columns(grad_preact.data.data(), gb.data(), batch, out_f);
+        for (int i = 0; i < out_f; i++) {
+            bias.grad[i] += gb[i];
+        }
+        // Step 4: grad_input = grad_preact * weights^T
+        //   grad_preact: [batch x out_f]
+        //   weights^T:   [out_f x in_f]
+        //   result:      [batch x in_f] — pass to previous layer
+        Tensor weights_T({out_f, in_f});
+        backend::transpose(
+            weights.data.data(), weights_T.data.data(),
+            in_f, out_f
+        );
+        Tensor grad_input({batch, in_f});
+        backend::matmul(
+            grad_preact.data.data(), weights_T.data.data(), grad_input.data.data(),
+            batch, in_f, out_f
+        );
+        return grad_input;
+    }
+    std::vector<Tensor*> parameters() override {
+        return {&weights, &bias};
+    }
+};
+} // namespace newnet