notRaphael commited on
Commit
cb41883
·
verified ·
1 Parent(s): 7f7a754

Upload layers/dense.hpp

Browse files
Files changed (1) hide show
  1. layers/dense.hpp +171 -0
layers/dense.hpp ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ #include "layer.hpp"
3
+ #include "../core/backend.hpp"
4
+ #include <string>
5
+
6
+ namespace newnet {
7
+
8
+ class Dense : public Layer {
9
+ public:
10
+ Tensor weights; // [in_features x out_features]
11
+ Tensor bias; // [1 x out_features]
12
+ std::string activation; // "relu", "sigmoid", "none"
13
+
14
+ // Cached from forward pass — needed by backward (chain rule requires these values)
15
+ Tensor cached_input; // the input x that was passed to forward()
16
+ Tensor cached_preact; // result before activation: x*W + b
17
+ Tensor cached_output; // result after activation (needed for sigmoid backward)
18
+
19
+ Dense(int in_features, int out_features, std::string activation_ = "none")
20
+ : activation(activation_) {
21
+ // Xavier initialization — prevents vanishing/exploding gradients
22
+ weights = Tensor::xavier(in_features, out_features);
23
+ weights.init_grad();
24
+
25
+ // Bias initialized to zero
26
+ bias = Tensor({1, out_features}, 0.0f);
27
+ bias.requires_grad = true;
28
+ bias.init_grad();
29
+ }
30
+
31
+ // Forward: output = activation(input * weights + bias)
32
+ //
33
+ // Math:
34
+ // input: [batch x in_f]
35
+ // weights:[in_f x out_f]
36
+ // output: [batch x out_f]
37
+ //
38
+ // Step 1: preact = input @ weights + bias
39
+ // Step 2: output = activation(preact)
40
+ //
41
+ Tensor forward(const Tensor& input) override {
42
+ int batch = input.rows();
43
+ int in_f = input.cols();
44
+ int out_f = weights.cols();
45
+
46
+ assert(in_f == weights.rows() && "Input features must match weight rows");
47
+
48
+ // Cache input for backward pass
49
+ cached_input = input;
50
+
51
+ // Step 1: matmul → preact = input * weights
52
+ Tensor preact({batch, out_f});
53
+ backend::matmul(
54
+ input.data.data(), weights.data.data(), preact.data.data(),
55
+ batch, out_f, in_f
56
+ );
57
+
58
+ // Step 2: add bias → preact[i][j] += bias[j] for each sample i
59
+ for (int i = 0; i < batch; i++) {
60
+ backend::add(
61
+ &preact.data[i * out_f], bias.data.data(),
62
+ &preact.data[i * out_f], out_f
63
+ );
64
+ }
65
+
66
+ cached_preact = preact;
67
+
68
+ // Step 3: apply activation function
69
+ Tensor output({batch, out_f});
70
+ if (activation == "relu") {
71
+ backend::relu_forward(preact.data.data(), output.data.data(), batch * out_f);
72
+ } else if (activation == "sigmoid") {
73
+ backend::sigmoid_forward(preact.data.data(), output.data.data(), batch * out_f);
74
+ } else {
75
+ output = preact;
76
+ }
77
+
78
+ cached_output = output;
79
+ return output;
80
+ }
81
+
82
+ // Backward: receives grad of loss w.r.t. this layer's OUTPUT
83
+ // returns grad of loss w.r.t. this layer's INPUT (to pass to previous layer)
84
+ //
85
+ // Also computes:
86
+ // grad_weights (accumulated into weights.grad)
87
+ // grad_bias (accumulated into bias.grad)
88
+ //
89
+ // Derivations (pen and paper, done ONCE by human):
90
+ // y = x * W + b
91
+ // dL/dW = x^T * dL/dy (how loss changes when weights change)
92
+ // dL/db = sum_over_batch(dL/dy) (how loss changes when bias changes)
93
+ // dL/dx = dL/dy * W^T (how loss changes when input changes — pass backward)
94
+ //
95
+ Tensor backward(const Tensor& grad_output) override {
96
+ int batch = cached_input.rows();
97
+ int in_f = cached_input.cols();
98
+ int out_f = weights.cols();
99
+
100
+ // Step 1: gradient through activation function
101
+ Tensor grad_preact({batch, out_f});
102
+ if (activation == "relu") {
103
+ // ReLU derivative: 1 if input > 0, else 0
104
+ backend::relu_backward(
105
+ grad_output.data.data(), cached_preact.data.data(),
106
+ grad_preact.data.data(), batch * out_f
107
+ );
108
+ } else if (activation == "sigmoid") {
109
+ // Sigmoid derivative: output * (1 - output)
110
+ backend::sigmoid_backward(
111
+ grad_output.data.data(), cached_output.data.data(),
112
+ grad_preact.data.data(), batch * out_f
113
+ );
114
+ } else {
115
+ grad_preact = grad_output;
116
+ }
117
+
118
+ // Step 2: grad_weights = input^T * grad_preact
119
+ // cached_input: [batch x in_f]
120
+ // input^T: [in_f x batch]
121
+ // grad_preact: [batch x out_f]
122
+ // result: [in_f x out_f] — same shape as weights
123
+ Tensor input_T({in_f, batch});
124
+ backend::transpose(
125
+ cached_input.data.data(), input_T.data.data(),
126
+ batch, in_f
127
+ );
128
+
129
+ std::vector<float> gw(in_f * out_f, 0.0f);
130
+ backend::matmul(
131
+ input_T.data.data(), grad_preact.data.data(), gw.data(),
132
+ in_f, out_f, batch
133
+ );
134
+ // Accumulate (+=) not overwrite — supports gradient accumulation across batches
135
+ for (int i = 0; i < in_f * out_f; i++) {
136
+ weights.grad[i] += gw[i];
137
+ }
138
+
139
+ // Step 3: grad_bias = sum of grad_preact along batch dimension
140
+ // For each output feature j: grad_bias[j] = sum_i(grad_preact[i][j])
141
+ std::vector<float> gb(out_f, 0.0f);
142
+ backend::sum_columns(grad_preact.data.data(), gb.data(), batch, out_f);
143
+ for (int i = 0; i < out_f; i++) {
144
+ bias.grad[i] += gb[i];
145
+ }
146
+
147
+ // Step 4: grad_input = grad_preact * weights^T
148
+ // grad_preact: [batch x out_f]
149
+ // weights^T: [out_f x in_f]
150
+ // result: [batch x in_f] — pass to previous layer
151
+ Tensor weights_T({out_f, in_f});
152
+ backend::transpose(
153
+ weights.data.data(), weights_T.data.data(),
154
+ in_f, out_f
155
+ );
156
+
157
+ Tensor grad_input({batch, in_f});
158
+ backend::matmul(
159
+ grad_preact.data.data(), weights_T.data.data(), grad_input.data.data(),
160
+ batch, in_f, out_f
161
+ );
162
+
163
+ return grad_input;
164
+ }
165
+
166
+ std::vector<Tensor*> parameters() override {
167
+ return {&weights, &bias};
168
+ }
169
+ };
170
+
171
+ } // namespace newnet