File size: 3,168 Bytes
30bd5c8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | #pragma once
#include "../core/tensor.hpp"
#include <vector>
namespace newnet {
// Stochastic Gradient Descent
// The simplest optimizer: param = param - learning_rate * gradient
class SGD {
public:
float lr;
SGD(float learning_rate) : lr(learning_rate) {}
// Update all parameters using their accumulated gradients
void step(std::vector<Tensor*> params) {
for (Tensor* p : params) {
assert(p->data.size() == p->grad.size());
for (int i = 0; i < (int)p->data.size(); i++) {
p->data[i] -= lr * p->grad[i];
}
}
}
// Zero all gradients — MUST call before each forward/backward pass
// Otherwise gradients accumulate across batches (which is sometimes desired
// for gradient accumulation, but usually not)
void zero_grad(std::vector<Tensor*> params) {
for (Tensor* p : params) {
p->zero_grad();
}
}
};
// Adam optimizer (Kingma & Ba, 2014)
// Adaptive learning rate per parameter using first and second moment estimates
class Adam {
public:
float lr;
float beta1; // exponential decay rate for first moment (default 0.9)
float beta2; // exponential decay rate for second moment (default 0.999)
float epsilon; // numerical stability (default 1e-8)
int t; // timestep counter
// First moment (mean of gradients) and second moment (mean of squared gradients)
// One vector per parameter tensor
std::vector<std::vector<float>> m; // first moment
std::vector<std::vector<float>> v; // second moment
bool initialized;
Adam(float learning_rate = 0.001f, float b1 = 0.9f, float b2 = 0.999f, float eps = 1e-8f)
: lr(learning_rate), beta1(b1), beta2(b2), epsilon(eps), t(0), initialized(false) {}
void step(std::vector<Tensor*> params) {
// Lazy initialization of moment vectors
if (!initialized) {
for (auto* p : params) {
m.push_back(std::vector<float>(p->data.size(), 0.0f));
v.push_back(std::vector<float>(p->data.size(), 0.0f));
}
initialized = true;
}
t++;
for (int idx = 0; idx < (int)params.size(); idx++) {
Tensor* p = params[idx];
for (int i = 0; i < (int)p->data.size(); i++) {
// Update biased first moment estimate
m[idx][i] = beta1 * m[idx][i] + (1.0f - beta1) * p->grad[i];
// Update biased second moment estimate
v[idx][i] = beta2 * v[idx][i] + (1.0f - beta2) * p->grad[i] * p->grad[i];
// Bias-corrected estimates
float m_hat = m[idx][i] / (1.0f - std::pow(beta1, t));
float v_hat = v[idx][i] / (1.0f - std::pow(beta2, t));
// Update parameter
p->data[i] -= lr * m_hat / (std::sqrt(v_hat) + epsilon);
}
}
}
void zero_grad(std::vector<Tensor*> params) {
for (Tensor* p : params) {
p->zero_grad();
}
}
};
} // namespace newnet
|