#pragma once #include "../core/tensor.hpp" #include namespace newnet { // Stochastic Gradient Descent // The simplest optimizer: param = param - learning_rate * gradient class SGD { public: float lr; SGD(float learning_rate) : lr(learning_rate) {} // Update all parameters using their accumulated gradients void step(std::vector params) { for (Tensor* p : params) { assert(p->data.size() == p->grad.size()); for (int i = 0; i < (int)p->data.size(); i++) { p->data[i] -= lr * p->grad[i]; } } } // Zero all gradients — MUST call before each forward/backward pass // Otherwise gradients accumulate across batches (which is sometimes desired // for gradient accumulation, but usually not) void zero_grad(std::vector params) { for (Tensor* p : params) { p->zero_grad(); } } }; // Adam optimizer (Kingma & Ba, 2014) // Adaptive learning rate per parameter using first and second moment estimates class Adam { public: float lr; float beta1; // exponential decay rate for first moment (default 0.9) float beta2; // exponential decay rate for second moment (default 0.999) float epsilon; // numerical stability (default 1e-8) int t; // timestep counter // First moment (mean of gradients) and second moment (mean of squared gradients) // One vector per parameter tensor std::vector> m; // first moment std::vector> v; // second moment bool initialized; Adam(float learning_rate = 0.001f, float b1 = 0.9f, float b2 = 0.999f, float eps = 1e-8f) : lr(learning_rate), beta1(b1), beta2(b2), epsilon(eps), t(0), initialized(false) {} void step(std::vector params) { // Lazy initialization of moment vectors if (!initialized) { for (auto* p : params) { m.push_back(std::vector(p->data.size(), 0.0f)); v.push_back(std::vector(p->data.size(), 0.0f)); } initialized = true; } t++; for (int idx = 0; idx < (int)params.size(); idx++) { Tensor* p = params[idx]; for (int i = 0; i < (int)p->data.size(); i++) { // Update biased first moment estimate m[idx][i] = beta1 * m[idx][i] + (1.0f - beta1) * p->grad[i]; // Update biased second moment estimate v[idx][i] = beta2 * v[idx][i] + (1.0f - beta2) * p->grad[i] * p->grad[i]; // Bias-corrected estimates float m_hat = m[idx][i] / (1.0f - std::pow(beta1, t)); float v_hat = v[idx][i] / (1.0f - std::pow(beta2, t)); // Update parameter p->data[i] -= lr * m_hat / (std::sqrt(v_hat) + epsilon); } } } void zero_grad(std::vector params) { for (Tensor* p : params) { p->zero_grad(); } } }; } // namespace newnet