notRaphael
/

newnet

Model card Files Files and versions

newnet / graph /optimizer.hpp

notRaphael's picture

Upload graph/optimizer.hpp

30bd5c8 verified 17 days ago

history blame contribute delete

3.17 kB

	#pragma once
	#include "../core/tensor.hpp"
	#include <vector>

	namespace newnet {

	// Stochastic Gradient Descent
	// The simplest optimizer: param = param - learning_rate * gradient
	class SGD {
	public:
	float lr;

	SGD(float learning_rate) : lr(learning_rate) {}

	// Update all parameters using their accumulated gradients
	void step(std::vector<Tensor*> params) {
	for (Tensor* p : params) {
	assert(p->data.size() == p->grad.size());
	for (int i = 0; i < (int)p->data.size(); i++) {
	p->data[i] -= lr * p->grad[i];
	}
	}
	}

	// Zero all gradients — MUST call before each forward/backward pass
	// Otherwise gradients accumulate across batches (which is sometimes desired
	// for gradient accumulation, but usually not)
	void zero_grad(std::vector<Tensor*> params) {
	for (Tensor* p : params) {
	p->zero_grad();
	}
	}
	};

	// Adam optimizer (Kingma & Ba, 2014)
	// Adaptive learning rate per parameter using first and second moment estimates
	class Adam {
	public:
	float lr;
	float beta1; // exponential decay rate for first moment (default 0.9)
	float beta2; // exponential decay rate for second moment (default 0.999)
	float epsilon; // numerical stability (default 1e-8)
	int t; // timestep counter

	// First moment (mean of gradients) and second moment (mean of squared gradients)
	// One vector per parameter tensor
	std::vector<std::vector<float>> m; // first moment
	std::vector<std::vector<float>> v; // second moment
	bool initialized;

	Adam(float learning_rate = 0.001f, float b1 = 0.9f, float b2 = 0.999f, float eps = 1e-8f)
	: lr(learning_rate), beta1(b1), beta2(b2), epsilon(eps), t(0), initialized(false) {}

	void step(std::vector<Tensor*> params) {
	// Lazy initialization of moment vectors
	if (!initialized) {
	for (auto* p : params) {
	m.push_back(std::vector<float>(p->data.size(), 0.0f));
	v.push_back(std::vector<float>(p->data.size(), 0.0f));
	}
	initialized = true;
	}

	t++;

	for (int idx = 0; idx < (int)params.size(); idx++) {
	Tensor* p = params[idx];
	for (int i = 0; i < (int)p->data.size(); i++) {
	// Update biased first moment estimate
	m[idx][i] = beta1 * m[idx][i] + (1.0f - beta1) * p->grad[i];
	// Update biased second moment estimate
	v[idx][i] = beta2 * v[idx][i] + (1.0f - beta2) * p->grad[i] * p->grad[i];

	// Bias-corrected estimates
	float m_hat = m[idx][i] / (1.0f - std::pow(beta1, t));
	float v_hat = v[idx][i] / (1.0f - std::pow(beta2, t));

	// Update parameter
	p->data[i] -= lr * m_hat / (std::sqrt(v_hat) + epsilon);
	}
	}
	}

	void zero_grad(std::vector<Tensor*> params) {
	for (Tensor* p : params) {
	p->zero_grad();
	}
	}
	};

	} // namespace newnet