notRaphael
/

newnet

ml-intern

Model card Files Files and versions

xet

Community

notRaphael commited on 17 days ago

Commit

30bd5c8

verified ·

1 Parent(s): 329394e

Upload graph/optimizer.hpp

Browse files

Files changed (1) hide show

graph/optimizer.hpp +91 -0

graph/optimizer.hpp ADDED Viewed

	@@ -0,0 +1,91 @@

+#pragma once
+#include "../core/tensor.hpp"
+#include <vector>
+namespace newnet {
+// Stochastic Gradient Descent
+// The simplest optimizer: param = param - learning_rate * gradient
+class SGD {
+public:
+    float lr;
+    SGD(float learning_rate) : lr(learning_rate) {}
+    // Update all parameters using their accumulated gradients
+    void step(std::vector<Tensor*> params) {
+        for (Tensor* p : params) {
+            assert(p->data.size() == p->grad.size());
+            for (int i = 0; i < (int)p->data.size(); i++) {
+                p->data[i] -= lr * p->grad[i];
+            }
+        }
+    }
+    // Zero all gradients — MUST call before each forward/backward pass
+    // Otherwise gradients accumulate across batches (which is sometimes desired
+    // for gradient accumulation, but usually not)
+    void zero_grad(std::vector<Tensor*> params) {
+        for (Tensor* p : params) {
+            p->zero_grad();
+        }
+    }
+};
+// Adam optimizer (Kingma & Ba, 2014)
+// Adaptive learning rate per parameter using first and second moment estimates
+class Adam {
+public:
+    float lr;
+    float beta1;    // exponential decay rate for first moment (default 0.9)
+    float beta2;    // exponential decay rate for second moment (default 0.999)
+    float epsilon;  // numerical stability (default 1e-8)
+    int t;          // timestep counter
+    // First moment (mean of gradients) and second moment (mean of squared gradients)
+    // One vector per parameter tensor
+    std::vector<std::vector<float>> m;  // first moment
+    std::vector<std::vector<float>> v;  // second moment
+    bool initialized;
+    Adam(float learning_rate = 0.001f, float b1 = 0.9f, float b2 = 0.999f, float eps = 1e-8f)
+        : lr(learning_rate), beta1(b1), beta2(b2), epsilon(eps), t(0), initialized(false) {}
+    void step(std::vector<Tensor*> params) {
+        // Lazy initialization of moment vectors
+        if (!initialized) {
+            for (auto* p : params) {
+                m.push_back(std::vector<float>(p->data.size(), 0.0f));
+                v.push_back(std::vector<float>(p->data.size(), 0.0f));
+            }
+            initialized = true;
+        }
+        t++;
+        for (int idx = 0; idx < (int)params.size(); idx++) {
+            Tensor* p = params[idx];
+            for (int i = 0; i < (int)p->data.size(); i++) {
+                // Update biased first moment estimate
+                m[idx][i] = beta1 * m[idx][i] + (1.0f - beta1) * p->grad[i];
+                // Update biased second moment estimate
+                v[idx][i] = beta2 * v[idx][i] + (1.0f - beta2) * p->grad[i] * p->grad[i];
+                // Bias-corrected estimates
+                float m_hat = m[idx][i] / (1.0f - std::pow(beta1, t));
+                float v_hat = v[idx][i] / (1.0f - std::pow(beta2, t));
+                // Update parameter
+                p->data[i] -= lr * m_hat / (std::sqrt(v_hat) + epsilon);
+            }
+        }
+    }
+    void zero_grad(std::vector<Tensor*> params) {
+        for (Tensor* p : params) {
+            p->zero_grad();
+        }
+    }
+};
+} // namespace newnet