notRaphael commited on
Commit
30bd5c8
·
verified ·
1 Parent(s): 329394e

Upload graph/optimizer.hpp

Browse files
Files changed (1) hide show
  1. graph/optimizer.hpp +91 -0
graph/optimizer.hpp ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ #include "../core/tensor.hpp"
3
+ #include <vector>
4
+
5
+ namespace newnet {
6
+
7
+ // Stochastic Gradient Descent
8
+ // The simplest optimizer: param = param - learning_rate * gradient
9
+ class SGD {
10
+ public:
11
+ float lr;
12
+
13
+ SGD(float learning_rate) : lr(learning_rate) {}
14
+
15
+ // Update all parameters using their accumulated gradients
16
+ void step(std::vector<Tensor*> params) {
17
+ for (Tensor* p : params) {
18
+ assert(p->data.size() == p->grad.size());
19
+ for (int i = 0; i < (int)p->data.size(); i++) {
20
+ p->data[i] -= lr * p->grad[i];
21
+ }
22
+ }
23
+ }
24
+
25
+ // Zero all gradients — MUST call before each forward/backward pass
26
+ // Otherwise gradients accumulate across batches (which is sometimes desired
27
+ // for gradient accumulation, but usually not)
28
+ void zero_grad(std::vector<Tensor*> params) {
29
+ for (Tensor* p : params) {
30
+ p->zero_grad();
31
+ }
32
+ }
33
+ };
34
+
35
+ // Adam optimizer (Kingma & Ba, 2014)
36
+ // Adaptive learning rate per parameter using first and second moment estimates
37
+ class Adam {
38
+ public:
39
+ float lr;
40
+ float beta1; // exponential decay rate for first moment (default 0.9)
41
+ float beta2; // exponential decay rate for second moment (default 0.999)
42
+ float epsilon; // numerical stability (default 1e-8)
43
+ int t; // timestep counter
44
+
45
+ // First moment (mean of gradients) and second moment (mean of squared gradients)
46
+ // One vector per parameter tensor
47
+ std::vector<std::vector<float>> m; // first moment
48
+ std::vector<std::vector<float>> v; // second moment
49
+ bool initialized;
50
+
51
+ Adam(float learning_rate = 0.001f, float b1 = 0.9f, float b2 = 0.999f, float eps = 1e-8f)
52
+ : lr(learning_rate), beta1(b1), beta2(b2), epsilon(eps), t(0), initialized(false) {}
53
+
54
+ void step(std::vector<Tensor*> params) {
55
+ // Lazy initialization of moment vectors
56
+ if (!initialized) {
57
+ for (auto* p : params) {
58
+ m.push_back(std::vector<float>(p->data.size(), 0.0f));
59
+ v.push_back(std::vector<float>(p->data.size(), 0.0f));
60
+ }
61
+ initialized = true;
62
+ }
63
+
64
+ t++;
65
+
66
+ for (int idx = 0; idx < (int)params.size(); idx++) {
67
+ Tensor* p = params[idx];
68
+ for (int i = 0; i < (int)p->data.size(); i++) {
69
+ // Update biased first moment estimate
70
+ m[idx][i] = beta1 * m[idx][i] + (1.0f - beta1) * p->grad[i];
71
+ // Update biased second moment estimate
72
+ v[idx][i] = beta2 * v[idx][i] + (1.0f - beta2) * p->grad[i] * p->grad[i];
73
+
74
+ // Bias-corrected estimates
75
+ float m_hat = m[idx][i] / (1.0f - std::pow(beta1, t));
76
+ float v_hat = v[idx][i] / (1.0f - std::pow(beta2, t));
77
+
78
+ // Update parameter
79
+ p->data[i] -= lr * m_hat / (std::sqrt(v_hat) + epsilon);
80
+ }
81
+ }
82
+ }
83
+
84
+ void zero_grad(std::vector<Tensor*> params) {
85
+ for (Tensor* p : params) {
86
+ p->zero_grad();
87
+ }
88
+ }
89
+ };
90
+
91
+ } // namespace newnet