#pragma once
#include "../core/tensor.hpp"
#include <vector>

namespace newnet {

// Stochastic Gradient Descent
// The simplest optimizer: param = param - learning_rate * gradient
class SGD {
public:
    float lr;
    
    SGD(float learning_rate) : lr(learning_rate) {}
    
    // Update all parameters using their accumulated gradients
    void step(std::vector<Tensor*> params) {
        for (Tensor* p : params) {
            assert(p->data.size() == p->grad.size());
            for (int i = 0; i < (int)p->data.size(); i++) {
                p->data[i] -= lr * p->grad[i];
            }
        }
    }
    
    // Zero all gradients — MUST call before each forward/backward pass
    // Otherwise gradients accumulate across batches (which is sometimes desired
    // for gradient accumulation, but usually not)
    void zero_grad(std::vector<Tensor*> params) {
        for (Tensor* p : params) {
            p->zero_grad();
        }
    }
};

// Adam optimizer (Kingma & Ba, 2014)
// Adaptive learning rate per parameter using first and second moment estimates
class Adam {
public:
    float lr;
    float beta1;    // exponential decay rate for first moment (default 0.9)
    float beta2;    // exponential decay rate for second moment (default 0.999)
    float epsilon;  // numerical stability (default 1e-8)
    int t;          // timestep counter
    
    // First moment (mean of gradients) and second moment (mean of squared gradients)
    // One vector per parameter tensor
    std::vector<std::vector<float>> m;  // first moment
    std::vector<std::vector<float>> v;  // second moment
    bool initialized;
    
    Adam(float learning_rate = 0.001f, float b1 = 0.9f, float b2 = 0.999f, float eps = 1e-8f)
        : lr(learning_rate), beta1(b1), beta2(b2), epsilon(eps), t(0), initialized(false) {}
    
    void step(std::vector<Tensor*> params) {
        // Lazy initialization of moment vectors
        if (!initialized) {
            for (auto* p : params) {
                m.push_back(std::vector<float>(p->data.size(), 0.0f));
                v.push_back(std::vector<float>(p->data.size(), 0.0f));
            }
            initialized = true;
        }
        
        t++;
        
        for (int idx = 0; idx < (int)params.size(); idx++) {
            Tensor* p = params[idx];
            for (int i = 0; i < (int)p->data.size(); i++) {
                // Update biased first moment estimate
                m[idx][i] = beta1 * m[idx][i] + (1.0f - beta1) * p->grad[i];
                // Update biased second moment estimate
                v[idx][i] = beta2 * v[idx][i] + (1.0f - beta2) * p->grad[i] * p->grad[i];
                
                // Bias-corrected estimates
                float m_hat = m[idx][i] / (1.0f - std::pow(beta1, t));
                float v_hat = v[idx][i] / (1.0f - std::pow(beta2, t));
                
                // Update parameter
                p->data[i] -= lr * m_hat / (std::sqrt(v_hat) + epsilon);
            }
        }
    }
    
    void zero_grad(std::vector<Tensor*> params) {
        for (Tensor* p : params) {
            p->zero_grad();
        }
    }
};

} // namespace newnet