| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #pragma once |
|
|
| #include <vector> |
| #include <cmath> |
| #include <numeric> |
| #include <random> |
| #include <cassert> |
| #include <algorithm> |
| #include <Eigen/Dense> |
|
|
| namespace UGTC { |
|
|
| using Matrix = Eigen::MatrixXf; |
| using Vector = Eigen::VectorXf; |
|
|
| |
| |
| |
|
|
| struct Config { |
| int hidden_dim = 64; |
| int M = 3; |
| float lambda_fast = 0.80f; |
| float lambda_slow = 0.99f; |
| float beta = 5.0f; |
| float ema_momentum = 0.99f; |
| float eps = 1e-8f; |
| }; |
|
|
| |
| |
| |
|
|
| inline float sigmoid(float x) { |
| return 1.0f / (1.0f + std::exp(-x)); |
| } |
|
|
| inline float tanh_activation(float x) { |
| return std::tanh(x); |
| } |
|
|
| inline Vector tanh_vec(const Vector& x) { |
| return x.unaryExpr([](float v) { return std::tanh(v); }); |
| } |
|
|
| |
| |
| |
|
|
| struct Linear { |
| Matrix W; |
| Vector b; |
|
|
| Linear() = default; |
|
|
| Linear(int in_dim, int out_dim, std::mt19937& rng) { |
| W = Matrix::Random(out_dim, in_dim); |
| b = Vector::Zero(out_dim); |
| |
| float scale = std::sqrt(2.0f / in_dim); |
| W *= scale; |
| } |
|
|
| Vector forward(const Vector& x) const { |
| return W * x + b; |
| } |
| }; |
|
|
| |
| |
| |
| |
|
|
| struct ValueNetwork { |
| Linear fc1, fc2, fc3; |
|
|
| ValueNetwork() = default; |
|
|
| ValueNetwork(int obs_dim, int hidden_dim, std::mt19937& rng) |
| : fc1(obs_dim, hidden_dim, rng) |
| , fc2(hidden_dim, hidden_dim, rng) |
| , fc3(hidden_dim, 1, rng) |
| {} |
|
|
| float forward(const Vector& obs) const { |
| Vector h1 = tanh_vec(fc1.forward(obs)); |
| Vector h2 = tanh_vec(fc2.forward(h1)); |
| return fc3.forward(h2)(0); |
| } |
| }; |
|
|
| |
| |
| |
|
|
| struct EnsembleValueNetwork { |
| std::vector<ValueNetwork> members; |
| int M; |
|
|
| EnsembleValueNetwork() = default; |
|
|
| EnsembleValueNetwork(int obs_dim, int hidden_dim, int M, std::mt19937& rng) |
| : M(M) |
| { |
| members.reserve(M); |
| for (int i = 0; i < M; ++i) { |
| members.emplace_back(obs_dim, hidden_dim, rng); |
| } |
| } |
|
|
| |
| std::pair<float, float> forward(const Vector& obs) const { |
| std::vector<float> vals; |
| vals.reserve(M); |
| for (auto& m : members) vals.push_back(m.forward(obs)); |
|
|
| float mean = std::accumulate(vals.begin(), vals.end(), 0.0f) / M; |
| float var = 0.0f; |
| for (float v : vals) var += (v - mean) * (v - mean); |
| var /= (M > 1 ? M - 1 : 1); |
|
|
| return { mean, std::sqrt(var) }; |
| } |
| }; |
|
|
| |
| |
| |
|
|
| struct GateResult { |
| float gate; |
| float v_fast; |
| float v_slow; |
| float sigma; |
| }; |
|
|
| |
| |
| |
|
|
| class Module { |
| public: |
| Module(int obs_dim, const Config& cfg = Config{}) |
| : cfg_(cfg) |
| , sigma_ema_(1.0f) |
| { |
| std::mt19937 rng(42); |
| fast_critic_ = ValueNetwork(obs_dim, cfg.hidden_dim, rng); |
| slow_ensemble_ = EnsembleValueNetwork(obs_dim, cfg.hidden_dim, cfg.M, rng); |
| } |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| GateResult computeGate(const Vector& obs, bool train = false) { |
| float v_fast = fast_critic_.forward(obs); |
| auto [v_slow, sigma] = slow_ensemble_.forward(obs); |
|
|
| if (train) { |
| sigma_ema_ = cfg_.ema_momentum * sigma_ema_ |
| + (1.0f - cfg_.ema_momentum) * sigma; |
| } |
|
|
| float normalized_sigma = sigma / (sigma_ema_ + cfg_.eps); |
| float gate = sigmoid(-cfg_.beta * (normalized_sigma - 1.0f)); |
|
|
| return { gate, v_fast, v_slow, sigma }; |
| } |
|
|
| |
|
|
| |
| |
| |
| float getValueUGTC(const Vector& obs, bool train = false) { |
| auto r = computeGate(obs, train); |
| return r.gate * r.v_slow + (1.0f - r.gate) * r.v_fast; |
| } |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| static std::vector<float> computeGAE( |
| const std::vector<float>& rewards, |
| const std::vector<float>& values, |
| const std::vector<float>& next_vals, |
| const std::vector<float>& dones, |
| float gamma, |
| float lam |
| ) { |
| int T = static_cast<int>(rewards.size()); |
| std::vector<float> advantages(T, 0.0f); |
|
|
| float gae = 0.0f; |
| for (int t = T - 1; t >= 0; --t) { |
| float delta = rewards[t] + gamma * next_vals[t] * (1.0f - dones[t]) - values[t]; |
| gae = delta + gamma * lam * (1.0f - dones[t]) * gae; |
| advantages[t] = gae; |
| } |
| return advantages; |
| } |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| std::vector<float> computeAdvantages( |
| const std::vector<Vector>& obs_seq, |
| const std::vector<Vector>& next_obs_seq, |
| const std::vector<float>& rewards, |
| const std::vector<float>& dones, |
| float gamma = 0.99f, |
| bool train = false |
| ) { |
| int T = static_cast<int>(obs_seq.size()); |
| assert(T == static_cast<int>(rewards.size())); |
|
|
| std::vector<float> gates(T), v_fast_arr(T), v_slow_arr(T); |
| std::vector<float> v_fast_next(T), v_slow_next(T); |
|
|
| for (int t = 0; t < T; ++t) { |
| auto r = computeGate(obs_seq[t], train); |
| auto r_next = computeGate(next_obs_seq[t], false); |
| gates[t] = r.gate; |
| v_fast_arr[t] = r.v_fast; |
| v_slow_arr[t] = r.v_slow; |
| v_fast_next[t] = r_next.v_fast; |
| v_slow_next[t] = r_next.v_slow; |
| } |
|
|
| auto adv_fast = computeGAE(rewards, v_fast_arr, v_fast_next, dones, gamma, cfg_.lambda_fast); |
| auto adv_slow = computeGAE(rewards, v_slow_arr, v_slow_next, dones, gamma, cfg_.lambda_slow); |
|
|
| std::vector<float> advantages(T); |
| for (int t = 0; t < T; ++t) { |
| advantages[t] = gates[t] * adv_slow[t] + (1.0f - gates[t]) * adv_fast[t]; |
| } |
| return advantages; |
| } |
|
|
| |
|
|
| float getSigmaEMA() const { return sigma_ema_; } |
| const Config& getConfig() const { return cfg_; } |
|
|
| private: |
| Config cfg_; |
| ValueNetwork fast_critic_; |
| EnsembleValueNetwork slow_ensemble_; |
| float sigma_ema_; |
| }; |
|
|
| } |
|
|