| | import torch |
| | from torch.optim import Optimizer |
| | import math |
| |
|
| | """ |
| | EmoTion v3.8.6 (260220) Moment-Free Edition FFT適応統合版(CPU-GPUデータ転送対応) |
| | shadow-system v3.1 -moment v3.1 emoPulse v3.8 FFT-Swap-Aware |
| | これまでの emo系 のすべてを継承し、独自更新式の特徴を受け継ぐ完全オリジナル最適化器 |
| | The “geometric relationship” between "W"eight and "G"radient Method |
| | 幾何学的最適化アルゴリズム Approx W-Ref Geometry 近似アシスト更新に変更し負荷低減 |
| | 過去の慣性と現在の勾配を動的にブレンドする、1次モーメント単一保持型の幾何学的最適化アルゴリズム |
| | ### FFT適応 cuDNN 等で厳格なデータ配置を求める仕様により中間テンソル(コピー)生じる ### |
| | """ |
| |
|
| | class EmoTion(Optimizer): |
| | |
| | def __init__(self, params, |
| | lr=1.0, |
| | eps=1e-8, |
| | betas=(0.9, 0.995), |
| | weight_decay=0.01, |
| | use_shadow:bool=False, |
| | fftmode:bool=False): |
| | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) |
| | super().__init__(params, defaults) |
| | self._init_lr = lr |
| | self.should_stop = False |
| | self.fftmode = fftmode |
| | self.use_shadow = use_shadow |
| | self.emoScope = lr |
| | self.dNR_hist = 1.0 |
| | self.noise_est = 1.0 |
| | self.d_est = 0.02 |
| |
|
| | if self.fftmode: |
| | self.base_scale, self.max_lim, self.min_lim = 1e-5, 3e-4, 1e-8 |
| | self.stop_scalar,self.stop_dNRsub = 5e-7, 5e-8 |
| | else: |
| | self.base_scale, self.max_lim, self.min_lim = 1e-4, 3e-3, 1e-6 |
| | self.stop_scalar,self.stop_dNRsub = 5e-6, 5e-7 |
| |
|
| | |
| | def _update_ema(self, state, loss_val): |
| | ema = state.setdefault('ema', {}) |
| | ema['short'] = 0.3 * loss_val + 0.7 * ema.get('short', loss_val) |
| | ema['medium'] = 0.05 * loss_val + 0.95 * ema.get('medium', loss_val) |
| | ema['long'] = 0.01 * loss_val + 0.99 * ema.get('long', loss_val) |
| | return ema |
| |
|
| | |
| | |
| | |
| | |
| | |
| | def _compute_scalar(self, ema): |
| | scale_base_l = max(ema['long'], 1e-5) |
| | scale_base_m = max(ema['medium'], 1e-5) |
| | diff_base = ema['long'] - ema['short'] |
| | diff_l = diff_base / scale_base_l |
| | diff_m = diff_base / scale_base_m |
| | |
| | if abs(diff_l) < 0.05: |
| | return math.tanh(diff_l) |
| | |
| | if abs(diff_m) * scale_base_m < abs(diff_l) * scale_base_l: |
| | return math.tanh(diff_m) |
| | else: |
| | return math.tanh(diff_l) |
| |
|
| | |
| | |
| | |
| | |
| | |
| | def _decide_ratio(self, scalar): |
| | if not self.use_shadow: |
| | return 0.0 |
| | if abs(scalar) > 0.625: |
| | return 1.0 - abs(scalar) |
| | else: |
| | return 0.0 |
| |
|
| | |
| | @torch.no_grad() |
| | def step(self, closure=None): |
| | loss = torch.enable_grad()(closure)() if closure is not None else None |
| | loss_val = loss.item() if loss is not None else 0.0 |
| |
|
| | |
| | ema = self._update_ema(self.state, loss_val) |
| | scalar = self._compute_scalar(ema) |
| | ratio = self._decide_ratio(scalar) |
| | trust = math.copysign((1.0 - abs(scalar)), scalar) |
| |
|
| | |
| | |
| | |
| | self.noise_est = 0.97 * self.noise_est + 0.03 * abs(scalar) |
| | self.d_est = 0.97 * self.d_est + 0.03 * abs(trust) |
| | noise = max(self.noise_est, 1e-10) |
| | d = self.d_est |
| | |
| | Noise_base = abs(scalar - trust) + 0.1 |
| | d_base = abs(noise - d) + 0.1 |
| | |
| | dNR_now_val = (d_base / Noise_base) ** 2 |
| | |
| | if dNR_now_val >= self.dNR_hist and trust >= 0.5: |
| | |
| | self.dNR_hist = min(dNR_now_val, self.dNR_hist * 1.50) |
| | elif -0.5 <= trust <= 0.5: |
| | |
| | self.dNR_hist = dNR_now_val * 0.80 |
| | |
| | emoPulse = float(max(min(self.dNR_hist * (self.emoScope * self.base_scale), |
| | self.max_lim), self.min_lim)) |
| | |
| |
|
| | |
| | |
| | |
| | with torch.no_grad(): |
| | |
| | |
| | params = self.param_groups[0]['params'] |
| | point_gl1 = sum(torch._foreach_norm(params, 1)) |
| | prev = getattr(self, "prev_gl1", None) |
| | |
| | if prev is not None: |
| | |
| | gratio = (abs(point_gl1 - prev) / (prev + 1e-8)).item() |
| | |
| | self.g_freshness = min(gratio / 0.05, 1.0) |
| | |
| | point_gl1 *= gratio |
| | else: |
| | |
| | self.g_freshness = 1.0 |
| | |
| | self.prev_gl1 = point_gl1 |
| | |
| |
|
| | for group in self.param_groups: |
| | beta1, beta2 = group['betas'] |
| | for p in group['params']: |
| | if p.grad is None: |
| | continue |
| |
|
| | grad = p.grad |
| | state = self.state[p] |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | if self.use_shadow : |
| | if 'shadow' not in state: |
| | state['shadow'] = p.clone() |
| | if ratio > 0: |
| | p.mul_(1-ratio).add_(state['shadow'], alpha=abs(trust)) |
| | else: |
| | leap_ratio = 0.1 * abs(trust) |
| | state['shadow'].lerp_(p, leap_ratio) |
| |
|
| | |
| | |
| | |
| | if 'exp_avg' not in state: |
| | state['exp_avg'] = torch.zeros_like(p) |
| |
|
| | exp_avg = state['exp_avg'] |
| |
|
| | |
| | |
| | exp_avg.mul_(beta1).add_(grad, alpha=(1.0 - beta1) * self.g_freshness) |
| |
|
| | |
| | if group['weight_decay'] != 0: |
| | p.mul_(1.0 - group['weight_decay'] * emoPulse) |
| |
|
| | |
| | if p.device != grad.device: |
| | |
| | update = exp_avg.to(p.device).sign() |
| | else: |
| | |
| | update = exp_avg.sign() |
| |
|
| | p.add_(update, alpha=-emoPulse) |
| | |
| |
|
| | |
| | for group in self.param_groups: |
| | group['lr'] = emoPulse |
| |
|
| | |
| | |
| | |
| | if abs(scalar) <= self.stop_scalar and abs(Noise_base - d_base) <= self.stop_dNRsub: |
| | if not self.should_stop: |
| | self.emoScope = 1.0 |
| | self.should_stop = True |
| | else: |
| | self.should_stop = False |
| |
|
| | return |
| |
|
| | """ |
| | https://github.com/muooon/EmoSens |
| | Pure W-Ref Geometry. Believing in a future for democratic AI learning. |
| | Taking decisive steps forward, Weight-Reference Optimizer. |
| | """ |
| |
|