Spaces:
Sleeping
Sleeping
File size: 4,448 Bytes
818e006 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | import torch
import torch.nn as nn
from torch.optim import Optimizer
import math
import os
# --- 1. RANGER OPTIMIZER (Full Implementation) ---
class Ranger(Optimizer):
def __init__(self, params, lr=1e-3, alpha=0.5, k=6, N_sma_threshhold=5, betas=(.95, 0.999), eps=1e-5, weight_decay=0):
defaults = dict(lr=lr, alpha=alpha, k=k, step_counter=0, betas=betas, N_sma_threshhold=N_sma_threshhold, eps=eps, weight_decay=weight_decay)
super().__init__(params, defaults)
self.N_sma_threshhold = N_sma_threshhold
self.alpha = alpha
self.k = k
self.radam_buffer = [[None,None,None] for ind in range(10)]
def __setstate__(self, state):
super().__setstate__(state)
def step(self, closure=None):
loss = None
if closure is not None: loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None: continue
grad = p.grad.data.float()
if p.grad.is_sparse: raise RuntimeError('Ranger does not support sparse gradients')
p_data_fp32 = p.data.float()
state = self.state[p]
if len(state) == 0:
state['step'] = 0
state['exp_avg'] = torch.zeros_like(p_data_fp32)
state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
state['slow_buffer'] = torch.empty_like(p.data)
state['slow_buffer'].copy_(p.data)
else:
state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['betas']
exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
state['step'] += 1
buffered = self.radam_buffer[int(state['step'] % 10)]
if state['step'] == buffered[0]:
N_sma, step_size = buffered[1], buffered[2]
else:
buffered[0] = state['step']
beta2_t = beta2 ** state['step']
N_sma_max = 2 / (1 - beta2) - 1
N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
buffered[1] = N_sma
if N_sma >= self.N_sma_threshhold:
step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
else:
step_size = 1.0 / (1 - beta1 ** state['step'])
buffered[2] = step_size
if group['weight_decay'] != 0:
p_data_fp32.add_(p_data_fp32, alpha=-group['weight_decay'] * group['lr'])
if N_sma >= self.N_sma_threshhold:
denom = exp_avg_sq.sqrt().add_(group['eps'])
p_data_fp32.addcdiv_(exp_avg, denom, value=-step_size * group['lr'])
else:
p_data_fp32.add_(exp_avg, alpha=-step_size * group['lr'])
p.data.copy_(p_data_fp32)
if state['step'] % group['k'] == 0:
slow_p = state['slow_buffer']
slow_p.add_(p.data - slow_p, alpha=self.alpha)
p.data.copy_(slow_p)
return loss
# --- 2. QUANTIZATION PIPELINE ---
def quantize_model(model):
"""
Applies PyTorch Dynamic INT8 Quantization.
"""
model.cpu().eval()
q_model = torch.quantization.quantize_dynamic(
model,
{torch.nn.Linear, torch.nn.GRU, torch.nn.LSTM},
dtype=torch.qint8
)
return q_model
def save_model(model, path):
torch.save(model.state_dict(), path)
def load_model(model_class, path, quantized=False):
model = model_class()
if quantized:
model = quantize_model(model)
# Weights_only=False is needed for quantized state dicts
state = torch.load(path, map_location='cpu', weights_only=False)
else:
state = torch.load(path, map_location='cpu')
model.load_state_dict(state)
return model |