alexandretl commited on
Commit
b5b44c3
·
1 Parent(s): 70d8309
Files changed (1) hide show
  1. training_dragon.py +3 -2
training_dragon.py CHANGED
@@ -101,6 +101,7 @@ class NanoArgs:
101
  swa_window_size : int = 1024
102
  slw_warmup_iters: float = 0
103
  slw_start: int = 8 # window size at the start of training
 
104
  slw_increment: int = 64 # window size increment at each step
105
  softcap_attn: float = 0.0 # logit soft-capping for attn logits, as per Gemma2 (0.0 = no soft-capping)
106
  qk_norm: bool = True
@@ -1331,9 +1332,9 @@ for iter_ in range(start_iter, start_iter+args.total_iterations+1):
1331
  slw_warmup_iters = int(args.slw_warmup_iters * args.total_iterations)
1332
 
1333
  progress_ratio = iter_ / slw_warmup_iters
1334
- window = args.slw_start + progress_ratio * (args.sequence_length - args.slw_start)
1335
  window = args.slw_increment * math.ceil(window / args.slw_increment) # quantize
1336
- window = int(min(window, args.sequence_length)) # cap
1337
  raw_model.config.slw_wsize = window
1338
 
1339
  to_log['slw_window'] = window
 
101
  swa_window_size : int = 1024
102
  slw_warmup_iters: float = 0
103
  slw_start: int = 8 # window size at the start of training
104
+ slw_end: int = 8192
105
  slw_increment: int = 64 # window size increment at each step
106
  softcap_attn: float = 0.0 # logit soft-capping for attn logits, as per Gemma2 (0.0 = no soft-capping)
107
  qk_norm: bool = True
 
1332
  slw_warmup_iters = int(args.slw_warmup_iters * args.total_iterations)
1333
 
1334
  progress_ratio = iter_ / slw_warmup_iters
1335
+ window = args.slw_start + progress_ratio * (args.slw_end - args.slw_start)
1336
  window = args.slw_increment * math.ceil(window / args.slw_increment) # quantize
1337
+ window = int(min(window, args.slw_end)) # cap
1338
  raw_model.config.slw_wsize = window
1339
 
1340
  to_log['slw_window'] = window