Commit ·
b5b44c3
1
Parent(s): 70d8309
SLW end
Browse files- training_dragon.py +3 -2
training_dragon.py
CHANGED
|
@@ -101,6 +101,7 @@ class NanoArgs:
|
|
| 101 |
swa_window_size : int = 1024
|
| 102 |
slw_warmup_iters: float = 0
|
| 103 |
slw_start: int = 8 # window size at the start of training
|
|
|
|
| 104 |
slw_increment: int = 64 # window size increment at each step
|
| 105 |
softcap_attn: float = 0.0 # logit soft-capping for attn logits, as per Gemma2 (0.0 = no soft-capping)
|
| 106 |
qk_norm: bool = True
|
|
@@ -1331,9 +1332,9 @@ for iter_ in range(start_iter, start_iter+args.total_iterations+1):
|
|
| 1331 |
slw_warmup_iters = int(args.slw_warmup_iters * args.total_iterations)
|
| 1332 |
|
| 1333 |
progress_ratio = iter_ / slw_warmup_iters
|
| 1334 |
-
window = args.slw_start + progress_ratio * (args.
|
| 1335 |
window = args.slw_increment * math.ceil(window / args.slw_increment) # quantize
|
| 1336 |
-
window = int(min(window, args.
|
| 1337 |
raw_model.config.slw_wsize = window
|
| 1338 |
|
| 1339 |
to_log['slw_window'] = window
|
|
|
|
| 101 |
swa_window_size : int = 1024
|
| 102 |
slw_warmup_iters: float = 0
|
| 103 |
slw_start: int = 8 # window size at the start of training
|
| 104 |
+
slw_end: int = 8192
|
| 105 |
slw_increment: int = 64 # window size increment at each step
|
| 106 |
softcap_attn: float = 0.0 # logit soft-capping for attn logits, as per Gemma2 (0.0 = no soft-capping)
|
| 107 |
qk_norm: bool = True
|
|
|
|
| 1332 |
slw_warmup_iters = int(args.slw_warmup_iters * args.total_iterations)
|
| 1333 |
|
| 1334 |
progress_ratio = iter_ / slw_warmup_iters
|
| 1335 |
+
window = args.slw_start + progress_ratio * (args.slw_end - args.slw_start)
|
| 1336 |
window = args.slw_increment * math.ceil(window / args.slw_increment) # quantize
|
| 1337 |
+
window = int(min(window, args.slw_end)) # cap
|
| 1338 |
raw_model.config.slw_wsize = window
|
| 1339 |
|
| 1340 |
to_log['slw_window'] = window
|