diff --git "a/checkpoints/ovod_20240626_001447/train.log" "b/checkpoints/ovod_20240626_001447/train.log" new file mode 100644--- /dev/null +++ "b/checkpoints/ovod_20240626_001447/train.log" @@ -0,0 +1,3043 @@ +[ INFO][26-Jun-24 00:14:47] Hydra output dir: /data/strahl/Code/ovod/outputs/ovod_20240626_001447 +[ INFO][26-Jun-24 00:14:47] Run configuration: +action: train +device: cuda +allow_tf32: true +cudnn_bench: true +determ: false +determ_seed: 1 +dry_run: false +wandb: true +wandb_project: ovod +wandb_entity: null +wandb_group: null +wandb_job_type: null +wandb_name: null +wandb_tags: null +embedder_spec: openclip:timm/ViT-SO400M-14-SigLIP +embedder_amp: true +embedder_amp_bf16: false +embedder_compile: false +embedder_optimum: false +batch_size_token: 2048 +batch_size_embed: 512 +batch_size_image: 256 +embedding_dataset: so400m_multiset3c2_cache_vt2.bin +embedding_datasets: [] +batch_size: 512 +dataset_workers: 8 +vocab_path: $SOURCE/data/object_nouns.json +vocab_thres: 0 +prompt_path: $SOURCE/data/prompts.json +prompt_collection: ImageNet1K | CIFAR +hypernym_collection: None +noun_cache: true +noun_recache: false +noun_cache_dir: $SOURCE/cache/noun_dataset +embedding_cache_dir: $SOURCE/cache/embedding_cache +strict_embedder: true +save_embedding_cache: '' +cls_dataset: ImageNet1K +cls_datasets: [] +cls_dataset_root: ~/Datasets +cls_split: valid +load_model: '' +load_models: [] +load_models_dirnum: 1 +model: PrefixedIterDecoder +with_start_token: false +with_end_token: true +compact_ids: true +fixed_token_length: false +auto_fixed_token_length: true +use_masks: true +use_weights: true +multi_target: true +multi_first: false +fixed_multi_length: false +amp: false +amp_bf16: true +vocab_quant: false +num_end_loss: 1 +label_smoothing: 0.0 +hidden_dim: 512 +feedfwd_scale: 1/4 +mlp_hidden_layer: none +mlp_hidden_bias: false +mlp_hidden_norm: false +mlp_hidden_activation: gelu +input_dropout: 0.1 +num_layers: 6 +num_heads: 8 +layer_dropout: 0.1 +layer_activation: gelu +layer_norm_first: true +layer_bias: false +logits_bias: false +init_bias_zero: true +init_mlp_mode: balanced +init_mlp_unit_norm: false +init_tfrm_mode: balanced +init_tfrm_unit_norm: false +init_tfrm_unit_postnorm: true +init_tfrm_proj_layers: true +init_zero_norm: false +init_rezero_mode: none +mlp_seq_len: 4 +weight_tying: true +strictly_causal: false +enable_nested: false +cross_encoder: true +num_encoder_layers: 6 +gencfg: '' +gencfgs: [] +gencfgs_grid: false +gencfg_method: +- greedy +- beam +gencfg_topk: +- 3 +- 5 +- 10 +gencfg_prior: +- none +- tgt0.25 +- tgt0.5 +- tok0.25 +- tok0.5 +gencfg_guide: +- plain +gencfg_tau: +- 0.5 +- 1 +- 2 +gencfg_alpha: +- -0.2 +- 0 +- 0.2 +- 0.5 +test_training: true +test_device: true +test_patch: true +test_consistent: false +test_print: 0 +clip_prompts: true +measure_gap: false +multi_target_freq: +- 1 +- 1 +captions_path: '' +template_multiplier: 10 +sample_multiplier: 20 +captions_print: 0 +class_names_variant: clip +save_targets: null +multi_mode: max +load_train_state: true +load_lr_state: true +chunk_scale: 50 +save_every_min: 12 +save_every_max: 48 +save_top1_min: 95.0 +save_top1_delta: 0.5 +max_epochs: 18 +max_chunks: 0 +accum_factor: 16 +optimizer: AdamW +init_lr: 0.0015 +final_lr: 0.0 +lr_scheduler: cosine +lr_warmup: 0 +beta1: 0.9 +beta2: 0.95 +weight_decay: 0.1 +weight_decay_1d: false +nesterov: true +compile: false +gradient_clip: 1.0 +loss_ewa_halflife: 4 +last_dropout_chunks: 0 +last_dropout_factor: 0.0 +mean_shift: false +mean_shift_path: $SOURCE/data/modality_gap_$EMBEDDER.json +noise_scheme: GaussElemUniformAngle +noise_vec_norm: 3.25 +noise_angle_min: 45 +noise_angle_max: 75 +noise_angle_std: 0.0 +noise_mix_ratio: 0.15 +fix_force_vtx: false +eval_train: false +eval_guided: false +eval_debug: false +eval_samples_max: 0 +eval_images: '' +eval_images_dir: $SOURCE/extras/eval_images +infer_log: true +infer_texts: [] +infer_images: [] +infer_image_dir: $SOURCE/extras/infer_images +infer_all_images_dir: '' +infer_ann_json: $IMAGEDIR/_class_annotations.json +infer_ann_json_update: false +infer_guided: false +infer_guide_dataset: '' +infer_guide_targets: [] +infer_debug: false +infer_pred_json: false +load_pred_jsons: [] +pred_image_dir: $SOURCE/extras/infer_images +pred_ann_json: $IMAGEDIR/_class_annotations.json +pfmt_type: model_topk_v1 +pfmt_topk: 3 +pfmt_model_spec: true +pfmt_sort: '' +fmt_type: all_v2 +fmt_models: '' +fmt_model_hosts: '' +fmt_hosts: '' +fmt_min_ago: '' +fmt_max_ago: '' +fmt_min_stamp: '' +fmt_max_stamp: '' +fmt_sort: '' +wiki_collect_dir: $SOURCE/extras/wiki_images +sample_input_dir: '' +sample_output_dir: $SOURCE/extras/sampled_images +sample_count: 100 +sample_special: [] +sample_special_mean: 0.05 +sample_special_factor: [] +[ INFO][26-Jun-24 00:14:47] TF32 tensor cores are enabled +[ INFO][26-Jun-24 00:14:47] Fast non-deterministic mode with cuDNN benchmark mode enabled +[ INFO][26-Jun-24 00:14:49] Wandb run: vital-vortex-4392 (https://wandb.ai/pallgeuer/ovod/runs/bon7vjyu) +[ INFO][26-Jun-24 00:14:49] Wandb run path: /data/strahl/Code/ovod/log/wandb/run-20240626_001448-bon7vjyu +[ INFO][26-Jun-24 00:14:49] Creating embedder of specification openclip:timm/ViT-SO400M-14-SigLIP... +[ INFO][26-Jun-24 00:14:49] Loading OpenCLIP configuration for 'timm/ViT-SO400M-14-SigLIP' +[ INFO][26-Jun-24 00:14:49] Loaded Hugging Face tokenizer: T5TokenizerFast +[ INFO][26-Jun-24 00:14:49] Loaded OpenCLIP tokenizer for 'timm/ViT-SO400M-14-SigLIP': HFTokenizer with cleaning +[ INFO][26-Jun-24 00:14:49] Text tokenizer has context length 16 and case-sensitive vocab size 32100 +[ INFO][26-Jun-24 00:14:49] Embedder is using CUDA device +[ INFO][26-Jun-24 00:14:49] Embedder has AMP enabled with dtype torch.float16 +[ INFO][26-Jun-24 00:14:49] Embedder has manual mixed precision disabled +[ INFO][26-Jun-24 00:14:49] Text tokenizer has dtype torch.int64, start None end 1 pad 1, and nominal batch size 2048 +[ INFO][26-Jun-24 00:14:49] Text embedding vector has dim 1152, dtype torch.float32, and nominal batch size 512 +[ INFO][26-Jun-24 00:14:49] Image component of embedder has nominal batch size 256 +[ INFO][26-Jun-24 00:14:49] Created embedder of class type OpenCLIPEmbedder +[ INFO][26-Jun-24 00:14:49] Loading embedding cache with targets... +[ INFO][26-Jun-24 00:14:49] Using embedding cache: /data/strahl/Code/ovod/cache/embedding_cache/so400m_multiset3c2_cache_vt2.bin +[ INFO][26-Jun-24 00:14:49] Loaded cache header information of version 1 +[ INFO][26-Jun-24 00:14:49] Loaded 11898 target nouns from cache +[ INFO][26-Jun-24 00:14:49] Cache size is 137788668378 bytes = 128.326GiB +[ INFO][26-Jun-24 00:14:49] Loaded embedding cache dataset of class type EmbeddingCache.Dataset +[ INFO][26-Jun-24 00:14:49] Generating target configuration for loaded target nouns and model of specification PrefixedIterDecoder... +[ INFO][26-Jun-24 00:14:50] Max target tokens without start token with end token is 16 for 'parathelypteris novae boracensis' +[ INFO][26-Jun-24 00:14:50] Compacting target tokenizations down to a vocab size of 5418 tokens +[ INFO][26-Jun-24 00:14:50] Using target tokenizations of variable length 16 with padding masks +[ INFO][26-Jun-24 00:14:50] Resolving data configuration for loaded embedding dataset and model of specification PrefixedIterDecoder... +[ INFO][26-Jun-24 00:14:50] Dataset is configured to use multiple targets per embedding, and to use target weights (normalized) +[ INFO][26-Jun-24 00:14:50] Dataset is multi-target with a dynamic M of up to 3, and the M-dim is after the batch dimension B +[ INFO][26-Jun-24 00:14:50] Creating embedding dataset loader in TRAIN mode with batch size 512 and 8 workers... +[ INFO][26-Jun-24 00:14:50] Dataset: 29746688 embeddings across 58099 items +[ INFO][26-Jun-24 00:14:50] Loader: 29746688/29746688 samples used in 58099+0 = 58099 batches of size 512+0 +[ INFO][26-Jun-24 00:14:50] Loader: 8 workers prefetching 2 unpinned CUDA batches each +[ INFO][26-Jun-24 00:14:50] Gradient accumulation factor 16 results in 3631+0 = 3631 meta-batches of size 8192+0 +[ INFO][26-Jun-24 00:14:50] Gradient accumulation is using 58096/58099 available batches and 29745152/29746688 available samples +[ INFO][26-Jun-24 00:14:50] Have 29746688 training samples available in the dataset +[ INFO][26-Jun-24 00:14:50] Training 512 samples per batch +[ INFO][26-Jun-24 00:14:50] Training 58096 batches = 29745152 samples per epoch (gradient accumulation factor 16 => 3631 optimizer updates) +[ INFO][26-Jun-24 00:14:50] Training 1162 batches = 594944 samples per chunk +[ INFO][26-Jun-24 00:14:50] Training nominally for 899 chunks (max epochs 18 specified) +[ INFO][26-Jun-24 00:14:50] Applying GaussElem noise of mean norm 3.25 to embedding vectors +[ INFO][26-Jun-24 00:14:50] Applying UniformAngle noise of angle range 45° to 75° to embedding vectors +[ INFO][26-Jun-24 00:14:50] Applying GaussElemUniformAngle noise with mix ratio 0.15 of UniformAngle +[ INFO][26-Jun-24 00:14:50] Loaded memory-mapped cache ready for use +[ INFO][26-Jun-24 00:14:50] Preloaded all target noun tokenizations from cache: 11898×16 of torch.int64/torch.bool +[ INFO][26-Jun-24 00:14:50] Preloaded all target noun IDs and weights from cache: 29746725×3 of torch.int32/torch.float32 +[ INFO][26-Jun-24 00:14:50] Training model from scratch +[ INFO][26-Jun-24 00:14:50] Creating model of class PrefixedIterDecoder... +[ INFO][26-Jun-24 00:14:51] Created model of class PrefixedIterDecoder +[ INFO][26-Jun-24 00:14:51] Model parameter counts by part: +[ INFO][26-Jun-24 00:14:51] Input MLP = 2359296 params +[ INFO][26-Jun-24 00:14:51] Token embed/logits = 2774016 params +[ INFO][26-Jun-24 00:14:51] Positional embed = 9728 params +[ INFO][26-Jun-24 00:14:51] Transformer = 7084544 params +[ INFO][26-Jun-24 00:14:51] Total = 12227584 params +[ INFO][26-Jun-24 00:14:51] Decoder AMP is disabled +[ INFO][26-Jun-24 00:14:51] Moving model to CUDA... +[ INFO][26-Jun-24 00:14:51] Applying weight decay to 27/40 trainable param tensors = 12220928/12227584 trainable parameters +[ INFO][26-Jun-24 00:14:51] Created optimizer: AdamW (Parameter Group 0, amsgrad: False, betas: (0.9, 0.95), capturable: False, differentiable: False, eps: 1e-08, foreach: None, fused: True, lr: 0.0015, maximize: False, weight_decay: 0.0 | Parameter Group 1, amsgrad: False, betas: (0.9, 0.95), capturable: False, differentiable: False, eps: 1e-08, foreach: None, fused: True, lr: 0.0015, maximize: False, weight_decay: 0.1) +[ INFO][26-Jun-24 00:14:51] Using scheduler: torch.optim.lr_scheduler.CosineAnnealingLR(chunks=899, baselr=1.50e-03, finallr=0.00e+00) +[ INFO][26-Jun-24 00:14:52] -------------------------------------------------------------------------------- +[ INFO][26-Jun-24 00:14:52] Epoch 1 = Batch 1 = Sample 1 +[ INFO][26-Jun-24 00:14:53] Chunk 1 = Batch 1 = Sample 1 +[ INFO][26-Jun-24 00:17:11] Total gradient norm stats for 72 steps: 0.23 <= 1.067 + 1.535z <= 10.93 (clipped to 1) +[ INFO][26-Jun-24 00:17:11] Trained chunk 1 in 138.6s at 4291noun/s: lr=1.50e-03, loss=5.53e+00, top1=32.87%/30.905% +[ INFO][26-Jun-24 00:17:11] Chunk 2 = Batch 1163 = Sample 594945 +[ INFO][26-Jun-24 00:19:30] Total gradient norm stats for 73 steps: 0.2459 <= 0.5252 + 0.2227z <= 1.165 (clipped to 1) +[ INFO][26-Jun-24 00:19:30] Trained chunk 2 in 139.3s at 4272noun/s: lr=1.50e-03, loss=4.84e+00, top1=45.24%/34.619% +[ INFO][26-Jun-24 00:19:30] Chunk 3 = Batch 2325 = Sample 1189889 +[ INFO][26-Jun-24 00:21:50] Total gradient norm stats for 72 steps: 0.292 <= 0.4377 + 0.1129z <= 0.7444 +[ INFO][26-Jun-24 00:21:50] Trained chunk 3 in 139.9s at 4252noun/s: lr=1.50e-03, loss=4.12e+00, top1=53.65%/40.794% +[ INFO][26-Jun-24 00:21:50] Chunk 4 = Batch 3487 = Sample 1784833 +[ INFO][26-Jun-24 00:24:10] Total gradient norm stats for 73 steps: 0.2238 <= 0.3431 + 0.0772z <= 0.5778 +[ INFO][26-Jun-24 00:24:10] Trained chunk 4 in 139.7s at 4260noun/s: lr=1.50e-03, loss=3.66e+00, top1=56.10%/45.203% +[ INFO][26-Jun-24 00:24:10] Chunk 5 = Batch 4649 = Sample 2379777 +[ INFO][26-Jun-24 00:26:30] Total gradient norm stats for 73 steps: 0.2163 <= 0.3185 + 0.09036z <= 0.7385 +[ INFO][26-Jun-24 00:26:30] Trained chunk 5 in 139.7s at 4260noun/s: lr=1.50e-03, loss=3.37e+00, top1=56.28%/48.093% +[ INFO][26-Jun-24 00:26:30] Chunk 6 = Batch 5811 = Sample 2974721 +[ INFO][26-Jun-24 00:28:50] Total gradient norm stats for 72 steps: 0.1916 <= 0.2983 + 0.08215z <= 0.524 +[ INFO][26-Jun-24 00:28:50] Trained chunk 6 in 139.8s at 4256noun/s: lr=1.50e-03, loss=3.16e+00, top1=56.69%/50.132% +[ INFO][26-Jun-24 00:28:50] Chunk 7 = Batch 6973 = Sample 3569665 +[ INFO][26-Jun-24 00:31:09] Total gradient norm stats for 73 steps: 0.1892 <= 0.2515 + 0.05542z <= 0.4808 +[ INFO][26-Jun-24 00:31:09] Trained chunk 7 in 139.3s at 4272noun/s: lr=1.50e-03, loss=3.01e+00, top1=56.71%/51.681% +[ INFO][26-Jun-24 00:31:09] Chunk 8 = Batch 8135 = Sample 4164609 +[ INFO][26-Jun-24 00:33:29] Total gradient norm stats for 73 steps: 0.1924 <= 0.2352 + 0.03349z <= 0.3536 +[ INFO][26-Jun-24 00:33:29] Trained chunk 8 in 140.1s at 4246noun/s: lr=1.50e-03, loss=2.89e+00, top1=57.62%/52.915% +[ INFO][26-Jun-24 00:33:29] Chunk 9 = Batch 9297 = Sample 4759553 +[ INFO][26-Jun-24 00:35:49] Total gradient norm stats for 72 steps: 0.1847 <= 0.2472 + 0.03683z <= 0.3477 +[ INFO][26-Jun-24 00:35:49] Trained chunk 9 in 140.0s at 4249noun/s: lr=1.50e-03, loss=2.79e+00, top1=57.86%/53.953% +[ INFO][26-Jun-24 00:35:49] Chunk 10 = Batch 10459 = Sample 5354497 +[ INFO][26-Jun-24 00:38:09] Total gradient norm stats for 73 steps: 0.1908 <= 0.2225 + 0.02726z <= 0.3175 +[ INFO][26-Jun-24 00:38:09] Trained chunk 10 in 139.9s at 4253noun/s: lr=1.50e-03, loss=2.70e+00, top1=59.96%/54.835% +[ INFO][26-Jun-24 00:38:09] Chunk 11 = Batch 11621 = Sample 5949441 +[ INFO][26-Jun-24 00:40:28] Total gradient norm stats for 72 steps: 0.1847 <= 0.2148 + 0.02216z <= 0.297 +[ INFO][26-Jun-24 00:40:28] Trained chunk 11 in 139.4s at 4267noun/s: lr=1.50e-03, loss=2.62e+00, top1=58.66%/55.596% +[ INFO][26-Jun-24 00:40:28] Chunk 12 = Batch 12783 = Sample 6544385 +[ INFO][26-Jun-24 00:42:48] Total gradient norm stats for 73 steps: 0.1827 <= 0.2113 + 0.01749z <= 0.2738 +[ INFO][26-Jun-24 00:42:48] Trained chunk 12 in 139.9s at 4252noun/s: lr=1.50e-03, loss=2.55e+00, top1=58.41%/56.284% +[ INFO][26-Jun-24 00:42:48] Chunk 13 = Batch 13945 = Sample 7139329 +[ INFO][26-Jun-24 00:45:08] Total gradient norm stats for 73 steps: 0.1823 <= 0.2098 + 0.0208z <= 0.2902 +[ INFO][26-Jun-24 00:45:08] Trained chunk 13 in 139.7s at 4257noun/s: lr=1.50e-03, loss=2.48e+00, top1=59.55%/56.883% +[ INFO][26-Jun-24 00:45:08] Chunk 14 = Batch 15107 = Sample 7734273 +[ INFO][26-Jun-24 00:47:28] Total gradient norm stats for 72 steps: 0.1784 <= 0.2062 + 0.02406z <= 0.3048 +[ INFO][26-Jun-24 00:47:28] Trained chunk 14 in 139.6s at 4263noun/s: lr=1.50e-03, loss=2.43e+00, top1=61.60%/57.439% +[ INFO][26-Jun-24 00:47:28] Chunk 15 = Batch 16269 = Sample 8329217 +[ INFO][26-Jun-24 00:49:47] Total gradient norm stats for 73 steps: 0.1772 <= 0.1912 + 0.009724z <= 0.2282 +[ INFO][26-Jun-24 00:49:47] Trained chunk 15 in 139.8s at 4255noun/s: lr=1.50e-03, loss=2.37e+00, top1=60.44%/57.936% +[ INFO][26-Jun-24 00:49:47] Chunk 16 = Batch 17431 = Sample 8924161 +[ INFO][26-Jun-24 00:52:07] Total gradient norm stats for 73 steps: 0.1755 <= 0.1961 + 0.01771z <= 0.2732 +[ INFO][26-Jun-24 00:52:07] Trained chunk 16 in 139.4s at 4268noun/s: lr=1.50e-03, loss=2.32e+00, top1=60.59%/58.393% +[ INFO][26-Jun-24 00:52:07] Chunk 17 = Batch 18593 = Sample 9519105 +[ INFO][26-Jun-24 00:54:27] Total gradient norm stats for 72 steps: 0.1751 <= 0.1973 + 0.01604z <= 0.2575 +[ INFO][26-Jun-24 00:54:27] Trained chunk 17 in 139.8s at 4256noun/s: lr=1.50e-03, loss=2.28e+00, top1=61.64%/58.811% +[ INFO][26-Jun-24 00:54:27] Chunk 18 = Batch 19755 = Sample 10114049 +[ INFO][26-Jun-24 00:56:46] Total gradient norm stats for 73 steps: 0.1732 <= 0.195 + 0.01746z <= 0.238 +[ INFO][26-Jun-24 00:56:46] Trained chunk 18 in 139.8s at 4257noun/s: lr=1.50e-03, loss=2.24e+00, top1=61.74%/59.201% +[ INFO][26-Jun-24 00:56:46] Chunk 19 = Batch 20917 = Sample 10708993 +[ INFO][26-Jun-24 00:59:06] Total gradient norm stats for 72 steps: 0.168 <= 0.184 + 0.009564z <= 0.2155 +[ INFO][26-Jun-24 00:59:06] Trained chunk 19 in 139.8s at 4255noun/s: lr=1.50e-03, loss=2.20e+00, top1=62.02%/59.567% +[ INFO][26-Jun-24 00:59:06] Chunk 20 = Batch 22079 = Sample 11303937 +[ INFO][26-Jun-24 01:01:26] Total gradient norm stats for 73 steps: 0.1671 <= 0.1823 + 0.01119z <= 0.2411 +[ INFO][26-Jun-24 01:01:26] Trained chunk 20 in 139.6s at 4263noun/s: lr=1.50e-03, loss=2.16e+00, top1=61.52%/59.908% +[ INFO][26-Jun-24 01:01:26] Chunk 21 = Batch 23241 = Sample 11898881 +[ INFO][26-Jun-24 01:03:45] Total gradient norm stats for 73 steps: 0.1646 <= 0.1766 + 0.009538z <= 0.2138 +[ INFO][26-Jun-24 01:03:45] Trained chunk 21 in 139.5s at 4264noun/s: lr=1.50e-03, loss=2.13e+00, top1=62.45%/60.228% +[ INFO][26-Jun-24 01:03:45] Chunk 22 = Batch 24403 = Sample 12493825 +[ INFO][26-Jun-24 01:06:05] Total gradient norm stats for 72 steps: 0.1649 <= 0.1765 + 0.008716z <= 0.2105 +[ INFO][26-Jun-24 01:06:05] Trained chunk 22 in 139.4s at 4268noun/s: lr=1.50e-03, loss=2.10e+00, top1=62.69%/60.529% +[ INFO][26-Jun-24 01:06:05] Chunk 23 = Batch 25565 = Sample 13088769 +[ INFO][26-Jun-24 01:08:25] Total gradient norm stats for 73 steps: 0.1624 <= 0.1783 + 0.01273z <= 0.2188 +[ INFO][26-Jun-24 01:08:25] Trained chunk 23 in 140.0s at 4251noun/s: lr=1.50e-03, loss=2.07e+00, top1=61.78%/60.815% +[ INFO][26-Jun-24 01:08:25] Chunk 24 = Batch 26727 = Sample 13683713 +[ INFO][26-Jun-24 01:10:44] Total gradient norm stats for 73 steps: 0.1604 <= 0.1763 + 0.0122z <= 0.2117 +[ INFO][26-Jun-24 01:10:44] Trained chunk 24 in 139.5s at 4265noun/s: lr=1.50e-03, loss=2.04e+00, top1=64.12%/61.082% +[ INFO][26-Jun-24 01:10:44] Chunk 25 = Batch 27889 = Sample 14278657 +[ INFO][26-Jun-24 01:13:04] Total gradient norm stats for 72 steps: 0.1572 <= 0.1696 + 0.009264z <= 0.1993 +[ INFO][26-Jun-24 01:13:04] Trained chunk 25 in 139.6s at 4261noun/s: lr=1.50e-03, loss=2.01e+00, top1=62.21%/61.340% +[ INFO][26-Jun-24 01:13:04] Chunk 26 = Batch 29051 = Sample 14873601 +[ INFO][26-Jun-24 01:15:23] Total gradient norm stats for 73 steps: 0.1579 <= 0.1702 + 0.01091z <= 0.2126 +[ INFO][26-Jun-24 01:15:23] Trained chunk 26 in 139.5s at 4265noun/s: lr=1.50e-03, loss=1.99e+00, top1=62.89%/61.578% +[ INFO][26-Jun-24 01:15:23] Chunk 27 = Batch 30213 = Sample 15468545 +[ INFO][26-Jun-24 01:17:43] Total gradient norm stats for 72 steps: 0.1554 <= 0.168 + 0.009642z <= 0.204 +[ INFO][26-Jun-24 01:17:43] Trained chunk 27 in 139.7s at 4260noun/s: lr=1.50e-03, loss=1.97e+00, top1=62.92%/61.797% +[ INFO][26-Jun-24 01:17:43] Chunk 28 = Batch 31375 = Sample 16063489 +[ INFO][26-Jun-24 01:20:03] Total gradient norm stats for 73 steps: 0.1535 <= 0.1654 + 0.007847z <= 0.1931 +[ INFO][26-Jun-24 01:20:03] Trained chunk 28 in 139.9s at 4251noun/s: lr=1.50e-03, loss=1.95e+00, top1=63.88%/62.005% +[ INFO][26-Jun-24 01:20:03] Chunk 29 = Batch 32537 = Sample 16658433 +[ INFO][26-Jun-24 01:22:23] Total gradient norm stats for 73 steps: 0.1528 <= 0.1622 + 0.006891z <= 0.1826 +[ INFO][26-Jun-24 01:22:23] Trained chunk 29 in 139.9s at 4254noun/s: lr=1.50e-03, loss=1.93e+00, top1=63.32%/62.206% +[ INFO][26-Jun-24 01:22:23] Chunk 30 = Batch 33699 = Sample 17253377 +[ INFO][26-Jun-24 01:24:43] Total gradient norm stats for 72 steps: 0.1508 <= 0.165 + 0.0106z <= 0.2129 +[ INFO][26-Jun-24 01:24:43] Trained chunk 30 in 139.8s at 4254noun/s: lr=1.50e-03, loss=1.91e+00, top1=63.03%/62.399% +[ INFO][26-Jun-24 01:24:43] Chunk 31 = Batch 34861 = Sample 17848321 +[ INFO][26-Jun-24 01:27:02] Total gradient norm stats for 73 steps: 0.1487 <= 0.1607 + 0.008827z <= 0.2009 +[ INFO][26-Jun-24 01:27:02] Trained chunk 31 in 139.4s at 4268noun/s: lr=1.50e-03, loss=1.89e+00, top1=63.31%/62.578% +[ INFO][26-Jun-24 01:27:02] Chunk 32 = Batch 36023 = Sample 18443265 +[ INFO][26-Jun-24 01:29:22] Total gradient norm stats for 73 steps: 0.1477 <= 0.159 + 0.008485z <= 0.1909 +[ INFO][26-Jun-24 01:29:22] Trained chunk 32 in 140.2s at 4242noun/s: lr=1.50e-03, loss=1.87e+00, top1=63.26%/62.753% +[ INFO][26-Jun-24 01:29:22] Chunk 33 = Batch 37185 = Sample 19038209 +[ INFO][26-Jun-24 01:31:42] Total gradient norm stats for 72 steps: 0.1473 <= 0.1579 + 0.007165z <= 0.1806 +[ INFO][26-Jun-24 01:31:42] Trained chunk 33 in 139.9s at 4254noun/s: lr=1.50e-03, loss=1.86e+00, top1=63.60%/62.919% +[ INFO][26-Jun-24 01:31:42] Chunk 34 = Batch 38347 = Sample 19633153 +[ INFO][26-Jun-24 01:34:02] Total gradient norm stats for 73 steps: 0.1461 <= 0.1579 + 0.00939z <= 0.1845 +[ INFO][26-Jun-24 01:34:02] Trained chunk 34 in 139.7s at 4258noun/s: lr=1.50e-03, loss=1.84e+00, top1=64.05%/63.072% +[ INFO][26-Jun-24 01:34:02] Chunk 35 = Batch 39509 = Sample 20228097 +[ INFO][26-Jun-24 01:36:22] Total gradient norm stats for 72 steps: 0.1446 <= 0.1539 + 0.006163z <= 0.1729 +[ INFO][26-Jun-24 01:36:22] Trained chunk 35 in 140.0s at 4248noun/s: lr=1.49e-03, loss=1.83e+00, top1=65.13%/63.226% +[ INFO][26-Jun-24 01:36:22] Chunk 36 = Batch 40671 = Sample 20823041 +[ INFO][26-Jun-24 01:38:42] Total gradient norm stats for 73 steps: 0.1433 <= 0.1541 + 0.009787z <= 0.1871 +[ INFO][26-Jun-24 01:38:42] Trained chunk 36 in 139.9s at 4252noun/s: lr=1.49e-03, loss=1.81e+00, top1=64.88%/63.375% +[ INFO][26-Jun-24 01:38:42] Chunk 37 = Batch 41833 = Sample 21417985 +[ INFO][26-Jun-24 01:41:02] Total gradient norm stats for 73 steps: 0.1423 <= 0.1546 + 0.008979z <= 0.1855 +[ INFO][26-Jun-24 01:41:02] Trained chunk 37 in 140.2s at 4244noun/s: lr=1.49e-03, loss=1.80e+00, top1=65.58%/63.517% +[ INFO][26-Jun-24 01:41:02] Chunk 38 = Batch 42995 = Sample 22012929 +[ INFO][26-Jun-24 01:43:22] Total gradient norm stats for 72 steps: 0.1412 <= 0.1537 + 0.008724z <= 0.1791 +[ INFO][26-Jun-24 01:43:22] Trained chunk 38 in 139.7s at 4259noun/s: lr=1.49e-03, loss=1.79e+00, top1=63.95%/63.653% +[ INFO][26-Jun-24 01:43:22] Chunk 39 = Batch 44157 = Sample 22607873 +[ INFO][26-Jun-24 01:45:42] Total gradient norm stats for 73 steps: 0.1393 <= 0.1512 + 0.006535z <= 0.1724 +[ INFO][26-Jun-24 01:45:42] Trained chunk 39 in 139.9s at 4253noun/s: lr=1.49e-03, loss=1.77e+00, top1=64.18%/63.779% +[ INFO][26-Jun-24 01:45:42] Chunk 40 = Batch 45319 = Sample 23202817 +[ INFO][26-Jun-24 01:48:01] Total gradient norm stats for 73 steps: 0.1413 <= 0.152 + 0.008037z <= 0.1827 +[ INFO][26-Jun-24 01:48:01] Trained chunk 40 in 139.9s at 4251noun/s: lr=1.49e-03, loss=1.76e+00, top1=63.57%/63.907% +[ INFO][26-Jun-24 01:48:01] Chunk 41 = Batch 46481 = Sample 23797761 +[ INFO][26-Jun-24 01:50:21] Total gradient norm stats for 72 steps: 0.1396 <= 0.1472 + 0.005755z <= 0.1673 +[ INFO][26-Jun-24 01:50:21] Trained chunk 41 in 139.9s at 4254noun/s: lr=1.49e-03, loss=1.75e+00, top1=64.49%/64.035% +[ INFO][26-Jun-24 01:50:21] Chunk 42 = Batch 47643 = Sample 24392705 +[ INFO][26-Jun-24 01:52:41] Total gradient norm stats for 73 steps: 0.1394 <= 0.1509 + 0.01055z <= 0.2038 +[ INFO][26-Jun-24 01:52:41] Trained chunk 42 in 140.1s at 4246noun/s: lr=1.49e-03, loss=1.74e+00, top1=66.08%/64.153% +[ INFO][26-Jun-24 01:52:41] Chunk 43 = Batch 48805 = Sample 24987649 +[ INFO][26-Jun-24 01:55:01] Total gradient norm stats for 72 steps: 0.1359 <= 0.1465 + 0.007929z <= 0.1761 +[ INFO][26-Jun-24 01:55:01] Trained chunk 43 in 139.8s at 4255noun/s: lr=1.49e-03, loss=1.73e+00, top1=64.49%/64.264% +[ INFO][26-Jun-24 01:55:01] Chunk 44 = Batch 49967 = Sample 25582593 +[ INFO][26-Jun-24 01:57:21] Total gradient norm stats for 73 steps: 0.1383 <= 0.1488 + 0.01136z <= 0.1868 +[ INFO][26-Jun-24 01:57:21] Trained chunk 44 in 140.1s at 4246noun/s: lr=1.49e-03, loss=1.72e+00, top1=65.43%/64.373% +[ INFO][26-Jun-24 01:57:21] Chunk 45 = Batch 51129 = Sample 26177537 +[ INFO][26-Jun-24 01:59:42] Total gradient norm stats for 73 steps: 0.1354 <= 0.1479 + 0.009892z <= 0.1918 +[ INFO][26-Jun-24 01:59:42] Trained chunk 45 in 140.5s at 4235noun/s: lr=1.49e-03, loss=1.71e+00, top1=65.28%/64.481% +[ INFO][26-Jun-24 01:59:42] Chunk 46 = Batch 52291 = Sample 26772481 +[ INFO][26-Jun-24 02:02:02] Total gradient norm stats for 72 steps: 0.136 <= 0.1483 + 0.01164z <= 0.2046 +[ INFO][26-Jun-24 02:02:02] Trained chunk 46 in 139.8s at 4254noun/s: lr=1.49e-03, loss=1.70e+00, top1=64.76%/64.575% +[ INFO][26-Jun-24 02:02:02] Chunk 47 = Batch 53453 = Sample 27367425 +[ INFO][26-Jun-24 02:04:22] Total gradient norm stats for 73 steps: 0.1353 <= 0.1446 + 0.00657z <= 0.1705 +[ INFO][26-Jun-24 02:04:22] Trained chunk 47 in 139.9s at 4252noun/s: lr=1.49e-03, loss=1.69e+00, top1=65.78%/64.677% +[ INFO][26-Jun-24 02:04:22] Chunk 48 = Batch 54615 = Sample 27962369 +[ INFO][26-Jun-24 02:06:42] Total gradient norm stats for 73 steps: 0.1351 <= 0.1444 + 0.007372z <= 0.1685 +[ INFO][26-Jun-24 02:06:42] Trained chunk 48 in 139.9s at 4252noun/s: lr=1.49e-03, loss=1.68e+00, top1=64.83%/64.771% +[ INFO][26-Jun-24 02:06:42] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240626_001447/ovod_chunk0048_20240626_020642.train +[ INFO][26-Jun-24 02:06:42] Chunk 49 = Batch 55777 = Sample 28557313 +[ INFO][26-Jun-24 02:09:01] Total gradient norm stats for 72 steps: 0.1341 <= 0.1436 + 0.007929z <= 0.1716 +[ INFO][26-Jun-24 02:09:01] Trained chunk 49 in 139.6s at 4261noun/s: lr=1.49e-03, loss=1.68e+00, top1=65.94%/64.862% +[ INFO][26-Jun-24 02:09:01] Chunk 50 = Batch 56939 = Sample 29152257 +[ INFO][26-Jun-24 02:11:24] Epoch 1 finished in 6992.3s +[ INFO][26-Jun-24 02:11:24] -------------------------------------------------------------------------------- +[ INFO][26-Jun-24 02:11:24] Epoch 2 = Batch 58097 = Sample 29745153 +[ INFO][26-Jun-24 02:11:25] Total gradient norm stats for 73 steps: 0.134 <= 0.1416 + 0.007375z <= 0.1705 +[ INFO][26-Jun-24 02:11:25] Trained chunk 50 in 143.6s at 4143noun/s: lr=1.49e-03, loss=1.67e+00, top1=65.38%/64.951% +[ INFO][26-Jun-24 02:11:25] Chunk 51 = Batch 58101 = Sample 29747201 +[ INFO][26-Jun-24 02:13:45] Total gradient norm stats for 72 steps: 0.1328 <= 0.1428 + 0.008393z <= 0.1689 +[ INFO][26-Jun-24 02:13:45] Trained chunk 51 in 139.6s at 4263noun/s: lr=1.49e-03, loss=1.66e+00, top1=65.44%/65.029% +[ INFO][26-Jun-24 02:13:45] Chunk 52 = Batch 59263 = Sample 30342145 +[ INFO][26-Jun-24 02:16:04] Total gradient norm stats for 73 steps: 0.1321 <= 0.1421 + 0.006821z <= 0.1653 +[ INFO][26-Jun-24 02:16:04] Trained chunk 52 in 139.8s at 4256noun/s: lr=1.49e-03, loss=1.65e+00, top1=65.34%/65.116% +[ INFO][26-Jun-24 02:16:04] Chunk 53 = Batch 60425 = Sample 30937089 +[ INFO][26-Jun-24 02:18:24] Total gradient norm stats for 73 steps: 0.132 <= 0.1404 + 0.00647z <= 0.161 +[ INFO][26-Jun-24 02:18:24] Trained chunk 53 in 139.7s at 4258noun/s: lr=1.49e-03, loss=1.65e+00, top1=65.09%/65.195% +[ INFO][26-Jun-24 02:18:24] Chunk 54 = Batch 61587 = Sample 31532033 +[ INFO][26-Jun-24 02:20:44] Total gradient norm stats for 72 steps: 0.1309 <= 0.1442 + 0.01153z <= 0.19 +[ INFO][26-Jun-24 02:20:44] Trained chunk 54 in 139.4s at 4267noun/s: lr=1.49e-03, loss=1.64e+00, top1=66.49%/65.274% +[ INFO][26-Jun-24 02:20:44] Chunk 55 = Batch 62749 = Sample 32126977 +[ INFO][26-Jun-24 02:23:03] Total gradient norm stats for 73 steps: 0.1325 <= 0.1405 + 0.005659z <= 0.1628 +[ INFO][26-Jun-24 02:23:03] Trained chunk 55 in 139.4s at 4268noun/s: lr=1.49e-03, loss=1.63e+00, top1=65.31%/65.352% +[ INFO][26-Jun-24 02:23:03] Chunk 56 = Batch 63911 = Sample 32721921 +[ INFO][26-Jun-24 02:25:23] Total gradient norm stats for 73 steps: 0.1295 <= 0.138 + 0.005848z <= 0.159 +[ INFO][26-Jun-24 02:25:23] Trained chunk 56 in 139.7s at 4257noun/s: lr=1.49e-03, loss=1.63e+00, top1=67.02%/65.436% +[ INFO][26-Jun-24 02:25:23] Chunk 57 = Batch 65073 = Sample 33316865 +[ INFO][26-Jun-24 02:27:43] Total gradient norm stats for 72 steps: 0.1306 <= 0.137 + 0.004445z <= 0.1514 +[ INFO][26-Jun-24 02:27:43] Trained chunk 57 in 140.0s at 4249noun/s: lr=1.49e-03, loss=1.62e+00, top1=64.04%/65.503% +[ INFO][26-Jun-24 02:27:43] Chunk 58 = Batch 66235 = Sample 33911809 +[ INFO][26-Jun-24 02:30:03] Total gradient norm stats for 73 steps: 0.13 <= 0.1406 + 0.008595z <= 0.17 +[ INFO][26-Jun-24 02:30:03] Trained chunk 58 in 139.8s at 4255noun/s: lr=1.49e-03, loss=1.61e+00, top1=66.29%/65.583% +[ INFO][26-Jun-24 02:30:03] Chunk 59 = Batch 67397 = Sample 34506753 +[ INFO][26-Jun-24 02:32:22] Total gradient norm stats for 72 steps: 0.129 <= 0.1374 + 0.006556z <= 0.1664 +[ INFO][26-Jun-24 02:32:22] Trained chunk 59 in 139.6s at 4263noun/s: lr=1.48e-03, loss=1.61e+00, top1=65.29%/65.650% +[ INFO][26-Jun-24 02:32:22] Chunk 60 = Batch 68559 = Sample 35101697 +[ INFO][26-Jun-24 02:34:42] Total gradient norm stats for 73 steps: 0.1299 <= 0.1376 + 0.00581z <= 0.1614 +[ INFO][26-Jun-24 02:34:42] Trained chunk 60 in 139.9s at 4252noun/s: lr=1.48e-03, loss=1.60e+00, top1=65.67%/65.721% +[ INFO][26-Jun-24 02:34:42] Chunk 61 = Batch 69721 = Sample 35696641 +[ INFO][26-Jun-24 02:37:02] Total gradient norm stats for 73 steps: 0.1292 <= 0.1367 + 0.005543z <= 0.1584 +[ INFO][26-Jun-24 02:37:02] Trained chunk 61 in 139.9s at 4252noun/s: lr=1.48e-03, loss=1.60e+00, top1=66.30%/65.784% +[ INFO][26-Jun-24 02:37:02] Chunk 62 = Batch 70883 = Sample 36291585 +[ INFO][26-Jun-24 02:39:22] Total gradient norm stats for 72 steps: 0.1285 <= 0.1377 + 0.008249z <= 0.1697 +[ INFO][26-Jun-24 02:39:22] Trained chunk 62 in 139.6s at 4261noun/s: lr=1.48e-03, loss=1.59e+00, top1=66.39%/65.846% +[ INFO][26-Jun-24 02:39:22] Chunk 63 = Batch 72045 = Sample 36886529 +[ INFO][26-Jun-24 02:41:41] Total gradient norm stats for 73 steps: 0.127 <= 0.1355 + 0.005148z <= 0.1611 +[ INFO][26-Jun-24 02:41:41] Trained chunk 63 in 139.9s at 4254noun/s: lr=1.48e-03, loss=1.59e+00, top1=66.71%/65.913% +[ INFO][26-Jun-24 02:41:41] Chunk 64 = Batch 73207 = Sample 37481473 +[ INFO][26-Jun-24 02:44:01] Total gradient norm stats for 73 steps: 0.1268 <= 0.1355 + 0.006167z <= 0.1623 +[ INFO][26-Jun-24 02:44:01] Trained chunk 64 in 139.7s at 4259noun/s: lr=1.48e-03, loss=1.58e+00, top1=64.20%/65.978% +[ INFO][26-Jun-24 02:44:01] Chunk 65 = Batch 74369 = Sample 38076417 +[ INFO][26-Jun-24 02:46:21] Total gradient norm stats for 72 steps: 0.1273 <= 0.1344 + 0.005594z <= 0.1545 +[ INFO][26-Jun-24 02:46:21] Trained chunk 65 in 139.8s at 4256noun/s: lr=1.48e-03, loss=1.58e+00, top1=66.20%/66.038% +[ INFO][26-Jun-24 02:46:21] Chunk 66 = Batch 75531 = Sample 38671361 +[ INFO][26-Jun-24 02:48:41] Total gradient norm stats for 73 steps: 0.1262 <= 0.1373 + 0.007159z <= 0.1582 +[ INFO][26-Jun-24 02:48:41] Trained chunk 66 in 139.6s at 4262noun/s: lr=1.48e-03, loss=1.57e+00, top1=67.19%/66.102% +[ INFO][26-Jun-24 02:48:41] Chunk 67 = Batch 76693 = Sample 39266305 +[ INFO][26-Jun-24 02:51:00] Total gradient norm stats for 72 steps: 0.1277 <= 0.135 + 0.00628z <= 0.1604 +[ INFO][26-Jun-24 02:51:00] Trained chunk 67 in 139.7s at 4259noun/s: lr=1.48e-03, loss=1.56e+00, top1=66.41%/66.165% +[ INFO][26-Jun-24 02:51:00] Chunk 68 = Batch 77855 = Sample 39861249 +[ INFO][26-Jun-24 02:53:20] Total gradient norm stats for 73 steps: 0.126 <= 0.1357 + 0.006418z <= 0.1527 +[ INFO][26-Jun-24 02:53:20] Trained chunk 68 in 139.6s at 4262noun/s: lr=1.48e-03, loss=1.56e+00, top1=67.34%/66.225% +[ INFO][26-Jun-24 02:53:20] Chunk 69 = Batch 79017 = Sample 40456193 +[ INFO][26-Jun-24 02:55:40] Total gradient norm stats for 73 steps: 0.1254 <= 0.1369 + 0.0064z <= 0.1666 +[ INFO][26-Jun-24 02:55:40] Trained chunk 69 in 139.9s at 4252noun/s: lr=1.48e-03, loss=1.56e+00, top1=66.81%/66.287% +[ INFO][26-Jun-24 02:55:40] Chunk 70 = Batch 80179 = Sample 41051137 +[ INFO][26-Jun-24 02:58:00] Total gradient norm stats for 72 steps: 0.1264 <= 0.1354 + 0.007847z <= 0.1579 +[ INFO][26-Jun-24 02:58:00] Trained chunk 70 in 139.8s at 4255noun/s: lr=1.48e-03, loss=1.55e+00, top1=65.93%/66.339% +[ INFO][26-Jun-24 02:58:00] Chunk 71 = Batch 81341 = Sample 41646081 +[ INFO][26-Jun-24 03:00:19] Total gradient norm stats for 73 steps: 0.1263 <= 0.1346 + 0.006083z <= 0.15 +[ INFO][26-Jun-24 03:00:19] Trained chunk 71 in 139.8s at 4257noun/s: lr=1.48e-03, loss=1.55e+00, top1=66.03%/66.393% +[ INFO][26-Jun-24 03:00:19] Chunk 72 = Batch 82503 = Sample 42241025 +[ INFO][26-Jun-24 03:02:39] Total gradient norm stats for 73 steps: 0.1262 <= 0.1328 + 0.006164z <= 0.1566 +[ INFO][26-Jun-24 03:02:39] Trained chunk 72 in 139.5s at 4264noun/s: lr=1.48e-03, loss=1.54e+00, top1=66.82%/66.447% +[ INFO][26-Jun-24 03:02:39] Chunk 73 = Batch 83665 = Sample 42835969 +[ INFO][26-Jun-24 03:04:59] Total gradient norm stats for 72 steps: 0.1237 <= 0.1349 + 0.00799z <= 0.1595 +[ INFO][26-Jun-24 03:04:59] Trained chunk 73 in 139.8s at 4255noun/s: lr=1.48e-03, loss=1.54e+00, top1=65.08%/66.495% +[ INFO][26-Jun-24 03:04:59] Chunk 74 = Batch 84827 = Sample 43430913 +[ INFO][26-Jun-24 03:07:19] Total gradient norm stats for 73 steps: 0.1264 <= 0.1346 + 0.006852z <= 0.1574 +[ INFO][26-Jun-24 03:07:19] Trained chunk 74 in 139.9s at 4252noun/s: lr=1.48e-03, loss=1.53e+00, top1=65.80%/66.544% +[ INFO][26-Jun-24 03:07:19] Chunk 75 = Batch 85989 = Sample 44025857 +[ INFO][26-Jun-24 03:09:38] Total gradient norm stats for 72 steps: 0.125 <= 0.1318 + 0.003789z <= 0.1421 +[ INFO][26-Jun-24 03:09:38] Trained chunk 75 in 139.6s at 4263noun/s: lr=1.48e-03, loss=1.53e+00, top1=68.07%/66.600% +[ INFO][26-Jun-24 03:09:38] Chunk 76 = Batch 87151 = Sample 44620801 +[ INFO][26-Jun-24 03:11:58] Total gradient norm stats for 73 steps: 0.1236 <= 0.1337 + 0.007785z <= 0.1639 +[ INFO][26-Jun-24 03:11:58] Trained chunk 76 in 140.2s at 4243noun/s: lr=1.47e-03, loss=1.53e+00, top1=66.62%/66.647% +[ INFO][26-Jun-24 03:11:58] Chunk 77 = Batch 88313 = Sample 45215745 +[ INFO][26-Jun-24 03:14:18] Total gradient norm stats for 73 steps: 0.124 <= 0.1309 + 0.003883z <= 0.1449 +[ INFO][26-Jun-24 03:14:18] Trained chunk 77 in 139.9s at 4252noun/s: lr=1.47e-03, loss=1.52e+00, top1=65.96%/66.697% +[ INFO][26-Jun-24 03:14:18] Chunk 78 = Batch 89475 = Sample 45810689 +[ INFO][26-Jun-24 03:16:38] Total gradient norm stats for 72 steps: 0.1256 <= 0.1337 + 0.006949z <= 0.16 +[ INFO][26-Jun-24 03:16:38] Trained chunk 78 in 139.8s at 4255noun/s: lr=1.47e-03, loss=1.52e+00, top1=66.57%/66.747% +[ INFO][26-Jun-24 03:16:38] Chunk 79 = Batch 90637 = Sample 46405633 +[ INFO][26-Jun-24 03:18:58] Total gradient norm stats for 73 steps: 0.1243 <= 0.1318 + 0.004803z <= 0.147 +[ INFO][26-Jun-24 03:18:58] Trained chunk 79 in 139.5s at 4264noun/s: lr=1.47e-03, loss=1.52e+00, top1=66.47%/66.793% +[ INFO][26-Jun-24 03:18:58] Chunk 80 = Batch 91799 = Sample 47000577 +[ INFO][26-Jun-24 03:21:18] Total gradient norm stats for 73 steps: 0.1248 <= 0.1312 + 0.003568z <= 0.1407 +[ INFO][26-Jun-24 03:21:18] Trained chunk 80 in 140.3s at 4240noun/s: lr=1.47e-03, loss=1.51e+00, top1=67.83%/66.840% +[ INFO][26-Jun-24 03:21:18] Chunk 81 = Batch 92961 = Sample 47595521 +[ INFO][26-Jun-24 03:23:38] Total gradient norm stats for 72 steps: 0.1233 <= 0.1333 + 0.006621z <= 0.154 +[ INFO][26-Jun-24 03:23:38] Trained chunk 81 in 139.8s at 4256noun/s: lr=1.47e-03, loss=1.51e+00, top1=67.77%/66.889% +[ INFO][26-Jun-24 03:23:38] Chunk 82 = Batch 94123 = Sample 48190465 +[ INFO][26-Jun-24 03:25:58] Total gradient norm stats for 73 steps: 0.126 <= 0.1323 + 0.005302z <= 0.1492 +[ INFO][26-Jun-24 03:25:58] Trained chunk 82 in 140.1s at 4246noun/s: lr=1.47e-03, loss=1.50e+00, top1=66.68%/66.935% +[ INFO][26-Jun-24 03:25:58] Chunk 83 = Batch 95285 = Sample 48785409 +[ INFO][26-Jun-24 03:28:18] Total gradient norm stats for 72 steps: 0.1236 <= 0.1315 + 0.004616z <= 0.1449 +[ INFO][26-Jun-24 03:28:18] Trained chunk 83 in 139.8s at 4256noun/s: lr=1.47e-03, loss=1.50e+00, top1=67.99%/66.981% +[ INFO][26-Jun-24 03:28:18] Chunk 84 = Batch 96447 = Sample 49380353 +[ INFO][26-Jun-24 03:30:37] Total gradient norm stats for 73 steps: 0.1242 <= 0.1309 + 0.004986z <= 0.1487 +[ INFO][26-Jun-24 03:30:37] Trained chunk 84 in 139.8s at 4257noun/s: lr=1.47e-03, loss=1.50e+00, top1=66.53%/67.020% +[ INFO][26-Jun-24 03:30:37] Chunk 85 = Batch 97609 = Sample 49975297 +[ INFO][26-Jun-24 03:32:57] Total gradient norm stats for 73 steps: 0.1247 <= 0.1344 + 0.006754z <= 0.1692 +[ INFO][26-Jun-24 03:32:57] Trained chunk 85 in 140.0s at 4249noun/s: lr=1.47e-03, loss=1.49e+00, top1=66.53%/67.061% +[ INFO][26-Jun-24 03:32:57] Chunk 86 = Batch 98771 = Sample 50570241 +[ INFO][26-Jun-24 03:35:17] Total gradient norm stats for 72 steps: 0.1233 <= 0.1308 + 0.003633z <= 0.1412 +[ INFO][26-Jun-24 03:35:17] Trained chunk 86 in 139.8s at 4254noun/s: lr=1.47e-03, loss=1.49e+00, top1=67.34%/67.106% +[ INFO][26-Jun-24 03:35:17] Chunk 87 = Batch 99933 = Sample 51165185 +[ INFO][26-Jun-24 03:37:37] Total gradient norm stats for 73 steps: 0.125 <= 0.1333 + 0.005285z <= 0.152 +[ INFO][26-Jun-24 03:37:37] Trained chunk 87 in 139.9s at 4253noun/s: lr=1.47e-03, loss=1.49e+00, top1=67.06%/67.145% +[ INFO][26-Jun-24 03:37:37] Chunk 88 = Batch 101095 = Sample 51760129 +[ INFO][26-Jun-24 03:39:57] Total gradient norm stats for 73 steps: 0.1247 <= 0.1342 + 0.007703z <= 0.1697 +[ INFO][26-Jun-24 03:39:57] Trained chunk 88 in 140.2s at 4243noun/s: lr=1.47e-03, loss=1.48e+00, top1=68.22%/67.189% +[ INFO][26-Jun-24 03:39:57] Chunk 89 = Batch 102257 = Sample 52355073 +[ INFO][26-Jun-24 03:42:17] Total gradient norm stats for 72 steps: 0.124 <= 0.133 + 0.00556z <= 0.1509 +[ INFO][26-Jun-24 03:42:17] Trained chunk 89 in 139.8s at 4256noun/s: lr=1.46e-03, loss=1.48e+00, top1=67.78%/67.225% +[ INFO][26-Jun-24 03:42:17] Chunk 90 = Batch 103419 = Sample 52950017 +[ INFO][26-Jun-24 03:44:37] Total gradient norm stats for 73 steps: 0.1242 <= 0.1337 + 0.009042z <= 0.1789 +[ INFO][26-Jun-24 03:44:37] Trained chunk 90 in 140.0s at 4250noun/s: lr=1.46e-03, loss=1.48e+00, top1=65.67%/67.263% +[ INFO][26-Jun-24 03:44:37] Chunk 91 = Batch 104581 = Sample 53544961 +[ INFO][26-Jun-24 03:46:57] Total gradient norm stats for 72 steps: 0.1224 <= 0.131 + 0.004454z <= 0.1443 +[ INFO][26-Jun-24 03:46:57] Trained chunk 91 in 139.8s at 4257noun/s: lr=1.46e-03, loss=1.47e+00, top1=67.48%/67.300% +[ INFO][26-Jun-24 03:46:57] Chunk 92 = Batch 105743 = Sample 54139905 +[ INFO][26-Jun-24 03:49:17] Total gradient norm stats for 73 steps: 0.1265 <= 0.1334 + 0.004621z <= 0.146 +[ INFO][26-Jun-24 03:49:17] Trained chunk 92 in 140.0s at 4249noun/s: lr=1.46e-03, loss=1.47e+00, top1=68.04%/67.341% +[ INFO][26-Jun-24 03:49:17] Chunk 93 = Batch 106905 = Sample 54734849 +[ INFO][26-Jun-24 03:51:37] Total gradient norm stats for 73 steps: 0.1224 <= 0.1319 + 0.004372z <= 0.1419 +[ INFO][26-Jun-24 03:51:37] Trained chunk 93 in 139.9s at 4252noun/s: lr=1.46e-03, loss=1.47e+00, top1=66.39%/67.377% +[ INFO][26-Jun-24 03:51:37] Chunk 94 = Batch 108067 = Sample 55329793 +[ INFO][26-Jun-24 03:53:57] Total gradient norm stats for 72 steps: 0.1235 <= 0.1317 + 0.005213z <= 0.1531 +[ INFO][26-Jun-24 03:53:57] Trained chunk 94 in 139.8s at 4255noun/s: lr=1.46e-03, loss=1.47e+00, top1=68.01%/67.408% +[ INFO][26-Jun-24 03:53:57] Chunk 95 = Batch 109229 = Sample 55924737 +[ INFO][26-Jun-24 03:56:16] Total gradient norm stats for 73 steps: 0.1226 <= 0.1321 + 0.005945z <= 0.1505 +[ INFO][26-Jun-24 03:56:16] Trained chunk 95 in 139.6s at 4262noun/s: lr=1.46e-03, loss=1.46e+00, top1=66.90%/67.447% +[ INFO][26-Jun-24 03:56:16] Chunk 96 = Batch 110391 = Sample 56519681 +[ INFO][26-Jun-24 03:58:36] Total gradient norm stats for 73 steps: 0.1242 <= 0.1335 + 0.006455z <= 0.1591 +[ INFO][26-Jun-24 03:58:36] Trained chunk 96 in 139.5s at 4265noun/s: lr=1.46e-03, loss=1.46e+00, top1=67.97%/67.486% +[ INFO][26-Jun-24 03:58:36] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240626_001447/ovod_chunk0096_20240626_035836.train +[ INFO][26-Jun-24 03:58:36] Chunk 97 = Batch 111553 = Sample 57114625 +[ INFO][26-Jun-24 04:00:56] Total gradient norm stats for 72 steps: 0.1259 <= 0.1347 + 0.008886z <= 0.1711 +[ INFO][26-Jun-24 04:00:56] Trained chunk 97 in 139.6s at 4262noun/s: lr=1.46e-03, loss=1.46e+00, top1=68.67%/67.519% +[ INFO][26-Jun-24 04:00:56] Chunk 98 = Batch 112715 = Sample 57709569 +[ INFO][26-Jun-24 04:03:15] Total gradient norm stats for 73 steps: 0.1258 <= 0.1332 + 0.00451z <= 0.1439 +[ INFO][26-Jun-24 04:03:15] Trained chunk 98 in 139.4s at 4268noun/s: lr=1.46e-03, loss=1.46e+00, top1=67.45%/67.559% +[ INFO][26-Jun-24 04:03:15] Chunk 99 = Batch 113877 = Sample 58304513 +[ INFO][26-Jun-24 04:05:35] Total gradient norm stats for 72 steps: 0.1236 <= 0.1336 + 0.005675z <= 0.153 +[ INFO][26-Jun-24 04:05:35] Trained chunk 99 in 139.7s at 4259noun/s: lr=1.46e-03, loss=1.45e+00, top1=67.86%/67.595% +[ INFO][26-Jun-24 04:05:35] Chunk 100 = Batch 115039 = Sample 58899457 +[ INFO][26-Jun-24 04:07:55] Epoch 2 finished in 6990.5s +[ INFO][26-Jun-24 04:07:55] -------------------------------------------------------------------------------- +[ INFO][26-Jun-24 04:07:55] Epoch 3 = Batch 116193 = Sample 59490305 +[ INFO][26-Jun-24 04:07:56] Total gradient norm stats for 73 steps: 0.1245 <= 0.1368 + 0.009672z <= 0.1716 +[ INFO][26-Jun-24 04:07:56] Trained chunk 100 in 141.2s at 4214noun/s: lr=1.46e-03, loss=1.45e+00, top1=66.40%/67.625% +[ INFO][26-Jun-24 04:07:56] Chunk 101 = Batch 116201 = Sample 59494401 +[ INFO][26-Jun-24 04:10:16] Total gradient norm stats for 73 steps: 0.123 <= 0.1315 + 0.004291z <= 0.1455 +[ INFO][26-Jun-24 04:10:16] Trained chunk 101 in 139.7s at 4260noun/s: lr=1.45e-03, loss=1.45e+00, top1=69.22%/67.664% +[ INFO][26-Jun-24 04:10:16] Chunk 102 = Batch 117363 = Sample 60089345 +[ INFO][26-Jun-24 04:12:35] Total gradient norm stats for 72 steps: 0.1249 <= 0.1363 + 0.01927z <= 0.2863 +[ INFO][26-Jun-24 04:12:35] Trained chunk 102 in 139.6s at 4262noun/s: lr=1.45e-03, loss=1.44e+00, top1=66.72%/67.697% +[ INFO][26-Jun-24 04:12:35] Chunk 103 = Batch 118525 = Sample 60684289 +[ INFO][26-Jun-24 04:14:55] Total gradient norm stats for 73 steps: 0.1261 <= 0.1367 + 0.00789z <= 0.1605 +[ INFO][26-Jun-24 04:14:55] Trained chunk 103 in 139.7s at 4259noun/s: lr=1.45e-03, loss=1.44e+00, top1=67.97%/67.724% +[ INFO][26-Jun-24 04:14:55] Chunk 104 = Batch 119687 = Sample 61279233 +[ INFO][26-Jun-24 04:17:15] Total gradient norm stats for 73 steps: 0.1232 <= 0.1384 + 0.009868z <= 0.1738 +[ INFO][26-Jun-24 04:17:15] Trained chunk 104 in 139.8s at 4255noun/s: lr=1.45e-03, loss=1.44e+00, top1=68.54%/67.752% +[ INFO][26-Jun-24 04:17:15] Chunk 105 = Batch 120849 = Sample 61874177 +[ INFO][26-Jun-24 04:19:35] Total gradient norm stats for 72 steps: 0.1245 <= 0.1352 + 0.007084z <= 0.1562 +[ INFO][26-Jun-24 04:19:35] Trained chunk 105 in 139.7s at 4259noun/s: lr=1.45e-03, loss=1.44e+00, top1=68.78%/67.785% +[ INFO][26-Jun-24 04:19:35] Chunk 106 = Batch 122011 = Sample 62469121 +[ INFO][26-Jun-24 04:21:54] Total gradient norm stats for 73 steps: 0.1255 <= 0.1352 + 0.007277z <= 0.1582 +[ INFO][26-Jun-24 04:21:54] Trained chunk 106 in 139.7s at 4259noun/s: lr=1.45e-03, loss=1.43e+00, top1=68.95%/67.816% +[ INFO][26-Jun-24 04:21:54] Chunk 107 = Batch 123173 = Sample 63064065 +[ INFO][26-Jun-24 04:24:14] Total gradient norm stats for 72 steps: 0.1236 <= 0.1338 + 0.005113z <= 0.1548 +[ INFO][26-Jun-24 04:24:14] Trained chunk 107 in 140.0s at 4250noun/s: lr=1.45e-03, loss=1.43e+00, top1=67.66%/67.846% +[ INFO][26-Jun-24 04:24:14] Chunk 108 = Batch 124335 = Sample 63659009 +[ INFO][26-Jun-24 04:26:34] Total gradient norm stats for 73 steps: 0.1271 <= 0.1356 + 0.006715z <= 0.1637 +[ INFO][26-Jun-24 04:26:34] Trained chunk 108 in 139.8s at 4257noun/s: lr=1.45e-03, loss=1.43e+00, top1=66.27%/67.875% +[ INFO][26-Jun-24 04:26:34] Chunk 109 = Batch 125497 = Sample 64253953 +[ INFO][26-Jun-24 04:28:54] Total gradient norm stats for 73 steps: 0.1255 <= 0.1338 + 0.005233z <= 0.1613 +[ INFO][26-Jun-24 04:28:54] Trained chunk 109 in 139.9s at 4252noun/s: lr=1.45e-03, loss=1.43e+00, top1=68.18%/67.904% +[ INFO][26-Jun-24 04:28:54] Chunk 110 = Batch 126659 = Sample 64848897 +[ INFO][26-Jun-24 04:31:14] Total gradient norm stats for 72 steps: 0.1247 <= 0.1332 + 0.005247z <= 0.149 +[ INFO][26-Jun-24 04:31:14] Trained chunk 110 in 139.7s at 4258noun/s: lr=1.45e-03, loss=1.43e+00, top1=69.20%/67.940% +[ INFO][26-Jun-24 04:31:14] Chunk 111 = Batch 127821 = Sample 65443841 +[ INFO][26-Jun-24 04:33:33] Total gradient norm stats for 73 steps: 0.1279 <= 0.1367 + 0.006045z <= 0.154 +[ INFO][26-Jun-24 04:33:33] Trained chunk 111 in 139.7s at 4257noun/s: lr=1.45e-03, loss=1.42e+00, top1=68.45%/67.972% +[ INFO][26-Jun-24 04:33:33] Chunk 112 = Batch 128983 = Sample 66038785 +[ INFO][26-Jun-24 04:35:53] Total gradient norm stats for 73 steps: 0.1264 <= 0.1354 + 0.005198z <= 0.1503 +[ INFO][26-Jun-24 04:35:53] Trained chunk 112 in 140.0s at 4250noun/s: lr=1.44e-03, loss=1.42e+00, top1=67.49%/68.004% +[ INFO][26-Jun-24 04:35:53] Chunk 113 = Batch 130145 = Sample 66633729 +[ INFO][26-Jun-24 04:38:13] Total gradient norm stats for 72 steps: 0.1274 <= 0.1369 + 0.006186z <= 0.1537 +[ INFO][26-Jun-24 04:38:13] Trained chunk 113 in 140.0s at 4251noun/s: lr=1.44e-03, loss=1.42e+00, top1=67.11%/68.028% +[ INFO][26-Jun-24 04:38:13] Chunk 114 = Batch 131307 = Sample 67228673 +[ INFO][26-Jun-24 04:40:34] Total gradient norm stats for 73 steps: 0.1259 <= 0.136 + 0.006057z <= 0.1524 +[ INFO][26-Jun-24 04:40:34] Trained chunk 114 in 140.1s at 4245noun/s: lr=1.44e-03, loss=1.42e+00, top1=67.82%/68.057% +[ INFO][26-Jun-24 04:40:34] Chunk 115 = Batch 132469 = Sample 67823617 +[ INFO][26-Jun-24 04:42:53] Total gradient norm stats for 72 steps: 0.1287 <= 0.1382 + 0.006189z <= 0.1562 +[ INFO][26-Jun-24 04:42:53] Trained chunk 115 in 139.4s at 4266noun/s: lr=1.44e-03, loss=1.41e+00, top1=68.31%/68.089% +[ INFO][26-Jun-24 04:42:53] Chunk 116 = Batch 133631 = Sample 68418561 +[ INFO][26-Jun-24 04:45:13] Total gradient norm stats for 73 steps: 0.1258 <= 0.1411 + 0.009395z <= 0.168 +[ INFO][26-Jun-24 04:45:13] Trained chunk 116 in 140.0s at 4249noun/s: lr=1.44e-03, loss=1.41e+00, top1=68.36%/68.119% +[ INFO][26-Jun-24 04:45:13] Chunk 117 = Batch 134793 = Sample 69013505 +[ INFO][26-Jun-24 04:47:33] Total gradient norm stats for 73 steps: 0.1283 <= 0.1362 + 0.006801z <= 0.1788 +[ INFO][26-Jun-24 04:47:33] Trained chunk 117 in 139.6s at 4262noun/s: lr=1.44e-03, loss=1.41e+00, top1=68.61%/68.151% +[ INFO][26-Jun-24 04:47:33] Chunk 118 = Batch 135955 = Sample 69608449 +[ INFO][26-Jun-24 04:49:52] Total gradient norm stats for 72 steps: 0.1275 <= 0.138 + 0.006187z <= 0.155 +[ INFO][26-Jun-24 04:49:52] Trained chunk 118 in 139.6s at 4262noun/s: lr=1.44e-03, loss=1.41e+00, top1=67.25%/68.181% +[ INFO][26-Jun-24 04:49:52] Chunk 119 = Batch 137117 = Sample 70203393 +[ INFO][26-Jun-24 04:52:12] Total gradient norm stats for 73 steps: 0.1253 <= 0.1377 + 0.006455z <= 0.1581 +[ INFO][26-Jun-24 04:52:12] Trained chunk 119 in 139.5s at 4265noun/s: lr=1.44e-03, loss=1.41e+00, top1=67.78%/68.200% +[ INFO][26-Jun-24 04:52:12] Chunk 120 = Batch 138279 = Sample 70798337 +[ INFO][26-Jun-24 04:54:31] Total gradient norm stats for 73 steps: 0.125 <= 0.1388 + 0.008655z <= 0.1755 +[ INFO][26-Jun-24 04:54:31] Trained chunk 120 in 139.7s at 4260noun/s: lr=1.44e-03, loss=1.40e+00, top1=67.88%/68.227% +[ INFO][26-Jun-24 04:54:31] Chunk 121 = Batch 139441 = Sample 71393281 +[ INFO][26-Jun-24 04:56:51] Total gradient norm stats for 72 steps: 0.1274 <= 0.137 + 0.004978z <= 0.1502 +[ INFO][26-Jun-24 04:56:51] Trained chunk 121 in 139.7s at 4259noun/s: lr=1.44e-03, loss=1.40e+00, top1=68.61%/68.250% +[ INFO][26-Jun-24 04:56:51] Chunk 122 = Batch 140603 = Sample 71988225 +[ INFO][26-Jun-24 04:59:11] Total gradient norm stats for 73 steps: 0.1285 <= 0.1392 + 0.006332z <= 0.1649 +[ INFO][26-Jun-24 04:59:11] Trained chunk 122 in 139.9s at 4254noun/s: lr=1.43e-03, loss=1.40e+00, top1=68.42%/68.280% +[ INFO][26-Jun-24 04:59:11] Chunk 123 = Batch 141765 = Sample 72583169 +[ INFO][26-Jun-24 05:01:31] Total gradient norm stats for 72 steps: 0.1302 <= 0.141 + 0.006388z <= 0.1602 +[ INFO][26-Jun-24 05:01:31] Trained chunk 123 in 139.8s at 4256noun/s: lr=1.43e-03, loss=1.40e+00, top1=68.91%/68.297% +[ INFO][26-Jun-24 05:01:31] Chunk 124 = Batch 142927 = Sample 73178113 +[ INFO][26-Jun-24 05:03:50] Total gradient norm stats for 73 steps: 0.1299 <= 0.1424 + 0.00933z <= 0.1754 +[ INFO][26-Jun-24 05:03:50] Trained chunk 124 in 139.6s at 4261noun/s: lr=1.43e-03, loss=1.40e+00, top1=68.81%/68.322% +[ INFO][26-Jun-24 05:03:50] Chunk 125 = Batch 144089 = Sample 73773057 +[ INFO][26-Jun-24 05:06:10] Total gradient norm stats for 73 steps: 0.1302 <= 0.139 + 0.005307z <= 0.1535 +[ INFO][26-Jun-24 05:06:10] Trained chunk 125 in 140.1s at 4248noun/s: lr=1.43e-03, loss=1.40e+00, top1=69.05%/68.350% +[ INFO][26-Jun-24 05:06:10] Chunk 126 = Batch 145251 = Sample 74368001 +[ INFO][26-Jun-24 05:08:30] Total gradient norm stats for 72 steps: 0.1291 <= 0.1419 + 0.008172z <= 0.1701 +[ INFO][26-Jun-24 05:08:30] Trained chunk 126 in 139.6s at 4262noun/s: lr=1.43e-03, loss=1.39e+00, top1=67.71%/68.378% +[ INFO][26-Jun-24 05:08:30] Chunk 127 = Batch 146413 = Sample 74962945 +[ INFO][26-Jun-24 05:10:50] Total gradient norm stats for 73 steps: 0.1299 <= 0.1419 + 0.006228z <= 0.1571 +[ INFO][26-Jun-24 05:10:50] Trained chunk 127 in 139.6s at 4261noun/s: lr=1.43e-03, loss=1.39e+00, top1=68.32%/68.400% +[ INFO][26-Jun-24 05:10:50] Chunk 128 = Batch 147575 = Sample 75557889 +[ INFO][26-Jun-24 05:13:09] Total gradient norm stats for 73 steps: 0.1292 <= 0.1469 + 0.01012z <= 0.1741 +[ INFO][26-Jun-24 05:13:09] Trained chunk 128 in 139.6s at 4261noun/s: lr=1.43e-03, loss=1.39e+00, top1=68.60%/68.421% +[ INFO][26-Jun-24 05:13:09] Chunk 129 = Batch 148737 = Sample 76152833 +[ INFO][26-Jun-24 05:15:29] Total gradient norm stats for 72 steps: 0.13 <= 0.1453 + 0.0113z <= 0.1768 +[ INFO][26-Jun-24 05:15:29] Trained chunk 129 in 139.8s at 4257noun/s: lr=1.43e-03, loss=1.39e+00, top1=68.89%/68.442% +[ INFO][26-Jun-24 05:15:29] Chunk 130 = Batch 149899 = Sample 76747777 +[ INFO][26-Jun-24 05:17:49] Total gradient norm stats for 73 steps: 0.1292 <= 0.1402 + 0.006391z <= 0.1661 +[ INFO][26-Jun-24 05:17:49] Trained chunk 130 in 140.0s at 4250noun/s: lr=1.43e-03, loss=1.39e+00, top1=68.26%/68.466% +[ INFO][26-Jun-24 05:17:49] Chunk 131 = Batch 151061 = Sample 77342721 +[ INFO][26-Jun-24 05:20:09] Total gradient norm stats for 72 steps: 0.1321 <= 0.1454 + 0.008576z <= 0.1702 +[ INFO][26-Jun-24 05:20:09] Trained chunk 131 in 139.7s at 4258noun/s: lr=1.42e-03, loss=1.39e+00, top1=69.54%/68.493% +[ INFO][26-Jun-24 05:20:09] Chunk 132 = Batch 152223 = Sample 77937665 +[ INFO][26-Jun-24 05:22:28] Total gradient norm stats for 73 steps: 0.1342 <= 0.1459 + 0.00812z <= 0.1775 +[ INFO][26-Jun-24 05:22:28] Trained chunk 132 in 139.5s at 4265noun/s: lr=1.42e-03, loss=1.38e+00, top1=68.01%/68.518% +[ INFO][26-Jun-24 05:22:28] Chunk 133 = Batch 153385 = Sample 78532609 +[ INFO][26-Jun-24 05:24:48] Total gradient norm stats for 73 steps: 0.1297 <= 0.1434 + 0.007066z <= 0.1739 +[ INFO][26-Jun-24 05:24:48] Trained chunk 133 in 139.6s at 4263noun/s: lr=1.42e-03, loss=1.38e+00, top1=69.20%/68.539% +[ INFO][26-Jun-24 05:24:48] Chunk 134 = Batch 154547 = Sample 79127553 +[ INFO][26-Jun-24 05:27:08] Total gradient norm stats for 72 steps: 0.1329 <= 0.1509 + 0.01251z <= 0.1952 +[ INFO][26-Jun-24 05:27:08] Trained chunk 134 in 140.1s at 4248noun/s: lr=1.42e-03, loss=1.38e+00, top1=69.59%/68.559% +[ INFO][26-Jun-24 05:27:08] Chunk 135 = Batch 155709 = Sample 79722497 +[ INFO][26-Jun-24 05:29:28] Total gradient norm stats for 73 steps: 0.1315 <= 0.1466 + 0.009935z <= 0.1738 +[ INFO][26-Jun-24 05:29:28] Trained chunk 135 in 139.8s at 4257noun/s: lr=1.42e-03, loss=1.38e+00, top1=69.66%/68.585% +[ INFO][26-Jun-24 05:29:28] Chunk 136 = Batch 156871 = Sample 80317441 +[ INFO][26-Jun-24 05:31:48] Total gradient norm stats for 73 steps: 0.1321 <= 0.143 + 0.005924z <= 0.1578 +[ INFO][26-Jun-24 05:31:48] Trained chunk 136 in 139.9s at 4251noun/s: lr=1.42e-03, loss=1.38e+00, top1=68.72%/68.605% +[ INFO][26-Jun-24 05:31:48] Chunk 137 = Batch 158033 = Sample 80912385 +[ INFO][26-Jun-24 05:34:07] Total gradient norm stats for 72 steps: 0.1336 <= 0.1472 + 0.008616z <= 0.1698 +[ INFO][26-Jun-24 05:34:07] Trained chunk 137 in 139.7s at 4258noun/s: lr=1.42e-03, loss=1.38e+00, top1=69.44%/68.623% +[ INFO][26-Jun-24 05:34:07] Chunk 138 = Batch 159195 = Sample 81507329 +[ INFO][26-Jun-24 05:36:27] Total gradient norm stats for 73 steps: 0.1363 <= 0.1496 + 0.01006z <= 0.1867 +[ INFO][26-Jun-24 05:36:27] Trained chunk 138 in 139.7s at 4258noun/s: lr=1.42e-03, loss=1.38e+00, top1=68.50%/68.638% +[ INFO][26-Jun-24 05:36:27] Chunk 139 = Batch 160357 = Sample 82102273 +[ INFO][26-Jun-24 05:38:47] Total gradient norm stats for 72 steps: 0.1358 <= 0.1505 + 0.009534z <= 0.1767 +[ INFO][26-Jun-24 05:38:47] Trained chunk 139 in 139.6s at 4261noun/s: lr=1.41e-03, loss=1.37e+00, top1=69.21%/68.656% +[ INFO][26-Jun-24 05:38:47] Chunk 140 = Batch 161519 = Sample 82697217 +[ INFO][26-Jun-24 05:41:06] Total gradient norm stats for 73 steps: 0.1341 <= 0.1537 + 0.01264z <= 0.1899 +[ INFO][26-Jun-24 05:41:06] Trained chunk 140 in 139.4s at 4266noun/s: lr=1.41e-03, loss=1.37e+00, top1=68.81%/68.676% +[ INFO][26-Jun-24 05:41:06] Chunk 141 = Batch 162681 = Sample 83292161 +[ INFO][26-Jun-24 05:43:26] Total gradient norm stats for 73 steps: 0.1323 <= 0.1487 + 0.01007z <= 0.1742 +[ INFO][26-Jun-24 05:43:26] Trained chunk 141 in 139.7s at 4259noun/s: lr=1.41e-03, loss=1.37e+00, top1=70.31%/68.690% +[ INFO][26-Jun-24 05:43:26] Chunk 142 = Batch 163843 = Sample 83887105 +[ INFO][26-Jun-24 05:45:46] Total gradient norm stats for 72 steps: 0.1344 <= 0.1504 + 0.008527z <= 0.1776 +[ INFO][26-Jun-24 05:45:46] Trained chunk 142 in 139.9s at 4253noun/s: lr=1.41e-03, loss=1.37e+00, top1=68.57%/68.702% +[ INFO][26-Jun-24 05:45:46] Chunk 143 = Batch 165005 = Sample 84482049 +[ INFO][26-Jun-24 05:48:05] Total gradient norm stats for 73 steps: 0.1348 <= 0.1484 + 0.00958z <= 0.1871 +[ INFO][26-Jun-24 05:48:05] Trained chunk 143 in 139.7s at 4259noun/s: lr=1.41e-03, loss=1.37e+00, top1=69.96%/68.725% +[ INFO][26-Jun-24 05:48:05] Chunk 144 = Batch 166167 = Sample 85076993 +[ INFO][26-Jun-24 05:50:25] Total gradient norm stats for 73 steps: 0.1363 <= 0.1598 + 0.01707z <= 0.2252 +[ INFO][26-Jun-24 05:50:25] Trained chunk 144 in 139.8s at 4255noun/s: lr=1.41e-03, loss=1.37e+00, top1=69.46%/68.744% +[ INFO][26-Jun-24 05:50:26] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240626_001447/ovod_chunk0144_20240626_055025.train +[ INFO][26-Jun-24 05:50:26] Chunk 145 = Batch 167329 = Sample 85671937 +[ INFO][26-Jun-24 05:52:45] Total gradient norm stats for 72 steps: 0.1381 <= 0.1532 + 0.01321z <= 0.2004 +[ INFO][26-Jun-24 05:52:45] Trained chunk 145 in 139.8s at 4255noun/s: lr=1.41e-03, loss=1.37e+00, top1=68.46%/68.766% +[ INFO][26-Jun-24 05:52:45] Chunk 146 = Batch 168491 = Sample 86266881 +[ INFO][26-Jun-24 05:55:05] Total gradient norm stats for 73 steps: 0.1394 <= 0.1575 + 0.01418z <= 0.2051 +[ INFO][26-Jun-24 05:55:05] Trained chunk 146 in 139.9s at 4251noun/s: lr=1.41e-03, loss=1.37e+00, top1=70.24%/68.783% +[ INFO][26-Jun-24 05:55:05] Chunk 147 = Batch 169653 = Sample 86861825 +[ INFO][26-Jun-24 05:57:25] Total gradient norm stats for 72 steps: 0.1371 <= 0.1532 + 0.011z <= 0.1915 +[ INFO][26-Jun-24 05:57:25] Trained chunk 147 in 139.6s at 4262noun/s: lr=1.40e-03, loss=1.36e+00, top1=68.46%/68.802% +[ INFO][26-Jun-24 05:57:25] Chunk 148 = Batch 170815 = Sample 87456769 +[ INFO][26-Jun-24 05:59:45] Total gradient norm stats for 73 steps: 0.1366 <= 0.1536 + 0.01047z <= 0.1961 +[ INFO][26-Jun-24 05:59:45] Trained chunk 148 in 139.7s at 4259noun/s: lr=1.40e-03, loss=1.36e+00, top1=68.54%/68.817% +[ INFO][26-Jun-24 05:59:45] Chunk 149 = Batch 171977 = Sample 88051713 +[ INFO][26-Jun-24 06:02:05] Total gradient norm stats for 73 steps: 0.1389 <= 0.1559 + 0.01264z <= 0.2016 +[ INFO][26-Jun-24 06:02:05] Trained chunk 149 in 140.1s at 4248noun/s: lr=1.40e-03, loss=1.36e+00, top1=68.63%/68.831% +[ INFO][26-Jun-24 06:02:05] Chunk 150 = Batch 173139 = Sample 88646657 +[ INFO][26-Jun-24 06:04:24] Epoch 3 finished in 6989.3s +[ INFO][26-Jun-24 06:04:24] -------------------------------------------------------------------------------- +[ INFO][26-Jun-24 06:04:24] Epoch 4 = Batch 174289 = Sample 89235457 +[ INFO][26-Jun-24 06:04:26] Total gradient norm stats for 72 steps: 0.1401 <= 0.1648 + 0.01477z <= 0.203 +[ INFO][26-Jun-24 06:04:26] Trained chunk 150 in 141.2s at 4213noun/s: lr=1.40e-03, loss=1.36e+00, top1=69.06%/68.845% +[ INFO][26-Jun-24 06:04:26] Chunk 151 = Batch 174301 = Sample 89241601 +[ INFO][26-Jun-24 06:06:46] Total gradient norm stats for 73 steps: 0.141 <= 0.1538 + 0.007467z <= 0.1778 +[ INFO][26-Jun-24 06:06:46] Trained chunk 151 in 140.3s at 4242noun/s: lr=1.40e-03, loss=1.36e+00, top1=69.16%/68.865% +[ INFO][26-Jun-24 06:06:46] Chunk 152 = Batch 175463 = Sample 89836545 +[ INFO][26-Jun-24 06:09:06] Total gradient norm stats for 73 steps: 0.1362 <= 0.1554 + 0.01236z <= 0.2185 +[ INFO][26-Jun-24 06:09:06] Trained chunk 152 in 139.9s at 4252noun/s: lr=1.40e-03, loss=1.36e+00, top1=69.13%/68.884% +[ INFO][26-Jun-24 06:09:06] Chunk 153 = Batch 176625 = Sample 90431489 +[ INFO][26-Jun-24 06:11:26] Total gradient norm stats for 72 steps: 0.1392 <= 0.1587 + 0.01429z <= 0.2033 +[ INFO][26-Jun-24 06:11:26] Trained chunk 153 in 139.7s at 4260noun/s: lr=1.40e-03, loss=1.36e+00, top1=68.85%/68.900% +[ INFO][26-Jun-24 06:11:26] Chunk 154 = Batch 177787 = Sample 91026433 +[ INFO][26-Jun-24 06:13:45] Total gradient norm stats for 73 steps: 0.1376 <= 0.1626 + 0.01757z <= 0.2198 +[ INFO][26-Jun-24 06:13:45] Trained chunk 154 in 139.4s at 4267noun/s: lr=1.40e-03, loss=1.36e+00, top1=69.40%/68.918% +[ INFO][26-Jun-24 06:13:45] Chunk 155 = Batch 178949 = Sample 91621377 +[ INFO][26-Jun-24 06:16:05] Total gradient norm stats for 72 steps: 0.1408 <= 0.1549 + 0.01069z <= 0.2061 +[ INFO][26-Jun-24 06:16:05] Trained chunk 155 in 139.8s at 4256noun/s: lr=1.39e-03, loss=1.35e+00, top1=69.34%/68.931% +[ INFO][26-Jun-24 06:16:05] Chunk 156 = Batch 180111 = Sample 92216321 +[ INFO][26-Jun-24 06:18:24] Total gradient norm stats for 73 steps: 0.1389 <= 0.1542 + 0.00864z <= 0.1732 +[ INFO][26-Jun-24 06:18:24] Trained chunk 156 in 139.4s at 4268noun/s: lr=1.39e-03, loss=1.35e+00, top1=68.34%/68.945% +[ INFO][26-Jun-24 06:18:24] Chunk 157 = Batch 181273 = Sample 92811265 +[ INFO][26-Jun-24 06:20:44] Total gradient norm stats for 73 steps: 0.1409 <= 0.1601 + 0.01388z <= 0.2267 +[ INFO][26-Jun-24 06:20:44] Trained chunk 157 in 139.9s at 4254noun/s: lr=1.39e-03, loss=1.35e+00, top1=68.37%/68.961% +[ INFO][26-Jun-24 06:20:44] Chunk 158 = Batch 182435 = Sample 93406209 +[ INFO][26-Jun-24 06:23:04] Total gradient norm stats for 72 steps: 0.1414 <= 0.168 + 0.01836z <= 0.2316 +[ INFO][26-Jun-24 06:23:04] Trained chunk 158 in 140.2s at 4245noun/s: lr=1.39e-03, loss=1.35e+00, top1=70.50%/68.979% +[ INFO][26-Jun-24 06:23:04] Chunk 159 = Batch 183597 = Sample 94001153 +[ INFO][26-Jun-24 06:25:24] Total gradient norm stats for 73 steps: 0.1447 <= 0.1641 + 0.01386z <= 0.2153 +[ INFO][26-Jun-24 06:25:24] Trained chunk 159 in 139.7s at 4260noun/s: lr=1.39e-03, loss=1.35e+00, top1=69.30%/68.998% +[ INFO][26-Jun-24 06:25:24] Chunk 160 = Batch 184759 = Sample 94596097 +[ INFO][26-Jun-24 06:27:44] Total gradient norm stats for 73 steps: 0.1392 <= 0.1614 + 0.02166z <= 0.2808 +[ INFO][26-Jun-24 06:27:44] Trained chunk 160 in 140.1s at 4247noun/s: lr=1.39e-03, loss=1.35e+00, top1=68.54%/69.011% +[ INFO][26-Jun-24 06:27:44] Chunk 161 = Batch 185921 = Sample 95191041 +[ INFO][26-Jun-24 06:30:04] Total gradient norm stats for 72 steps: 0.1387 <= 0.1633 + 0.0157z <= 0.218 +[ INFO][26-Jun-24 06:30:04] Trained chunk 161 in 139.9s at 4253noun/s: lr=1.39e-03, loss=1.35e+00, top1=69.78%/69.023% +[ INFO][26-Jun-24 06:30:04] Chunk 162 = Batch 187083 = Sample 95785985 +[ INFO][26-Jun-24 06:32:24] Total gradient norm stats for 73 steps: 0.1455 <= 0.1696 + 0.01707z <= 0.2174 +[ INFO][26-Jun-24 06:32:24] Trained chunk 162 in 139.7s at 4260noun/s: lr=1.38e-03, loss=1.35e+00, top1=70.66%/69.034% +[ INFO][26-Jun-24 06:32:24] Chunk 163 = Batch 188245 = Sample 96380929 +[ INFO][26-Jun-24 06:34:44] Total gradient norm stats for 72 steps: 0.1418 <= 0.1656 + 0.01809z <= 0.2225 +[ INFO][26-Jun-24 06:34:44] Trained chunk 163 in 139.8s at 4256noun/s: lr=1.38e-03, loss=1.35e+00, top1=69.08%/69.041% +[ INFO][26-Jun-24 06:34:44] Chunk 164 = Batch 189407 = Sample 96975873 +[ INFO][26-Jun-24 06:37:03] Total gradient norm stats for 73 steps: 0.1433 <= 0.1638 + 0.01414z <= 0.211 +[ INFO][26-Jun-24 06:37:03] Trained chunk 164 in 139.8s at 4257noun/s: lr=1.38e-03, loss=1.35e+00, top1=69.67%/69.055% +[ INFO][26-Jun-24 06:37:03] Chunk 165 = Batch 190569 = Sample 97570817 +[ INFO][26-Jun-24 06:39:23] Total gradient norm stats for 73 steps: 0.1448 <= 0.1726 + 0.01755z <= 0.23 +[ INFO][26-Jun-24 06:39:23] Trained chunk 165 in 140.0s at 4250noun/s: lr=1.38e-03, loss=1.35e+00, top1=69.27%/69.067% +[ INFO][26-Jun-24 06:39:23] Chunk 166 = Batch 191731 = Sample 98165761 +[ INFO][26-Jun-24 06:41:43] Total gradient norm stats for 72 steps: 0.145 <= 0.1678 + 0.01598z <= 0.2132 +[ INFO][26-Jun-24 06:41:43] Trained chunk 166 in 139.6s at 4261noun/s: lr=1.38e-03, loss=1.35e+00, top1=69.06%/69.073% +[ INFO][26-Jun-24 06:41:43] Chunk 167 = Batch 192893 = Sample 98760705 +[ INFO][26-Jun-24 06:44:03] Total gradient norm stats for 73 steps: 0.1485 <= 0.1673 + 0.01215z <= 0.1968 +[ INFO][26-Jun-24 06:44:03] Trained chunk 167 in 139.6s at 4261noun/s: lr=1.38e-03, loss=1.34e+00, top1=68.46%/69.088% +[ INFO][26-Jun-24 06:44:03] Chunk 168 = Batch 194055 = Sample 99355649 +[ INFO][26-Jun-24 06:46:23] Total gradient norm stats for 73 steps: 0.1473 <= 0.1727 + 0.01904z <= 0.229 +[ INFO][26-Jun-24 06:46:23] Trained chunk 168 in 140.0s at 4250noun/s: lr=1.38e-03, loss=1.34e+00, top1=69.53%/69.105% +[ INFO][26-Jun-24 06:46:23] Chunk 169 = Batch 195217 = Sample 99950593 +[ INFO][26-Jun-24 06:48:42] Total gradient norm stats for 72 steps: 0.1461 <= 0.1693 + 0.0154z <= 0.2088 +[ INFO][26-Jun-24 06:48:42] Trained chunk 169 in 139.5s at 4264noun/s: lr=1.37e-03, loss=1.34e+00, top1=69.51%/69.118% +[ INFO][26-Jun-24 06:48:42] Chunk 170 = Batch 196379 = Sample 100545537 +[ INFO][26-Jun-24 06:51:02] Total gradient norm stats for 73 steps: 0.1534 <= 0.1736 + 0.01845z <= 0.2716 +[ INFO][26-Jun-24 06:51:02] Trained chunk 170 in 139.8s at 4254noun/s: lr=1.37e-03, loss=1.34e+00, top1=69.11%/69.131% +[ INFO][26-Jun-24 06:51:02] Chunk 171 = Batch 197541 = Sample 101140481 +[ INFO][26-Jun-24 06:53:22] Total gradient norm stats for 72 steps: 0.1463 <= 0.1708 + 0.02287z <= 0.2796 +[ INFO][26-Jun-24 06:53:22] Trained chunk 171 in 139.9s at 4254noun/s: lr=1.37e-03, loss=1.34e+00, top1=69.94%/69.142% +[ INFO][26-Jun-24 06:53:22] Chunk 172 = Batch 198703 = Sample 101735425 +[ INFO][26-Jun-24 06:55:42] Total gradient norm stats for 73 steps: 0.1494 <= 0.1712 + 0.01619z <= 0.2315 +[ INFO][26-Jun-24 06:55:42] Trained chunk 172 in 139.8s at 4257noun/s: lr=1.37e-03, loss=1.34e+00, top1=68.80%/69.159% +[ INFO][26-Jun-24 06:55:42] Chunk 173 = Batch 199865 = Sample 102330369 +[ INFO][26-Jun-24 06:58:01] Total gradient norm stats for 73 steps: 0.1434 <= 0.1805 + 0.02404z <= 0.2777 +[ INFO][26-Jun-24 06:58:01] Trained chunk 173 in 139.7s at 4259noun/s: lr=1.37e-03, loss=1.34e+00, top1=70.05%/69.171% +[ INFO][26-Jun-24 06:58:01] Chunk 174 = Batch 201027 = Sample 102925313 +[ INFO][26-Jun-24 07:00:21] Total gradient norm stats for 72 steps: 0.1521 <= 0.1841 + 0.02398z <= 0.2941 +[ INFO][26-Jun-24 07:00:21] Trained chunk 174 in 139.7s at 4259noun/s: lr=1.37e-03, loss=1.34e+00, top1=70.37%/69.184% +[ INFO][26-Jun-24 07:00:21] Chunk 175 = Batch 202189 = Sample 103520257 +[ INFO][26-Jun-24 07:02:41] Total gradient norm stats for 73 steps: 0.1445 <= 0.1704 + 0.01875z <= 0.2688 +[ INFO][26-Jun-24 07:02:41] Trained chunk 175 in 139.6s at 4262noun/s: lr=1.37e-03, loss=1.34e+00, top1=69.75%/69.188% +[ INFO][26-Jun-24 07:02:41] Chunk 176 = Batch 203351 = Sample 104115201 +[ INFO][26-Jun-24 07:05:00] Total gradient norm stats for 73 steps: 0.1483 <= 0.1774 + 0.03z <= 0.3425 +[ INFO][26-Jun-24 07:05:00] Trained chunk 176 in 139.5s at 4264noun/s: lr=1.36e-03, loss=1.34e+00, top1=69.13%/69.197% +[ INFO][26-Jun-24 07:05:00] Chunk 177 = Batch 204513 = Sample 104710145 +[ INFO][26-Jun-24 07:07:20] Total gradient norm stats for 72 steps: 0.1533 <= 0.1674 + 0.01175z <= 0.2152 +[ INFO][26-Jun-24 07:07:20] Trained chunk 177 in 139.5s at 4264noun/s: lr=1.36e-03, loss=1.34e+00, top1=69.24%/69.214% +[ INFO][26-Jun-24 07:07:20] Chunk 178 = Batch 205675 = Sample 105305089 +[ INFO][26-Jun-24 07:09:40] Total gradient norm stats for 73 steps: 0.147 <= 0.1734 + 0.01583z <= 0.2281 +[ INFO][26-Jun-24 07:09:40] Trained chunk 178 in 139.9s at 4253noun/s: lr=1.36e-03, loss=1.34e+00, top1=68.47%/69.224% +[ INFO][26-Jun-24 07:09:40] Chunk 179 = Batch 206837 = Sample 105900033 +[ INFO][26-Jun-24 07:11:59] Total gradient norm stats for 72 steps: 0.1478 <= 0.1759 + 0.01682z <= 0.2183 +[ INFO][26-Jun-24 07:11:59] Trained chunk 179 in 139.7s at 4258noun/s: lr=1.36e-03, loss=1.34e+00, top1=69.34%/69.233% +[ INFO][26-Jun-24 07:11:59] Chunk 180 = Batch 207999 = Sample 106494977 +[ INFO][26-Jun-24 07:14:19] Total gradient norm stats for 73 steps: 0.1532 <= 0.1792 + 0.01788z <= 0.2431 +[ INFO][26-Jun-24 07:14:19] Trained chunk 180 in 139.7s at 4259noun/s: lr=1.36e-03, loss=1.34e+00, top1=70.01%/69.238% +[ INFO][26-Jun-24 07:14:19] Chunk 181 = Batch 209161 = Sample 107089921 +[ INFO][26-Jun-24 07:16:39] Total gradient norm stats for 73 steps: 0.1529 <= 0.1755 + 0.01634z <= 0.2341 +[ INFO][26-Jun-24 07:16:39] Trained chunk 181 in 139.8s at 4256noun/s: lr=1.36e-03, loss=1.33e+00, top1=69.03%/69.251% +[ INFO][26-Jun-24 07:16:39] Chunk 182 = Batch 210323 = Sample 107684865 +[ INFO][26-Jun-24 07:18:59] Total gradient norm stats for 72 steps: 0.148 <= 0.1721 + 0.01374z <= 0.216 +[ INFO][26-Jun-24 07:18:59] Trained chunk 182 in 139.8s at 4256noun/s: lr=1.35e-03, loss=1.33e+00, top1=69.57%/69.266% +[ INFO][26-Jun-24 07:18:59] Chunk 183 = Batch 211485 = Sample 108279809 +[ INFO][26-Jun-24 07:21:18] Total gradient norm stats for 73 steps: 0.1484 <= 0.174 + 0.01469z <= 0.2388 +[ INFO][26-Jun-24 07:21:18] Trained chunk 183 in 139.8s at 4256noun/s: lr=1.35e-03, loss=1.33e+00, top1=69.33%/69.278% +[ INFO][26-Jun-24 07:21:18] Chunk 184 = Batch 212647 = Sample 108874753 +[ INFO][26-Jun-24 07:23:38] Total gradient norm stats for 73 steps: 0.1522 <= 0.1879 + 0.02786z <= 0.2581 +[ INFO][26-Jun-24 07:23:38] Trained chunk 184 in 139.9s at 4254noun/s: lr=1.35e-03, loss=1.33e+00, top1=68.99%/69.282% +[ INFO][26-Jun-24 07:23:38] Chunk 185 = Batch 213809 = Sample 109469697 +[ INFO][26-Jun-24 07:25:58] Total gradient norm stats for 72 steps: 0.1537 <= 0.1754 + 0.01996z <= 0.2622 +[ INFO][26-Jun-24 07:25:58] Trained chunk 185 in 139.4s at 4268noun/s: lr=1.35e-03, loss=1.33e+00, top1=71.06%/69.296% +[ INFO][26-Jun-24 07:25:58] Chunk 186 = Batch 214971 = Sample 110064641 +[ INFO][26-Jun-24 07:28:17] Total gradient norm stats for 73 steps: 0.1548 <= 0.1823 + 0.02479z <= 0.3263 +[ INFO][26-Jun-24 07:28:17] Trained chunk 186 in 139.7s at 4259noun/s: lr=1.35e-03, loss=1.33e+00, top1=68.08%/69.302% +[ INFO][26-Jun-24 07:28:17] Chunk 187 = Batch 216133 = Sample 110659585 +[ INFO][26-Jun-24 07:30:37] Total gradient norm stats for 72 steps: 0.1525 <= 0.1754 + 0.01508z <= 0.2321 +[ INFO][26-Jun-24 07:30:37] Trained chunk 187 in 139.7s at 4258noun/s: lr=1.35e-03, loss=1.33e+00, top1=69.11%/69.315% +[ INFO][26-Jun-24 07:30:37] Chunk 188 = Batch 217295 = Sample 111254529 +[ INFO][26-Jun-24 07:32:56] Total gradient norm stats for 73 steps: 0.1512 <= 0.1928 + 0.03141z <= 0.3294 +[ INFO][26-Jun-24 07:32:56] Trained chunk 188 in 139.3s at 4270noun/s: lr=1.35e-03, loss=1.33e+00, top1=69.19%/69.328% +[ INFO][26-Jun-24 07:32:56] Chunk 189 = Batch 218457 = Sample 111849473 +[ INFO][26-Jun-24 07:35:16] Total gradient norm stats for 73 steps: 0.1531 <= 0.1832 + 0.02267z <= 0.2592 +[ INFO][26-Jun-24 07:35:16] Trained chunk 189 in 139.7s at 4258noun/s: lr=1.34e-03, loss=1.33e+00, top1=68.89%/69.336% +[ INFO][26-Jun-24 07:35:16] Chunk 190 = Batch 219619 = Sample 112444417 +[ INFO][26-Jun-24 07:37:36] Total gradient norm stats for 72 steps: 0.154 <= 0.1803 + 0.01703z <= 0.2465 +[ INFO][26-Jun-24 07:37:36] Trained chunk 190 in 139.5s at 4264noun/s: lr=1.34e-03, loss=1.33e+00, top1=68.89%/69.346% +[ INFO][26-Jun-24 07:37:36] Chunk 191 = Batch 220781 = Sample 113039361 +[ INFO][26-Jun-24 07:39:55] Total gradient norm stats for 73 steps: 0.1566 <= 0.1844 + 0.02051z <= 0.2477 +[ INFO][26-Jun-24 07:39:55] Trained chunk 191 in 139.5s at 4265noun/s: lr=1.34e-03, loss=1.33e+00, top1=68.78%/69.356% +[ INFO][26-Jun-24 07:39:55] Chunk 192 = Batch 221943 = Sample 113634305 +[ INFO][26-Jun-24 07:42:15] Total gradient norm stats for 73 steps: 0.1567 <= 0.1968 + 0.02752z <= 0.2785 +[ INFO][26-Jun-24 07:42:15] Trained chunk 192 in 140.0s at 4249noun/s: lr=1.34e-03, loss=1.33e+00, top1=68.91%/69.368% +[ INFO][26-Jun-24 07:42:15] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240626_001447/ovod_chunk0192_20240626_074215.train +[ INFO][26-Jun-24 07:42:15] Chunk 193 = Batch 223105 = Sample 114229249 +[ INFO][26-Jun-24 07:44:35] Total gradient norm stats for 72 steps: 0.1581 <= 0.1835 + 0.02058z <= 0.2725 +[ INFO][26-Jun-24 07:44:35] Trained chunk 193 in 139.6s at 4261noun/s: lr=1.34e-03, loss=1.33e+00, top1=69.16%/69.375% +[ INFO][26-Jun-24 07:44:35] Chunk 194 = Batch 224267 = Sample 114824193 +[ INFO][26-Jun-24 07:46:55] Total gradient norm stats for 73 steps: 0.159 <= 0.1888 + 0.02283z <= 0.2676 +[ INFO][26-Jun-24 07:46:55] Trained chunk 194 in 139.7s at 4259noun/s: lr=1.34e-03, loss=1.33e+00, top1=69.31%/69.390% +[ INFO][26-Jun-24 07:46:55] Chunk 195 = Batch 225429 = Sample 115419137 +[ INFO][26-Jun-24 07:49:14] Total gradient norm stats for 72 steps: 0.15 <= 0.1902 + 0.02486z <= 0.267 +[ INFO][26-Jun-24 07:49:14] Trained chunk 195 in 139.5s at 4265noun/s: lr=1.33e-03, loss=1.33e+00, top1=69.72%/69.399% +[ INFO][26-Jun-24 07:49:14] Chunk 196 = Batch 226591 = Sample 116014081 +[ INFO][26-Jun-24 07:51:33] Total gradient norm stats for 73 steps: 0.1612 <= 0.1943 + 0.02427z <= 0.2687 +[ INFO][26-Jun-24 07:51:33] Trained chunk 196 in 139.3s at 4271noun/s: lr=1.33e-03, loss=1.33e+00, top1=69.15%/69.404% +[ INFO][26-Jun-24 07:51:33] Chunk 197 = Batch 227753 = Sample 116609025 +[ INFO][26-Jun-24 07:53:54] Total gradient norm stats for 73 steps: 0.1592 <= 0.1926 + 0.02888z <= 0.3479 +[ INFO][26-Jun-24 07:53:54] Trained chunk 197 in 140.2s at 4244noun/s: lr=1.33e-03, loss=1.32e+00, top1=69.97%/69.413% +[ INFO][26-Jun-24 07:53:54] Chunk 198 = Batch 228915 = Sample 117203969 +[ INFO][26-Jun-24 07:56:13] Total gradient norm stats for 72 steps: 0.1598 <= 0.187 + 0.0246z <= 0.2791 +[ INFO][26-Jun-24 07:56:13] Trained chunk 198 in 139.6s at 4261noun/s: lr=1.33e-03, loss=1.32e+00, top1=70.43%/69.412% +[ INFO][26-Jun-24 07:56:13] Chunk 199 = Batch 230077 = Sample 117798913 +[ INFO][26-Jun-24 07:58:33] Total gradient norm stats for 73 steps: 0.157 <= 0.1918 + 0.0238z <= 0.2645 +[ INFO][26-Jun-24 07:58:33] Trained chunk 199 in 139.6s at 4261noun/s: lr=1.33e-03, loss=1.32e+00, top1=69.91%/69.422% +[ INFO][26-Jun-24 07:58:33] Chunk 200 = Batch 231239 = Sample 118393857 +[ INFO][26-Jun-24 08:00:52] Epoch 4 finished in 6987.6s +[ INFO][26-Jun-24 08:00:52] -------------------------------------------------------------------------------- +[ INFO][26-Jun-24 08:00:52] Epoch 5 = Batch 232385 = Sample 118980609 +[ INFO][26-Jun-24 08:00:54] Total gradient norm stats for 73 steps: 0.1578 <= 0.1939 + 0.02441z <= 0.2824 +[ INFO][26-Jun-24 08:00:54] Trained chunk 200 in 141.0s at 4220noun/s: lr=1.33e-03, loss=1.32e+00, top1=69.41%/69.428% +[ INFO][26-Jun-24 08:00:54] Chunk 201 = Batch 232401 = Sample 118988801 +[ INFO][26-Jun-24 08:03:14] Total gradient norm stats for 72 steps: 0.1605 <= 0.1975 + 0.04001z <= 0.4532 +[ INFO][26-Jun-24 08:03:14] Trained chunk 201 in 140.1s at 4246noun/s: lr=1.32e-03, loss=1.32e+00, top1=69.00%/69.439% +[ INFO][26-Jun-24 08:03:14] Chunk 202 = Batch 233563 = Sample 119583745 +[ INFO][26-Jun-24 08:05:33] Total gradient norm stats for 73 steps: 0.1588 <= 0.1884 + 0.01921z <= 0.243 +[ INFO][26-Jun-24 08:05:33] Trained chunk 202 in 139.5s at 4265noun/s: lr=1.32e-03, loss=1.32e+00, top1=69.03%/69.449% +[ INFO][26-Jun-24 08:05:33] Chunk 203 = Batch 234725 = Sample 120178689 +[ INFO][26-Jun-24 08:07:53] Total gradient norm stats for 72 steps: 0.1544 <= 0.1837 + 0.02023z <= 0.244 +[ INFO][26-Jun-24 08:07:53] Trained chunk 203 in 139.9s at 4252noun/s: lr=1.32e-03, loss=1.32e+00, top1=68.84%/69.460% +[ INFO][26-Jun-24 08:07:53] Chunk 204 = Batch 235887 = Sample 120773633 +[ INFO][26-Jun-24 08:10:13] Total gradient norm stats for 73 steps: 0.1567 <= 0.1979 + 0.03341z <= 0.3257 +[ INFO][26-Jun-24 08:10:13] Trained chunk 204 in 139.8s at 4255noun/s: lr=1.32e-03, loss=1.32e+00, top1=70.76%/69.471% +[ INFO][26-Jun-24 08:10:13] Chunk 205 = Batch 237049 = Sample 121368577 +[ INFO][26-Jun-24 08:12:33] Total gradient norm stats for 73 steps: 0.1557 <= 0.1898 + 0.02457z <= 0.2655 +[ INFO][26-Jun-24 08:12:33] Trained chunk 205 in 139.8s at 4257noun/s: lr=1.32e-03, loss=1.32e+00, top1=68.43%/69.478% +[ INFO][26-Jun-24 08:12:33] Chunk 206 = Batch 238211 = Sample 121963521 +[ INFO][26-Jun-24 08:14:53] Total gradient norm stats for 72 steps: 0.16 <= 0.1978 + 0.02271z <= 0.2688 +[ INFO][26-Jun-24 08:14:53] Trained chunk 206 in 139.8s at 4255noun/s: lr=1.32e-03, loss=1.32e+00, top1=68.40%/69.480% +[ INFO][26-Jun-24 08:14:53] Chunk 207 = Batch 239373 = Sample 122558465 +[ INFO][26-Jun-24 08:17:13] Total gradient norm stats for 73 steps: 0.1635 <= 0.1954 + 0.02386z <= 0.271 +[ INFO][26-Jun-24 08:17:13] Trained chunk 207 in 139.7s at 4259noun/s: lr=1.31e-03, loss=1.32e+00, top1=69.15%/69.487% +[ INFO][26-Jun-24 08:17:13] Chunk 208 = Batch 240535 = Sample 123153409 +[ INFO][26-Jun-24 08:19:32] Total gradient norm stats for 73 steps: 0.1629 <= 0.1948 + 0.02602z <= 0.323 +[ INFO][26-Jun-24 08:19:32] Trained chunk 208 in 139.8s at 4255noun/s: lr=1.31e-03, loss=1.32e+00, top1=69.54%/69.491% +[ INFO][26-Jun-24 08:19:32] Chunk 209 = Batch 241697 = Sample 123748353 +[ INFO][26-Jun-24 08:21:52] Total gradient norm stats for 72 steps: 0.1614 <= 0.199 + 0.03054z <= 0.3221 +[ INFO][26-Jun-24 08:21:52] Trained chunk 209 in 139.6s at 4262noun/s: lr=1.31e-03, loss=1.32e+00, top1=68.43%/69.502% +[ INFO][26-Jun-24 08:21:52] Chunk 210 = Batch 242859 = Sample 124343297 +[ INFO][26-Jun-24 08:24:12] Total gradient norm stats for 73 steps: 0.1607 <= 0.1953 + 0.02077z <= 0.2506 +[ INFO][26-Jun-24 08:24:12] Trained chunk 210 in 140.0s at 4251noun/s: lr=1.31e-03, loss=1.32e+00, top1=69.53%/69.512% +[ INFO][26-Jun-24 08:24:12] Chunk 211 = Batch 244021 = Sample 124938241 +[ INFO][26-Jun-24 08:26:32] Total gradient norm stats for 72 steps: 0.1608 <= 0.1946 + 0.02981z <= 0.2987 +[ INFO][26-Jun-24 08:26:32] Trained chunk 211 in 139.9s at 4253noun/s: lr=1.31e-03, loss=1.32e+00, top1=69.79%/69.518% +[ INFO][26-Jun-24 08:26:32] Chunk 212 = Batch 245183 = Sample 125533185 +[ INFO][26-Jun-24 08:28:52] Total gradient norm stats for 73 steps: 0.1592 <= 0.1942 + 0.02518z <= 0.3052 +[ INFO][26-Jun-24 08:28:52] Trained chunk 212 in 139.7s at 4258noun/s: lr=1.31e-03, loss=1.32e+00, top1=69.42%/69.524% +[ INFO][26-Jun-24 08:28:52] Chunk 213 = Batch 246345 = Sample 126128129 +[ INFO][26-Jun-24 08:31:11] Total gradient norm stats for 73 steps: 0.1599 <= 0.1946 + 0.0241z <= 0.2682 +[ INFO][26-Jun-24 08:31:11] Trained chunk 213 in 139.8s at 4256noun/s: lr=1.30e-03, loss=1.32e+00, top1=68.73%/69.534% +[ INFO][26-Jun-24 08:31:11] Chunk 214 = Batch 247507 = Sample 126723073 +[ INFO][26-Jun-24 08:33:31] Total gradient norm stats for 72 steps: 0.1595 <= 0.19 + 0.02387z <= 0.2847 +[ INFO][26-Jun-24 08:33:31] Trained chunk 214 in 139.9s at 4251noun/s: lr=1.30e-03, loss=1.32e+00, top1=69.14%/69.547% +[ INFO][26-Jun-24 08:33:31] Chunk 215 = Batch 248669 = Sample 127318017 +[ INFO][26-Jun-24 08:35:51] Total gradient norm stats for 73 steps: 0.1575 <= 0.1974 + 0.02937z <= 0.342 +[ INFO][26-Jun-24 08:35:51] Trained chunk 215 in 139.8s at 4257noun/s: lr=1.30e-03, loss=1.32e+00, top1=70.52%/69.557% +[ INFO][26-Jun-24 08:35:51] Chunk 216 = Batch 249831 = Sample 127912961 +[ INFO][26-Jun-24 08:38:11] Total gradient norm stats for 73 steps: 0.1641 <= 0.2023 + 0.0292z <= 0.2963 +[ INFO][26-Jun-24 08:38:11] Trained chunk 216 in 140.0s at 4249noun/s: lr=1.30e-03, loss=1.32e+00, top1=69.66%/69.571% +[ INFO][26-Jun-24 08:38:11] Chunk 217 = Batch 250993 = Sample 128507905 +[ INFO][26-Jun-24 08:40:31] Total gradient norm stats for 72 steps: 0.1635 <= 0.199 + 0.02452z <= 0.2617 +[ INFO][26-Jun-24 08:40:31] Trained chunk 217 in 139.7s at 4259noun/s: lr=1.30e-03, loss=1.32e+00, top1=69.80%/69.575% +[ INFO][26-Jun-24 08:40:31] Chunk 218 = Batch 252155 = Sample 129102849 +[ INFO][26-Jun-24 08:42:51] Total gradient norm stats for 73 steps: 0.1627 <= 0.1971 + 0.02799z <= 0.3186 +[ INFO][26-Jun-24 08:42:51] Trained chunk 218 in 139.9s at 4251noun/s: lr=1.29e-03, loss=1.32e+00, top1=70.13%/69.579% +[ INFO][26-Jun-24 08:42:51] Chunk 219 = Batch 253317 = Sample 129697793 +[ INFO][26-Jun-24 08:45:10] Total gradient norm stats for 72 steps: 0.1666 <= 0.2034 + 0.02891z <= 0.3113 +[ INFO][26-Jun-24 08:45:10] Trained chunk 219 in 139.7s at 4260noun/s: lr=1.29e-03, loss=1.32e+00, top1=70.08%/69.581% +[ INFO][26-Jun-24 08:45:10] Chunk 220 = Batch 254479 = Sample 130292737 +[ INFO][26-Jun-24 08:47:30] Total gradient norm stats for 73 steps: 0.1629 <= 0.1984 + 0.02306z <= 0.2629 +[ INFO][26-Jun-24 08:47:30] Trained chunk 220 in 139.5s at 4263noun/s: lr=1.29e-03, loss=1.32e+00, top1=69.63%/69.586% +[ INFO][26-Jun-24 08:47:30] Chunk 221 = Batch 255641 = Sample 130887681 +[ INFO][26-Jun-24 08:49:50] Total gradient norm stats for 73 steps: 0.1664 <= 0.2012 + 0.0222z <= 0.2647 +[ INFO][26-Jun-24 08:49:50] Trained chunk 221 in 139.9s at 4253noun/s: lr=1.29e-03, loss=1.31e+00, top1=68.58%/69.590% +[ INFO][26-Jun-24 08:49:50] Chunk 222 = Batch 256803 = Sample 131482625 +[ INFO][26-Jun-24 08:52:11] Total gradient norm stats for 72 steps: 0.1685 <= 0.2093 + 0.03515z <= 0.3759 +[ INFO][26-Jun-24 08:52:11] Trained chunk 222 in 140.8s at 4224noun/s: lr=1.29e-03, loss=1.31e+00, top1=69.17%/69.593% +[ INFO][26-Jun-24 08:52:11] Chunk 223 = Batch 257965 = Sample 132077569 +[ INFO][26-Jun-24 08:54:31] Total gradient norm stats for 73 steps: 0.1683 <= 0.2055 + 0.02779z <= 0.2962 +[ INFO][26-Jun-24 08:54:31] Trained chunk 223 in 139.9s at 4252noun/s: lr=1.29e-03, loss=1.31e+00, top1=69.33%/69.597% +[ INFO][26-Jun-24 08:54:31] Chunk 224 = Batch 259127 = Sample 132672513 +[ INFO][26-Jun-24 08:56:51] Total gradient norm stats for 73 steps: 0.1637 <= 0.2003 + 0.02546z <= 0.3029 +[ INFO][26-Jun-24 08:56:51] Trained chunk 224 in 139.9s at 4254noun/s: lr=1.28e-03, loss=1.31e+00, top1=70.49%/69.605% +[ INFO][26-Jun-24 08:56:51] Chunk 225 = Batch 260289 = Sample 133267457 +[ INFO][26-Jun-24 08:59:11] Total gradient norm stats for 72 steps: 0.1677 <= 0.2083 + 0.03417z <= 0.3226 +[ INFO][26-Jun-24 08:59:11] Trained chunk 225 in 140.1s at 4247noun/s: lr=1.28e-03, loss=1.31e+00, top1=69.21%/69.609% +[ INFO][26-Jun-24 08:59:11] Chunk 226 = Batch 261451 = Sample 133862401 +[ INFO][26-Jun-24 09:01:30] Total gradient norm stats for 73 steps: 0.1713 <= 0.2071 + 0.02685z <= 0.2946 +[ INFO][26-Jun-24 09:01:30] Trained chunk 226 in 139.6s at 4262noun/s: lr=1.28e-03, loss=1.31e+00, top1=71.00%/69.621% +[ INFO][26-Jun-24 09:01:30] Chunk 227 = Batch 262613 = Sample 134457345 +[ INFO][26-Jun-24 09:03:50] Total gradient norm stats for 72 steps: 0.1679 <= 0.2119 + 0.03396z <= 0.3315 +[ INFO][26-Jun-24 09:03:50] Trained chunk 227 in 139.4s at 4267noun/s: lr=1.28e-03, loss=1.31e+00, top1=70.90%/69.629% +[ INFO][26-Jun-24 09:03:50] Chunk 228 = Batch 263775 = Sample 135052289 +[ INFO][26-Jun-24 09:06:09] Total gradient norm stats for 73 steps: 0.164 <= 0.1989 + 0.02426z <= 0.2744 +[ INFO][26-Jun-24 09:06:09] Trained chunk 228 in 139.5s at 4263noun/s: lr=1.28e-03, loss=1.31e+00, top1=70.65%/69.641% +[ INFO][26-Jun-24 09:06:09] Chunk 229 = Batch 264937 = Sample 135647233 +[ INFO][26-Jun-24 09:08:29] Total gradient norm stats for 73 steps: 0.1659 <= 0.1973 + 0.0186z <= 0.2573 +[ INFO][26-Jun-24 09:08:29] Trained chunk 229 in 139.7s at 4259noun/s: lr=1.27e-03, loss=1.31e+00, top1=69.79%/69.650% +[ INFO][26-Jun-24 09:08:29] Chunk 230 = Batch 266099 = Sample 136242177 +[ INFO][26-Jun-24 09:10:49] Total gradient norm stats for 72 steps: 0.1654 <= 0.2098 + 0.03336z <= 0.3115 +[ INFO][26-Jun-24 09:10:49] Trained chunk 230 in 139.8s at 4256noun/s: lr=1.27e-03, loss=1.31e+00, top1=69.58%/69.654% +[ INFO][26-Jun-24 09:10:49] Chunk 231 = Batch 267261 = Sample 136837121 +[ INFO][26-Jun-24 09:13:08] Total gradient norm stats for 73 steps: 0.1696 <= 0.2229 + 0.0791z <= 0.8259 +[ INFO][26-Jun-24 09:13:08] Trained chunk 231 in 139.6s at 4263noun/s: lr=1.27e-03, loss=1.31e+00, top1=69.99%/69.659% +[ INFO][26-Jun-24 09:13:08] Chunk 232 = Batch 268423 = Sample 137432065 +[ INFO][26-Jun-24 09:15:28] Total gradient norm stats for 73 steps: 0.1751 <= 0.2117 + 0.02466z <= 0.2648 +[ INFO][26-Jun-24 09:15:28] Trained chunk 232 in 139.8s at 4255noun/s: lr=1.27e-03, loss=1.31e+00, top1=70.32%/69.666% +[ INFO][26-Jun-24 09:15:28] Chunk 233 = Batch 269585 = Sample 138027009 +[ INFO][26-Jun-24 09:17:48] Total gradient norm stats for 72 steps: 0.1708 <= 0.2169 + 0.04133z <= 0.4072 +[ INFO][26-Jun-24 09:17:48] Trained chunk 233 in 140.3s at 4241noun/s: lr=1.27e-03, loss=1.31e+00, top1=69.59%/69.670% +[ INFO][26-Jun-24 09:17:48] Chunk 234 = Batch 270747 = Sample 138621953 +[ INFO][26-Jun-24 09:20:08] Total gradient norm stats for 73 steps: 0.1737 <= 0.2136 + 0.02853z <= 0.2893 +[ INFO][26-Jun-24 09:20:08] Trained chunk 234 in 139.7s at 4258noun/s: lr=1.26e-03, loss=1.31e+00, top1=70.26%/69.675% +[ INFO][26-Jun-24 09:20:08] Chunk 235 = Batch 271909 = Sample 139216897 +[ INFO][26-Jun-24 09:22:27] Total gradient norm stats for 72 steps: 0.1666 <= 0.2121 + 0.03104z <= 0.3067 +[ INFO][26-Jun-24 09:22:27] Trained chunk 235 in 139.2s at 4275noun/s: lr=1.26e-03, loss=1.31e+00, top1=69.57%/69.680% +[ INFO][26-Jun-24 09:22:27] Chunk 236 = Batch 273071 = Sample 139811841 +[ INFO][26-Jun-24 09:24:47] Total gradient norm stats for 73 steps: 0.168 <= 0.217 + 0.03346z <= 0.3221 +[ INFO][26-Jun-24 09:24:47] Trained chunk 236 in 140.0s at 4251noun/s: lr=1.26e-03, loss=1.31e+00, top1=70.46%/69.691% +[ INFO][26-Jun-24 09:24:47] Chunk 237 = Batch 274233 = Sample 140406785 +[ INFO][26-Jun-24 09:27:07] Total gradient norm stats for 73 steps: 0.1758 <= 0.2139 + 0.03271z <= 0.3282 +[ INFO][26-Jun-24 09:27:07] Trained chunk 237 in 139.6s at 4261noun/s: lr=1.26e-03, loss=1.31e+00, top1=70.17%/69.691% +[ INFO][26-Jun-24 09:27:07] Chunk 238 = Batch 275395 = Sample 141001729 +[ INFO][26-Jun-24 09:29:27] Total gradient norm stats for 72 steps: 0.1718 <= 0.2071 + 0.0264z <= 0.312 +[ INFO][26-Jun-24 09:29:27] Trained chunk 238 in 139.9s at 4251noun/s: lr=1.26e-03, loss=1.31e+00, top1=69.72%/69.699% +[ INFO][26-Jun-24 09:29:27] Chunk 239 = Batch 276557 = Sample 141596673 +[ INFO][26-Jun-24 09:31:47] Total gradient norm stats for 73 steps: 0.1679 <= 0.211 + 0.03167z <= 0.3198 +[ INFO][26-Jun-24 09:31:47] Trained chunk 239 in 139.9s at 4252noun/s: lr=1.26e-03, loss=1.31e+00, top1=69.34%/69.699% +[ INFO][26-Jun-24 09:31:47] Chunk 240 = Batch 277719 = Sample 142191617 +[ INFO][26-Jun-24 09:34:06] Total gradient norm stats for 73 steps: 0.1724 <= 0.2052 + 0.02256z <= 0.2825 +[ INFO][26-Jun-24 09:34:06] Trained chunk 240 in 139.5s at 4264noun/s: lr=1.25e-03, loss=1.31e+00, top1=69.27%/69.703% +[ INFO][26-Jun-24 09:34:06] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240626_001447/ovod_chunk0240_20240626_093406.train +[ INFO][26-Jun-24 09:34:06] Chunk 241 = Batch 278881 = Sample 142786561 +[ INFO][26-Jun-24 09:36:26] Total gradient norm stats for 72 steps: 0.1715 <= 0.2063 + 0.02685z <= 0.279 +[ INFO][26-Jun-24 09:36:26] Trained chunk 241 in 139.8s at 4255noun/s: lr=1.25e-03, loss=1.31e+00, top1=70.76%/69.710% +[ INFO][26-Jun-24 09:36:26] Chunk 242 = Batch 280043 = Sample 143381505 +[ INFO][26-Jun-24 09:38:47] Total gradient norm stats for 73 steps: 0.1732 <= 0.2091 + 0.03555z <= 0.3494 +[ INFO][26-Jun-24 09:38:47] Trained chunk 242 in 140.2s at 4242noun/s: lr=1.25e-03, loss=1.31e+00, top1=69.78%/69.715% +[ INFO][26-Jun-24 09:38:47] Chunk 243 = Batch 281205 = Sample 143976449 +[ INFO][26-Jun-24 09:41:07] Total gradient norm stats for 72 steps: 0.1679 <= 0.2163 + 0.02526z <= 0.2836 +[ INFO][26-Jun-24 09:41:07] Trained chunk 243 in 140.1s at 4248noun/s: lr=1.25e-03, loss=1.31e+00, top1=69.20%/69.724% +[ INFO][26-Jun-24 09:41:07] Chunk 244 = Batch 282367 = Sample 144571393 +[ INFO][26-Jun-24 09:43:26] Total gradient norm stats for 73 steps: 0.1699 <= 0.2022 + 0.02283z <= 0.3096 +[ INFO][26-Jun-24 09:43:26] Trained chunk 244 in 139.8s at 4255noun/s: lr=1.25e-03, loss=1.31e+00, top1=70.35%/69.733% +[ INFO][26-Jun-24 09:43:26] Chunk 245 = Batch 283529 = Sample 145166337 +[ INFO][26-Jun-24 09:45:46] Total gradient norm stats for 73 steps: 0.1728 <= 0.2151 + 0.03174z <= 0.3049 +[ INFO][26-Jun-24 09:45:46] Trained chunk 245 in 139.6s at 4263noun/s: lr=1.24e-03, loss=1.31e+00, top1=70.98%/69.730% +[ INFO][26-Jun-24 09:45:46] Chunk 246 = Batch 284691 = Sample 145761281 +[ INFO][26-Jun-24 09:48:06] Total gradient norm stats for 72 steps: 0.1677 <= 0.2123 + 0.02815z <= 0.2922 +[ INFO][26-Jun-24 09:48:06] Trained chunk 246 in 139.9s at 4253noun/s: lr=1.24e-03, loss=1.31e+00, top1=70.20%/69.740% +[ INFO][26-Jun-24 09:48:06] Chunk 247 = Batch 285853 = Sample 146356225 +[ INFO][26-Jun-24 09:50:26] Total gradient norm stats for 73 steps: 0.1708 <= 0.2205 + 0.03803z <= 0.3751 +[ INFO][26-Jun-24 09:50:26] Trained chunk 247 in 139.6s at 4261noun/s: lr=1.24e-03, loss=1.31e+00, top1=68.67%/69.751% +[ INFO][26-Jun-24 09:50:26] Chunk 248 = Batch 287015 = Sample 146951169 +[ INFO][26-Jun-24 09:52:45] Total gradient norm stats for 73 steps: 0.1696 <= 0.2078 + 0.02632z <= 0.3079 +[ INFO][26-Jun-24 09:52:45] Trained chunk 248 in 139.4s at 4266noun/s: lr=1.24e-03, loss=1.31e+00, top1=69.73%/69.764% +[ INFO][26-Jun-24 09:52:45] Chunk 249 = Batch 288177 = Sample 147546113 +[ INFO][26-Jun-24 09:55:05] Total gradient norm stats for 72 steps: 0.1712 <= 0.2095 + 0.0261z <= 0.2964 +[ INFO][26-Jun-24 09:55:05] Trained chunk 249 in 140.0s at 4248noun/s: lr=1.24e-03, loss=1.31e+00, top1=70.12%/69.775% +[ INFO][26-Jun-24 09:55:05] Chunk 250 = Batch 289339 = Sample 148141057 +[ INFO][26-Jun-24 09:57:23] Epoch 5 finished in 6991.8s +[ INFO][26-Jun-24 09:57:23] -------------------------------------------------------------------------------- +[ INFO][26-Jun-24 09:57:23] Epoch 6 = Batch 290481 = Sample 148725761 +[ INFO][26-Jun-24 09:57:26] Total gradient norm stats for 73 steps: 0.1743 <= 0.2192 + 0.0317z <= 0.3181 +[ INFO][26-Jun-24 09:57:26] Trained chunk 250 in 141.2s at 4215noun/s: lr=1.23e-03, loss=1.31e+00, top1=68.95%/69.779% +[ INFO][26-Jun-24 09:57:26] Chunk 251 = Batch 290501 = Sample 148736001 +[ INFO][26-Jun-24 09:59:46] Total gradient norm stats for 72 steps: 0.1772 <= 0.2207 + 0.04362z <= 0.4413 +[ INFO][26-Jun-24 09:59:46] Trained chunk 251 in 139.5s at 4263noun/s: lr=1.23e-03, loss=1.31e+00, top1=69.33%/69.783% +[ INFO][26-Jun-24 09:59:46] Chunk 252 = Batch 291663 = Sample 149330945 +[ INFO][26-Jun-24 10:02:06] Total gradient norm stats for 73 steps: 0.1704 <= 0.2142 + 0.03174z <= 0.3472 +[ INFO][26-Jun-24 10:02:06] Trained chunk 252 in 140.1s at 4248noun/s: lr=1.23e-03, loss=1.30e+00, top1=69.18%/69.787% +[ INFO][26-Jun-24 10:02:06] Chunk 253 = Batch 292825 = Sample 149925889 +[ INFO][26-Jun-24 10:04:25] Total gradient norm stats for 73 steps: 0.1709 <= 0.2148 + 0.03516z <= 0.3368 +[ INFO][26-Jun-24 10:04:25] Trained chunk 253 in 139.6s at 4260noun/s: lr=1.23e-03, loss=1.30e+00, top1=68.38%/69.787% +[ INFO][26-Jun-24 10:04:25] Chunk 254 = Batch 293987 = Sample 150520833 +[ INFO][26-Jun-24 10:06:46] Total gradient norm stats for 72 steps: 0.1775 <= 0.2169 + 0.03006z <= 0.3183 +[ INFO][26-Jun-24 10:06:46] Trained chunk 254 in 140.1s at 4247noun/s: lr=1.23e-03, loss=1.30e+00, top1=69.82%/69.796% +[ INFO][26-Jun-24 10:06:46] Chunk 255 = Batch 295149 = Sample 151115777 +[ INFO][26-Jun-24 10:09:06] Total gradient norm stats for 73 steps: 0.1741 <= 0.2157 + 0.02426z <= 0.2979 +[ INFO][26-Jun-24 10:09:06] Trained chunk 255 in 140.0s at 4249noun/s: lr=1.22e-03, loss=1.30e+00, top1=70.87%/69.806% +[ INFO][26-Jun-24 10:09:06] Chunk 256 = Batch 296311 = Sample 151710721 +[ INFO][26-Jun-24 10:11:26] Total gradient norm stats for 73 steps: 0.1745 <= 0.2101 + 0.02776z <= 0.3094 +[ INFO][26-Jun-24 10:11:26] Trained chunk 256 in 140.2s at 4244noun/s: lr=1.22e-03, loss=1.30e+00, top1=68.06%/69.809% +[ INFO][26-Jun-24 10:11:26] Chunk 257 = Batch 297473 = Sample 152305665 +[ INFO][26-Jun-24 10:13:46] Total gradient norm stats for 72 steps: 0.1771 <= 0.2067 + 0.02518z <= 0.2877 +[ INFO][26-Jun-24 10:13:46] Trained chunk 257 in 139.9s at 4253noun/s: lr=1.22e-03, loss=1.30e+00, top1=70.40%/69.814% +[ INFO][26-Jun-24 10:13:46] Chunk 258 = Batch 298635 = Sample 152900609 +[ INFO][26-Jun-24 10:16:06] Total gradient norm stats for 73 steps: 0.1814 <= 0.2179 + 0.04134z <= 0.4306 +[ INFO][26-Jun-24 10:16:06] Trained chunk 258 in 139.9s at 4252noun/s: lr=1.22e-03, loss=1.30e+00, top1=70.23%/69.821% +[ INFO][26-Jun-24 10:16:06] Chunk 259 = Batch 299797 = Sample 153495553 +[ INFO][26-Jun-24 10:18:26] Total gradient norm stats for 72 steps: 0.1796 <= 0.2104 + 0.02499z <= 0.2898 +[ INFO][26-Jun-24 10:18:26] Trained chunk 259 in 140.1s at 4246noun/s: lr=1.22e-03, loss=1.30e+00, top1=69.69%/69.822% +[ INFO][26-Jun-24 10:18:26] Chunk 260 = Batch 300959 = Sample 154090497 +[ INFO][26-Jun-24 10:20:45] Total gradient norm stats for 73 steps: 0.1787 <= 0.2063 + 0.02366z <= 0.3246 +[ INFO][26-Jun-24 10:20:45] Trained chunk 260 in 139.6s at 4261noun/s: lr=1.21e-03, loss=1.30e+00, top1=68.72%/69.827% +[ INFO][26-Jun-24 10:20:45] Chunk 261 = Batch 302121 = Sample 154685441 +[ INFO][26-Jun-24 10:23:05] Total gradient norm stats for 73 steps: 0.174 <= 0.2046 + 0.02115z <= 0.312 +[ INFO][26-Jun-24 10:23:05] Trained chunk 261 in 139.8s at 4254noun/s: lr=1.21e-03, loss=1.30e+00, top1=70.13%/69.830% +[ INFO][26-Jun-24 10:23:05] Chunk 262 = Batch 303283 = Sample 155280385 +[ INFO][26-Jun-24 10:25:25] Total gradient norm stats for 72 steps: 0.1761 <= 0.2096 + 0.02098z <= 0.2989 +[ INFO][26-Jun-24 10:25:25] Trained chunk 262 in 139.5s at 4265noun/s: lr=1.21e-03, loss=1.30e+00, top1=69.20%/69.840% +[ INFO][26-Jun-24 10:25:25] Chunk 263 = Batch 304445 = Sample 155875329 +[ INFO][26-Jun-24 10:27:44] Total gradient norm stats for 73 steps: 0.1817 <= 0.2115 + 0.02225z <= 0.2896 +[ INFO][26-Jun-24 10:27:44] Trained chunk 263 in 139.6s at 4262noun/s: lr=1.21e-03, loss=1.30e+00, top1=70.93%/69.842% +[ INFO][26-Jun-24 10:27:44] Chunk 264 = Batch 305607 = Sample 156470273 +[ INFO][26-Jun-24 10:30:05] Total gradient norm stats for 73 steps: 0.1776 <= 0.2119 + 0.03431z <= 0.4125 +[ INFO][26-Jun-24 10:30:05] Trained chunk 264 in 140.3s at 4241noun/s: lr=1.20e-03, loss=1.30e+00, top1=70.76%/69.847% +[ INFO][26-Jun-24 10:30:05] Chunk 265 = Batch 306769 = Sample 157065217 +[ INFO][26-Jun-24 10:32:24] Total gradient norm stats for 72 steps: 0.1812 <= 0.2217 + 0.03141z <= 0.326 +[ INFO][26-Jun-24 10:32:24] Trained chunk 265 in 139.7s at 4259noun/s: lr=1.20e-03, loss=1.30e+00, top1=70.72%/69.852% +[ INFO][26-Jun-24 10:32:24] Chunk 266 = Batch 307931 = Sample 157660161 +[ INFO][26-Jun-24 10:34:44] Total gradient norm stats for 73 steps: 0.18 <= 0.2187 + 0.03773z <= 0.3291 +[ INFO][26-Jun-24 10:34:44] Trained chunk 266 in 139.5s at 4265noun/s: lr=1.20e-03, loss=1.30e+00, top1=69.59%/69.853% +[ INFO][26-Jun-24 10:34:44] Chunk 267 = Batch 309093 = Sample 158255105 +[ INFO][26-Jun-24 10:37:04] Total gradient norm stats for 72 steps: 0.1778 <= 0.2176 + 0.0306z <= 0.329 +[ INFO][26-Jun-24 10:37:04] Trained chunk 267 in 139.8s at 4255noun/s: lr=1.20e-03, loss=1.30e+00, top1=70.14%/69.860% +[ INFO][26-Jun-24 10:37:04] Chunk 268 = Batch 310255 = Sample 158850049 +[ INFO][26-Jun-24 10:39:24] Total gradient norm stats for 73 steps: 0.1724 <= 0.2096 + 0.0265z <= 0.3162 +[ INFO][26-Jun-24 10:39:24] Trained chunk 268 in 140.2s at 4242noun/s: lr=1.20e-03, loss=1.30e+00, top1=69.87%/69.862% +[ INFO][26-Jun-24 10:39:24] Chunk 269 = Batch 311417 = Sample 159444993 +[ INFO][26-Jun-24 10:41:43] Total gradient norm stats for 73 steps: 0.1788 <= 0.2205 + 0.03383z <= 0.3287 +[ INFO][26-Jun-24 10:41:43] Trained chunk 269 in 139.7s at 4258noun/s: lr=1.19e-03, loss=1.30e+00, top1=68.65%/69.870% +[ INFO][26-Jun-24 10:41:43] Chunk 270 = Batch 312579 = Sample 160039937 +[ INFO][26-Jun-24 10:44:04] Total gradient norm stats for 72 steps: 0.1753 <= 0.2445 + 0.2932z <= 2.689 (clipped to 1) +[ INFO][26-Jun-24 10:44:04] Trained chunk 270 in 140.0s at 4249noun/s: lr=1.19e-03, loss=1.30e+00, top1=69.89%/69.876% +[ INFO][26-Jun-24 10:44:04] Chunk 271 = Batch 313741 = Sample 160634881 +[ INFO][26-Jun-24 10:46:23] Total gradient norm stats for 73 steps: 0.1757 <= 0.2228 + 0.0519z <= 0.5211 +[ INFO][26-Jun-24 10:46:23] Trained chunk 271 in 139.8s at 4256noun/s: lr=1.19e-03, loss=1.30e+00, top1=70.08%/69.874% +[ INFO][26-Jun-24 10:46:23] Chunk 272 = Batch 314903 = Sample 161229825 +[ INFO][26-Jun-24 10:48:43] Total gradient norm stats for 73 steps: 0.1812 <= 0.2183 + 0.02932z <= 0.3194 +[ INFO][26-Jun-24 10:48:43] Trained chunk 272 in 140.1s at 4247noun/s: lr=1.19e-03, loss=1.30e+00, top1=69.41%/69.881% +[ INFO][26-Jun-24 10:48:43] Chunk 273 = Batch 316065 = Sample 161824769 +[ INFO][26-Jun-24 10:51:03] Total gradient norm stats for 72 steps: 0.1708 <= 0.2146 + 0.02575z <= 0.321 +[ INFO][26-Jun-24 10:51:03] Trained chunk 273 in 139.7s at 4259noun/s: lr=1.19e-03, loss=1.30e+00, top1=69.92%/69.889% +[ INFO][26-Jun-24 10:51:03] Chunk 274 = Batch 317227 = Sample 162419713 +[ INFO][26-Jun-24 10:53:23] Total gradient norm stats for 73 steps: 0.1818 <= 0.2158 + 0.02406z <= 0.2879 +[ INFO][26-Jun-24 10:53:23] Trained chunk 274 in 140.0s at 4249noun/s: lr=1.18e-03, loss=1.30e+00, top1=70.26%/69.896% +[ INFO][26-Jun-24 10:53:23] Chunk 275 = Batch 318389 = Sample 163014657 +[ INFO][26-Jun-24 10:55:43] Total gradient norm stats for 72 steps: 0.1845 <= 0.2194 + 0.04146z <= 0.4918 +[ INFO][26-Jun-24 10:55:43] Trained chunk 275 in 139.9s at 4252noun/s: lr=1.18e-03, loss=1.30e+00, top1=70.51%/69.901% +[ INFO][26-Jun-24 10:55:43] Chunk 276 = Batch 319551 = Sample 163609601 +[ INFO][26-Jun-24 10:58:02] Total gradient norm stats for 73 steps: 0.1781 <= 0.2289 + 0.0634z <= 0.5863 +[ INFO][26-Jun-24 10:58:02] Trained chunk 276 in 139.4s at 4269noun/s: lr=1.18e-03, loss=1.30e+00, top1=69.49%/69.904% +[ INFO][26-Jun-24 10:58:02] Chunk 277 = Batch 320713 = Sample 164204545 +[ INFO][26-Jun-24 11:00:22] Total gradient norm stats for 73 steps: 0.1875 <= 0.2186 + 0.02263z <= 0.288 +[ INFO][26-Jun-24 11:00:22] Trained chunk 277 in 139.6s at 4261noun/s: lr=1.18e-03, loss=1.30e+00, top1=68.92%/69.913% +[ INFO][26-Jun-24 11:00:22] Chunk 278 = Batch 321875 = Sample 164799489 +[ INFO][26-Jun-24 11:02:42] Total gradient norm stats for 72 steps: 0.1809 <= 0.2176 + 0.02347z <= 0.3161 +[ INFO][26-Jun-24 11:02:42] Trained chunk 278 in 139.9s at 4251noun/s: lr=1.18e-03, loss=1.30e+00, top1=70.45%/69.925% +[ INFO][26-Jun-24 11:02:42] Chunk 279 = Batch 323037 = Sample 165394433 +[ INFO][26-Jun-24 11:05:01] Total gradient norm stats for 73 steps: 0.1816 <= 0.2199 + 0.0324z <= 0.3357 +[ INFO][26-Jun-24 11:05:01] Trained chunk 279 in 139.3s at 4270noun/s: lr=1.17e-03, loss=1.30e+00, top1=69.80%/69.921% +[ INFO][26-Jun-24 11:05:01] Chunk 280 = Batch 324199 = Sample 165989377 +[ INFO][26-Jun-24 11:07:21] Total gradient norm stats for 73 steps: 0.1817 <= 0.2057 + 0.01751z <= 0.2544 +[ INFO][26-Jun-24 11:07:21] Trained chunk 280 in 139.8s at 4255noun/s: lr=1.17e-03, loss=1.30e+00, top1=70.25%/69.927% +[ INFO][26-Jun-24 11:07:21] Chunk 281 = Batch 325361 = Sample 166584321 +[ INFO][26-Jun-24 11:09:41] Total gradient norm stats for 72 steps: 0.181 <= 0.2145 + 0.02427z <= 0.2817 +[ INFO][26-Jun-24 11:09:41] Trained chunk 281 in 139.7s at 4258noun/s: lr=1.17e-03, loss=1.30e+00, top1=70.30%/69.930% +[ INFO][26-Jun-24 11:09:41] Chunk 282 = Batch 326523 = Sample 167179265 +[ INFO][26-Jun-24 11:12:01] Total gradient norm stats for 73 steps: 0.18 <= 0.2079 + 0.01808z <= 0.282 +[ INFO][26-Jun-24 11:12:01] Trained chunk 282 in 139.8s at 4257noun/s: lr=1.17e-03, loss=1.30e+00, top1=69.75%/69.938% +[ INFO][26-Jun-24 11:12:01] Chunk 283 = Batch 327685 = Sample 167774209 +[ INFO][26-Jun-24 11:14:20] Total gradient norm stats for 72 steps: 0.1817 <= 0.2158 + 0.02624z <= 0.302 +[ INFO][26-Jun-24 11:14:20] Trained chunk 283 in 139.8s at 4257noun/s: lr=1.16e-03, loss=1.30e+00, top1=70.08%/69.939% +[ INFO][26-Jun-24 11:14:20] Chunk 284 = Batch 328847 = Sample 168369153 +[ INFO][26-Jun-24 11:16:40] Total gradient norm stats for 73 steps: 0.1797 <= 0.2105 + 0.02345z <= 0.2908 +[ INFO][26-Jun-24 11:16:40] Trained chunk 284 in 139.6s at 4261noun/s: lr=1.16e-03, loss=1.30e+00, top1=69.92%/69.948% +[ INFO][26-Jun-24 11:16:40] Chunk 285 = Batch 330009 = Sample 168964097 +[ INFO][26-Jun-24 11:19:00] Total gradient norm stats for 73 steps: 0.1777 <= 0.2208 + 0.04658z <= 0.555 +[ INFO][26-Jun-24 11:19:00] Trained chunk 285 in 139.8s at 4257noun/s: lr=1.16e-03, loss=1.30e+00, top1=70.79%/69.949% +[ INFO][26-Jun-24 11:19:00] Chunk 286 = Batch 331171 = Sample 169559041 +[ INFO][26-Jun-24 11:21:19] Total gradient norm stats for 72 steps: 0.179 <= 0.2158 + 0.02221z <= 0.2787 +[ INFO][26-Jun-24 11:21:19] Trained chunk 286 in 139.6s at 4262noun/s: lr=1.16e-03, loss=1.30e+00, top1=69.13%/69.961% +[ INFO][26-Jun-24 11:21:19] Chunk 287 = Batch 332333 = Sample 170153985 +[ INFO][26-Jun-24 11:23:39] Total gradient norm stats for 73 steps: 0.1804 <= 0.2151 + 0.02484z <= 0.2954 +[ INFO][26-Jun-24 11:23:39] Trained chunk 287 in 139.5s at 4266noun/s: lr=1.16e-03, loss=1.30e+00, top1=71.00%/69.965% +[ INFO][26-Jun-24 11:23:39] Chunk 288 = Batch 333495 = Sample 170748929 +[ INFO][26-Jun-24 11:25:58] Total gradient norm stats for 73 steps: 0.1827 <= 0.2096 + 0.01874z <= 0.2634 +[ INFO][26-Jun-24 11:25:58] Trained chunk 288 in 139.1s at 4278noun/s: lr=1.15e-03, loss=1.30e+00, top1=69.53%/69.964% +[ INFO][26-Jun-24 11:25:58] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240626_001447/ovod_chunk0288_20240626_112558.train +[ INFO][26-Jun-24 11:25:58] Chunk 289 = Batch 334657 = Sample 171343873 +[ INFO][26-Jun-24 11:28:18] Total gradient norm stats for 72 steps: 0.179 <= 0.208 + 0.01855z <= 0.2801 +[ INFO][26-Jun-24 11:28:18] Trained chunk 289 in 139.7s at 4260noun/s: lr=1.15e-03, loss=1.30e+00, top1=68.99%/69.969% +[ INFO][26-Jun-24 11:28:18] Chunk 290 = Batch 335819 = Sample 171938817 +[ INFO][26-Jun-24 11:30:37] Total gradient norm stats for 73 steps: 0.1888 <= 0.2218 + 0.03337z <= 0.3833 +[ INFO][26-Jun-24 11:30:37] Trained chunk 290 in 139.5s at 4266noun/s: lr=1.15e-03, loss=1.29e+00, top1=69.75%/69.974% +[ INFO][26-Jun-24 11:30:37] Chunk 291 = Batch 336981 = Sample 172533761 +[ INFO][26-Jun-24 11:32:57] Total gradient norm stats for 72 steps: 0.1925 <= 0.2218 + 0.01869z <= 0.2785 +[ INFO][26-Jun-24 11:32:57] Trained chunk 291 in 139.3s at 4271noun/s: lr=1.15e-03, loss=1.30e+00, top1=70.01%/69.978% +[ INFO][26-Jun-24 11:32:57] Chunk 292 = Batch 338143 = Sample 173128705 +[ INFO][26-Jun-24 11:35:16] Total gradient norm stats for 73 steps: 0.1839 <= 0.213 + 0.0228z <= 0.2841 +[ INFO][26-Jun-24 11:35:16] Trained chunk 292 in 139.3s at 4271noun/s: lr=1.14e-03, loss=1.29e+00, top1=70.42%/69.992% +[ INFO][26-Jun-24 11:35:16] Chunk 293 = Batch 339305 = Sample 173723649 +[ INFO][26-Jun-24 11:37:35] Total gradient norm stats for 73 steps: 0.1877 <= 0.2207 + 0.03435z <= 0.3601 +[ INFO][26-Jun-24 11:37:35] Trained chunk 293 in 139.6s at 4262noun/s: lr=1.14e-03, loss=1.29e+00, top1=70.69%/69.993% +[ INFO][26-Jun-24 11:37:35] Chunk 294 = Batch 340467 = Sample 174318593 +[ INFO][26-Jun-24 11:39:55] Total gradient norm stats for 72 steps: 0.1806 <= 0.2161 + 0.05922z <= 0.6966 +[ INFO][26-Jun-24 11:39:55] Trained chunk 294 in 139.5s at 4265noun/s: lr=1.14e-03, loss=1.29e+00, top1=69.31%/69.997% +[ INFO][26-Jun-24 11:39:55] Chunk 295 = Batch 341629 = Sample 174913537 +[ INFO][26-Jun-24 11:42:14] Total gradient norm stats for 73 steps: 0.1823 <= 0.2193 + 0.02477z <= 0.2839 +[ INFO][26-Jun-24 11:42:14] Trained chunk 295 in 139.4s at 4269noun/s: lr=1.14e-03, loss=1.29e+00, top1=69.52%/70.002% +[ INFO][26-Jun-24 11:42:14] Chunk 296 = Batch 342791 = Sample 175508481 +[ INFO][26-Jun-24 11:44:34] Total gradient norm stats for 73 steps: 0.1857 <= 0.2115 + 0.01826z <= 0.2625 +[ INFO][26-Jun-24 11:44:34] Trained chunk 296 in 139.9s at 4253noun/s: lr=1.14e-03, loss=1.29e+00, top1=69.83%/70.005% +[ INFO][26-Jun-24 11:44:34] Chunk 297 = Batch 343953 = Sample 176103425 +[ INFO][26-Jun-24 11:46:54] Total gradient norm stats for 72 steps: 0.1829 <= 0.2185 + 0.0297z <= 0.3777 +[ INFO][26-Jun-24 11:46:54] Trained chunk 297 in 139.9s at 4252noun/s: lr=1.13e-03, loss=1.29e+00, top1=70.09%/70.008% +[ INFO][26-Jun-24 11:46:54] Chunk 298 = Batch 345115 = Sample 176698369 +[ INFO][26-Jun-24 11:49:14] Total gradient norm stats for 73 steps: 0.1815 <= 0.2128 + 0.02079z <= 0.2841 +[ INFO][26-Jun-24 11:49:14] Trained chunk 298 in 139.8s at 4257noun/s: lr=1.13e-03, loss=1.29e+00, top1=70.42%/70.013% +[ INFO][26-Jun-24 11:49:14] Chunk 299 = Batch 346277 = Sample 177293313 +[ INFO][26-Jun-24 11:51:34] Total gradient norm stats for 72 steps: 0.1816 <= 0.213 + 0.02328z <= 0.2764 +[ INFO][26-Jun-24 11:51:34] Trained chunk 299 in 139.7s at 4259noun/s: lr=1.13e-03, loss=1.29e+00, top1=69.67%/70.023% +[ INFO][26-Jun-24 11:51:34] Chunk 300 = Batch 347439 = Sample 177888257 +[ INFO][26-Jun-24 11:53:52] Epoch 6 finished in 6988.4s +[ INFO][26-Jun-24 11:53:52] -------------------------------------------------------------------------------- +[ INFO][26-Jun-24 11:53:52] Epoch 7 = Batch 348577 = Sample 178470913 +[ INFO][26-Jun-24 11:53:55] Total gradient norm stats for 73 steps: 0.1862 <= 0.2152 + 0.02324z <= 0.289 +[ INFO][26-Jun-24 11:53:55] Trained chunk 300 in 141.4s at 4207noun/s: lr=1.13e-03, loss=1.29e+00, top1=71.19%/70.034% +[ INFO][26-Jun-24 11:53:55] Chunk 301 = Batch 348601 = Sample 178483201 +[ INFO][26-Jun-24 11:56:15] Total gradient norm stats for 73 steps: 0.1871 <= 0.2428 + 0.1322z <= 1.107 (clipped to 1) +[ INFO][26-Jun-24 11:56:15] Trained chunk 301 in 140.0s at 4249noun/s: lr=1.12e-03, loss=1.29e+00, top1=70.46%/70.039% +[ INFO][26-Jun-24 11:56:15] Chunk 302 = Batch 349763 = Sample 179078145 +[ INFO][26-Jun-24 11:58:35] Total gradient norm stats for 72 steps: 0.1814 <= 0.2159 + 0.02857z <= 0.3189 +[ INFO][26-Jun-24 11:58:35] Trained chunk 302 in 140.0s at 4249noun/s: lr=1.12e-03, loss=1.29e+00, top1=69.18%/70.043% +[ INFO][26-Jun-24 11:58:35] Chunk 303 = Batch 350925 = Sample 179673089 +[ INFO][26-Jun-24 12:00:55] Total gradient norm stats for 73 steps: 0.1787 <= 0.2138 + 0.01865z <= 0.2578 +[ INFO][26-Jun-24 12:00:55] Trained chunk 303 in 140.1s at 4245noun/s: lr=1.12e-03, loss=1.29e+00, top1=69.46%/70.053% +[ INFO][26-Jun-24 12:00:55] Chunk 304 = Batch 352087 = Sample 180268033 +[ INFO][26-Jun-24 12:03:15] Total gradient norm stats for 73 steps: 0.1826 <= 0.2203 + 0.06971z <= 0.7928 +[ INFO][26-Jun-24 12:03:15] Trained chunk 304 in 139.9s at 4254noun/s: lr=1.12e-03, loss=1.29e+00, top1=69.31%/70.061% +[ INFO][26-Jun-24 12:03:15] Chunk 305 = Batch 353249 = Sample 180862977 +[ INFO][26-Jun-24 12:05:35] Total gradient norm stats for 72 steps: 0.1871 <= 0.2167 + 0.0175z <= 0.2681 +[ INFO][26-Jun-24 12:05:35] Trained chunk 305 in 140.2s at 4245noun/s: lr=1.12e-03, loss=1.29e+00, top1=69.86%/70.062% +[ INFO][26-Jun-24 12:05:35] Chunk 306 = Batch 354411 = Sample 181457921 +[ INFO][26-Jun-24 12:08:37] Total gradient norm stats for 73 steps: 0.1877 <= 0.2126 + 0.02222z <= 0.3061 +[ INFO][26-Jun-24 12:08:37] Trained chunk 306 in 182.1s at 3267noun/s: lr=1.11e-03, loss=1.29e+00, top1=70.02%/70.068% +[ INFO][26-Jun-24 12:08:37] Chunk 307 = Batch 355573 = Sample 182052865 +[ INFO][26-Jun-24 12:13:16] Total gradient norm stats for 72 steps: 0.1833 <= 0.2169 + 0.02502z <= 0.3406 +[ INFO][26-Jun-24 12:13:16] Trained chunk 307 in 278.8s at 2134noun/s: lr=1.11e-03, loss=1.29e+00, top1=70.53%/70.070% +[ INFO][26-Jun-24 12:13:16] Chunk 308 = Batch 356735 = Sample 182647809 +[ INFO][26-Jun-24 12:17:55] Total gradient norm stats for 73 steps: 0.1822 <= 0.2213 + 0.03014z <= 0.3472 +[ INFO][26-Jun-24 12:17:55] Trained chunk 308 in 278.4s at 2137noun/s: lr=1.11e-03, loss=1.29e+00, top1=69.78%/70.074% +[ INFO][26-Jun-24 12:17:55] Chunk 309 = Batch 357897 = Sample 183242753 +[ INFO][26-Jun-24 12:22:28] Total gradient norm stats for 73 steps: 0.1854 <= 0.2223 + 0.02327z <= 0.2926 +[ INFO][26-Jun-24 12:22:28] Trained chunk 309 in 273.2s at 2178noun/s: lr=1.11e-03, loss=1.29e+00, top1=69.68%/70.078% +[ INFO][26-Jun-24 12:22:28] Chunk 310 = Batch 359059 = Sample 183837697 +[ INFO][26-Jun-24 12:27:02] Total gradient norm stats for 72 steps: 0.1818 <= 0.2126 + 0.02063z <= 0.3076 +[ INFO][26-Jun-24 12:27:02] Trained chunk 310 in 273.8s at 2173noun/s: lr=1.10e-03, loss=1.29e+00, top1=70.84%/70.082% +[ INFO][26-Jun-24 12:27:02] Chunk 311 = Batch 360221 = Sample 184432641 +[ INFO][26-Jun-24 12:31:40] Total gradient norm stats for 73 steps: 0.182 <= 0.2154 + 0.02717z <= 0.4006 +[ INFO][26-Jun-24 12:31:40] Trained chunk 311 in 278.2s at 2138noun/s: lr=1.10e-03, loss=1.29e+00, top1=69.29%/70.088% +[ INFO][26-Jun-24 12:31:40] Chunk 312 = Batch 361383 = Sample 185027585 +[ INFO][26-Jun-24 12:36:14] Total gradient norm stats for 73 steps: 0.1876 <= 0.2295 + 0.03555z <= 0.3335 +[ INFO][26-Jun-24 12:36:14] Trained chunk 312 in 274.5s at 2168noun/s: lr=1.10e-03, loss=1.29e+00, top1=70.98%/70.089% +[ INFO][26-Jun-24 12:36:14] Chunk 313 = Batch 362545 = Sample 185622529 +[ INFO][26-Jun-24 12:40:52] Total gradient norm stats for 72 steps: 0.1901 <= 0.2137 + 0.01671z <= 0.2646 +[ INFO][26-Jun-24 12:40:52] Trained chunk 313 in 277.5s at 2144noun/s: lr=1.10e-03, loss=1.29e+00, top1=69.40%/70.095% +[ INFO][26-Jun-24 12:40:52] Chunk 314 = Batch 363707 = Sample 186217473 +[ INFO][26-Jun-24 12:45:24] Total gradient norm stats for 73 steps: 0.1904 <= 0.2219 + 0.03237z <= 0.4147 +[ INFO][26-Jun-24 12:45:24] Trained chunk 314 in 272.1s at 2186noun/s: lr=1.09e-03, loss=1.29e+00, top1=69.04%/70.097% +[ INFO][26-Jun-24 12:45:24] Chunk 315 = Batch 364869 = Sample 186812417 +[ INFO][26-Jun-24 12:49:59] Total gradient norm stats for 72 steps: 0.1909 <= 0.2215 + 0.02702z <= 0.3252 +[ INFO][26-Jun-24 12:49:59] Trained chunk 315 in 275.1s at 2163noun/s: lr=1.09e-03, loss=1.29e+00, top1=70.98%/70.099% +[ INFO][26-Jun-24 12:49:59] Chunk 316 = Batch 366031 = Sample 187407361 +[ INFO][26-Jun-24 12:52:26] Total gradient norm stats for 73 steps: 0.182 <= 0.2132 + 0.01833z <= 0.2931 +[ INFO][26-Jun-24 12:52:26] Trained chunk 316 in 147.2s at 4041noun/s: lr=1.09e-03, loss=1.29e+00, top1=69.98%/70.097% +[ INFO][26-Jun-24 12:52:26] Chunk 317 = Batch 367193 = Sample 188002305 +[ INFO][26-Jun-24 12:54:46] Total gradient norm stats for 73 steps: 0.1918 <= 0.2227 + 0.02998z <= 0.4202 +[ INFO][26-Jun-24 12:54:46] Trained chunk 317 in 139.6s at 4262noun/s: lr=1.09e-03, loss=1.29e+00, top1=69.37%/70.104% +[ INFO][26-Jun-24 12:54:46] Chunk 318 = Batch 368355 = Sample 188597249 +[ INFO][26-Jun-24 12:57:06] Total gradient norm stats for 72 steps: 0.1889 <= 0.2371 + 0.03511z <= 0.356 +[ INFO][26-Jun-24 12:57:06] Trained chunk 318 in 139.6s at 4262noun/s: lr=1.08e-03, loss=1.29e+00, top1=70.57%/70.111% +[ INFO][26-Jun-24 12:57:06] Chunk 319 = Batch 369517 = Sample 189192193 +[ INFO][26-Jun-24 12:59:25] Total gradient norm stats for 73 steps: 0.1902 <= 0.221 + 0.02803z <= 0.3488 +[ INFO][26-Jun-24 12:59:25] Trained chunk 319 in 139.4s at 4268noun/s: lr=1.08e-03, loss=1.29e+00, top1=69.83%/70.112% +[ INFO][26-Jun-24 12:59:25] Chunk 320 = Batch 370679 = Sample 189787137 +[ INFO][26-Jun-24 13:01:45] Total gradient norm stats for 73 steps: 0.1896 <= 0.2183 + 0.0218z <= 0.3227 +[ INFO][26-Jun-24 13:01:45] Trained chunk 320 in 140.0s at 4249noun/s: lr=1.08e-03, loss=1.29e+00, top1=70.04%/70.118% +[ INFO][26-Jun-24 13:01:45] Chunk 321 = Batch 371841 = Sample 190382081 +[ INFO][26-Jun-24 13:04:05] Total gradient norm stats for 72 steps: 0.1925 <= 0.2225 + 0.04327z <= 0.5351 +[ INFO][26-Jun-24 13:04:05] Trained chunk 321 in 139.6s at 4261noun/s: lr=1.08e-03, loss=1.29e+00, top1=70.58%/70.123% +[ INFO][26-Jun-24 13:04:05] Chunk 322 = Batch 373003 = Sample 190977025 +[ INFO][26-Jun-24 13:06:24] Total gradient norm stats for 73 steps: 0.1887 <= 0.2253 + 0.0267z <= 0.3334 +[ INFO][26-Jun-24 13:06:24] Trained chunk 322 in 139.6s at 4262noun/s: lr=1.08e-03, loss=1.29e+00, top1=68.94%/70.129% +[ INFO][26-Jun-24 13:06:24] Chunk 323 = Batch 374165 = Sample 191571969 +[ INFO][26-Jun-24 13:08:44] Total gradient norm stats for 72 steps: 0.1939 <= 0.2255 + 0.02563z <= 0.3124 +[ INFO][26-Jun-24 13:08:44] Trained chunk 323 in 139.9s at 4254noun/s: lr=1.07e-03, loss=1.29e+00, top1=69.66%/70.138% +[ INFO][26-Jun-24 13:08:44] Chunk 324 = Batch 375327 = Sample 192166913 +[ INFO][26-Jun-24 13:11:04] Total gradient norm stats for 73 steps: 0.188 <= 0.2194 + 0.0197z <= 0.2775 +[ INFO][26-Jun-24 13:11:04] Trained chunk 324 in 140.1s at 4247noun/s: lr=1.07e-03, loss=1.29e+00, top1=70.05%/70.142% +[ INFO][26-Jun-24 13:11:04] Chunk 325 = Batch 376489 = Sample 192761857 +[ INFO][26-Jun-24 13:13:24] Total gradient norm stats for 73 steps: 0.1891 <= 0.2216 + 0.02097z <= 0.2894 +[ INFO][26-Jun-24 13:13:24] Trained chunk 325 in 139.7s at 4258noun/s: lr=1.07e-03, loss=1.29e+00, top1=70.93%/70.145% +[ INFO][26-Jun-24 13:13:24] Chunk 326 = Batch 377651 = Sample 193356801 +[ INFO][26-Jun-24 13:15:43] Total gradient norm stats for 72 steps: 0.1898 <= 0.2184 + 0.03174z <= 0.4379 +[ INFO][26-Jun-24 13:15:43] Trained chunk 326 in 139.6s at 4262noun/s: lr=1.07e-03, loss=1.29e+00, top1=70.14%/70.141% +[ INFO][26-Jun-24 13:15:43] Chunk 327 = Batch 378813 = Sample 193951745 +[ INFO][26-Jun-24 13:18:03] Total gradient norm stats for 73 steps: 0.1958 <= 0.2221 + 0.02094z <= 0.2787 +[ INFO][26-Jun-24 13:18:03] Trained chunk 327 in 139.5s at 4264noun/s: lr=1.06e-03, loss=1.29e+00, top1=71.26%/70.145% +[ INFO][26-Jun-24 13:18:03] Chunk 328 = Batch 379975 = Sample 194546689 +[ INFO][26-Jun-24 13:20:22] Total gradient norm stats for 73 steps: 0.1886 <= 0.2138 + 0.01931z <= 0.3008 +[ INFO][26-Jun-24 13:20:22] Trained chunk 328 in 139.4s at 4269noun/s: lr=1.06e-03, loss=1.29e+00, top1=69.59%/70.151% +[ INFO][26-Jun-24 13:20:22] Chunk 329 = Batch 381137 = Sample 195141633 +[ INFO][26-Jun-24 13:22:42] Total gradient norm stats for 72 steps: 0.1883 <= 0.2184 + 0.03091z <= 0.4309 +[ INFO][26-Jun-24 13:22:42] Trained chunk 329 in 139.6s at 4263noun/s: lr=1.06e-03, loss=1.29e+00, top1=70.40%/70.156% +[ INFO][26-Jun-24 13:22:42] Chunk 330 = Batch 382299 = Sample 195736577 +[ INFO][26-Jun-24 13:25:01] Total gradient norm stats for 73 steps: 0.1932 <= 0.2233 + 0.02323z <= 0.294 +[ INFO][26-Jun-24 13:25:01] Trained chunk 330 in 139.6s at 4263noun/s: lr=1.06e-03, loss=1.29e+00, top1=69.83%/70.160% +[ INFO][26-Jun-24 13:25:01] Chunk 331 = Batch 383461 = Sample 196331521 +[ INFO][26-Jun-24 13:27:21] Total gradient norm stats for 72 steps: 0.1905 <= 0.2325 + 0.09406z <= 0.9981 +[ INFO][26-Jun-24 13:27:21] Trained chunk 331 in 139.6s at 4262noun/s: lr=1.05e-03, loss=1.29e+00, top1=69.56%/70.163% +[ INFO][26-Jun-24 13:27:21] Chunk 332 = Batch 384623 = Sample 196926465 +[ INFO][26-Jun-24 13:29:41] Total gradient norm stats for 73 steps: 0.1886 <= 0.2146 + 0.01383z <= 0.2498 +[ INFO][26-Jun-24 13:29:41] Trained chunk 332 in 139.7s at 4258noun/s: lr=1.05e-03, loss=1.29e+00, top1=70.30%/70.170% +[ INFO][26-Jun-24 13:29:41] Chunk 333 = Batch 385785 = Sample 197521409 +[ INFO][26-Jun-24 13:32:00] Total gradient norm stats for 73 steps: 0.1873 <= 0.2193 + 0.02034z <= 0.3325 +[ INFO][26-Jun-24 13:32:00] Trained chunk 333 in 139.5s at 4264noun/s: lr=1.05e-03, loss=1.29e+00, top1=70.28%/70.177% +[ INFO][26-Jun-24 13:32:00] Chunk 334 = Batch 386947 = Sample 198116353 +[ INFO][26-Jun-24 13:34:20] Total gradient norm stats for 72 steps: 0.1908 <= 0.2219 + 0.0239z <= 0.2908 +[ INFO][26-Jun-24 13:34:20] Trained chunk 334 in 139.6s at 4261noun/s: lr=1.05e-03, loss=1.28e+00, top1=69.84%/70.182% +[ INFO][26-Jun-24 13:34:20] Chunk 335 = Batch 388109 = Sample 198711297 +[ INFO][26-Jun-24 13:36:40] Total gradient norm stats for 73 steps: 0.1939 <= 0.2157 + 0.01547z <= 0.2601 +[ INFO][26-Jun-24 13:36:40] Trained chunk 335 in 139.9s at 4253noun/s: lr=1.04e-03, loss=1.28e+00, top1=71.19%/70.186% +[ INFO][26-Jun-24 13:36:40] Chunk 336 = Batch 389271 = Sample 199306241 +[ INFO][26-Jun-24 13:38:59] Total gradient norm stats for 73 steps: 0.1969 <= 0.2228 + 0.02316z <= 0.3153 +[ INFO][26-Jun-24 13:38:59] Trained chunk 336 in 139.4s at 4267noun/s: lr=1.04e-03, loss=1.28e+00, top1=70.49%/70.193% +[ INFO][26-Jun-24 13:39:00] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240626_001447/ovod_chunk0336_20240626_133859.train +[ INFO][26-Jun-24 13:39:00] Chunk 337 = Batch 390433 = Sample 199901185 +[ INFO][26-Jun-24 13:41:19] Total gradient norm stats for 72 steps: 0.197 <= 0.2308 + 0.05122z <= 0.6269 +[ INFO][26-Jun-24 13:41:19] Trained chunk 337 in 139.4s at 4269noun/s: lr=1.04e-03, loss=1.28e+00, top1=71.15%/70.195% +[ INFO][26-Jun-24 13:41:19] Chunk 338 = Batch 391595 = Sample 200496129 +[ INFO][26-Jun-24 13:43:39] Total gradient norm stats for 73 steps: 0.1894 <= 0.2242 + 0.03827z <= 0.4888 +[ INFO][26-Jun-24 13:43:39] Trained chunk 338 in 139.8s at 4255noun/s: lr=1.04e-03, loss=1.28e+00, top1=71.07%/70.201% +[ INFO][26-Jun-24 13:43:39] Chunk 339 = Batch 392757 = Sample 201091073 +[ INFO][26-Jun-24 13:45:59] Total gradient norm stats for 72 steps: 0.1943 <= 0.2293 + 0.03217z <= 0.3801 +[ INFO][26-Jun-24 13:45:59] Trained chunk 339 in 140.0s at 4249noun/s: lr=1.03e-03, loss=1.28e+00, top1=70.48%/70.209% +[ INFO][26-Jun-24 13:45:59] Chunk 340 = Batch 393919 = Sample 201686017 +[ INFO][26-Jun-24 13:48:18] Total gradient norm stats for 73 steps: 0.1946 <= 0.237 + 0.04065z <= 0.4273 +[ INFO][26-Jun-24 13:48:18] Trained chunk 340 in 139.7s at 4259noun/s: lr=1.03e-03, loss=1.28e+00, top1=70.30%/70.207% +[ INFO][26-Jun-24 13:48:18] Chunk 341 = Batch 395081 = Sample 202280961 +[ INFO][26-Jun-24 13:50:38] Total gradient norm stats for 73 steps: 0.1912 <= 0.2211 + 0.02497z <= 0.3463 +[ INFO][26-Jun-24 13:50:38] Trained chunk 341 in 139.9s at 4251noun/s: lr=1.03e-03, loss=1.28e+00, top1=70.24%/70.212% +[ INFO][26-Jun-24 13:50:38] Chunk 342 = Batch 396243 = Sample 202875905 +[ INFO][26-Jun-24 13:52:58] Total gradient norm stats for 72 steps: 0.1923 <= 0.2166 + 0.01403z <= 0.2548 +[ INFO][26-Jun-24 13:52:58] Trained chunk 342 in 139.5s at 4265noun/s: lr=1.03e-03, loss=1.28e+00, top1=68.80%/70.214% +[ INFO][26-Jun-24 13:52:58] Chunk 343 = Batch 397405 = Sample 203470849 +[ INFO][26-Jun-24 13:55:17] Total gradient norm stats for 73 steps: 0.1937 <= 0.2317 + 0.04287z <= 0.5409 +[ INFO][26-Jun-24 13:55:17] Trained chunk 343 in 139.3s at 4272noun/s: lr=1.03e-03, loss=1.28e+00, top1=71.83%/70.220% +[ INFO][26-Jun-24 13:55:17] Chunk 344 = Batch 398567 = Sample 204065793 +[ INFO][26-Jun-24 13:57:37] Total gradient norm stats for 73 steps: 0.1934 <= 0.2193 + 0.0361z <= 0.4991 +[ INFO][26-Jun-24 13:57:37] Trained chunk 344 in 139.8s at 4256noun/s: lr=1.02e-03, loss=1.28e+00, top1=71.59%/70.231% +[ INFO][26-Jun-24 13:57:37] Chunk 345 = Batch 399729 = Sample 204660737 +[ INFO][26-Jun-24 13:59:56] Total gradient norm stats for 72 steps: 0.1969 <= 0.2389 + 0.04408z <= 0.3875 +[ INFO][26-Jun-24 13:59:56] Trained chunk 345 in 139.6s at 4262noun/s: lr=1.02e-03, loss=1.28e+00, top1=68.79%/70.228% +[ INFO][26-Jun-24 13:59:56] Chunk 346 = Batch 400891 = Sample 205255681 +[ INFO][26-Jun-24 14:02:16] Total gradient norm stats for 73 steps: 0.1897 <= 0.2201 + 0.01654z <= 0.2644 +[ INFO][26-Jun-24 14:02:16] Trained chunk 346 in 139.6s at 4262noun/s: lr=1.02e-03, loss=1.28e+00, top1=69.63%/70.234% +[ INFO][26-Jun-24 14:02:16] Chunk 347 = Batch 402053 = Sample 205850625 +[ INFO][26-Jun-24 14:04:36] Total gradient norm stats for 72 steps: 0.1929 <= 0.2184 + 0.0179z <= 0.2721 +[ INFO][26-Jun-24 14:04:36] Trained chunk 347 in 139.5s at 4265noun/s: lr=1.02e-03, loss=1.28e+00, top1=69.87%/70.237% +[ INFO][26-Jun-24 14:04:36] Chunk 348 = Batch 403215 = Sample 206445569 +[ INFO][26-Jun-24 14:06:55] Total gradient norm stats for 73 steps: 0.1972 <= 0.233 + 0.0409z <= 0.3992 +[ INFO][26-Jun-24 14:06:55] Trained chunk 348 in 139.8s at 4256noun/s: lr=1.01e-03, loss=1.28e+00, top1=70.00%/70.238% +[ INFO][26-Jun-24 14:06:55] Chunk 349 = Batch 404377 = Sample 207040513 +[ INFO][26-Jun-24 14:09:15] Total gradient norm stats for 73 steps: 0.1877 <= 0.2271 + 0.02967z <= 0.3606 +[ INFO][26-Jun-24 14:09:15] Trained chunk 349 in 139.4s at 4269noun/s: lr=1.01e-03, loss=1.28e+00, top1=70.70%/70.242% +[ INFO][26-Jun-24 14:09:15] Chunk 350 = Batch 405539 = Sample 207635457 +[ INFO][26-Jun-24 14:11:32] Epoch 7 finished in 8259.9s +[ INFO][26-Jun-24 14:11:32] -------------------------------------------------------------------------------- +[ INFO][26-Jun-24 14:11:32] Epoch 8 = Batch 406673 = Sample 208216065 +[ INFO][26-Jun-24 14:11:35] Total gradient norm stats for 72 steps: 0.1922 <= 0.2139 + 0.01365z <= 0.2614 +[ INFO][26-Jun-24 14:11:35] Trained chunk 350 in 140.6s at 4231noun/s: lr=1.01e-03, loss=1.28e+00, top1=70.11%/70.242% +[ INFO][26-Jun-24 14:11:35] Chunk 351 = Batch 406701 = Sample 208230401 +[ INFO][26-Jun-24 14:13:55] Total gradient norm stats for 73 steps: 0.1914 <= 0.225 + 0.02073z <= 0.2887 +[ INFO][26-Jun-24 14:13:55] Trained chunk 351 in 139.8s at 4256noun/s: lr=1.01e-03, loss=1.28e+00, top1=69.98%/70.250% +[ INFO][26-Jun-24 14:13:55] Chunk 352 = Batch 407863 = Sample 208825345 +[ INFO][26-Jun-24 14:16:15] Total gradient norm stats for 73 steps: 0.1952 <= 0.2237 + 0.0219z <= 0.303 +[ INFO][26-Jun-24 14:16:15] Trained chunk 352 in 139.5s at 4265noun/s: lr=1.00e-03, loss=1.28e+00, top1=70.90%/70.263% +[ INFO][26-Jun-24 14:16:15] Chunk 353 = Batch 409025 = Sample 209420289 +[ INFO][26-Jun-24 14:18:34] Total gradient norm stats for 72 steps: 0.1989 <= 0.2254 + 0.02199z <= 0.2847 +[ INFO][26-Jun-24 14:18:34] Trained chunk 353 in 139.6s at 4263noun/s: lr=1.00e-03, loss=1.28e+00, top1=70.12%/70.271% +[ INFO][26-Jun-24 14:18:34] Chunk 354 = Batch 410187 = Sample 210015233 +[ INFO][26-Jun-24 14:20:54] Total gradient norm stats for 73 steps: 0.196 <= 0.2278 + 0.02496z <= 0.3089 +[ INFO][26-Jun-24 14:20:54] Trained chunk 354 in 139.4s at 4267noun/s: lr=9.98e-04, loss=1.28e+00, top1=70.61%/70.279% +[ INFO][26-Jun-24 14:20:54] Chunk 355 = Batch 411349 = Sample 210610177 +[ INFO][26-Jun-24 14:23:13] Total gradient norm stats for 72 steps: 0.1877 <= 0.2203 + 0.01845z <= 0.2881 +[ INFO][26-Jun-24 14:23:13] Trained chunk 355 in 139.0s at 4279noun/s: lr=9.96e-04, loss=1.28e+00, top1=70.29%/70.281% +[ INFO][26-Jun-24 14:23:13] Chunk 356 = Batch 412511 = Sample 211205121 +[ INFO][26-Jun-24 14:25:32] Total gradient norm stats for 73 steps: 0.1966 <= 0.2202 + 0.01596z <= 0.2702 +[ INFO][26-Jun-24 14:25:32] Trained chunk 356 in 139.7s at 4257noun/s: lr=9.93e-04, loss=1.28e+00, top1=70.18%/70.281% +[ INFO][26-Jun-24 14:25:32] Chunk 357 = Batch 413673 = Sample 211800065 +[ INFO][26-Jun-24 14:27:52] Total gradient norm stats for 73 steps: 0.1963 <= 0.2302 + 0.02752z <= 0.3306 +[ INFO][26-Jun-24 14:27:52] Trained chunk 357 in 139.7s at 4258noun/s: lr=9.91e-04, loss=1.28e+00, top1=69.36%/70.284% +[ INFO][26-Jun-24 14:27:52] Chunk 358 = Batch 414835 = Sample 212395009 +[ INFO][26-Jun-24 14:30:12] Total gradient norm stats for 72 steps: 0.1955 <= 0.2365 + 0.03576z <= 0.3549 +[ INFO][26-Jun-24 14:30:12] Trained chunk 358 in 139.5s at 4264noun/s: lr=9.88e-04, loss=1.28e+00, top1=70.70%/70.284% +[ INFO][26-Jun-24 14:30:12] Chunk 359 = Batch 415997 = Sample 212989953 +[ INFO][26-Jun-24 14:32:32] Total gradient norm stats for 73 steps: 0.1953 <= 0.2312 + 0.03425z <= 0.453 +[ INFO][26-Jun-24 14:32:32] Trained chunk 359 in 139.9s at 4252noun/s: lr=9.86e-04, loss=1.28e+00, top1=70.75%/70.291% +[ INFO][26-Jun-24 14:32:32] Chunk 360 = Batch 417159 = Sample 213584897 +[ INFO][26-Jun-24 14:34:51] Total gradient norm stats for 73 steps: 0.195 <= 0.2275 + 0.024z <= 0.3173 +[ INFO][26-Jun-24 14:34:51] Trained chunk 360 in 139.3s at 4271noun/s: lr=9.83e-04, loss=1.28e+00, top1=70.85%/70.296% +[ INFO][26-Jun-24 14:34:51] Chunk 361 = Batch 418321 = Sample 214179841 +[ INFO][26-Jun-24 14:37:11] Total gradient norm stats for 72 steps: 0.1969 <= 0.2202 + 0.01805z <= 0.2807 +[ INFO][26-Jun-24 14:37:11] Trained chunk 361 in 139.6s at 4261noun/s: lr=9.81e-04, loss=1.28e+00, top1=69.72%/70.298% +[ INFO][26-Jun-24 14:37:11] Chunk 362 = Batch 419483 = Sample 214774785 +[ INFO][26-Jun-24 14:39:31] Total gradient norm stats for 73 steps: 0.1993 <= 0.2285 + 0.02226z <= 0.3069 +[ INFO][26-Jun-24 14:39:31] Trained chunk 362 in 140.0s at 4250noun/s: lr=9.78e-04, loss=1.28e+00, top1=68.48%/70.302% +[ INFO][26-Jun-24 14:39:31] Chunk 363 = Batch 420645 = Sample 215369729 +[ INFO][26-Jun-24 14:41:50] Total gradient norm stats for 72 steps: 0.1976 <= 0.2224 + 0.02138z <= 0.3236 +[ INFO][26-Jun-24 14:41:50] Trained chunk 363 in 139.9s at 4254noun/s: lr=9.76e-04, loss=1.28e+00, top1=69.34%/70.306% +[ INFO][26-Jun-24 14:41:50] Chunk 364 = Batch 421807 = Sample 215964673 +[ INFO][26-Jun-24 14:44:10] Total gradient norm stats for 73 steps: 0.1965 <= 0.2236 + 0.01891z <= 0.2765 +[ INFO][26-Jun-24 14:44:10] Trained chunk 364 in 139.9s at 4254noun/s: lr=9.73e-04, loss=1.28e+00, top1=69.12%/70.315% +[ INFO][26-Jun-24 14:44:10] Chunk 365 = Batch 422969 = Sample 216559617 +[ INFO][26-Jun-24 14:46:31] Total gradient norm stats for 73 steps: 0.1944 <= 0.2211 + 0.01691z <= 0.2881 +[ INFO][26-Jun-24 14:46:31] Trained chunk 365 in 140.4s at 4239noun/s: lr=9.71e-04, loss=1.28e+00, top1=69.43%/70.319% +[ INFO][26-Jun-24 14:46:31] Chunk 366 = Batch 424131 = Sample 217154561 +[ INFO][26-Jun-24 14:48:50] Total gradient norm stats for 72 steps: 0.1979 <= 0.2237 + 0.02002z <= 0.3 +[ INFO][26-Jun-24 14:48:50] Trained chunk 366 in 139.5s at 4266noun/s: lr=9.68e-04, loss=1.28e+00, top1=71.03%/70.325% +[ INFO][26-Jun-24 14:48:50] Chunk 367 = Batch 425293 = Sample 217749505 +[ INFO][26-Jun-24 14:51:10] Total gradient norm stats for 73 steps: 0.1977 <= 0.2265 + 0.02211z <= 0.3038 +[ INFO][26-Jun-24 14:51:10] Trained chunk 367 in 139.8s at 4255noun/s: lr=9.66e-04, loss=1.28e+00, top1=70.36%/70.331% +[ INFO][26-Jun-24 14:51:10] Chunk 368 = Batch 426455 = Sample 218344449 +[ INFO][26-Jun-24 14:53:29] Total gradient norm stats for 73 steps: 0.2005 <= 0.2886 + 0.1587z <= 1.171 (clipped to 1) +[ INFO][26-Jun-24 14:53:29] Trained chunk 368 in 139.6s at 4261noun/s: lr=9.63e-04, loss=1.28e+00, top1=70.08%/70.331% +[ INFO][26-Jun-24 14:53:29] Chunk 369 = Batch 427617 = Sample 218939393 +[ INFO][26-Jun-24 14:55:49] Total gradient norm stats for 72 steps: 0.2021 <= 0.2518 + 0.04903z <= 0.4583 +[ INFO][26-Jun-24 14:55:49] Trained chunk 369 in 139.5s at 4265noun/s: lr=9.61e-04, loss=1.28e+00, top1=69.66%/70.329% +[ INFO][26-Jun-24 14:55:49] Chunk 370 = Batch 428779 = Sample 219534337 +[ INFO][26-Jun-24 14:58:09] Total gradient norm stats for 73 steps: 0.1979 <= 0.2247 + 0.02154z <= 0.3111 +[ INFO][26-Jun-24 14:58:09] Trained chunk 370 in 139.6s at 4263noun/s: lr=9.58e-04, loss=1.28e+00, top1=70.04%/70.328% +[ INFO][26-Jun-24 14:58:09] Chunk 371 = Batch 429941 = Sample 220129281 +[ INFO][26-Jun-24 15:00:28] Total gradient norm stats for 72 steps: 0.1952 <= 0.2271 + 0.02696z <= 0.3449 +[ INFO][26-Jun-24 15:00:28] Trained chunk 371 in 139.8s at 4256noun/s: lr=9.56e-04, loss=1.28e+00, top1=69.83%/70.331% +[ INFO][26-Jun-24 15:00:28] Chunk 372 = Batch 431103 = Sample 220724225 +[ INFO][26-Jun-24 15:02:48] Total gradient norm stats for 73 steps: 0.1981 <= 0.2361 + 0.02733z <= 0.3055 +[ INFO][26-Jun-24 15:02:48] Trained chunk 372 in 139.2s at 4273noun/s: lr=9.53e-04, loss=1.28e+00, top1=70.01%/70.336% +[ INFO][26-Jun-24 15:02:48] Chunk 373 = Batch 432265 = Sample 221319169 +[ INFO][26-Jun-24 15:05:07] Total gradient norm stats for 73 steps: 0.1989 <= 0.2256 + 0.01852z <= 0.2783 +[ INFO][26-Jun-24 15:05:07] Trained chunk 373 in 139.9s at 4253noun/s: lr=9.51e-04, loss=1.28e+00, top1=69.44%/70.339% +[ INFO][26-Jun-24 15:05:07] Chunk 374 = Batch 433427 = Sample 221914113 +[ INFO][26-Jun-24 15:07:27] Total gradient norm stats for 72 steps: 0.1975 <= 0.2222 + 0.02217z <= 0.3552 +[ INFO][26-Jun-24 15:07:27] Trained chunk 374 in 140.0s at 4249noun/s: lr=9.48e-04, loss=1.28e+00, top1=69.80%/70.341% +[ INFO][26-Jun-24 15:07:27] Chunk 375 = Batch 434589 = Sample 222509057 +[ INFO][26-Jun-24 15:09:47] Total gradient norm stats for 73 steps: 0.2003 <= 0.2298 + 0.02437z <= 0.3115 +[ INFO][26-Jun-24 15:09:47] Trained chunk 375 in 140.0s at 4250noun/s: lr=9.46e-04, loss=1.28e+00, top1=71.00%/70.345% +[ INFO][26-Jun-24 15:09:47] Chunk 376 = Batch 435751 = Sample 223104001 +[ INFO][26-Jun-24 15:12:07] Total gradient norm stats for 73 steps: 0.1978 <= 0.2234 + 0.01855z <= 0.2855 +[ INFO][26-Jun-24 15:12:07] Trained chunk 376 in 139.7s at 4259noun/s: lr=9.43e-04, loss=1.28e+00, top1=69.82%/70.353% +[ INFO][26-Jun-24 15:12:07] Chunk 377 = Batch 436913 = Sample 223698945 +[ INFO][26-Jun-24 15:14:27] Total gradient norm stats for 72 steps: 0.2027 <= 0.227 + 0.01784z <= 0.2741 +[ INFO][26-Jun-24 15:14:27] Trained chunk 377 in 139.5s at 4266noun/s: lr=9.41e-04, loss=1.28e+00, top1=70.00%/70.357% +[ INFO][26-Jun-24 15:14:27] Chunk 378 = Batch 438075 = Sample 224293889 +[ INFO][26-Jun-24 15:16:46] Total gradient norm stats for 73 steps: 0.2012 <= 0.2249 + 0.01739z <= 0.2921 +[ INFO][26-Jun-24 15:16:46] Trained chunk 378 in 139.3s at 4270noun/s: lr=9.38e-04, loss=1.28e+00, top1=69.92%/70.361% +[ INFO][26-Jun-24 15:16:46] Chunk 379 = Batch 439237 = Sample 224888833 +[ INFO][26-Jun-24 15:19:06] Total gradient norm stats for 72 steps: 0.2002 <= 0.2322 + 0.02624z <= 0.3162 +[ INFO][26-Jun-24 15:19:06] Trained chunk 379 in 139.9s at 4252noun/s: lr=9.35e-04, loss=1.28e+00, top1=70.18%/70.370% +[ INFO][26-Jun-24 15:19:06] Chunk 380 = Batch 440399 = Sample 225483777 +[ INFO][26-Jun-24 15:21:26] Total gradient norm stats for 73 steps: 0.1995 <= 0.2444 + 0.03872z <= 0.3574 +[ INFO][26-Jun-24 15:21:26] Trained chunk 380 in 139.8s at 4257noun/s: lr=9.33e-04, loss=1.28e+00, top1=70.91%/70.376% +[ INFO][26-Jun-24 15:21:26] Chunk 381 = Batch 441561 = Sample 226078721 +[ INFO][26-Jun-24 15:23:45] Total gradient norm stats for 73 steps: 0.2029 <= 0.2266 + 0.01754z <= 0.2773 +[ INFO][26-Jun-24 15:23:45] Trained chunk 381 in 139.6s at 4263noun/s: lr=9.30e-04, loss=1.27e+00, top1=69.16%/70.386% +[ INFO][26-Jun-24 15:23:45] Chunk 382 = Batch 442723 = Sample 226673665 +[ INFO][26-Jun-24 15:26:05] Total gradient norm stats for 72 steps: 0.1973 <= 0.2226 + 0.0392z <= 0.4997 +[ INFO][26-Jun-24 15:26:05] Trained chunk 382 in 139.6s at 4261noun/s: lr=9.28e-04, loss=1.27e+00, top1=69.49%/70.391% +[ INFO][26-Jun-24 15:26:05] Chunk 383 = Batch 443885 = Sample 227268609 +[ INFO][26-Jun-24 15:28:25] Total gradient norm stats for 73 steps: 0.1965 <= 0.2326 + 0.03329z <= 0.3987 +[ INFO][26-Jun-24 15:28:25] Trained chunk 383 in 139.7s at 4258noun/s: lr=9.25e-04, loss=1.27e+00, top1=70.32%/70.393% +[ INFO][26-Jun-24 15:28:25] Chunk 384 = Batch 445047 = Sample 227863553 +[ INFO][26-Jun-24 15:30:44] Total gradient norm stats for 73 steps: 0.1986 <= 0.218 + 0.01369z <= 0.2683 +[ INFO][26-Jun-24 15:30:44] Trained chunk 384 in 139.5s at 4263noun/s: lr=9.23e-04, loss=1.27e+00, top1=70.12%/70.402% +[ INFO][26-Jun-24 15:30:44] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240626_001447/ovod_chunk0384_20240626_153044.train +[ INFO][26-Jun-24 15:30:44] Chunk 385 = Batch 446209 = Sample 228458497 +[ INFO][26-Jun-24 15:33:04] Total gradient norm stats for 72 steps: 0.1963 <= 0.2196 + 0.01469z <= 0.2697 +[ INFO][26-Jun-24 15:33:04] Trained chunk 385 in 139.6s at 4262noun/s: lr=9.20e-04, loss=1.27e+00, top1=69.87%/70.405% +[ INFO][26-Jun-24 15:33:04] Chunk 386 = Batch 447371 = Sample 229053441 +[ INFO][26-Jun-24 15:35:24] Total gradient norm stats for 73 steps: 0.1968 <= 0.2224 + 0.01939z <= 0.3171 +[ INFO][26-Jun-24 15:35:24] Trained chunk 386 in 139.7s at 4258noun/s: lr=9.18e-04, loss=1.27e+00, top1=69.16%/70.404% +[ INFO][26-Jun-24 15:35:24] Chunk 387 = Batch 448533 = Sample 229648385 +[ INFO][26-Jun-24 15:37:43] Total gradient norm stats for 72 steps: 0.2022 <= 0.2453 + 0.138z <= 1.383 (clipped to 1) +[ INFO][26-Jun-24 15:37:43] Trained chunk 387 in 139.6s at 4263noun/s: lr=9.15e-04, loss=1.27e+00, top1=69.96%/70.407% +[ INFO][26-Jun-24 15:37:43] Chunk 388 = Batch 449695 = Sample 230243329 +[ INFO][26-Jun-24 15:40:03] Total gradient norm stats for 73 steps: 0.2052 <= 0.2292 + 0.01874z <= 0.2882 +[ INFO][26-Jun-24 15:40:03] Trained chunk 388 in 139.4s at 4267noun/s: lr=9.13e-04, loss=1.27e+00, top1=70.69%/70.413% +[ INFO][26-Jun-24 15:40:03] Chunk 389 = Batch 450857 = Sample 230838273 +[ INFO][26-Jun-24 15:42:22] Total gradient norm stats for 73 steps: 0.2004 <= 0.224 + 0.01321z <= 0.2607 +[ INFO][26-Jun-24 15:42:22] Trained chunk 389 in 139.6s at 4261noun/s: lr=9.10e-04, loss=1.27e+00, top1=69.56%/70.416% +[ INFO][26-Jun-24 15:42:22] Chunk 390 = Batch 452019 = Sample 231433217 +[ INFO][26-Jun-24 15:44:42] Total gradient norm stats for 72 steps: 0.1964 <= 0.231 + 0.01802z <= 0.2831 +[ INFO][26-Jun-24 15:44:42] Trained chunk 390 in 139.3s at 4271noun/s: lr=9.07e-04, loss=1.27e+00, top1=70.04%/70.418% +[ INFO][26-Jun-24 15:44:42] Chunk 391 = Batch 453181 = Sample 232028161 +[ INFO][26-Jun-24 15:47:02] Total gradient norm stats for 73 steps: 0.2082 <= 0.2317 + 0.01808z <= 0.2951 +[ INFO][26-Jun-24 15:47:02] Trained chunk 391 in 139.9s at 4253noun/s: lr=9.05e-04, loss=1.27e+00, top1=70.60%/70.419% +[ INFO][26-Jun-24 15:47:02] Chunk 392 = Batch 454343 = Sample 232623105 +[ INFO][26-Jun-24 15:49:21] Total gradient norm stats for 73 steps: 0.1999 <= 0.2326 + 0.03968z <= 0.4951 +[ INFO][26-Jun-24 15:49:21] Trained chunk 392 in 139.9s at 4254noun/s: lr=9.02e-04, loss=1.27e+00, top1=68.66%/70.425% +[ INFO][26-Jun-24 15:49:21] Chunk 393 = Batch 455505 = Sample 233218049 +[ INFO][26-Jun-24 15:51:41] Total gradient norm stats for 72 steps: 0.2024 <= 0.2377 + 0.02772z <= 0.3165 +[ INFO][26-Jun-24 15:51:41] Trained chunk 393 in 139.5s at 4263noun/s: lr=9.00e-04, loss=1.27e+00, top1=70.70%/70.434% +[ INFO][26-Jun-24 15:51:41] Chunk 394 = Batch 456667 = Sample 233812993 +[ INFO][26-Jun-24 15:54:00] Total gradient norm stats for 73 steps: 0.1993 <= 0.2234 + 0.01731z <= 0.2856 +[ INFO][26-Jun-24 15:54:00] Trained chunk 394 in 139.4s at 4268noun/s: lr=8.97e-04, loss=1.27e+00, top1=70.73%/70.433% +[ INFO][26-Jun-24 15:54:00] Chunk 395 = Batch 457829 = Sample 234407937 +[ INFO][26-Jun-24 15:56:20] Total gradient norm stats for 72 steps: 0.2001 <= 0.2225 + 0.01314z <= 0.2706 +[ INFO][26-Jun-24 15:56:20] Trained chunk 395 in 139.6s at 4263noun/s: lr=8.95e-04, loss=1.27e+00, top1=69.35%/70.435% +[ INFO][26-Jun-24 15:56:20] Chunk 396 = Batch 458991 = Sample 235002881 +[ INFO][26-Jun-24 15:58:39] Total gradient norm stats for 73 steps: 0.2018 <= 0.2423 + 0.1027z <= 1.092 (clipped to 1) +[ INFO][26-Jun-24 15:58:39] Trained chunk 396 in 139.4s at 4267noun/s: lr=8.92e-04, loss=1.27e+00, top1=70.59%/70.443% +[ INFO][26-Jun-24 15:58:39] Chunk 397 = Batch 460153 = Sample 235597825 +[ INFO][26-Jun-24 16:00:59] Total gradient norm stats for 73 steps: 0.1974 <= 0.2276 + 0.01796z <= 0.2947 +[ INFO][26-Jun-24 16:00:59] Trained chunk 397 in 139.7s at 4259noun/s: lr=8.89e-04, loss=1.27e+00, top1=70.84%/70.446% +[ INFO][26-Jun-24 16:00:59] Chunk 398 = Batch 461315 = Sample 236192769 +[ INFO][26-Jun-24 16:03:19] Total gradient norm stats for 72 steps: 0.2 <= 0.2289 + 0.01961z <= 0.2833 +[ INFO][26-Jun-24 16:03:19] Trained chunk 398 in 139.6s at 4262noun/s: lr=8.87e-04, loss=1.27e+00, top1=70.20%/70.460% +[ INFO][26-Jun-24 16:03:19] Chunk 399 = Batch 462477 = Sample 236787713 +[ INFO][26-Jun-24 16:05:38] Total gradient norm stats for 73 steps: 0.2037 <= 0.2273 + 0.01808z <= 0.2949 +[ INFO][26-Jun-24 16:05:38] Trained chunk 399 in 139.8s at 4256noun/s: lr=8.84e-04, loss=1.27e+00, top1=70.73%/70.466% +[ INFO][26-Jun-24 16:05:38] Chunk 400 = Batch 463639 = Sample 237382657 +[ INFO][26-Jun-24 16:07:55] Epoch 8 finished in 6983.7s +[ INFO][26-Jun-24 16:07:55] -------------------------------------------------------------------------------- +[ INFO][26-Jun-24 16:07:55] Epoch 9 = Batch 464769 = Sample 237961217 +[ INFO][26-Jun-24 16:08:00] Total gradient norm stats for 73 steps: 0.202 <= 0.2276 + 0.01669z <= 0.2731 +[ INFO][26-Jun-24 16:08:00] Trained chunk 400 in 141.1s at 4217noun/s: lr=8.82e-04, loss=1.27e+00, top1=71.59%/70.472% +[ INFO][26-Jun-24 16:08:00] Chunk 401 = Batch 464801 = Sample 237977601 +[ INFO][26-Jun-24 16:10:20] Total gradient norm stats for 72 steps: 0.2001 <= 0.225 + 0.01899z <= 0.3288 +[ INFO][26-Jun-24 16:10:20] Trained chunk 401 in 140.1s at 4248noun/s: lr=8.79e-04, loss=1.27e+00, top1=69.49%/70.479% +[ INFO][26-Jun-24 16:10:20] Chunk 402 = Batch 465963 = Sample 238572545 +[ INFO][26-Jun-24 16:12:40] Total gradient norm stats for 73 steps: 0.1991 <= 0.2319 + 0.02737z <= 0.411 +[ INFO][26-Jun-24 16:12:40] Trained chunk 402 in 140.1s at 4247noun/s: lr=8.77e-04, loss=1.27e+00, top1=70.87%/70.481% +[ INFO][26-Jun-24 16:12:40] Chunk 403 = Batch 467125 = Sample 239167489 +[ INFO][26-Jun-24 16:15:00] Total gradient norm stats for 72 steps: 0.2048 <= 0.2276 + 0.01719z <= 0.2729 +[ INFO][26-Jun-24 16:15:00] Trained chunk 403 in 140.3s at 4241noun/s: lr=8.74e-04, loss=1.27e+00, top1=70.48%/70.486% +[ INFO][26-Jun-24 16:15:00] Chunk 404 = Batch 468287 = Sample 239762433 +[ INFO][26-Jun-24 16:17:20] Total gradient norm stats for 73 steps: 0.2031 <= 0.2329 + 0.02211z <= 0.3263 +[ INFO][26-Jun-24 16:17:20] Trained chunk 404 in 139.9s at 4253noun/s: lr=8.71e-04, loss=1.27e+00, top1=71.79%/70.493% +[ INFO][26-Jun-24 16:17:20] Chunk 405 = Batch 469449 = Sample 240357377 +[ INFO][26-Jun-24 16:19:40] Total gradient norm stats for 73 steps: 0.2037 <= 0.2274 + 0.01996z <= 0.3206 +[ INFO][26-Jun-24 16:19:40] Trained chunk 405 in 140.6s at 4232noun/s: lr=8.69e-04, loss=1.27e+00, top1=69.77%/70.494% +[ INFO][26-Jun-24 16:19:40] Chunk 406 = Batch 470611 = Sample 240952321 +[ INFO][26-Jun-24 16:22:01] Total gradient norm stats for 72 steps: 0.2013 <= 0.2399 + 0.03156z <= 0.3649 +[ INFO][26-Jun-24 16:22:01] Trained chunk 406 in 140.1s at 4246noun/s: lr=8.66e-04, loss=1.27e+00, top1=70.56%/70.494% +[ INFO][26-Jun-24 16:22:01] Chunk 407 = Batch 471773 = Sample 241547265 +[ INFO][26-Jun-24 16:24:21] Total gradient norm stats for 73 steps: 0.2032 <= 0.2279 + 0.01849z <= 0.2914 +[ INFO][26-Jun-24 16:24:21] Trained chunk 407 in 140.1s at 4245noun/s: lr=8.64e-04, loss=1.27e+00, top1=68.67%/70.499% +[ INFO][26-Jun-24 16:24:21] Chunk 408 = Batch 472935 = Sample 242142209 +[ INFO][26-Jun-24 16:26:40] Total gradient norm stats for 73 steps: 0.2012 <= 0.3585 + 1.139z <= 9.956 (clipped to 1) +[ INFO][26-Jun-24 16:26:40] Trained chunk 408 in 139.7s at 4258noun/s: lr=8.61e-04, loss=1.27e+00, top1=69.57%/70.503% +[ INFO][26-Jun-24 16:26:40] Chunk 409 = Batch 474097 = Sample 242737153 +[ INFO][26-Jun-24 16:29:00] Total gradient norm stats for 72 steps: 0.2031 <= 0.2297 + 0.01927z <= 0.304 +[ INFO][26-Jun-24 16:29:00] Trained chunk 409 in 139.7s at 4259noun/s: lr=8.58e-04, loss=1.27e+00, top1=69.43%/70.506% +[ INFO][26-Jun-24 16:29:00] Chunk 410 = Batch 475259 = Sample 243332097 +[ INFO][26-Jun-24 16:31:20] Total gradient norm stats for 73 steps: 0.2039 <= 0.2342 + 0.01788z <= 0.2889 +[ INFO][26-Jun-24 16:31:20] Trained chunk 410 in 139.5s at 4265noun/s: lr=8.56e-04, loss=1.27e+00, top1=70.43%/70.507% +[ INFO][26-Jun-24 16:31:20] Chunk 411 = Batch 476421 = Sample 243927041 +[ INFO][26-Jun-24 16:33:39] Total gradient norm stats for 72 steps: 0.2071 <= 0.2519 + 0.1335z <= 1.354 (clipped to 1) +[ INFO][26-Jun-24 16:33:39] Trained chunk 411 in 139.6s at 4261noun/s: lr=8.53e-04, loss=1.27e+00, top1=71.22%/70.512% +[ INFO][26-Jun-24 16:33:39] Chunk 412 = Batch 477583 = Sample 244521985 +[ INFO][26-Jun-24 16:35:59] Total gradient norm stats for 73 steps: 0.2039 <= 0.2258 + 0.02018z <= 0.3249 +[ INFO][26-Jun-24 16:35:59] Trained chunk 412 in 140.0s at 4248noun/s: lr=8.51e-04, loss=1.27e+00, top1=71.52%/70.522% +[ INFO][26-Jun-24 16:35:59] Chunk 413 = Batch 478745 = Sample 245116929 +[ INFO][26-Jun-24 16:38:19] Total gradient norm stats for 73 steps: 0.2086 <= 0.2355 + 0.03218z <= 0.4342 +[ INFO][26-Jun-24 16:38:19] Trained chunk 413 in 139.7s at 4258noun/s: lr=8.48e-04, loss=1.27e+00, top1=70.25%/70.526% +[ INFO][26-Jun-24 16:38:19] Chunk 414 = Batch 479907 = Sample 245711873 +[ INFO][26-Jun-24 16:40:39] Total gradient norm stats for 72 steps: 0.2039 <= 0.23 + 0.01853z <= 0.2989 +[ INFO][26-Jun-24 16:40:39] Trained chunk 414 in 139.4s at 4267noun/s: lr=8.45e-04, loss=1.27e+00, top1=70.62%/70.533% +[ INFO][26-Jun-24 16:40:39] Chunk 415 = Batch 481069 = Sample 246306817 +[ INFO][26-Jun-24 16:42:58] Total gradient norm stats for 73 steps: 0.2075 <= 0.2342 + 0.02153z <= 0.3242 +[ INFO][26-Jun-24 16:42:58] Trained chunk 415 in 139.8s at 4257noun/s: lr=8.43e-04, loss=1.27e+00, top1=69.78%/70.540% +[ INFO][26-Jun-24 16:42:58] Chunk 416 = Batch 482231 = Sample 246901761 +[ INFO][26-Jun-24 16:45:18] Total gradient norm stats for 73 steps: 0.206 <= 0.2294 + 0.01795z <= 0.2806 +[ INFO][26-Jun-24 16:45:18] Trained chunk 416 in 139.4s at 4269noun/s: lr=8.40e-04, loss=1.27e+00, top1=71.15%/70.545% +[ INFO][26-Jun-24 16:45:18] Chunk 417 = Batch 483393 = Sample 247496705 +[ INFO][26-Jun-24 16:47:37] Total gradient norm stats for 72 steps: 0.2028 <= 0.2316 + 0.01939z <= 0.301 +[ INFO][26-Jun-24 16:47:37] Trained chunk 417 in 139.3s at 4270noun/s: lr=8.38e-04, loss=1.27e+00, top1=70.85%/70.551% +[ INFO][26-Jun-24 16:47:37] Chunk 418 = Batch 484555 = Sample 248091649 +[ INFO][26-Jun-24 16:49:56] Total gradient norm stats for 73 steps: 0.2041 <= 0.2304 + 0.0227z <= 0.3318 +[ INFO][26-Jun-24 16:49:56] Trained chunk 418 in 139.4s at 4267noun/s: lr=8.35e-04, loss=1.27e+00, top1=71.90%/70.551% +[ INFO][26-Jun-24 16:49:56] Chunk 419 = Batch 485717 = Sample 248686593 +[ INFO][26-Jun-24 16:52:16] Total gradient norm stats for 72 steps: 0.2084 <= 0.2368 + 0.02689z <= 0.3717 +[ INFO][26-Jun-24 16:52:16] Trained chunk 419 in 139.5s at 4265noun/s: lr=8.32e-04, loss=1.27e+00, top1=71.58%/70.555% +[ INFO][26-Jun-24 16:52:16] Chunk 420 = Batch 486879 = Sample 249281537 +[ INFO][26-Jun-24 16:54:36] Total gradient norm stats for 73 steps: 0.2068 <= 0.233 + 0.02z <= 0.3176 +[ INFO][26-Jun-24 16:54:36] Trained chunk 420 in 140.2s at 4245noun/s: lr=8.30e-04, loss=1.27e+00, top1=70.72%/70.555% +[ INFO][26-Jun-24 16:54:36] Chunk 421 = Batch 488041 = Sample 249876481 +[ INFO][26-Jun-24 16:56:56] Total gradient norm stats for 73 steps: 0.207 <= 0.2318 + 0.01768z <= 0.2935 +[ INFO][26-Jun-24 16:56:56] Trained chunk 421 in 139.8s at 4256noun/s: lr=8.27e-04, loss=1.27e+00, top1=70.56%/70.556% +[ INFO][26-Jun-24 16:56:56] Chunk 422 = Batch 489203 = Sample 250471425 +[ INFO][26-Jun-24 16:59:16] Total gradient norm stats for 72 steps: 0.2063 <= 0.2285 + 0.01277z <= 0.2586 +[ INFO][26-Jun-24 16:59:16] Trained chunk 422 in 139.7s at 4259noun/s: lr=8.25e-04, loss=1.27e+00, top1=69.96%/70.554% +[ INFO][26-Jun-24 16:59:16] Chunk 423 = Batch 490365 = Sample 251066369 +[ INFO][26-Jun-24 17:01:35] Total gradient norm stats for 73 steps: 0.2043 <= 0.2284 + 0.01539z <= 0.2753 +[ INFO][26-Jun-24 17:01:35] Trained chunk 423 in 139.6s at 4261noun/s: lr=8.22e-04, loss=1.26e+00, top1=69.48%/70.563% +[ INFO][26-Jun-24 17:01:35] Chunk 424 = Batch 491527 = Sample 251661313 +[ INFO][26-Jun-24 17:03:55] Total gradient norm stats for 73 steps: 0.2038 <= 0.2439 + 0.05738z <= 0.6728 +[ INFO][26-Jun-24 17:03:55] Trained chunk 424 in 139.9s at 4253noun/s: lr=8.19e-04, loss=1.26e+00, top1=69.78%/70.569% +[ INFO][26-Jun-24 17:03:55] Chunk 425 = Batch 492689 = Sample 252256257 +[ INFO][26-Jun-24 17:06:15] Total gradient norm stats for 72 steps: 0.2068 <= 0.2408 + 0.0658z <= 0.7596 +[ INFO][26-Jun-24 17:06:15] Trained chunk 425 in 139.6s at 4262noun/s: lr=8.17e-04, loss=1.26e+00, top1=70.99%/70.574% +[ INFO][26-Jun-24 17:06:15] Chunk 426 = Batch 493851 = Sample 252851201 +[ INFO][26-Jun-24 17:08:34] Total gradient norm stats for 73 steps: 0.21 <= 0.2458 + 0.04786z <= 0.6051 +[ INFO][26-Jun-24 17:08:34] Trained chunk 426 in 139.4s at 4268noun/s: lr=8.14e-04, loss=1.26e+00, top1=71.92%/70.578% +[ INFO][26-Jun-24 17:08:34] Chunk 427 = Batch 495013 = Sample 253446145 +[ INFO][26-Jun-24 17:10:54] Total gradient norm stats for 72 steps: 0.2113 <= 0.2407 + 0.02754z <= 0.3316 +[ INFO][26-Jun-24 17:10:54] Trained chunk 427 in 139.7s at 4260noun/s: lr=8.12e-04, loss=1.26e+00, top1=69.60%/70.580% +[ INFO][26-Jun-24 17:10:54] Chunk 428 = Batch 496175 = Sample 254041089 +[ INFO][26-Jun-24 17:13:13] Total gradient norm stats for 73 steps: 0.2091 <= 0.3069 + 0.4079z <= 2.779 (clipped to 1) +[ INFO][26-Jun-24 17:13:13] Trained chunk 428 in 139.7s at 4258noun/s: lr=8.09e-04, loss=1.26e+00, top1=70.57%/70.581% +[ INFO][26-Jun-24 17:13:13] Chunk 429 = Batch 497337 = Sample 254636033 +[ INFO][26-Jun-24 17:15:34] Total gradient norm stats for 73 steps: 0.2071 <= 0.3457 + 0.4669z <= 3.949 (clipped to 1) +[ INFO][26-Jun-24 17:15:34] Trained chunk 429 in 140.1s at 4247noun/s: lr=8.06e-04, loss=1.26e+00, top1=70.15%/70.571% +[ INFO][26-Jun-24 17:15:34] Chunk 430 = Batch 498499 = Sample 255230977 +[ INFO][26-Jun-24 17:17:53] Total gradient norm stats for 72 steps: 0.2064 <= 0.2528 + 0.1309z <= 1.329 (clipped to 1) +[ INFO][26-Jun-24 17:17:53] Trained chunk 430 in 139.6s at 4261noun/s: lr=8.04e-04, loss=1.26e+00, top1=69.91%/70.573% +[ INFO][26-Jun-24 17:17:53] Chunk 431 = Batch 499661 = Sample 255825921 +[ INFO][26-Jun-24 17:20:13] Total gradient norm stats for 73 steps: 0.2039 <= 0.2333 + 0.02095z <= 0.3064 +[ INFO][26-Jun-24 17:20:13] Trained chunk 431 in 139.7s at 4258noun/s: lr=8.01e-04, loss=1.26e+00, top1=69.76%/70.580% +[ INFO][26-Jun-24 17:20:13] Chunk 432 = Batch 500823 = Sample 256420865 +[ INFO][26-Jun-24 17:22:33] Total gradient norm stats for 73 steps: 0.2112 <= 0.2592 + 0.09312z <= 0.7971 +[ INFO][26-Jun-24 17:22:33] Trained chunk 432 in 139.8s at 4255noun/s: lr=7.98e-04, loss=1.26e+00, top1=68.46%/70.586% +[ INFO][26-Jun-24 17:22:33] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240626_001447/ovod_chunk0432_20240626_172233.train +[ INFO][26-Jun-24 17:22:33] Chunk 433 = Batch 501985 = Sample 257015809 +[ INFO][26-Jun-24 17:24:52] Total gradient norm stats for 72 steps: 0.2048 <= 0.2379 + 0.04516z <= 0.5838 +[ INFO][26-Jun-24 17:24:52] Trained chunk 433 in 139.3s at 4271noun/s: lr=7.96e-04, loss=1.26e+00, top1=70.12%/70.597% +[ INFO][26-Jun-24 17:24:52] Chunk 434 = Batch 503147 = Sample 257610753 +[ INFO][26-Jun-24 17:27:12] Total gradient norm stats for 73 steps: 0.2097 <= 0.2331 + 0.01621z <= 0.2923 +[ INFO][26-Jun-24 17:27:12] Trained chunk 434 in 139.2s at 4274noun/s: lr=7.93e-04, loss=1.26e+00, top1=70.51%/70.606% +[ INFO][26-Jun-24 17:27:12] Chunk 435 = Batch 504309 = Sample 258205697 +[ INFO][26-Jun-24 17:29:31] Total gradient norm stats for 72 steps: 0.2026 <= 0.2351 + 0.01791z <= 0.2911 +[ INFO][26-Jun-24 17:29:31] Trained chunk 435 in 139.8s at 4256noun/s: lr=7.91e-04, loss=1.26e+00, top1=70.68%/70.606% +[ INFO][26-Jun-24 17:29:31] Chunk 436 = Batch 505471 = Sample 258800641 +[ INFO][26-Jun-24 17:31:51] Total gradient norm stats for 73 steps: 0.209 <= 0.2387 + 0.02212z <= 0.3111 +[ INFO][26-Jun-24 17:31:51] Trained chunk 436 in 139.9s at 4253noun/s: lr=7.88e-04, loss=1.26e+00, top1=71.41%/70.612% +[ INFO][26-Jun-24 17:31:51] Chunk 437 = Batch 506633 = Sample 259395585 +[ INFO][26-Jun-24 17:34:11] Total gradient norm stats for 73 steps: 0.2089 <= 0.229 + 0.01364z <= 0.2693 +[ INFO][26-Jun-24 17:34:11] Trained chunk 437 in 139.4s at 4269noun/s: lr=7.85e-04, loss=1.26e+00, top1=71.83%/70.619% +[ INFO][26-Jun-24 17:34:11] Chunk 438 = Batch 507795 = Sample 259990529 +[ INFO][26-Jun-24 17:36:30] Total gradient norm stats for 72 steps: 0.2074 <= 0.2498 + 0.04381z <= 0.535 +[ INFO][26-Jun-24 17:36:30] Trained chunk 438 in 139.6s at 4261noun/s: lr=7.83e-04, loss=1.26e+00, top1=70.69%/70.621% +[ INFO][26-Jun-24 17:36:30] Chunk 439 = Batch 508957 = Sample 260585473 +[ INFO][26-Jun-24 17:38:49] Total gradient norm stats for 73 steps: 0.2093 <= 0.2341 + 0.02209z <= 0.3554 +[ INFO][26-Jun-24 17:38:49] Trained chunk 439 in 139.0s at 4280noun/s: lr=7.80e-04, loss=1.26e+00, top1=71.97%/70.630% +[ INFO][26-Jun-24 17:38:49] Chunk 440 = Batch 510119 = Sample 261180417 +[ INFO][26-Jun-24 17:41:08] Total gradient norm stats for 73 steps: 0.2047 <= 0.2355 + 0.02416z <= 0.3166 +[ INFO][26-Jun-24 17:41:08] Trained chunk 440 in 139.1s at 4276noun/s: lr=7.78e-04, loss=1.26e+00, top1=71.43%/70.632% +[ INFO][26-Jun-24 17:41:08] Chunk 441 = Batch 511281 = Sample 261775361 +[ INFO][26-Jun-24 17:43:28] Total gradient norm stats for 72 steps: 0.2115 <= 0.242 + 0.08798z <= 0.9551 +[ INFO][26-Jun-24 17:43:28] Trained chunk 441 in 139.4s at 4269noun/s: lr=7.75e-04, loss=1.26e+00, top1=69.52%/70.631% +[ INFO][26-Jun-24 17:43:28] Chunk 442 = Batch 512443 = Sample 262370305 +[ INFO][26-Jun-24 17:45:47] Total gradient norm stats for 73 steps: 0.2105 <= 0.2301 + 0.01674z <= 0.3124 +[ INFO][26-Jun-24 17:45:47] Trained chunk 442 in 139.5s at 4265noun/s: lr=7.72e-04, loss=1.26e+00, top1=72.17%/70.639% +[ INFO][26-Jun-24 17:45:47] Chunk 443 = Batch 513605 = Sample 262965249 +[ INFO][26-Jun-24 17:48:06] Total gradient norm stats for 72 steps: 0.2111 <= 0.238 + 0.0271z <= 0.3527 +[ INFO][26-Jun-24 17:48:06] Trained chunk 443 in 139.1s at 4277noun/s: lr=7.70e-04, loss=1.26e+00, top1=70.36%/70.644% +[ INFO][26-Jun-24 17:48:06] Chunk 444 = Batch 514767 = Sample 263560193 +[ INFO][26-Jun-24 17:50:26] Total gradient norm stats for 73 steps: 0.2105 <= 0.2434 + 0.03086z <= 0.3565 +[ INFO][26-Jun-24 17:50:26] Trained chunk 444 in 139.5s at 4264noun/s: lr=7.67e-04, loss=1.26e+00, top1=70.85%/70.657% +[ INFO][26-Jun-24 17:50:26] Chunk 445 = Batch 515929 = Sample 264155137 +[ INFO][26-Jun-24 17:52:46] Total gradient norm stats for 73 steps: 0.2119 <= 0.2472 + 0.02856z <= 0.37 +[ INFO][26-Jun-24 17:52:46] Trained chunk 445 in 139.7s at 4258noun/s: lr=7.64e-04, loss=1.26e+00, top1=70.44%/70.663% +[ INFO][26-Jun-24 17:52:46] Chunk 446 = Batch 517091 = Sample 264750081 +[ INFO][26-Jun-24 17:55:05] Total gradient norm stats for 72 steps: 0.2098 <= 0.2385 + 0.03218z <= 0.4597 +[ INFO][26-Jun-24 17:55:05] Trained chunk 446 in 139.7s at 4259noun/s: lr=7.62e-04, loss=1.26e+00, top1=70.18%/70.668% +[ INFO][26-Jun-24 17:55:05] Chunk 447 = Batch 518253 = Sample 265345025 +[ INFO][26-Jun-24 17:57:25] Total gradient norm stats for 73 steps: 0.2123 <= 0.2708 + 0.1539z <= 1.358 (clipped to 1) +[ INFO][26-Jun-24 17:57:25] Trained chunk 447 in 139.6s at 4262noun/s: lr=7.59e-04, loss=1.26e+00, top1=71.38%/70.676% +[ INFO][26-Jun-24 17:57:25] Chunk 448 = Batch 519415 = Sample 265939969 +[ INFO][26-Jun-24 17:59:44] Total gradient norm stats for 73 steps: 0.212 <= 0.2431 + 0.02745z <= 0.3634 +[ INFO][26-Jun-24 17:59:44] Trained chunk 448 in 139.4s at 4268noun/s: lr=7.57e-04, loss=1.26e+00, top1=69.20%/70.674% +[ INFO][26-Jun-24 17:59:44] Chunk 449 = Batch 520577 = Sample 266534913 +[ INFO][26-Jun-24 18:02:04] Total gradient norm stats for 72 steps: 0.211 <= 0.2311 + 0.01128z <= 0.2633 +[ INFO][26-Jun-24 18:02:04] Trained chunk 449 in 139.4s at 4269noun/s: lr=7.54e-04, loss=1.26e+00, top1=71.22%/70.683% +[ INFO][26-Jun-24 18:02:04] Chunk 450 = Batch 521739 = Sample 267129857 +[ INFO][26-Jun-24 18:04:20] Epoch 9 finished in 6984.3s +[ INFO][26-Jun-24 18:04:20] -------------------------------------------------------------------------------- +[ INFO][26-Jun-24 18:04:20] Epoch 10 = Batch 522865 = Sample 267706369 +[ INFO][26-Jun-24 18:04:24] Total gradient norm stats for 73 steps: 0.2096 <= 0.2337 + 0.01757z <= 0.2886 +[ INFO][26-Jun-24 18:04:24] Trained chunk 450 in 140.7s at 4229noun/s: lr=7.51e-04, loss=1.26e+00, top1=71.22%/70.687% +[ INFO][26-Jun-24 18:04:24] Chunk 451 = Batch 522901 = Sample 267724801 +[ INFO][26-Jun-24 18:06:44] Total gradient norm stats for 72 steps: 0.2136 <= 0.2353 + 0.01894z <= 0.3012 +[ INFO][26-Jun-24 18:06:44] Trained chunk 451 in 139.4s at 4269noun/s: lr=7.49e-04, loss=1.26e+00, top1=69.84%/70.692% +[ INFO][26-Jun-24 18:06:44] Chunk 452 = Batch 524063 = Sample 268319745 +[ INFO][26-Jun-24 18:09:03] Total gradient norm stats for 73 steps: 0.2127 <= 0.252 + 0.02501z <= 0.3377 +[ INFO][26-Jun-24 18:09:03] Trained chunk 452 in 139.6s at 4262noun/s: lr=7.46e-04, loss=1.26e+00, top1=70.98%/70.696% +[ INFO][26-Jun-24 18:09:03] Chunk 453 = Batch 525225 = Sample 268914689 +[ INFO][26-Jun-24 18:11:23] Total gradient norm stats for 73 steps: 0.2132 <= 0.2374 + 0.02053z <= 0.3212 +[ INFO][26-Jun-24 18:11:23] Trained chunk 453 in 139.8s at 4255noun/s: lr=7.43e-04, loss=1.26e+00, top1=71.35%/70.700% +[ INFO][26-Jun-24 18:11:23] Chunk 454 = Batch 526387 = Sample 269509633 +[ INFO][26-Jun-24 18:13:42] Total gradient norm stats for 72 steps: 0.211 <= 0.2321 + 0.01569z <= 0.2927 +[ INFO][26-Jun-24 18:13:42] Trained chunk 454 in 139.0s at 4281noun/s: lr=7.41e-04, loss=1.26e+00, top1=70.19%/70.705% +[ INFO][26-Jun-24 18:13:42] Chunk 455 = Batch 527549 = Sample 270104577 +[ INFO][26-Jun-24 18:16:02] Total gradient norm stats for 73 steps: 0.2115 <= 0.241 + 0.01735z <= 0.2819 +[ INFO][26-Jun-24 18:16:02] Trained chunk 455 in 139.7s at 4258noun/s: lr=7.38e-04, loss=1.26e+00, top1=70.89%/70.710% +[ INFO][26-Jun-24 18:16:02] Chunk 456 = Batch 528711 = Sample 270699521 +[ INFO][26-Jun-24 18:18:21] Total gradient norm stats for 73 steps: 0.2101 <= 0.2419 + 0.04626z <= 0.6063 +[ INFO][26-Jun-24 18:18:21] Trained chunk 456 in 139.1s at 4276noun/s: lr=7.36e-04, loss=1.26e+00, top1=70.83%/70.714% +[ INFO][26-Jun-24 18:18:21] Chunk 457 = Batch 529873 = Sample 271294465 +[ INFO][26-Jun-24 18:20:40] Total gradient norm stats for 72 steps: 0.2115 <= 0.2351 + 0.01666z <= 0.2892 +[ INFO][26-Jun-24 18:20:40] Trained chunk 457 in 139.2s at 4274noun/s: lr=7.33e-04, loss=1.26e+00, top1=71.78%/70.723% +[ INFO][26-Jun-24 18:20:40] Chunk 458 = Batch 531035 = Sample 271889409 +[ INFO][26-Jun-24 18:22:59] Total gradient norm stats for 73 steps: 0.2068 <= 0.2394 + 0.02181z <= 0.3023 +[ INFO][26-Jun-24 18:22:59] Trained chunk 458 in 139.2s at 4274noun/s: lr=7.30e-04, loss=1.26e+00, top1=70.86%/70.726% +[ INFO][26-Jun-24 18:22:59] Chunk 459 = Batch 532197 = Sample 272484353 +[ INFO][26-Jun-24 18:25:18] Total gradient norm stats for 72 steps: 0.2112 <= 0.2404 + 0.02906z <= 0.4338 +[ INFO][26-Jun-24 18:25:18] Trained chunk 459 in 138.8s at 4285noun/s: lr=7.28e-04, loss=1.26e+00, top1=69.64%/70.730% +[ INFO][26-Jun-24 18:25:18] Chunk 460 = Batch 533359 = Sample 273079297 +[ INFO][26-Jun-24 18:27:38] Total gradient norm stats for 73 steps: 0.2146 <= 0.2435 + 0.07038z <= 0.8165 +[ INFO][26-Jun-24 18:27:38] Trained chunk 460 in 139.4s at 4266noun/s: lr=7.25e-04, loss=1.25e+00, top1=69.40%/70.739% +[ INFO][26-Jun-24 18:27:38] Chunk 461 = Batch 534521 = Sample 273674241 +[ INFO][26-Jun-24 18:29:57] Total gradient norm stats for 73 steps: 0.2141 <= 0.2377 + 0.02245z <= 0.3566 +[ INFO][26-Jun-24 18:29:57] Trained chunk 461 in 139.3s at 4271noun/s: lr=7.22e-04, loss=1.25e+00, top1=71.31%/70.742% +[ INFO][26-Jun-24 18:29:57] Chunk 462 = Batch 535683 = Sample 274269185 +[ INFO][26-Jun-24 18:32:16] Total gradient norm stats for 72 steps: 0.2112 <= 0.2453 + 0.02428z <= 0.325 +[ INFO][26-Jun-24 18:32:16] Trained chunk 462 in 139.2s at 4275noun/s: lr=7.20e-04, loss=1.25e+00, top1=70.96%/70.743% +[ INFO][26-Jun-24 18:32:16] Chunk 463 = Batch 536845 = Sample 274864129 +[ INFO][26-Jun-24 18:34:35] Total gradient norm stats for 73 steps: 0.2159 <= 0.2503 + 0.05394z <= 0.6425 +[ INFO][26-Jun-24 18:34:35] Trained chunk 463 in 139.2s at 4273noun/s: lr=7.17e-04, loss=1.26e+00, top1=70.97%/70.735% +[ INFO][26-Jun-24 18:34:35] Chunk 464 = Batch 538007 = Sample 275459073 +[ INFO][26-Jun-24 18:36:55] Total gradient norm stats for 73 steps: 0.2092 <= 0.2432 + 0.04985z <= 0.6403 +[ INFO][26-Jun-24 18:36:55] Trained chunk 464 in 139.3s at 4271noun/s: lr=7.15e-04, loss=1.25e+00, top1=71.37%/70.742% +[ INFO][26-Jun-24 18:36:55] Chunk 465 = Batch 539169 = Sample 276054017 +[ INFO][26-Jun-24 18:39:14] Total gradient norm stats for 72 steps: 0.2139 <= 0.2362 + 0.01685z <= 0.296 +[ INFO][26-Jun-24 18:39:14] Trained chunk 465 in 139.3s at 4271noun/s: lr=7.12e-04, loss=1.25e+00, top1=70.57%/70.744% +[ INFO][26-Jun-24 18:39:14] Chunk 466 = Batch 540331 = Sample 276648961 +[ INFO][26-Jun-24 18:41:33] Total gradient norm stats for 73 steps: 0.2124 <= 0.235 + 0.01484z <= 0.2806 +[ INFO][26-Jun-24 18:41:33] Trained chunk 466 in 139.3s at 4271noun/s: lr=7.09e-04, loss=1.25e+00, top1=71.57%/70.750% +[ INFO][26-Jun-24 18:41:33] Chunk 467 = Batch 541493 = Sample 277243905 +[ INFO][26-Jun-24 18:43:52] Total gradient norm stats for 72 steps: 0.212 <= 0.233 + 0.02366z <= 0.4018 +[ INFO][26-Jun-24 18:43:52] Trained chunk 467 in 139.1s at 4278noun/s: lr=7.07e-04, loss=1.25e+00, top1=70.34%/70.752% +[ INFO][26-Jun-24 18:43:52] Chunk 468 = Batch 542655 = Sample 277838849 +[ INFO][26-Jun-24 18:46:12] Total gradient norm stats for 73 steps: 0.218 <= 0.2459 + 0.0268z <= 0.3185 +[ INFO][26-Jun-24 18:46:12] Trained chunk 468 in 139.2s at 4276noun/s: lr=7.04e-04, loss=1.25e+00, top1=70.66%/70.754% +[ INFO][26-Jun-24 18:46:12] Chunk 469 = Batch 543817 = Sample 278433793 +[ INFO][26-Jun-24 18:48:31] Total gradient norm stats for 73 steps: 0.2127 <= 0.2337 + 0.01298z <= 0.2789 +[ INFO][26-Jun-24 18:48:31] Trained chunk 469 in 139.0s at 4279noun/s: lr=7.02e-04, loss=1.25e+00, top1=70.85%/70.762% +[ INFO][26-Jun-24 18:48:31] Chunk 470 = Batch 544979 = Sample 279028737 +[ INFO][26-Jun-24 18:50:50] Total gradient norm stats for 72 steps: 0.2169 <= 0.2371 + 0.01637z <= 0.2884 +[ INFO][26-Jun-24 18:50:50] Trained chunk 470 in 138.9s at 4283noun/s: lr=6.99e-04, loss=1.25e+00, top1=70.26%/70.772% +[ INFO][26-Jun-24 18:50:50] Chunk 471 = Batch 546141 = Sample 279623681 +[ INFO][26-Jun-24 18:53:09] Total gradient norm stats for 73 steps: 0.2173 <= 0.2434 + 0.0243z <= 0.3163 +[ INFO][26-Jun-24 18:53:09] Trained chunk 471 in 139.1s at 4277noun/s: lr=6.96e-04, loss=1.25e+00, top1=70.04%/70.773% +[ INFO][26-Jun-24 18:53:09] Chunk 472 = Batch 547303 = Sample 280218625 +[ INFO][26-Jun-24 18:55:28] Total gradient norm stats for 73 steps: 0.2136 <= 0.2522 + 0.03582z <= 0.3969 +[ INFO][26-Jun-24 18:55:28] Trained chunk 472 in 139.5s at 4263noun/s: lr=6.94e-04, loss=1.25e+00, top1=69.69%/70.776% +[ INFO][26-Jun-24 18:55:28] Chunk 473 = Batch 548465 = Sample 280813569 +[ INFO][26-Jun-24 18:57:48] Total gradient norm stats for 72 steps: 0.2111 <= 0.243 + 0.02565z <= 0.329 +[ INFO][26-Jun-24 18:57:48] Trained chunk 473 in 139.5s at 4266noun/s: lr=6.91e-04, loss=1.25e+00, top1=70.49%/70.781% +[ INFO][26-Jun-24 18:57:48] Chunk 474 = Batch 549627 = Sample 281408513 +[ INFO][26-Jun-24 19:00:07] Total gradient norm stats for 73 steps: 0.2157 <= 0.247 + 0.0906z <= 0.9997 +[ INFO][26-Jun-24 19:00:07] Trained chunk 474 in 139.5s at 4264noun/s: lr=6.88e-04, loss=1.25e+00, top1=70.77%/70.786% +[ INFO][26-Jun-24 19:00:07] Chunk 475 = Batch 550789 = Sample 282003457 +[ INFO][26-Jun-24 19:02:26] Total gradient norm stats for 72 steps: 0.2155 <= 0.259 + 0.1918z <= 1.857 (clipped to 1) +[ INFO][26-Jun-24 19:02:26] Trained chunk 475 in 139.0s at 4282noun/s: lr=6.86e-04, loss=1.25e+00, top1=70.55%/70.783% +[ INFO][26-Jun-24 19:02:26] Chunk 476 = Batch 551951 = Sample 282598401 +[ INFO][26-Jun-24 19:04:45] Total gradient norm stats for 73 steps: 0.2161 <= 0.236 + 0.01055z <= 0.2722 +[ INFO][26-Jun-24 19:04:45] Trained chunk 476 in 138.8s at 4286noun/s: lr=6.83e-04, loss=1.25e+00, top1=71.18%/70.789% +[ INFO][26-Jun-24 19:04:45] Chunk 477 = Batch 553113 = Sample 283193345 +[ INFO][26-Jun-24 19:07:04] Total gradient norm stats for 73 steps: 0.2152 <= 0.2416 + 0.02182z <= 0.3093 +[ INFO][26-Jun-24 19:07:04] Trained chunk 477 in 139.2s at 4274noun/s: lr=6.81e-04, loss=1.25e+00, top1=70.82%/70.797% +[ INFO][26-Jun-24 19:07:04] Chunk 478 = Batch 554275 = Sample 283788289 +[ INFO][26-Jun-24 19:09:23] Total gradient norm stats for 72 steps: 0.2133 <= 0.2371 + 0.01806z <= 0.3145 +[ INFO][26-Jun-24 19:09:23] Trained chunk 478 in 139.2s at 4275noun/s: lr=6.78e-04, loss=1.25e+00, top1=70.54%/70.801% +[ INFO][26-Jun-24 19:09:23] Chunk 479 = Batch 555437 = Sample 284383233 +[ INFO][26-Jun-24 19:11:42] Total gradient norm stats for 73 steps: 0.2102 <= 0.2383 + 0.01436z <= 0.2812 +[ INFO][26-Jun-24 19:11:42] Trained chunk 479 in 139.0s at 4279noun/s: lr=6.75e-04, loss=1.25e+00, top1=70.46%/70.807% +[ INFO][26-Jun-24 19:11:42] Chunk 480 = Batch 556599 = Sample 284978177 +[ INFO][26-Jun-24 19:14:02] Total gradient norm stats for 73 steps: 0.2166 <= 0.2398 + 0.01516z <= 0.2898 +[ INFO][26-Jun-24 19:14:02] Trained chunk 480 in 139.3s at 4271noun/s: lr=6.73e-04, loss=1.25e+00, top1=70.02%/70.811% +[ INFO][26-Jun-24 19:14:02] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240626_001447/ovod_chunk0480_20240626_191402.train +[ INFO][26-Jun-24 19:14:02] Chunk 481 = Batch 557761 = Sample 285573121 +[ INFO][26-Jun-24 19:16:21] Total gradient norm stats for 72 steps: 0.2144 <= 0.2349 + 0.01119z <= 0.2687 +[ INFO][26-Jun-24 19:16:21] Trained chunk 481 in 138.9s at 4282noun/s: lr=6.70e-04, loss=1.25e+00, top1=71.26%/70.816% +[ INFO][26-Jun-24 19:16:21] Chunk 482 = Batch 558923 = Sample 286168065 +[ INFO][26-Jun-24 19:18:41] Total gradient norm stats for 73 steps: 0.2186 <= 0.2379 + 0.01562z <= 0.292 +[ INFO][26-Jun-24 19:18:41] Trained chunk 482 in 139.7s at 4260noun/s: lr=6.68e-04, loss=1.25e+00, top1=71.41%/70.823% +[ INFO][26-Jun-24 19:18:41] Chunk 483 = Batch 560085 = Sample 286763009 +[ INFO][26-Jun-24 19:21:00] Total gradient norm stats for 72 steps: 0.2144 <= 0.2336 + 0.01421z <= 0.3043 +[ INFO][26-Jun-24 19:21:00] Trained chunk 483 in 139.3s at 4269noun/s: lr=6.65e-04, loss=1.25e+00, top1=72.54%/70.826% +[ INFO][26-Jun-24 19:21:00] Chunk 484 = Batch 561247 = Sample 287357953 +[ INFO][26-Jun-24 19:23:19] Total gradient norm stats for 73 steps: 0.2181 <= 0.2468 + 0.02956z <= 0.3671 +[ INFO][26-Jun-24 19:23:19] Trained chunk 484 in 139.5s at 4265noun/s: lr=6.62e-04, loss=1.25e+00, top1=70.72%/70.832% +[ INFO][26-Jun-24 19:23:19] Chunk 485 = Batch 562409 = Sample 287952897 +[ INFO][26-Jun-24 19:25:39] Total gradient norm stats for 73 steps: 0.2171 <= 0.2344 + 0.01496z <= 0.3 +[ INFO][26-Jun-24 19:25:39] Trained chunk 485 in 139.2s at 4275noun/s: lr=6.60e-04, loss=1.25e+00, top1=71.03%/70.839% +[ INFO][26-Jun-24 19:25:39] Chunk 486 = Batch 563571 = Sample 288547841 +[ INFO][26-Jun-24 19:27:58] Total gradient norm stats for 72 steps: 0.2147 <= 0.2392 + 0.01848z <= 0.323 +[ INFO][26-Jun-24 19:27:58] Trained chunk 486 in 139.3s at 4272noun/s: lr=6.57e-04, loss=1.25e+00, top1=70.89%/70.840% +[ INFO][26-Jun-24 19:27:58] Chunk 487 = Batch 564733 = Sample 289142785 +[ INFO][26-Jun-24 19:30:17] Total gradient norm stats for 73 steps: 0.2133 <= 0.2434 + 0.01642z <= 0.2916 +[ INFO][26-Jun-24 19:30:17] Trained chunk 487 in 139.1s at 4277noun/s: lr=6.55e-04, loss=1.25e+00, top1=70.99%/70.844% +[ INFO][26-Jun-24 19:30:17] Chunk 488 = Batch 565895 = Sample 289737729 +[ INFO][26-Jun-24 19:32:36] Total gradient norm stats for 73 steps: 0.213 <= 0.2393 + 0.01462z <= 0.2873 +[ INFO][26-Jun-24 19:32:36] Trained chunk 488 in 139.0s at 4279noun/s: lr=6.52e-04, loss=1.25e+00, top1=70.91%/70.844% +[ INFO][26-Jun-24 19:32:36] Chunk 489 = Batch 567057 = Sample 290332673 +[ INFO][26-Jun-24 19:34:55] Total gradient norm stats for 72 steps: 0.2181 <= 0.237 + 0.01205z <= 0.2681 +[ INFO][26-Jun-24 19:34:55] Trained chunk 489 in 138.9s at 4284noun/s: lr=6.49e-04, loss=1.25e+00, top1=70.47%/70.849% +[ INFO][26-Jun-24 19:34:55] Chunk 490 = Batch 568219 = Sample 290927617 +[ INFO][26-Jun-24 19:37:14] Total gradient norm stats for 73 steps: 0.2169 <= 0.2391 + 0.01604z <= 0.3162 +[ INFO][26-Jun-24 19:37:14] Trained chunk 490 in 139.2s at 4273noun/s: lr=6.47e-04, loss=1.25e+00, top1=71.47%/70.853% +[ INFO][26-Jun-24 19:37:14] Chunk 491 = Batch 569381 = Sample 291522561 +[ INFO][26-Jun-24 19:39:33] Total gradient norm stats for 72 steps: 0.2194 <= 0.2486 + 0.06885z <= 0.7958 +[ INFO][26-Jun-24 19:39:33] Trained chunk 491 in 139.2s at 4275noun/s: lr=6.44e-04, loss=1.25e+00, top1=70.54%/70.855% +[ INFO][26-Jun-24 19:39:33] Chunk 492 = Batch 570543 = Sample 292117505 +[ INFO][26-Jun-24 19:41:53] Total gradient norm stats for 73 steps: 0.221 <= 0.2577 + 0.09382z <= 1.028 (clipped to 1) +[ INFO][26-Jun-24 19:41:53] Trained chunk 492 in 139.3s at 4271noun/s: lr=6.42e-04, loss=1.25e+00, top1=70.51%/70.863% +[ INFO][26-Jun-24 19:41:53] Chunk 493 = Batch 571705 = Sample 292712449 +[ INFO][26-Jun-24 19:44:11] Total gradient norm stats for 73 steps: 0.217 <= 0.2331 + 0.01196z <= 0.2713 +[ INFO][26-Jun-24 19:44:11] Trained chunk 493 in 138.9s at 4283noun/s: lr=6.39e-04, loss=1.25e+00, top1=70.67%/70.861% +[ INFO][26-Jun-24 19:44:11] Chunk 494 = Batch 572867 = Sample 293307393 +[ INFO][26-Jun-24 19:46:31] Total gradient norm stats for 72 steps: 0.2195 <= 0.2548 + 0.09072z <= 0.9939 +[ INFO][26-Jun-24 19:46:31] Trained chunk 494 in 139.3s at 4270noun/s: lr=6.36e-04, loss=1.25e+00, top1=69.89%/70.864% +[ INFO][26-Jun-24 19:46:31] Chunk 495 = Batch 574029 = Sample 293902337 +[ INFO][26-Jun-24 19:48:50] Total gradient norm stats for 73 steps: 0.2215 <= 0.2427 + 0.01905z <= 0.3177 +[ INFO][26-Jun-24 19:48:50] Trained chunk 495 in 138.9s at 4282noun/s: lr=6.34e-04, loss=1.25e+00, top1=71.16%/70.872% +[ INFO][26-Jun-24 19:48:50] Chunk 496 = Batch 575191 = Sample 294497281 +[ INFO][26-Jun-24 19:51:09] Total gradient norm stats for 73 steps: 0.2185 <= 0.2595 + 0.1247z <= 1.235 (clipped to 1) +[ INFO][26-Jun-24 19:51:09] Trained chunk 496 in 139.5s at 4266noun/s: lr=6.31e-04, loss=1.25e+00, top1=72.05%/70.876% +[ INFO][26-Jun-24 19:51:09] Chunk 497 = Batch 576353 = Sample 295092225 +[ INFO][26-Jun-24 19:53:29] Total gradient norm stats for 72 steps: 0.2165 <= 0.2416 + 0.01773z <= 0.3113 +[ INFO][26-Jun-24 19:53:29] Trained chunk 497 in 139.6s at 4261noun/s: lr=6.29e-04, loss=1.25e+00, top1=70.35%/70.878% +[ INFO][26-Jun-24 19:53:29] Chunk 498 = Batch 577515 = Sample 295687169 +[ INFO][26-Jun-24 19:55:48] Total gradient norm stats for 73 steps: 0.217 <= 0.2441 + 0.02073z <= 0.3373 +[ INFO][26-Jun-24 19:55:48] Trained chunk 498 in 139.3s at 4271noun/s: lr=6.26e-04, loss=1.25e+00, top1=70.73%/70.889% +[ INFO][26-Jun-24 19:55:48] Chunk 499 = Batch 578677 = Sample 296282113 +[ INFO][26-Jun-24 19:58:07] Total gradient norm stats for 72 steps: 0.2129 <= 0.2505 + 0.02935z <= 0.3742 +[ INFO][26-Jun-24 19:58:07] Trained chunk 499 in 139.1s at 4276noun/s: lr=6.23e-04, loss=1.25e+00, top1=70.50%/70.896% +[ INFO][26-Jun-24 19:58:07] Chunk 500 = Batch 579839 = Sample 296877057 +[ INFO][26-Jun-24 20:00:23] Epoch 10 finished in 6962.9s +[ INFO][26-Jun-24 20:00:23] -------------------------------------------------------------------------------- +[ INFO][26-Jun-24 20:00:23] Epoch 11 = Batch 580961 = Sample 297451521 +[ INFO][26-Jun-24 20:00:28] Total gradient norm stats for 73 steps: 0.2229 <= 0.25 + 0.02162z <= 0.3211 +[ INFO][26-Jun-24 20:00:28] Trained chunk 500 in 140.4s at 4237noun/s: lr=6.21e-04, loss=1.25e+00, top1=72.20%/70.895% +[ INFO][26-Jun-24 20:00:28] Chunk 501 = Batch 581001 = Sample 297472001 +[ INFO][26-Jun-24 20:02:47] Total gradient norm stats for 73 steps: 0.2221 <= 0.2553 + 0.05518z <= 0.6901 +[ INFO][26-Jun-24 20:02:47] Trained chunk 501 in 139.5s at 4264noun/s: lr=6.18e-04, loss=1.25e+00, top1=71.93%/70.899% +[ INFO][26-Jun-24 20:02:47] Chunk 502 = Batch 582163 = Sample 298066945 +[ INFO][26-Jun-24 20:05:06] Total gradient norm stats for 72 steps: 0.2195 <= 0.238 + 0.01346z <= 0.2968 +[ INFO][26-Jun-24 20:05:06] Trained chunk 502 in 139.3s at 4272noun/s: lr=6.16e-04, loss=1.25e+00, top1=70.65%/70.905% +[ INFO][26-Jun-24 20:05:06] Chunk 503 = Batch 583325 = Sample 298661889 +[ INFO][26-Jun-24 20:07:26] Total gradient norm stats for 73 steps: 0.2222 <= 0.2496 + 0.02175z <= 0.3171 +[ INFO][26-Jun-24 20:07:26] Trained chunk 503 in 139.4s at 4268noun/s: lr=6.13e-04, loss=1.25e+00, top1=70.42%/70.907% +[ INFO][26-Jun-24 20:07:26] Chunk 504 = Batch 584487 = Sample 299256833 +[ INFO][26-Jun-24 20:09:45] Total gradient norm stats for 73 steps: 0.2193 <= 0.2419 + 0.01396z <= 0.2822 +[ INFO][26-Jun-24 20:09:45] Trained chunk 504 in 139.3s at 4269noun/s: lr=6.11e-04, loss=1.25e+00, top1=71.25%/70.916% +[ INFO][26-Jun-24 20:09:45] Chunk 505 = Batch 585649 = Sample 299851777 +[ INFO][26-Jun-24 20:12:04] Total gradient norm stats for 72 steps: 0.2188 <= 0.2484 + 0.0396z <= 0.4944 +[ INFO][26-Jun-24 20:12:04] Trained chunk 505 in 139.2s at 4274noun/s: lr=6.08e-04, loss=1.25e+00, top1=70.52%/70.920% +[ INFO][26-Jun-24 20:12:04] Chunk 506 = Batch 586811 = Sample 300446721 +[ INFO][26-Jun-24 20:14:24] Total gradient norm stats for 73 steps: 0.2215 <= 0.2546 + 0.1121z <= 1.185 (clipped to 1) +[ INFO][26-Jun-24 20:14:24] Trained chunk 506 in 139.4s at 4268noun/s: lr=6.05e-04, loss=1.25e+00, top1=70.38%/70.923% +[ INFO][26-Jun-24 20:14:24] Chunk 507 = Batch 587973 = Sample 301041665 +[ INFO][26-Jun-24 20:16:44] Total gradient norm stats for 72 steps: 0.2198 <= 0.2819 + 0.2857z <= 2.665 (clipped to 1) +[ INFO][26-Jun-24 20:16:44] Trained chunk 507 in 139.7s at 4260noun/s: lr=6.03e-04, loss=1.25e+00, top1=70.42%/70.929% +[ INFO][26-Jun-24 20:16:44] Chunk 508 = Batch 589135 = Sample 301636609 +[ INFO][26-Jun-24 20:19:03] Total gradient norm stats for 73 steps: 0.2175 <= 0.2691 + 0.2356z <= 2.246 (clipped to 1) +[ INFO][26-Jun-24 20:19:03] Trained chunk 508 in 139.3s at 4270noun/s: lr=6.00e-04, loss=1.24e+00, top1=70.61%/70.936% +[ INFO][26-Jun-24 20:19:03] Chunk 509 = Batch 590297 = Sample 302231553 +[ INFO][26-Jun-24 20:21:22] Total gradient norm stats for 73 steps: 0.2248 <= 0.2508 + 0.0314z <= 0.4606 +[ INFO][26-Jun-24 20:21:22] Trained chunk 509 in 139.6s at 4262noun/s: lr=5.98e-04, loss=1.24e+00, top1=71.16%/70.940% +[ INFO][26-Jun-24 20:21:22] Chunk 510 = Batch 591459 = Sample 302826497 +[ INFO][26-Jun-24 20:23:42] Total gradient norm stats for 72 steps: 0.222 <= 0.2457 + 0.01957z <= 0.325 +[ INFO][26-Jun-24 20:23:42] Trained chunk 510 in 139.5s at 4266noun/s: lr=5.95e-04, loss=1.24e+00, top1=71.41%/70.941% +[ INFO][26-Jun-24 20:23:42] Chunk 511 = Batch 592621 = Sample 303421441 +[ INFO][26-Jun-24 20:26:01] Total gradient norm stats for 73 steps: 0.2143 <= 0.2465 + 0.02196z <= 0.322 +[ INFO][26-Jun-24 20:26:01] Trained chunk 511 in 139.0s at 4281noun/s: lr=5.93e-04, loss=1.24e+00, top1=70.91%/70.947% +[ INFO][26-Jun-24 20:26:01] Chunk 512 = Batch 593783 = Sample 304016385 +[ INFO][26-Jun-24 20:28:20] Total gradient norm stats for 73 steps: 0.2214 <= 0.2456 + 0.01679z <= 0.3024 +[ INFO][26-Jun-24 20:28:20] Trained chunk 512 in 139.3s at 4271noun/s: lr=5.90e-04, loss=1.24e+00, top1=69.76%/70.950% +[ INFO][26-Jun-24 20:28:20] Chunk 513 = Batch 594945 = Sample 304611329 +[ INFO][26-Jun-24 20:30:39] Total gradient norm stats for 72 steps: 0.2213 <= 0.2398 + 0.0137z <= 0.2855 +[ INFO][26-Jun-24 20:30:39] Trained chunk 513 in 139.1s at 4276noun/s: lr=5.87e-04, loss=1.24e+00, top1=70.89%/70.954% +[ INFO][26-Jun-24 20:30:39] Chunk 514 = Batch 596107 = Sample 305206273 +[ INFO][26-Jun-24 20:32:59] Total gradient norm stats for 73 steps: 0.2198 <= 0.2479 + 0.01738z <= 0.3067 +[ INFO][26-Jun-24 20:32:59] Trained chunk 514 in 139.2s at 4274noun/s: lr=5.85e-04, loss=1.24e+00, top1=70.67%/70.959% +[ INFO][26-Jun-24 20:32:59] Chunk 515 = Batch 597269 = Sample 305801217 +[ INFO][26-Jun-24 20:35:17] Total gradient norm stats for 72 steps: 0.2177 <= 0.2554 + 0.02578z <= 0.3796 +[ INFO][26-Jun-24 20:35:17] Trained chunk 515 in 138.7s at 4289noun/s: lr=5.82e-04, loss=1.24e+00, top1=70.99%/70.964% +[ INFO][26-Jun-24 20:35:17] Chunk 516 = Batch 598431 = Sample 306396161 +[ INFO][26-Jun-24 20:37:37] Total gradient norm stats for 73 steps: 0.2218 <= 0.2445 + 0.01797z <= 0.296 +[ INFO][26-Jun-24 20:37:37] Trained chunk 516 in 139.4s at 4268noun/s: lr=5.80e-04, loss=1.24e+00, top1=71.85%/70.971% +[ INFO][26-Jun-24 20:37:37] Chunk 517 = Batch 599593 = Sample 306991105 +[ INFO][26-Jun-24 20:39:56] Total gradient norm stats for 73 steps: 0.2198 <= 0.2446 + 0.01721z <= 0.3217 +[ INFO][26-Jun-24 20:39:56] Trained chunk 517 in 139.6s at 4261noun/s: lr=5.77e-04, loss=1.24e+00, top1=70.92%/70.974% +[ INFO][26-Jun-24 20:39:56] Chunk 518 = Batch 600755 = Sample 307586049 +[ INFO][26-Jun-24 20:42:15] Total gradient norm stats for 72 steps: 0.2197 <= 0.243 + 0.01322z <= 0.2817 +[ INFO][26-Jun-24 20:42:15] Trained chunk 518 in 139.1s at 4276noun/s: lr=5.75e-04, loss=1.24e+00, top1=70.37%/70.978% +[ INFO][26-Jun-24 20:42:15] Chunk 519 = Batch 601917 = Sample 308180993 +[ INFO][26-Jun-24 20:44:35] Total gradient norm stats for 73 steps: 0.2248 <= 0.2524 + 0.02574z <= 0.3536 +[ INFO][26-Jun-24 20:44:35] Trained chunk 519 in 139.4s at 4268noun/s: lr=5.72e-04, loss=1.24e+00, top1=71.38%/70.984% +[ INFO][26-Jun-24 20:44:35] Chunk 520 = Batch 603079 = Sample 308775937 +[ INFO][26-Jun-24 20:46:54] Total gradient norm stats for 73 steps: 0.2234 <= 0.2529 + 0.04849z <= 0.6379 +[ INFO][26-Jun-24 20:46:54] Trained chunk 520 in 138.9s at 4283noun/s: lr=5.70e-04, loss=1.24e+00, top1=70.17%/70.988% +[ INFO][26-Jun-24 20:46:54] Chunk 521 = Batch 604241 = Sample 309370881 +[ INFO][26-Jun-24 20:49:13] Total gradient norm stats for 72 steps: 0.2239 <= 0.2425 + 0.01603z <= 0.3021 +[ INFO][26-Jun-24 20:49:13] Trained chunk 521 in 139.1s at 4276noun/s: lr=5.67e-04, loss=1.24e+00, top1=71.12%/70.999% +[ INFO][26-Jun-24 20:49:13] Chunk 522 = Batch 605403 = Sample 309965825 +[ INFO][26-Jun-24 20:51:32] Total gradient norm stats for 73 steps: 0.2232 <= 0.245 + 0.01506z <= 0.2974 +[ INFO][26-Jun-24 20:51:32] Trained chunk 522 in 139.1s at 4276noun/s: lr=5.65e-04, loss=1.24e+00, top1=70.35%/71.001% +[ INFO][26-Jun-24 20:51:32] Chunk 523 = Batch 606565 = Sample 310560769 +[ INFO][26-Jun-24 20:53:51] Total gradient norm stats for 72 steps: 0.2247 <= 0.2424 + 0.0163z <= 0.2907 +[ INFO][26-Jun-24 20:53:51] Trained chunk 523 in 139.2s at 4275noun/s: lr=5.62e-04, loss=1.24e+00, top1=70.75%/70.998% +[ INFO][26-Jun-24 20:53:51] Chunk 524 = Batch 607727 = Sample 311155713 +[ INFO][26-Jun-24 20:56:10] Total gradient norm stats for 73 steps: 0.2224 <= 0.2579 + 0.05324z <= 0.6646 +[ INFO][26-Jun-24 20:56:10] Trained chunk 524 in 139.0s at 4279noun/s: lr=5.59e-04, loss=1.24e+00, top1=71.98%/71.001% +[ INFO][26-Jun-24 20:56:10] Chunk 525 = Batch 608889 = Sample 311750657 +[ INFO][26-Jun-24 20:58:29] Total gradient norm stats for 73 steps: 0.2221 <= 0.3176 + 0.6235z <= 5.57 (clipped to 1) +[ INFO][26-Jun-24 20:58:29] Trained chunk 525 in 139.1s at 4278noun/s: lr=5.57e-04, loss=1.24e+00, top1=71.00%/71.003% +[ INFO][26-Jun-24 20:58:29] Chunk 526 = Batch 610051 = Sample 312345601 +[ INFO][26-Jun-24 21:00:48] Total gradient norm stats for 72 steps: 0.221 <= 0.2447 + 0.01502z <= 0.291 +[ INFO][26-Jun-24 21:00:48] Trained chunk 526 in 139.1s at 4277noun/s: lr=5.54e-04, loss=1.24e+00, top1=70.15%/71.011% +[ INFO][26-Jun-24 21:00:48] Chunk 527 = Batch 611213 = Sample 312940545 +[ INFO][26-Jun-24 21:03:07] Total gradient norm stats for 73 steps: 0.2218 <= 0.2437 + 0.01664z <= 0.3218 +[ INFO][26-Jun-24 21:03:07] Trained chunk 527 in 138.9s at 4283noun/s: lr=5.52e-04, loss=1.24e+00, top1=71.63%/71.018% +[ INFO][26-Jun-24 21:03:07] Chunk 528 = Batch 612375 = Sample 313535489 +[ INFO][26-Jun-24 21:05:27] Total gradient norm stats for 73 steps: 0.2245 <= 0.2496 + 0.01959z <= 0.331 +[ INFO][26-Jun-24 21:05:27] Trained chunk 528 in 139.5s at 4265noun/s: lr=5.49e-04, loss=1.24e+00, top1=71.92%/71.025% +[ INFO][26-Jun-24 21:05:27] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240626_001447/ovod_chunk0528_20240626_210527.train +[ INFO][26-Jun-24 21:05:27] Chunk 529 = Batch 613537 = Sample 314130433 +[ INFO][26-Jun-24 21:07:46] Total gradient norm stats for 72 steps: 0.2259 <= 0.2384 + 0.01219z <= 0.2985 +[ INFO][26-Jun-24 21:07:46] Trained chunk 529 in 139.3s at 4271noun/s: lr=5.47e-04, loss=1.24e+00, top1=70.77%/71.026% +[ INFO][26-Jun-24 21:07:46] Chunk 530 = Batch 614699 = Sample 314725377 +[ INFO][26-Jun-24 21:10:06] Total gradient norm stats for 73 steps: 0.2258 <= 0.2452 + 0.01291z <= 0.2854 +[ INFO][26-Jun-24 21:10:06] Trained chunk 530 in 139.3s at 4272noun/s: lr=5.44e-04, loss=1.24e+00, top1=68.79%/71.028% +[ INFO][26-Jun-24 21:10:06] Chunk 531 = Batch 615861 = Sample 315320321 +[ INFO][26-Jun-24 21:12:25] Total gradient norm stats for 72 steps: 0.2241 <= 0.2453 + 0.02845z <= 0.4603 +[ INFO][26-Jun-24 21:12:25] Trained chunk 531 in 139.0s at 4280noun/s: lr=5.42e-04, loss=1.24e+00, top1=71.55%/71.035% +[ INFO][26-Jun-24 21:12:25] Chunk 532 = Batch 617023 = Sample 315915265 +[ INFO][26-Jun-24 21:14:44] Total gradient norm stats for 73 steps: 0.2235 <= 0.2489 + 0.01868z <= 0.2982 +[ INFO][26-Jun-24 21:14:44] Trained chunk 532 in 139.0s at 4280noun/s: lr=5.39e-04, loss=1.24e+00, top1=70.96%/71.038% +[ INFO][26-Jun-24 21:14:44] Chunk 533 = Batch 618185 = Sample 316510209 +[ INFO][26-Jun-24 21:17:03] Total gradient norm stats for 73 steps: 0.2214 <= 0.254 + 0.02251z <= 0.3255 +[ INFO][26-Jun-24 21:17:03] Trained chunk 533 in 139.4s at 4269noun/s: lr=5.37e-04, loss=1.24e+00, top1=70.04%/71.040% +[ INFO][26-Jun-24 21:17:03] Chunk 534 = Batch 619347 = Sample 317105153 +[ INFO][26-Jun-24 21:19:22] Total gradient norm stats for 72 steps: 0.2248 <= 0.2558 + 0.03947z <= 0.4916 +[ INFO][26-Jun-24 21:19:22] Trained chunk 534 in 139.3s at 4271noun/s: lr=5.34e-04, loss=1.24e+00, top1=71.60%/71.045% +[ INFO][26-Jun-24 21:19:22] Chunk 535 = Batch 620509 = Sample 317700097 +[ INFO][26-Jun-24 21:21:42] Total gradient norm stats for 73 steps: 0.2263 <= 0.247 + 0.01398z <= 0.2893 +[ INFO][26-Jun-24 21:21:42] Trained chunk 535 in 139.2s at 4274noun/s: lr=5.32e-04, loss=1.24e+00, top1=71.01%/71.046% +[ INFO][26-Jun-24 21:21:42] Chunk 536 = Batch 621671 = Sample 318295041 +[ INFO][26-Jun-24 21:24:01] Total gradient norm stats for 73 steps: 0.2236 <= 0.2472 + 0.0303z <= 0.4754 +[ INFO][26-Jun-24 21:24:01] Trained chunk 536 in 139.5s at 4264noun/s: lr=5.29e-04, loss=1.24e+00, top1=71.72%/71.052% +[ INFO][26-Jun-24 21:24:01] Chunk 537 = Batch 622833 = Sample 318889985 +[ INFO][26-Jun-24 21:26:20] Total gradient norm stats for 72 steps: 0.2225 <= 0.2482 + 0.01868z <= 0.3142 +[ INFO][26-Jun-24 21:26:20] Trained chunk 537 in 139.2s at 4274noun/s: lr=5.27e-04, loss=1.24e+00, top1=71.09%/71.061% +[ INFO][26-Jun-24 21:26:20] Chunk 538 = Batch 623995 = Sample 319484929 +[ INFO][26-Jun-24 21:28:39] Total gradient norm stats for 73 steps: 0.2268 <= 0.2471 + 0.0172z <= 0.3123 +[ INFO][26-Jun-24 21:28:39] Trained chunk 538 in 139.1s at 4278noun/s: lr=5.24e-04, loss=1.24e+00, top1=70.11%/71.061% +[ INFO][26-Jun-24 21:28:39] Chunk 539 = Batch 625157 = Sample 320079873 +[ INFO][26-Jun-24 21:30:59] Total gradient norm stats for 72 steps: 0.2246 <= 0.2506 + 0.01863z <= 0.3402 +[ INFO][26-Jun-24 21:30:59] Trained chunk 539 in 139.3s at 4272noun/s: lr=5.22e-04, loss=1.24e+00, top1=71.30%/71.069% +[ INFO][26-Jun-24 21:30:59] Chunk 540 = Batch 626319 = Sample 320674817 +[ INFO][26-Jun-24 21:33:18] Total gradient norm stats for 73 steps: 0.2316 <= 0.2516 + 0.01541z <= 0.2938 +[ INFO][26-Jun-24 21:33:18] Trained chunk 540 in 139.3s at 4271noun/s: lr=5.19e-04, loss=1.24e+00, top1=71.23%/71.070% +[ INFO][26-Jun-24 21:33:18] Chunk 541 = Batch 627481 = Sample 321269761 +[ INFO][26-Jun-24 21:35:37] Total gradient norm stats for 73 steps: 0.227 <= 0.2494 + 0.01302z <= 0.2916 +[ INFO][26-Jun-24 21:35:37] Trained chunk 541 in 139.0s at 4279noun/s: lr=5.17e-04, loss=1.24e+00, top1=71.73%/71.078% +[ INFO][26-Jun-24 21:35:37] Chunk 542 = Batch 628643 = Sample 321864705 +[ INFO][26-Jun-24 21:37:56] Total gradient norm stats for 72 steps: 0.2257 <= 0.248 + 0.01515z <= 0.2895 +[ INFO][26-Jun-24 21:37:56] Trained chunk 542 in 139.1s at 4276noun/s: lr=5.14e-04, loss=1.24e+00, top1=70.31%/71.082% +[ INFO][26-Jun-24 21:37:56] Chunk 543 = Batch 629805 = Sample 322459649 +[ INFO][26-Jun-24 21:40:15] Total gradient norm stats for 73 steps: 0.2252 <= 0.2491 + 0.02371z <= 0.3765 +[ INFO][26-Jun-24 21:40:15] Trained chunk 543 in 139.4s at 4268noun/s: lr=5.12e-04, loss=1.24e+00, top1=71.80%/71.084% +[ INFO][26-Jun-24 21:40:15] Chunk 544 = Batch 630967 = Sample 323054593 +[ INFO][26-Jun-24 21:42:35] Total gradient norm stats for 73 steps: 0.2263 <= 0.2445 + 0.01123z <= 0.2749 +[ INFO][26-Jun-24 21:42:35] Trained chunk 544 in 139.1s at 4277noun/s: lr=5.09e-04, loss=1.24e+00, top1=70.98%/71.087% +[ INFO][26-Jun-24 21:42:35] Chunk 545 = Batch 632129 = Sample 323649537 +[ INFO][26-Jun-24 21:44:54] Total gradient norm stats for 72 steps: 0.2269 <= 0.2492 + 0.0145z <= 0.2919 +[ INFO][26-Jun-24 21:44:54] Trained chunk 545 in 139.2s at 4273noun/s: lr=5.07e-04, loss=1.24e+00, top1=71.25%/71.091% +[ INFO][26-Jun-24 21:44:54] Chunk 546 = Batch 633291 = Sample 324244481 +[ INFO][26-Jun-24 21:47:13] Total gradient norm stats for 73 steps: 0.2267 <= 0.2443 + 0.0141z <= 0.2838 +[ INFO][26-Jun-24 21:47:13] Trained chunk 546 in 139.3s at 4272noun/s: lr=5.04e-04, loss=1.23e+00, top1=71.71%/71.103% +[ INFO][26-Jun-24 21:47:13] Chunk 547 = Batch 634453 = Sample 324839425 +[ INFO][26-Jun-24 21:49:33] Total gradient norm stats for 72 steps: 0.2298 <= 0.2551 + 0.01814z <= 0.3222 +[ INFO][26-Jun-24 21:49:33] Trained chunk 547 in 139.7s at 4258noun/s: lr=5.02e-04, loss=1.23e+00, top1=70.94%/71.111% +[ INFO][26-Jun-24 21:49:33] Chunk 548 = Batch 635615 = Sample 325434369 +[ INFO][26-Jun-24 21:51:52] Total gradient norm stats for 73 steps: 0.2257 <= 0.2522 + 0.01854z <= 0.3026 +[ INFO][26-Jun-24 21:51:52] Trained chunk 548 in 139.1s at 4277noun/s: lr=4.99e-04, loss=1.23e+00, top1=70.99%/71.117% +[ INFO][26-Jun-24 21:51:52] Chunk 549 = Batch 636777 = Sample 326029313 +[ INFO][26-Jun-24 21:54:11] Total gradient norm stats for 73 steps: 0.2242 <= 0.2495 + 0.01723z <= 0.3269 +[ INFO][26-Jun-24 21:54:11] Trained chunk 549 in 139.1s at 4276noun/s: lr=4.97e-04, loss=1.23e+00, top1=70.96%/71.119% +[ INFO][26-Jun-24 21:54:11] Chunk 550 = Batch 637939 = Sample 326624257 +[ INFO][26-Jun-24 21:56:26] Epoch 11 finished in 6963.3s +[ INFO][26-Jun-24 21:56:26] -------------------------------------------------------------------------------- +[ INFO][26-Jun-24 21:56:26] Epoch 12 = Batch 639057 = Sample 327196673 +[ INFO][26-Jun-24 21:56:32] Total gradient norm stats for 72 steps: 0.2302 <= 0.2512 + 0.01899z <= 0.3359 +[ INFO][26-Jun-24 21:56:32] Trained chunk 550 in 140.5s at 4236noun/s: lr=4.94e-04, loss=1.23e+00, top1=71.50%/71.124% +[ INFO][26-Jun-24 21:56:32] Chunk 551 = Batch 639101 = Sample 327219201 +[ INFO][26-Jun-24 21:58:51] Total gradient norm stats for 73 steps: 0.2238 <= 0.2499 + 0.01497z <= 0.3065 +[ INFO][26-Jun-24 21:58:51] Trained chunk 551 in 139.2s at 4274noun/s: lr=4.92e-04, loss=1.23e+00, top1=71.21%/71.126% +[ INFO][26-Jun-24 21:58:51] Chunk 552 = Batch 640263 = Sample 327814145 +[ INFO][26-Jun-24 22:01:10] Total gradient norm stats for 73 steps: 0.2277 <= 0.2536 + 0.01708z <= 0.3136 +[ INFO][26-Jun-24 22:01:10] Trained chunk 552 in 139.0s at 4279noun/s: lr=4.90e-04, loss=1.23e+00, top1=70.27%/71.134% +[ INFO][26-Jun-24 22:01:10] Chunk 553 = Batch 641425 = Sample 328409089 +[ INFO][26-Jun-24 22:03:29] Total gradient norm stats for 72 steps: 0.2317 <= 0.2557 + 0.01944z <= 0.3137 +[ INFO][26-Jun-24 22:03:29] Trained chunk 553 in 139.1s at 4276noun/s: lr=4.87e-04, loss=1.23e+00, top1=71.23%/71.138% +[ INFO][26-Jun-24 22:03:29] Chunk 554 = Batch 642587 = Sample 329004033 +[ INFO][26-Jun-24 22:05:48] Total gradient norm stats for 73 steps: 0.2249 <= 0.251 + 0.01896z <= 0.3222 +[ INFO][26-Jun-24 22:05:48] Trained chunk 554 in 139.2s at 4273noun/s: lr=4.85e-04, loss=1.23e+00, top1=70.57%/71.148% +[ INFO][26-Jun-24 22:05:48] Chunk 555 = Batch 643749 = Sample 329598977 +[ INFO][26-Jun-24 22:08:08] Total gradient norm stats for 72 steps: 0.2282 <= 0.2454 + 0.01215z <= 0.2873 +[ INFO][26-Jun-24 22:08:08] Trained chunk 555 in 139.6s at 4260noun/s: lr=4.82e-04, loss=1.23e+00, top1=70.74%/71.154% +[ INFO][26-Jun-24 22:08:08] Chunk 556 = Batch 644911 = Sample 330193921 +[ INFO][26-Jun-24 22:10:27] Total gradient norm stats for 73 steps: 0.229 <= 0.2483 + 0.01296z <= 0.2845 +[ INFO][26-Jun-24 22:10:27] Trained chunk 556 in 139.1s at 4278noun/s: lr=4.80e-04, loss=1.23e+00, top1=71.27%/71.160% +[ INFO][26-Jun-24 22:10:27] Chunk 557 = Batch 646073 = Sample 330788865 +[ INFO][26-Jun-24 22:12:46] Total gradient norm stats for 73 steps: 0.2307 <= 0.2478 + 0.01184z <= 0.2892 +[ INFO][26-Jun-24 22:12:46] Trained chunk 557 in 139.4s at 4269noun/s: lr=4.77e-04, loss=1.23e+00, top1=71.22%/71.170% +[ INFO][26-Jun-24 22:12:46] Chunk 558 = Batch 647235 = Sample 331383809 +[ INFO][26-Jun-24 22:15:06] Total gradient norm stats for 72 steps: 0.2299 <= 0.2425 + 0.008414z <= 0.269 +[ INFO][26-Jun-24 22:15:06] Trained chunk 558 in 139.3s at 4270noun/s: lr=4.75e-04, loss=1.23e+00, top1=70.04%/71.174% +[ INFO][26-Jun-24 22:15:06] Chunk 559 = Batch 648397 = Sample 331978753 +[ INFO][26-Jun-24 22:17:25] Total gradient norm stats for 73 steps: 0.232 <= 0.2505 + 0.01478z <= 0.3068 +[ INFO][26-Jun-24 22:17:25] Trained chunk 559 in 139.8s at 4256noun/s: lr=4.72e-04, loss=1.23e+00, top1=71.14%/71.177% +[ INFO][26-Jun-24 22:17:25] Chunk 560 = Batch 649559 = Sample 332573697 +[ INFO][26-Jun-24 22:19:45] Total gradient norm stats for 73 steps: 0.2303 <= 0.2458 + 0.01152z <= 0.2932 +[ INFO][26-Jun-24 22:19:45] Trained chunk 560 in 139.6s at 4260noun/s: lr=4.70e-04, loss=1.23e+00, top1=71.55%/71.180% +[ INFO][26-Jun-24 22:19:45] Chunk 561 = Batch 650721 = Sample 333168641 +[ INFO][26-Jun-24 22:22:05] Total gradient norm stats for 72 steps: 0.2294 <= 0.2435 + 0.01067z <= 0.2804 +[ INFO][26-Jun-24 22:22:05] Trained chunk 561 in 139.5s at 4266noun/s: lr=4.68e-04, loss=1.23e+00, top1=70.58%/71.185% +[ INFO][26-Jun-24 22:22:05] Chunk 562 = Batch 651883 = Sample 333763585 +[ INFO][26-Jun-24 22:24:24] Total gradient norm stats for 73 steps: 0.2299 <= 0.2495 + 0.01414z <= 0.3134 +[ INFO][26-Jun-24 22:24:24] Trained chunk 562 in 139.1s at 4276noun/s: lr=4.65e-04, loss=1.23e+00, top1=71.17%/71.179% +[ INFO][26-Jun-24 22:24:24] Chunk 563 = Batch 653045 = Sample 334358529 +[ INFO][26-Jun-24 22:26:43] Total gradient norm stats for 72 steps: 0.2325 <= 0.2494 + 0.01542z <= 0.3345 +[ INFO][26-Jun-24 22:26:43] Trained chunk 563 in 139.5s at 4264noun/s: lr=4.63e-04, loss=1.23e+00, top1=71.27%/71.182% +[ INFO][26-Jun-24 22:26:43] Chunk 564 = Batch 654207 = Sample 334953473 +[ INFO][26-Jun-24 22:29:02] Total gradient norm stats for 73 steps: 0.2307 <= 0.2467 + 0.01176z <= 0.2843 +[ INFO][26-Jun-24 22:29:02] Trained chunk 564 in 139.2s at 4274noun/s: lr=4.60e-04, loss=1.23e+00, top1=72.91%/71.190% +[ INFO][26-Jun-24 22:29:02] Chunk 565 = Batch 655369 = Sample 335548417 +[ INFO][26-Jun-24 22:31:22] Total gradient norm stats for 73 steps: 0.23 <= 0.249 + 0.01145z <= 0.2897 +[ INFO][26-Jun-24 22:31:22] Trained chunk 565 in 139.3s at 4272noun/s: lr=4.58e-04, loss=1.23e+00, top1=72.48%/71.196% +[ INFO][26-Jun-24 22:31:22] Chunk 566 = Batch 656531 = Sample 336143361 +[ INFO][26-Jun-24 22:33:41] Total gradient norm stats for 72 steps: 0.2303 <= 0.247 + 0.01015z <= 0.2833 +[ INFO][26-Jun-24 22:33:41] Trained chunk 566 in 139.1s at 4277noun/s: lr=4.55e-04, loss=1.23e+00, top1=71.45%/71.197% +[ INFO][26-Jun-24 22:33:41] Chunk 567 = Batch 657693 = Sample 336738305 +[ INFO][26-Jun-24 22:36:00] Total gradient norm stats for 73 steps: 0.2292 <= 0.2493 + 0.01423z <= 0.2952 +[ INFO][26-Jun-24 22:36:00] Trained chunk 567 in 138.8s at 4285noun/s: lr=4.53e-04, loss=1.23e+00, top1=71.11%/71.197% +[ INFO][26-Jun-24 22:36:00] Chunk 568 = Batch 658855 = Sample 337333249 +[ INFO][26-Jun-24 22:38:19] Total gradient norm stats for 73 steps: 0.2333 <= 0.2546 + 0.01705z <= 0.3054 +[ INFO][26-Jun-24 22:38:19] Trained chunk 568 in 139.0s at 4279noun/s: lr=4.51e-04, loss=1.23e+00, top1=70.09%/71.202% +[ INFO][26-Jun-24 22:38:19] Chunk 569 = Batch 660017 = Sample 337928193 +[ INFO][26-Jun-24 22:40:38] Total gradient norm stats for 72 steps: 0.2301 <= 0.248 + 0.01639z <= 0.3239 +[ INFO][26-Jun-24 22:40:38] Trained chunk 569 in 139.2s at 4275noun/s: lr=4.48e-04, loss=1.23e+00, top1=71.55%/71.202% +[ INFO][26-Jun-24 22:40:38] Chunk 570 = Batch 661179 = Sample 338523137 +[ INFO][26-Jun-24 22:42:57] Total gradient norm stats for 73 steps: 0.228 <= 0.2477 + 0.02077z <= 0.3792 +[ INFO][26-Jun-24 22:42:57] Trained chunk 570 in 139.2s at 4275noun/s: lr=4.46e-04, loss=1.23e+00, top1=70.88%/71.202% +[ INFO][26-Jun-24 22:42:57] Chunk 571 = Batch 662341 = Sample 339118081 +[ INFO][26-Jun-24 22:45:16] Total gradient norm stats for 72 steps: 0.2285 <= 0.2494 + 0.01426z <= 0.3083 +[ INFO][26-Jun-24 22:45:16] Trained chunk 571 in 139.1s at 4278noun/s: lr=4.43e-04, loss=1.23e+00, top1=70.31%/71.211% +[ INFO][26-Jun-24 22:45:16] Chunk 572 = Batch 663503 = Sample 339713025 +[ INFO][26-Jun-24 22:47:36] Total gradient norm stats for 73 steps: 0.2343 <= 0.252 + 0.01272z <= 0.2946 +[ INFO][26-Jun-24 22:47:36] Trained chunk 572 in 139.6s at 4263noun/s: lr=4.41e-04, loss=1.23e+00, top1=71.69%/71.218% +[ INFO][26-Jun-24 22:47:36] Chunk 573 = Batch 664665 = Sample 340307969 +[ INFO][26-Jun-24 22:49:55] Total gradient norm stats for 73 steps: 0.2323 <= 0.2494 + 0.01069z <= 0.2797 +[ INFO][26-Jun-24 22:49:55] Trained chunk 573 in 139.2s at 4275noun/s: lr=4.39e-04, loss=1.23e+00, top1=71.91%/71.227% +[ INFO][26-Jun-24 22:49:55] Chunk 574 = Batch 665827 = Sample 340902913 +[ INFO][26-Jun-24 22:52:14] Total gradient norm stats for 72 steps: 0.2289 <= 0.2474 + 0.01035z <= 0.2741 +[ INFO][26-Jun-24 22:52:14] Trained chunk 574 in 139.4s at 4269noun/s: lr=4.36e-04, loss=1.23e+00, top1=71.10%/71.234% +[ INFO][26-Jun-24 22:52:14] Chunk 575 = Batch 666989 = Sample 341497857 +[ INFO][26-Jun-24 22:54:33] Total gradient norm stats for 73 steps: 0.2329 <= 0.2532 + 0.0158z <= 0.3172 +[ INFO][26-Jun-24 22:54:33] Trained chunk 575 in 139.2s at 4275noun/s: lr=4.34e-04, loss=1.23e+00, top1=70.66%/71.235% +[ INFO][26-Jun-24 22:54:33] Chunk 576 = Batch 668151 = Sample 342092801 +[ INFO][26-Jun-24 22:56:52] Total gradient norm stats for 73 steps: 0.2338 <= 0.2507 + 0.01339z <= 0.3006 +[ INFO][26-Jun-24 22:56:52] Trained chunk 576 in 139.0s at 4280noun/s: lr=4.32e-04, loss=1.23e+00, top1=71.98%/71.235% +[ INFO][26-Jun-24 22:56:53] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240626_001447/ovod_chunk0576_20240626_225652.train +[ INFO][26-Jun-24 22:56:53] Chunk 577 = Batch 669313 = Sample 342687745 +[ INFO][26-Jun-24 22:59:12] Total gradient norm stats for 72 steps: 0.2356 <= 0.2579 + 0.01974z <= 0.3496 +[ INFO][26-Jun-24 22:59:12] Trained chunk 577 in 139.3s at 4271noun/s: lr=4.29e-04, loss=1.23e+00, top1=71.49%/71.234% +[ INFO][26-Jun-24 22:59:12] Chunk 578 = Batch 670475 = Sample 343282689 +[ INFO][26-Jun-24 23:01:31] Total gradient norm stats for 73 steps: 0.235 <= 0.2642 + 0.09276z <= 1.025 (clipped to 1) +[ INFO][26-Jun-24 23:01:31] Trained chunk 578 in 139.6s at 4263noun/s: lr=4.27e-04, loss=1.23e+00, top1=71.52%/71.238% +[ INFO][26-Jun-24 23:01:31] Chunk 579 = Batch 671637 = Sample 343877633 +[ INFO][26-Jun-24 23:03:51] Total gradient norm stats for 72 steps: 0.232 <= 0.255 + 0.02905z <= 0.4622 +[ INFO][26-Jun-24 23:03:51] Trained chunk 579 in 139.3s at 4272noun/s: lr=4.24e-04, loss=1.23e+00, top1=70.57%/71.245% +[ INFO][26-Jun-24 23:03:51] Chunk 580 = Batch 672799 = Sample 344472577 +[ INFO][26-Jun-24 23:06:10] Total gradient norm stats for 73 steps: 0.231 <= 0.252 + 0.01761z <= 0.3257 +[ INFO][26-Jun-24 23:06:10] Trained chunk 580 in 139.4s at 4268noun/s: lr=4.22e-04, loss=1.23e+00, top1=72.56%/71.253% +[ INFO][26-Jun-24 23:06:10] Chunk 581 = Batch 673961 = Sample 345067521 +[ INFO][26-Jun-24 23:08:29] Total gradient norm stats for 73 steps: 0.2283 <= 0.2574 + 0.03921z <= 0.4804 +[ INFO][26-Jun-24 23:08:29] Trained chunk 581 in 139.2s at 4273noun/s: lr=4.20e-04, loss=1.23e+00, top1=70.60%/71.259% +[ INFO][26-Jun-24 23:08:29] Chunk 582 = Batch 675123 = Sample 345662465 +[ INFO][26-Jun-24 23:10:49] Total gradient norm stats for 72 steps: 0.2339 <= 0.258 + 0.04279z <= 0.6028 +[ INFO][26-Jun-24 23:10:49] Trained chunk 582 in 139.2s at 4274noun/s: lr=4.17e-04, loss=1.23e+00, top1=72.23%/71.267% +[ INFO][26-Jun-24 23:10:49] Chunk 583 = Batch 676285 = Sample 346257409 +[ INFO][26-Jun-24 23:13:08] Total gradient norm stats for 73 steps: 0.2316 <= 0.2548 + 0.0173z <= 0.3123 +[ INFO][26-Jun-24 23:13:08] Trained chunk 583 in 139.2s at 4274noun/s: lr=4.15e-04, loss=1.23e+00, top1=71.69%/71.275% +[ INFO][26-Jun-24 23:13:08] Chunk 584 = Batch 677447 = Sample 346852353 +[ INFO][26-Jun-24 23:15:27] Total gradient norm stats for 73 steps: 0.2335 <= 0.2496 + 0.01108z <= 0.2931 +[ INFO][26-Jun-24 23:15:27] Trained chunk 584 in 138.9s at 4285noun/s: lr=4.13e-04, loss=1.23e+00, top1=70.53%/71.281% +[ INFO][26-Jun-24 23:15:27] Chunk 585 = Batch 678609 = Sample 347447297 +[ INFO][26-Jun-24 23:17:46] Total gradient norm stats for 72 steps: 0.2335 <= 0.2504 + 0.008935z <= 0.2755 +[ INFO][26-Jun-24 23:17:46] Trained chunk 585 in 139.4s at 4267noun/s: lr=4.10e-04, loss=1.23e+00, top1=71.58%/71.283% +[ INFO][26-Jun-24 23:17:46] Chunk 586 = Batch 679771 = Sample 348042241 +[ INFO][26-Jun-24 23:20:06] Total gradient norm stats for 73 steps: 0.2379 <= 0.2632 + 0.02466z <= 0.3588 +[ INFO][26-Jun-24 23:20:06] Trained chunk 586 in 139.6s at 4262noun/s: lr=4.08e-04, loss=1.22e+00, top1=71.29%/71.289% +[ INFO][26-Jun-24 23:20:06] Chunk 587 = Batch 680933 = Sample 348637185 +[ INFO][26-Jun-24 23:22:25] Total gradient norm stats for 72 steps: 0.2339 <= 0.2548 + 0.03467z <= 0.5321 +[ INFO][26-Jun-24 23:22:25] Trained chunk 587 in 139.1s at 4277noun/s: lr=4.06e-04, loss=1.22e+00, top1=71.83%/71.294% +[ INFO][26-Jun-24 23:22:25] Chunk 588 = Batch 682095 = Sample 349232129 +[ INFO][26-Jun-24 23:24:44] Total gradient norm stats for 73 steps: 0.233 <= 0.2536 + 0.01637z <= 0.3331 +[ INFO][26-Jun-24 23:24:44] Trained chunk 588 in 139.3s at 4270noun/s: lr=4.03e-04, loss=1.22e+00, top1=72.03%/71.297% +[ INFO][26-Jun-24 23:24:44] Chunk 589 = Batch 683257 = Sample 349827073 +[ INFO][26-Jun-24 23:27:03] Total gradient norm stats for 73 steps: 0.2323 <= 0.2535 + 0.01453z <= 0.3076 +[ INFO][26-Jun-24 23:27:03] Trained chunk 589 in 139.0s at 4281noun/s: lr=4.01e-04, loss=1.22e+00, top1=71.25%/71.301% +[ INFO][26-Jun-24 23:27:03] Chunk 590 = Batch 684419 = Sample 350422017 +[ INFO][26-Jun-24 23:29:23] Total gradient norm stats for 72 steps: 0.2331 <= 0.2518 + 0.01223z <= 0.2907 +[ INFO][26-Jun-24 23:29:23] Trained chunk 590 in 139.4s at 4267noun/s: lr=3.99e-04, loss=1.22e+00, top1=71.95%/71.303% +[ INFO][26-Jun-24 23:29:23] Chunk 591 = Batch 685581 = Sample 351016961 +[ INFO][26-Jun-24 23:31:42] Total gradient norm stats for 73 steps: 0.2358 <= 0.2519 + 0.01081z <= 0.2796 +[ INFO][26-Jun-24 23:31:42] Trained chunk 591 in 139.0s at 4279noun/s: lr=3.96e-04, loss=1.22e+00, top1=70.17%/71.309% +[ INFO][26-Jun-24 23:31:42] Chunk 592 = Batch 686743 = Sample 351611905 +[ INFO][26-Jun-24 23:34:01] Total gradient norm stats for 73 steps: 0.2345 <= 0.2579 + 0.01836z <= 0.318 +[ INFO][26-Jun-24 23:34:01] Trained chunk 592 in 139.0s at 4282noun/s: lr=3.94e-04, loss=1.22e+00, top1=70.48%/71.312% +[ INFO][26-Jun-24 23:34:01] Chunk 593 = Batch 687905 = Sample 352206849 +[ INFO][26-Jun-24 23:36:19] Total gradient norm stats for 72 steps: 0.2336 <= 0.2489 + 0.00961z <= 0.2786 +[ INFO][26-Jun-24 23:36:19] Trained chunk 593 in 138.6s at 4292noun/s: lr=3.92e-04, loss=1.22e+00, top1=70.66%/71.311% +[ INFO][26-Jun-24 23:36:19] Chunk 594 = Batch 689067 = Sample 352801793 +[ INFO][26-Jun-24 23:38:38] Total gradient norm stats for 73 steps: 0.231 <= 0.253 + 0.0133z <= 0.293 +[ INFO][26-Jun-24 23:38:38] Trained chunk 594 in 139.2s at 4274noun/s: lr=3.89e-04, loss=1.22e+00, top1=69.53%/71.317% +[ INFO][26-Jun-24 23:38:38] Chunk 595 = Batch 690229 = Sample 353396737 +[ INFO][26-Jun-24 23:40:58] Total gradient norm stats for 72 steps: 0.2326 <= 0.2574 + 0.01702z <= 0.3422 +[ INFO][26-Jun-24 23:40:58] Trained chunk 595 in 139.2s at 4273noun/s: lr=3.87e-04, loss=1.22e+00, top1=71.23%/71.320% +[ INFO][26-Jun-24 23:40:58] Chunk 596 = Batch 691391 = Sample 353991681 +[ INFO][26-Jun-24 23:43:17] Total gradient norm stats for 73 steps: 0.2388 <= 0.2533 + 0.01405z <= 0.3093 +[ INFO][26-Jun-24 23:43:17] Trained chunk 596 in 139.3s at 4272noun/s: lr=3.85e-04, loss=1.22e+00, top1=69.65%/71.322% +[ INFO][26-Jun-24 23:43:17] Chunk 597 = Batch 692553 = Sample 354586625 +[ INFO][26-Jun-24 23:45:36] Total gradient norm stats for 73 steps: 0.2334 <= 0.2496 + 0.009101z <= 0.2825 +[ INFO][26-Jun-24 23:45:36] Trained chunk 597 in 139.4s at 4268noun/s: lr=3.83e-04, loss=1.22e+00, top1=71.18%/71.330% +[ INFO][26-Jun-24 23:45:36] Chunk 598 = Batch 693715 = Sample 355181569 +[ INFO][26-Jun-24 23:47:55] Total gradient norm stats for 72 steps: 0.2345 <= 0.2496 + 0.008897z <= 0.2704 +[ INFO][26-Jun-24 23:47:55] Trained chunk 598 in 139.2s at 4275noun/s: lr=3.80e-04, loss=1.22e+00, top1=71.58%/71.334% +[ INFO][26-Jun-24 23:47:55] Chunk 599 = Batch 694877 = Sample 355776513 +[ INFO][26-Jun-24 23:50:15] Total gradient norm stats for 73 steps: 0.2378 <= 0.2571 + 0.01848z <= 0.3695 +[ INFO][26-Jun-24 23:50:15] Trained chunk 599 in 139.3s at 4272noun/s: lr=3.78e-04, loss=1.22e+00, top1=71.15%/71.335% +[ INFO][26-Jun-24 23:50:15] Chunk 600 = Batch 696039 = Sample 356371457 +[ INFO][26-Jun-24 23:52:29] Epoch 12 finished in 6963.2s +[ INFO][26-Jun-24 23:52:29] -------------------------------------------------------------------------------- +[ INFO][26-Jun-24 23:52:29] Epoch 13 = Batch 697153 = Sample 356941825 +[ INFO][26-Jun-24 23:52:35] Total gradient norm stats for 73 steps: 0.2351 <= 0.2528 + 0.0127z <= 0.2873 +[ INFO][26-Jun-24 23:52:35] Trained chunk 600 in 140.4s at 4236noun/s: lr=3.76e-04, loss=1.22e+00, top1=71.59%/71.338% +[ INFO][26-Jun-24 23:52:35] Chunk 601 = Batch 697201 = Sample 356966401 +[ INFO][26-Jun-24 23:54:55] Total gradient norm stats for 72 steps: 0.2387 <= 0.2608 + 0.08453z <= 0.9639 +[ INFO][26-Jun-24 23:54:55] Trained chunk 601 in 139.4s at 4267noun/s: lr=3.73e-04, loss=1.22e+00, top1=71.22%/71.347% +[ INFO][26-Jun-24 23:54:55] Chunk 602 = Batch 698363 = Sample 357561345 +[ INFO][26-Jun-24 23:57:14] Total gradient norm stats for 73 steps: 0.237 <= 0.2535 + 0.01277z <= 0.2959 +[ INFO][26-Jun-24 23:57:14] Trained chunk 602 in 139.4s at 4267noun/s: lr=3.71e-04, loss=1.22e+00, top1=72.14%/71.353% +[ INFO][26-Jun-24 23:57:14] Chunk 603 = Batch 699525 = Sample 358156289 +[ INFO][26-Jun-24 23:59:33] Total gradient norm stats for 72 steps: 0.2337 <= 0.2534 + 0.02652z <= 0.4594 +[ INFO][26-Jun-24 23:59:33] Trained chunk 603 in 139.1s at 4277noun/s: lr=3.69e-04, loss=1.22e+00, top1=70.37%/71.359% +[ INFO][26-Jun-24 23:59:33] Chunk 604 = Batch 700687 = Sample 358751233 +[ INFO][27-Jun-24 00:01:52] Total gradient norm stats for 73 steps: 0.238 <= 0.2554 + 0.01426z <= 0.3228 +[ INFO][27-Jun-24 00:01:52] Trained chunk 604 in 139.0s at 4281noun/s: lr=3.67e-04, loss=1.22e+00, top1=71.71%/71.362% +[ INFO][27-Jun-24 00:01:52] Chunk 605 = Batch 701849 = Sample 359346177 +[ INFO][27-Jun-24 00:04:12] Total gradient norm stats for 73 steps: 0.2336 <= 0.2569 + 0.01507z <= 0.3113 +[ INFO][27-Jun-24 00:04:12] Trained chunk 605 in 139.4s at 4267noun/s: lr=3.64e-04, loss=1.22e+00, top1=71.55%/71.364% +[ INFO][27-Jun-24 00:04:12] Chunk 606 = Batch 703011 = Sample 359941121 +[ INFO][27-Jun-24 00:06:31] Total gradient norm stats for 72 steps: 0.2383 <= 0.2645 + 0.01691z <= 0.3008 +[ INFO][27-Jun-24 00:06:31] Trained chunk 606 in 139.5s at 4264noun/s: lr=3.62e-04, loss=1.22e+00, top1=70.87%/71.366% +[ INFO][27-Jun-24 00:06:31] Chunk 607 = Batch 704173 = Sample 360536065 +[ INFO][27-Jun-24 00:08:50] Total gradient norm stats for 73 steps: 0.2385 <= 0.2774 + 0.1579z <= 1.598 (clipped to 1) +[ INFO][27-Jun-24 00:08:50] Trained chunk 607 in 139.2s at 4273noun/s: lr=3.60e-04, loss=1.22e+00, top1=71.76%/71.374% +[ INFO][27-Jun-24 00:08:50] Chunk 608 = Batch 705335 = Sample 361131009 +[ INFO][27-Jun-24 00:11:10] Total gradient norm stats for 73 steps: 0.2356 <= 0.2581 + 0.01577z <= 0.3214 +[ INFO][27-Jun-24 00:11:10] Trained chunk 608 in 139.3s at 4271noun/s: lr=3.58e-04, loss=1.22e+00, top1=71.48%/71.388% +[ INFO][27-Jun-24 00:11:10] Chunk 609 = Batch 706497 = Sample 361725953 +[ INFO][27-Jun-24 00:13:29] Total gradient norm stats for 72 steps: 0.2361 <= 0.2556 + 0.01336z <= 0.3077 +[ INFO][27-Jun-24 00:13:29] Trained chunk 609 in 139.4s at 4267noun/s: lr=3.56e-04, loss=1.22e+00, top1=72.12%/71.392% +[ INFO][27-Jun-24 00:13:29] Chunk 610 = Batch 707659 = Sample 362320897 +[ INFO][27-Jun-24 00:15:48] Total gradient norm stats for 73 steps: 0.238 <= 0.2623 + 0.05326z <= 0.6998 +[ INFO][27-Jun-24 00:15:48] Trained chunk 610 in 138.9s at 4283noun/s: lr=3.53e-04, loss=1.22e+00, top1=72.05%/71.399% +[ INFO][27-Jun-24 00:15:48] Chunk 611 = Batch 708821 = Sample 362915841 +[ INFO][27-Jun-24 00:18:07] Total gradient norm stats for 72 steps: 0.2378 <= 0.3498 + 0.3843z <= 3.412 (clipped to 1) +[ INFO][27-Jun-24 00:18:07] Trained chunk 611 in 139.1s at 4278noun/s: lr=3.51e-04, loss=1.22e+00, top1=71.16%/71.398% +[ INFO][27-Jun-24 00:18:07] Chunk 612 = Batch 709983 = Sample 363510785 +[ INFO][27-Jun-24 00:20:26] Total gradient norm stats for 73 steps: 0.2446 <= 0.5109 + 0.5105z <= 2.759 (clipped to 1) +[ INFO][27-Jun-24 00:20:26] Trained chunk 612 in 139.2s at 4273noun/s: lr=3.49e-04, loss=1.22e+00, top1=71.02%/71.391% +[ INFO][27-Jun-24 00:20:26] Chunk 613 = Batch 711145 = Sample 364105729 +[ INFO][27-Jun-24 00:22:45] Total gradient norm stats for 73 steps: 0.2374 <= 0.3616 + 0.4494z <= 4.01 (clipped to 1) +[ INFO][27-Jun-24 00:22:45] Trained chunk 613 in 139.1s at 4276noun/s: lr=3.47e-04, loss=1.22e+00, top1=70.93%/71.392% +[ INFO][27-Jun-24 00:22:45] Chunk 614 = Batch 712307 = Sample 364700673 +[ INFO][27-Jun-24 00:25:05] Total gradient norm stats for 72 steps: 0.241 <= 0.3211 + 0.1979z <= 1.603 (clipped to 1) +[ INFO][27-Jun-24 00:25:05] Trained chunk 614 in 139.3s at 4271noun/s: lr=3.44e-04, loss=1.22e+00, top1=71.09%/71.400% +[ INFO][27-Jun-24 00:25:05] Chunk 615 = Batch 713469 = Sample 365295617 +[ INFO][27-Jun-24 00:27:24] Total gradient norm stats for 73 steps: 0.2427 <= 0.3447 + 0.2291z <= 1.499 (clipped to 1) +[ INFO][27-Jun-24 00:27:24] Trained chunk 615 in 139.0s at 4279noun/s: lr=3.42e-04, loss=1.22e+00, top1=71.65%/71.402% +[ INFO][27-Jun-24 00:27:24] Chunk 616 = Batch 714631 = Sample 365890561 +[ INFO][27-Jun-24 00:29:43] Total gradient norm stats for 73 steps: 0.2381 <= 0.456 + 1.073z <= 8.792 (clipped to 1) +[ INFO][27-Jun-24 00:29:43] Trained chunk 616 in 139.3s at 4271noun/s: lr=3.40e-04, loss=1.22e+00, top1=72.24%/71.405% +[ INFO][27-Jun-24 00:29:43] Chunk 617 = Batch 715793 = Sample 366485505 +[ INFO][27-Jun-24 00:32:02] Total gradient norm stats for 72 steps: 0.2419 <= 0.2717 + 0.08497z <= 0.9582 +[ INFO][27-Jun-24 00:32:02] Trained chunk 617 in 139.2s at 4273noun/s: lr=3.38e-04, loss=1.22e+00, top1=71.37%/71.411% +[ INFO][27-Jun-24 00:32:02] Chunk 618 = Batch 716955 = Sample 367080449 +[ INFO][27-Jun-24 00:34:22] Total gradient norm stats for 73 steps: 0.2418 <= 0.3154 + 0.2086z <= 1.431 (clipped to 1) +[ INFO][27-Jun-24 00:34:22] Trained chunk 618 in 139.3s at 4271noun/s: lr=3.36e-04, loss=1.22e+00, top1=72.42%/71.420% +[ INFO][27-Jun-24 00:34:22] Chunk 619 = Batch 718117 = Sample 367675393 +[ INFO][27-Jun-24 00:36:41] Total gradient norm stats for 72 steps: 0.2391 <= 0.357 + 0.8094z <= 7.126 (clipped to 1) +[ INFO][27-Jun-24 00:36:41] Trained chunk 619 in 139.3s at 4271noun/s: lr=3.33e-04, loss=1.22e+00, top1=72.30%/71.426% +[ INFO][27-Jun-24 00:36:41] Chunk 620 = Batch 719279 = Sample 368270337 +[ INFO][27-Jun-24 00:39:00] Total gradient norm stats for 73 steps: 0.2394 <= 1.075 + 6.44z <= 55.31 (clipped to 1) +[ INFO][27-Jun-24 00:39:00] Trained chunk 620 in 139.2s at 4273noun/s: lr=3.31e-04, loss=1.22e+00, top1=70.96%/71.416% +[ INFO][27-Jun-24 00:39:00] Chunk 621 = Batch 720441 = Sample 368865281 +[ INFO][27-Jun-24 00:41:19] Total gradient norm stats for 73 steps: 0.2395 <= 0.3098 + 0.1967z <= 1.64 (clipped to 1) +[ INFO][27-Jun-24 00:41:19] Trained chunk 621 in 139.3s at 4270noun/s: lr=3.29e-04, loss=1.22e+00, top1=71.50%/71.422% +[ INFO][27-Jun-24 00:41:19] Chunk 622 = Batch 721603 = Sample 369460225 +[ INFO][27-Jun-24 00:43:39] Total gradient norm stats for 72 steps: 0.2393 <= 0.3449 + 0.4648z <= 3.724 (clipped to 1) +[ INFO][27-Jun-24 00:43:39] Trained chunk 622 in 139.2s at 4274noun/s: lr=3.27e-04, loss=1.22e+00, top1=71.49%/71.425% +[ INFO][27-Jun-24 00:43:39] Chunk 623 = Batch 722765 = Sample 370055169 +[ INFO][27-Jun-24 00:45:58] Total gradient norm stats for 73 steps: 0.2398 <= 0.2845 + 0.1834z <= 1.824 (clipped to 1) +[ INFO][27-Jun-24 00:45:58] Trained chunk 623 in 139.5s at 4264noun/s: lr=3.25e-04, loss=1.22e+00, top1=71.64%/71.426% +[ INFO][27-Jun-24 00:45:58] Chunk 624 = Batch 723927 = Sample 370650113 +[ INFO][27-Jun-24 00:48:18] Total gradient norm stats for 73 steps: 0.2406 <= 0.3338 + 0.2968z <= 2.64 (clipped to 1) +[ INFO][27-Jun-24 00:48:18] Trained chunk 624 in 139.3s at 4270noun/s: lr=3.23e-04, loss=1.22e+00, top1=71.73%/71.429% +[ INFO][27-Jun-24 00:48:18] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240626_001447/ovod_chunk0624_20240627_004818.train +[ INFO][27-Jun-24 00:48:18] Chunk 625 = Batch 725089 = Sample 371245057 +[ INFO][27-Jun-24 00:50:37] Total gradient norm stats for 72 steps: 0.2431 <= 0.2684 + 0.06868z <= 0.8271 +[ INFO][27-Jun-24 00:50:37] Trained chunk 625 in 139.2s at 4274noun/s: lr=3.20e-04, loss=1.22e+00, top1=70.93%/71.434% +[ INFO][27-Jun-24 00:50:37] Chunk 626 = Batch 726251 = Sample 371840001 +[ INFO][27-Jun-24 00:52:56] Total gradient norm stats for 73 steps: 0.2462 <= 0.3467 + 0.2226z <= 1.419 (clipped to 1) +[ INFO][27-Jun-24 00:52:56] Trained chunk 626 in 139.1s at 4277noun/s: lr=3.18e-04, loss=1.22e+00, top1=71.34%/71.430% +[ INFO][27-Jun-24 00:52:56] Chunk 627 = Batch 727413 = Sample 372434945 +[ INFO][27-Jun-24 00:55:15] Total gradient norm stats for 72 steps: 0.2402 <= 0.4026 + 0.378z <= 2.463 (clipped to 1) +[ INFO][27-Jun-24 00:55:15] Trained chunk 627 in 139.3s at 4271noun/s: lr=3.16e-04, loss=1.22e+00, top1=69.95%/71.440% +[ INFO][27-Jun-24 00:55:15] Chunk 628 = Batch 728575 = Sample 373029889 +[ INFO][27-Jun-24 00:57:35] Total gradient norm stats for 73 steps: 0.2428 <= 0.261 + 0.01364z <= 0.3167 +[ INFO][27-Jun-24 00:57:35] Trained chunk 628 in 139.3s at 4272noun/s: lr=3.14e-04, loss=1.22e+00, top1=70.38%/71.450% +[ INFO][27-Jun-24 00:57:35] Chunk 629 = Batch 729737 = Sample 373624833 +[ INFO][27-Jun-24 00:59:54] Total gradient norm stats for 73 steps: 0.2418 <= 0.2737 + 0.09904z <= 1.017 (clipped to 1) +[ INFO][27-Jun-24 00:59:54] Trained chunk 629 in 139.2s at 4275noun/s: lr=3.12e-04, loss=1.22e+00, top1=71.33%/71.457% +[ INFO][27-Jun-24 00:59:54] Chunk 630 = Batch 730899 = Sample 374219777 +[ INFO][27-Jun-24 01:02:13] Total gradient norm stats for 72 steps: 0.2404 <= 0.2594 + 0.01176z <= 0.2979 +[ INFO][27-Jun-24 01:02:13] Trained chunk 630 in 139.4s at 4268noun/s: lr=3.10e-04, loss=1.22e+00, top1=71.90%/71.462% +[ INFO][27-Jun-24 01:02:13] Chunk 631 = Batch 732061 = Sample 374814721 +[ INFO][27-Jun-24 01:04:32] Total gradient norm stats for 73 steps: 0.2387 <= 0.293 + 0.299z <= 2.81 (clipped to 1) +[ INFO][27-Jun-24 01:04:32] Trained chunk 631 in 139.1s at 4278noun/s: lr=3.08e-04, loss=1.22e+00, top1=70.67%/71.469% +[ INFO][27-Jun-24 01:04:32] Chunk 632 = Batch 733223 = Sample 375409665 +[ INFO][27-Jun-24 01:06:52] Total gradient norm stats for 73 steps: 0.2431 <= 0.2571 + 0.009319z <= 0.3027 +[ INFO][27-Jun-24 01:06:52] Trained chunk 632 in 139.2s at 4274noun/s: lr=3.06e-04, loss=1.21e+00, top1=70.56%/71.473% +[ INFO][27-Jun-24 01:06:52] Chunk 633 = Batch 734385 = Sample 376004609 +[ INFO][27-Jun-24 01:09:11] Total gradient norm stats for 72 steps: 0.2436 <= 0.2684 + 0.04301z <= 0.5928 +[ INFO][27-Jun-24 01:09:11] Trained chunk 633 in 139.2s at 4275noun/s: lr=3.03e-04, loss=1.21e+00, top1=72.24%/71.476% +[ INFO][27-Jun-24 01:09:11] Chunk 634 = Batch 735547 = Sample 376599553 +[ INFO][27-Jun-24 01:11:30] Total gradient norm stats for 73 steps: 0.2411 <= 0.2634 + 0.02923z <= 0.4858 +[ INFO][27-Jun-24 01:11:30] Trained chunk 634 in 139.2s at 4275noun/s: lr=3.01e-04, loss=1.21e+00, top1=70.81%/71.479% +[ INFO][27-Jun-24 01:11:30] Chunk 635 = Batch 736709 = Sample 377194497 +[ INFO][27-Jun-24 01:13:50] Total gradient norm stats for 72 steps: 0.2436 <= 0.2672 + 0.01872z <= 0.3283 +[ INFO][27-Jun-24 01:13:50] Trained chunk 635 in 139.6s at 4260noun/s: lr=2.99e-04, loss=1.21e+00, top1=71.43%/71.485% +[ INFO][27-Jun-24 01:13:50] Chunk 636 = Batch 737871 = Sample 377789441 +[ INFO][27-Jun-24 01:16:09] Total gradient norm stats for 73 steps: 0.243 <= 0.2815 + 0.1408z <= 1.372 (clipped to 1) +[ INFO][27-Jun-24 01:16:09] Trained chunk 636 in 139.3s at 4271noun/s: lr=2.97e-04, loss=1.21e+00, top1=71.94%/71.493% +[ INFO][27-Jun-24 01:16:09] Chunk 637 = Batch 739033 = Sample 378384385 +[ INFO][27-Jun-24 01:18:28] Total gradient norm stats for 73 steps: 0.2438 <= 0.2622 + 0.01344z <= 0.3049 +[ INFO][27-Jun-24 01:18:28] Trained chunk 637 in 139.2s at 4276noun/s: lr=2.95e-04, loss=1.21e+00, top1=72.51%/71.496% +[ INFO][27-Jun-24 01:18:28] Chunk 638 = Batch 740195 = Sample 378979329 +[ INFO][27-Jun-24 01:20:47] Total gradient norm stats for 72 steps: 0.2391 <= 0.2593 + 0.01156z <= 0.3165 +[ INFO][27-Jun-24 01:20:47] Trained chunk 638 in 139.1s at 4277noun/s: lr=2.93e-04, loss=1.21e+00, top1=72.26%/71.502% +[ INFO][27-Jun-24 01:20:47] Chunk 639 = Batch 741357 = Sample 379574273 +[ INFO][27-Jun-24 01:23:06] Total gradient norm stats for 73 steps: 0.2433 <= 0.2562 + 0.008333z <= 0.2866 +[ INFO][27-Jun-24 01:23:06] Trained chunk 639 in 139.4s at 4267noun/s: lr=2.91e-04, loss=1.21e+00, top1=71.58%/71.500% +[ INFO][27-Jun-24 01:23:06] Chunk 640 = Batch 742519 = Sample 380169217 +[ INFO][27-Jun-24 01:25:25] Total gradient norm stats for 73 steps: 0.2427 <= 0.2615 + 0.01126z <= 0.2976 +[ INFO][27-Jun-24 01:25:25] Trained chunk 640 in 138.8s at 4285noun/s: lr=2.89e-04, loss=1.21e+00, top1=72.50%/71.501% +[ INFO][27-Jun-24 01:25:25] Chunk 641 = Batch 743681 = Sample 380764161 +[ INFO][27-Jun-24 01:27:45] Total gradient norm stats for 72 steps: 0.2449 <= 0.2625 + 0.01471z <= 0.3198 +[ INFO][27-Jun-24 01:27:45] Trained chunk 641 in 139.3s at 4270noun/s: lr=2.87e-04, loss=1.21e+00, top1=71.06%/71.504% +[ INFO][27-Jun-24 01:27:45] Chunk 642 = Batch 744843 = Sample 381359105 +[ INFO][27-Jun-24 01:30:04] Total gradient norm stats for 73 steps: 0.2422 <= 0.2613 + 0.01255z <= 0.297 +[ INFO][27-Jun-24 01:30:04] Trained chunk 642 in 139.4s at 4268noun/s: lr=2.85e-04, loss=1.21e+00, top1=71.26%/71.517% +[ INFO][27-Jun-24 01:30:04] Chunk 643 = Batch 746005 = Sample 381954049 +[ INFO][27-Jun-24 01:32:23] Total gradient norm stats for 72 steps: 0.2455 <= 0.2572 + 0.007795z <= 0.2885 +[ INFO][27-Jun-24 01:32:23] Trained chunk 643 in 139.0s at 4281noun/s: lr=2.83e-04, loss=1.21e+00, top1=71.35%/71.521% +[ INFO][27-Jun-24 01:32:23] Chunk 644 = Batch 747167 = Sample 382548993 +[ INFO][27-Jun-24 01:34:42] Total gradient norm stats for 73 steps: 0.2439 <= 0.2593 + 0.008945z <= 0.2904 +[ INFO][27-Jun-24 01:34:42] Trained chunk 644 in 138.7s at 4289noun/s: lr=2.81e-04, loss=1.21e+00, top1=71.72%/71.527% +[ INFO][27-Jun-24 01:34:42] Chunk 645 = Batch 748329 = Sample 383143937 +[ INFO][27-Jun-24 01:37:01] Total gradient norm stats for 73 steps: 0.2472 <= 0.2606 + 0.01047z <= 0.2968 +[ INFO][27-Jun-24 01:37:01] Trained chunk 645 in 139.2s at 4273noun/s: lr=2.79e-04, loss=1.21e+00, top1=70.37%/71.532% +[ INFO][27-Jun-24 01:37:01] Chunk 646 = Batch 749491 = Sample 383738881 +[ INFO][27-Jun-24 01:39:20] Total gradient norm stats for 72 steps: 0.2458 <= 0.2598 + 0.01296z <= 0.32 +[ INFO][27-Jun-24 01:39:20] Trained chunk 646 in 139.5s at 4265noun/s: lr=2.77e-04, loss=1.21e+00, top1=70.90%/71.539% +[ INFO][27-Jun-24 01:39:20] Chunk 647 = Batch 750653 = Sample 384333825 +[ INFO][27-Jun-24 01:41:40] Total gradient norm stats for 73 steps: 0.2465 <= 0.2597 + 0.01052z <= 0.3037 +[ INFO][27-Jun-24 01:41:40] Trained chunk 647 in 139.5s at 4266noun/s: lr=2.75e-04, loss=1.21e+00, top1=70.99%/71.545% +[ INFO][27-Jun-24 01:41:40] Chunk 648 = Batch 751815 = Sample 384928769 +[ INFO][27-Jun-24 01:43:59] Total gradient norm stats for 73 steps: 0.2444 <= 0.2603 + 0.01126z <= 0.3002 +[ INFO][27-Jun-24 01:43:59] Trained chunk 648 in 138.9s at 4282noun/s: lr=2.72e-04, loss=1.21e+00, top1=71.91%/71.548% +[ INFO][27-Jun-24 01:43:59] Chunk 649 = Batch 752977 = Sample 385523713 +[ INFO][27-Jun-24 01:46:18] Total gradient norm stats for 72 steps: 0.2457 <= 0.2627 + 0.01233z <= 0.3105 +[ INFO][27-Jun-24 01:46:18] Trained chunk 649 in 139.2s at 4273noun/s: lr=2.70e-04, loss=1.21e+00, top1=71.37%/71.551% +[ INFO][27-Jun-24 01:46:18] Chunk 650 = Batch 754139 = Sample 386118657 +[ INFO][27-Jun-24 01:48:32] Epoch 13 finished in 6962.9s +[ INFO][27-Jun-24 01:48:32] -------------------------------------------------------------------------------- +[ INFO][27-Jun-24 01:48:32] Epoch 14 = Batch 755249 = Sample 386686977 +[ INFO][27-Jun-24 01:48:39] Total gradient norm stats for 73 steps: 0.2456 <= 0.2633 + 0.01814z <= 0.3388 +[ INFO][27-Jun-24 01:48:39] Trained chunk 650 in 140.4s at 4238noun/s: lr=2.68e-04, loss=1.21e+00, top1=70.80%/71.553% +[ INFO][27-Jun-24 01:48:39] Chunk 651 = Batch 755301 = Sample 386713601 +[ INFO][27-Jun-24 01:50:58] Total gradient norm stats for 72 steps: 0.2453 <= 0.2609 + 0.0117z <= 0.3142 +[ INFO][27-Jun-24 01:50:58] Trained chunk 651 in 139.3s at 4270noun/s: lr=2.66e-04, loss=1.21e+00, top1=71.26%/71.558% +[ INFO][27-Jun-24 01:50:58] Chunk 652 = Batch 756463 = Sample 387308545 +[ INFO][27-Jun-24 01:53:17] Total gradient norm stats for 73 steps: 0.2463 <= 0.2617 + 0.01306z <= 0.3115 +[ INFO][27-Jun-24 01:53:17] Trained chunk 652 in 139.1s at 4276noun/s: lr=2.64e-04, loss=1.21e+00, top1=72.51%/71.563% +[ INFO][27-Jun-24 01:53:17] Chunk 653 = Batch 757625 = Sample 387903489 +[ INFO][27-Jun-24 01:55:36] Total gradient norm stats for 73 steps: 0.2465 <= 0.2608 + 0.009813z <= 0.303 +[ INFO][27-Jun-24 01:55:36] Trained chunk 653 in 138.7s at 4289noun/s: lr=2.62e-04, loss=1.21e+00, top1=71.10%/71.565% +[ INFO][27-Jun-24 01:55:36] Chunk 654 = Batch 758787 = Sample 388498433 +[ INFO][27-Jun-24 01:57:55] Total gradient norm stats for 72 steps: 0.2463 <= 0.2645 + 0.01325z <= 0.316 +[ INFO][27-Jun-24 01:57:55] Trained chunk 654 in 139.4s at 4268noun/s: lr=2.60e-04, loss=1.21e+00, top1=72.77%/71.578% +[ INFO][27-Jun-24 01:57:55] Chunk 655 = Batch 759949 = Sample 389093377 +[ INFO][27-Jun-24 02:00:14] Total gradient norm stats for 73 steps: 0.2474 <= 0.2618 + 0.01005z <= 0.292 +[ INFO][27-Jun-24 02:00:14] Trained chunk 655 in 139.1s at 4278noun/s: lr=2.58e-04, loss=1.21e+00, top1=71.09%/71.577% +[ INFO][27-Jun-24 02:00:14] Chunk 656 = Batch 761111 = Sample 389688321 +[ INFO][27-Jun-24 02:02:33] Total gradient norm stats for 73 steps: 0.2461 <= 0.2593 + 0.006613z <= 0.28 +[ INFO][27-Jun-24 02:02:33] Trained chunk 656 in 139.0s at 4280noun/s: lr=2.57e-04, loss=1.21e+00, top1=72.64%/71.585% +[ INFO][27-Jun-24 02:02:33] Chunk 657 = Batch 762273 = Sample 390283265 +[ INFO][27-Jun-24 02:04:53] Total gradient norm stats for 72 steps: 0.2439 <= 0.259 + 0.009529z <= 0.2935 +[ INFO][27-Jun-24 02:04:53] Trained chunk 657 in 139.6s at 4260noun/s: lr=2.55e-04, loss=1.21e+00, top1=70.70%/71.593% +[ INFO][27-Jun-24 02:04:53] Chunk 658 = Batch 763435 = Sample 390878209 +[ INFO][27-Jun-24 02:07:12] Total gradient norm stats for 73 steps: 0.2433 <= 0.2602 + 0.01156z <= 0.3211 +[ INFO][27-Jun-24 02:07:12] Trained chunk 658 in 139.2s at 4275noun/s: lr=2.53e-04, loss=1.21e+00, top1=70.74%/71.599% +[ INFO][27-Jun-24 02:07:12] Chunk 659 = Batch 764597 = Sample 391473153 +[ INFO][27-Jun-24 02:09:32] Total gradient norm stats for 72 steps: 0.246 <= 0.2593 + 0.01016z <= 0.3002 +[ INFO][27-Jun-24 02:09:32] Trained chunk 659 in 139.5s at 4265noun/s: lr=2.51e-04, loss=1.21e+00, top1=71.82%/71.600% +[ INFO][27-Jun-24 02:09:32] Chunk 660 = Batch 765759 = Sample 392068097 +[ INFO][27-Jun-24 02:11:51] Total gradient norm stats for 73 steps: 0.2485 <= 0.2833 + 0.1354z <= 1.304 (clipped to 1) +[ INFO][27-Jun-24 02:11:51] Trained chunk 660 in 139.3s at 4271noun/s: lr=2.49e-04, loss=1.21e+00, top1=71.79%/71.606% +[ INFO][27-Jun-24 02:11:51] Chunk 661 = Batch 766921 = Sample 392663041 +[ INFO][27-Jun-24 02:14:10] Total gradient norm stats for 73 steps: 0.243 <= 0.2603 + 0.01006z <= 0.3039 +[ INFO][27-Jun-24 02:14:10] Trained chunk 661 in 139.1s at 4277noun/s: lr=2.47e-04, loss=1.21e+00, top1=71.85%/71.606% +[ INFO][27-Jun-24 02:14:10] Chunk 662 = Batch 768083 = Sample 393257985 +[ INFO][27-Jun-24 02:16:29] Total gradient norm stats for 72 steps: 0.2474 <= 0.2664 + 0.01801z <= 0.3347 +[ INFO][27-Jun-24 02:16:29] Trained chunk 662 in 139.2s at 4274noun/s: lr=2.45e-04, loss=1.21e+00, top1=71.77%/71.608% +[ INFO][27-Jun-24 02:16:29] Chunk 663 = Batch 769245 = Sample 393852929 +[ INFO][27-Jun-24 02:18:48] Total gradient norm stats for 73 steps: 0.2448 <= 0.2601 + 0.009362z <= 0.2891 +[ INFO][27-Jun-24 02:18:48] Trained chunk 663 in 139.2s at 4273noun/s: lr=2.43e-04, loss=1.21e+00, top1=72.91%/71.614% +[ INFO][27-Jun-24 02:18:48] Chunk 664 = Batch 770407 = Sample 394447873 +[ INFO][27-Jun-24 02:21:08] Total gradient norm stats for 73 steps: 0.2464 <= 0.2613 + 0.00988z <= 0.2979 +[ INFO][27-Jun-24 02:21:08] Trained chunk 664 in 139.1s at 4276noun/s: lr=2.41e-04, loss=1.21e+00, top1=72.38%/71.624% +[ INFO][27-Jun-24 02:21:08] Chunk 665 = Batch 771569 = Sample 395042817 +[ INFO][27-Jun-24 02:23:27] Total gradient norm stats for 72 steps: 0.2479 <= 0.2666 + 0.01225z <= 0.3015 +[ INFO][27-Jun-24 02:23:27] Trained chunk 665 in 139.5s at 4263noun/s: lr=2.39e-04, loss=1.20e+00, top1=71.38%/71.640% +[ INFO][27-Jun-24 02:23:27] Chunk 666 = Batch 772731 = Sample 395637761 +[ INFO][27-Jun-24 02:25:46] Total gradient norm stats for 73 steps: 0.2474 <= 0.2612 + 0.00973z <= 0.3 +[ INFO][27-Jun-24 02:25:46] Trained chunk 666 in 139.2s at 4275noun/s: lr=2.37e-04, loss=1.20e+00, top1=71.33%/71.642% +[ INFO][27-Jun-24 02:25:46] Chunk 667 = Batch 773893 = Sample 396232705 +[ INFO][27-Jun-24 02:28:05] Total gradient norm stats for 72 steps: 0.2487 <= 0.2662 + 0.01423z <= 0.3127 +[ INFO][27-Jun-24 02:28:05] Trained chunk 667 in 139.2s at 4273noun/s: lr=2.35e-04, loss=1.20e+00, top1=70.68%/71.648% +[ INFO][27-Jun-24 02:28:05] Chunk 668 = Batch 775055 = Sample 396827649 +[ INFO][27-Jun-24 02:30:24] Total gradient norm stats for 73 steps: 0.2487 <= 0.2699 + 0.02687z <= 0.4595 +[ INFO][27-Jun-24 02:30:24] Trained chunk 668 in 139.0s at 4280noun/s: lr=2.33e-04, loss=1.20e+00, top1=70.75%/71.650% +[ INFO][27-Jun-24 02:30:24] Chunk 669 = Batch 776217 = Sample 397422593 +[ INFO][27-Jun-24 02:32:44] Total gradient norm stats for 73 steps: 0.2473 <= 0.2606 + 0.01301z <= 0.3454 +[ INFO][27-Jun-24 02:32:44] Trained chunk 669 in 139.3s at 4271noun/s: lr=2.31e-04, loss=1.20e+00, top1=70.54%/71.650% +[ INFO][27-Jun-24 02:32:44] Chunk 670 = Batch 777379 = Sample 398017537 +[ INFO][27-Jun-24 02:35:03] Total gradient norm stats for 72 steps: 0.2455 <= 0.2598 + 0.008805z <= 0.2945 +[ INFO][27-Jun-24 02:35:03] Trained chunk 670 in 139.2s at 4274noun/s: lr=2.29e-04, loss=1.20e+00, top1=71.97%/71.657% +[ INFO][27-Jun-24 02:35:03] Chunk 671 = Batch 778541 = Sample 398612481 +[ INFO][27-Jun-24 02:37:22] Total gradient norm stats for 73 steps: 0.2462 <= 0.2677 + 0.01687z <= 0.3373 +[ INFO][27-Jun-24 02:37:22] Trained chunk 671 in 139.5s at 4265noun/s: lr=2.28e-04, loss=1.20e+00, top1=72.00%/71.663% +[ INFO][27-Jun-24 02:37:22] Chunk 672 = Batch 779703 = Sample 399207425 +[ INFO][27-Jun-24 02:39:41] Total gradient norm stats for 73 steps: 0.2477 <= 0.2651 + 0.03844z <= 0.5842 +[ INFO][27-Jun-24 02:39:41] Trained chunk 672 in 138.9s at 4283noun/s: lr=2.26e-04, loss=1.20e+00, top1=71.67%/71.664% +[ INFO][27-Jun-24 02:39:42] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240626_001447/ovod_chunk0672_20240627_023941.train +[ INFO][27-Jun-24 02:39:42] Chunk 673 = Batch 780865 = Sample 399802369 +[ INFO][27-Jun-24 02:42:01] Total gradient norm stats for 72 steps: 0.2475 <= 0.2595 + 0.006817z <= 0.2789 +[ INFO][27-Jun-24 02:42:01] Trained chunk 673 in 139.2s at 4273noun/s: lr=2.24e-04, loss=1.20e+00, top1=71.29%/71.669% +[ INFO][27-Jun-24 02:42:01] Chunk 674 = Batch 782027 = Sample 400397313 +[ INFO][27-Jun-24 02:44:20] Total gradient norm stats for 73 steps: 0.2457 <= 0.2659 + 0.01122z <= 0.3018 +[ INFO][27-Jun-24 02:44:20] Trained chunk 674 in 139.1s at 4278noun/s: lr=2.22e-04, loss=1.20e+00, top1=71.43%/71.670% +[ INFO][27-Jun-24 02:44:20] Chunk 675 = Batch 783189 = Sample 400992257 +[ INFO][27-Jun-24 02:46:39] Total gradient norm stats for 72 steps: 0.2449 <= 0.2689 + 0.01664z <= 0.3338 +[ INFO][27-Jun-24 02:46:39] Trained chunk 675 in 139.1s at 4278noun/s: lr=2.20e-04, loss=1.20e+00, top1=72.43%/71.677% +[ INFO][27-Jun-24 02:46:39] Chunk 676 = Batch 784351 = Sample 401587201 +[ INFO][27-Jun-24 02:48:58] Total gradient norm stats for 73 steps: 0.2467 <= 0.2619 + 0.01072z <= 0.2936 +[ INFO][27-Jun-24 02:48:58] Trained chunk 676 in 139.3s at 4270noun/s: lr=2.18e-04, loss=1.20e+00, top1=70.79%/71.680% +[ INFO][27-Jun-24 02:48:58] Chunk 677 = Batch 785513 = Sample 402182145 +[ INFO][27-Jun-24 02:51:18] Total gradient norm stats for 73 steps: 0.2454 <= 0.2663 + 0.02339z <= 0.4362 +[ INFO][27-Jun-24 02:51:18] Trained chunk 677 in 139.2s at 4274noun/s: lr=2.16e-04, loss=1.20e+00, top1=71.34%/71.687% +[ INFO][27-Jun-24 02:51:18] Chunk 678 = Batch 786675 = Sample 402777089 +[ INFO][27-Jun-24 02:53:37] Total gradient norm stats for 72 steps: 0.2499 <= 0.2617 + 0.007945z <= 0.2944 +[ INFO][27-Jun-24 02:53:37] Trained chunk 678 in 139.2s at 4273noun/s: lr=2.15e-04, loss=1.20e+00, top1=71.20%/71.688% +[ INFO][27-Jun-24 02:53:37] Chunk 679 = Batch 787837 = Sample 403372033 +[ INFO][27-Jun-24 02:55:56] Total gradient norm stats for 73 steps: 0.2492 <= 0.2604 + 0.006699z <= 0.2848 +[ INFO][27-Jun-24 02:55:56] Trained chunk 679 in 139.4s at 4269noun/s: lr=2.13e-04, loss=1.20e+00, top1=72.35%/71.689% +[ INFO][27-Jun-24 02:55:56] Chunk 680 = Batch 788999 = Sample 403966977 +[ INFO][27-Jun-24 02:58:16] Total gradient norm stats for 73 steps: 0.2482 <= 0.2626 + 0.008061z <= 0.2944 +[ INFO][27-Jun-24 02:58:16] Trained chunk 680 in 139.4s at 4268noun/s: lr=2.11e-04, loss=1.20e+00, top1=72.55%/71.692% +[ INFO][27-Jun-24 02:58:16] Chunk 681 = Batch 790161 = Sample 404561921 +[ INFO][27-Jun-24 03:00:35] Total gradient norm stats for 72 steps: 0.2481 <= 0.2623 + 0.009706z <= 0.3158 +[ INFO][27-Jun-24 03:00:35] Trained chunk 681 in 139.1s at 4278noun/s: lr=2.09e-04, loss=1.20e+00, top1=71.31%/71.699% +[ INFO][27-Jun-24 03:00:35] Chunk 682 = Batch 791323 = Sample 405156865 +[ INFO][27-Jun-24 03:02:53] Total gradient norm stats for 73 steps: 0.2499 <= 0.2642 + 0.01681z <= 0.389 +[ INFO][27-Jun-24 03:02:53] Trained chunk 682 in 138.6s at 4291noun/s: lr=2.07e-04, loss=1.20e+00, top1=71.42%/71.704% +[ INFO][27-Jun-24 03:02:53] Chunk 683 = Batch 792485 = Sample 405751809 +[ INFO][27-Jun-24 03:05:12] Total gradient norm stats for 72 steps: 0.2519 <= 0.2643 + 0.008946z <= 0.3059 +[ INFO][27-Jun-24 03:05:12] Trained chunk 683 in 139.0s at 4281noun/s: lr=2.06e-04, loss=1.20e+00, top1=72.19%/71.706% +[ INFO][27-Jun-24 03:05:12] Chunk 684 = Batch 793647 = Sample 406346753 +[ INFO][27-Jun-24 03:07:31] Total gradient norm stats for 73 steps: 0.2472 <= 0.2639 + 0.01518z <= 0.3632 +[ INFO][27-Jun-24 03:07:31] Trained chunk 684 in 139.0s at 4279noun/s: lr=2.04e-04, loss=1.20e+00, top1=71.53%/71.710% +[ INFO][27-Jun-24 03:07:31] Chunk 685 = Batch 794809 = Sample 406941697 +[ INFO][27-Jun-24 03:09:50] Total gradient norm stats for 73 steps: 0.2511 <= 0.2624 + 0.01183z <= 0.3414 +[ INFO][27-Jun-24 03:09:50] Trained chunk 685 in 139.0s at 4279noun/s: lr=2.02e-04, loss=1.20e+00, top1=72.49%/71.715% +[ INFO][27-Jun-24 03:09:50] Chunk 686 = Batch 795971 = Sample 407536641 +[ INFO][27-Jun-24 03:12:10] Total gradient norm stats for 72 steps: 0.25 <= 0.2622 + 0.00921z <= 0.3056 +[ INFO][27-Jun-24 03:12:10] Trained chunk 686 in 139.2s at 4275noun/s: lr=2.00e-04, loss=1.20e+00, top1=71.66%/71.719% +[ INFO][27-Jun-24 03:12:10] Chunk 687 = Batch 797133 = Sample 408131585 +[ INFO][27-Jun-24 03:14:29] Total gradient norm stats for 73 steps: 0.2511 <= 0.2703 + 0.07029z <= 0.8594 +[ INFO][27-Jun-24 03:14:29] Trained chunk 687 in 139.5s at 4264noun/s: lr=1.98e-04, loss=1.20e+00, top1=71.67%/71.725% +[ INFO][27-Jun-24 03:14:29] Chunk 688 = Batch 798295 = Sample 408726529 +[ INFO][27-Jun-24 03:16:48] Total gradient norm stats for 73 steps: 0.2513 <= 0.2616 + 0.006481z <= 0.2882 +[ INFO][27-Jun-24 03:16:48] Trained chunk 688 in 139.3s at 4270noun/s: lr=1.97e-04, loss=1.20e+00, top1=70.13%/71.733% +[ INFO][27-Jun-24 03:16:48] Chunk 689 = Batch 799457 = Sample 409321473 +[ INFO][27-Jun-24 03:19:08] Total gradient norm stats for 72 steps: 0.2508 <= 0.2648 + 0.01098z <= 0.3156 +[ INFO][27-Jun-24 03:19:08] Trained chunk 689 in 139.3s at 4271noun/s: lr=1.95e-04, loss=1.20e+00, top1=71.38%/71.735% +[ INFO][27-Jun-24 03:19:08] Chunk 690 = Batch 800619 = Sample 409916417 +[ INFO][27-Jun-24 03:21:27] Total gradient norm stats for 73 steps: 0.2495 <= 0.2661 + 0.01163z <= 0.323 +[ INFO][27-Jun-24 03:21:27] Trained chunk 690 in 139.1s at 4278noun/s: lr=1.93e-04, loss=1.20e+00, top1=71.00%/71.742% +[ INFO][27-Jun-24 03:21:27] Chunk 691 = Batch 801781 = Sample 410511361 +[ INFO][27-Jun-24 03:23:46] Total gradient norm stats for 72 steps: 0.2519 <= 0.2653 + 0.01056z <= 0.3033 +[ INFO][27-Jun-24 03:23:46] Trained chunk 691 in 139.6s at 4261noun/s: lr=1.91e-04, loss=1.20e+00, top1=71.73%/71.743% +[ INFO][27-Jun-24 03:23:46] Chunk 692 = Batch 802943 = Sample 411106305 +[ INFO][27-Jun-24 03:26:06] Total gradient norm stats for 73 steps: 0.2517 <= 0.2669 + 0.01151z <= 0.326 +[ INFO][27-Jun-24 03:26:06] Trained chunk 692 in 139.3s at 4270noun/s: lr=1.90e-04, loss=1.20e+00, top1=71.70%/71.744% +[ INFO][27-Jun-24 03:26:06] Chunk 693 = Batch 804105 = Sample 411701249 +[ INFO][27-Jun-24 03:28:25] Total gradient norm stats for 73 steps: 0.2495 <= 0.2635 + 0.008745z <= 0.3092 +[ INFO][27-Jun-24 03:28:25] Trained chunk 693 in 139.2s at 4273noun/s: lr=1.88e-04, loss=1.20e+00, top1=72.27%/71.747% +[ INFO][27-Jun-24 03:28:25] Chunk 694 = Batch 805267 = Sample 412296193 +[ INFO][27-Jun-24 03:30:44] Total gradient norm stats for 72 steps: 0.2491 <= 0.2637 + 0.007479z <= 0.2838 +[ INFO][27-Jun-24 03:30:44] Trained chunk 694 in 138.9s at 4284noun/s: lr=1.86e-04, loss=1.20e+00, top1=70.57%/71.754% +[ INFO][27-Jun-24 03:30:44] Chunk 695 = Batch 806429 = Sample 412891137 +[ INFO][27-Jun-24 03:33:03] Total gradient norm stats for 73 steps: 0.249 <= 0.2628 + 0.01668z <= 0.3934 +[ INFO][27-Jun-24 03:33:03] Trained chunk 695 in 139.2s at 4273noun/s: lr=1.84e-04, loss=1.20e+00, top1=71.10%/71.754% +[ INFO][27-Jun-24 03:33:03] Chunk 696 = Batch 807591 = Sample 413486081 +[ INFO][27-Jun-24 03:35:22] Total gradient norm stats for 73 steps: 0.2513 <= 0.2698 + 0.01268z <= 0.3081 +[ INFO][27-Jun-24 03:35:22] Trained chunk 696 in 139.1s at 4276noun/s: lr=1.83e-04, loss=1.20e+00, top1=71.71%/71.756% +[ INFO][27-Jun-24 03:35:22] Chunk 697 = Batch 808753 = Sample 414081025 +[ INFO][27-Jun-24 03:37:42] Total gradient norm stats for 72 steps: 0.2516 <= 0.2689 + 0.01033z <= 0.3024 +[ INFO][27-Jun-24 03:37:42] Trained chunk 697 in 139.4s at 4268noun/s: lr=1.81e-04, loss=1.20e+00, top1=72.11%/71.758% +[ INFO][27-Jun-24 03:37:42] Chunk 698 = Batch 809915 = Sample 414675969 +[ INFO][27-Jun-24 03:40:01] Total gradient norm stats for 73 steps: 0.2509 <= 0.2662 + 0.009256z <= 0.299 +[ INFO][27-Jun-24 03:40:01] Trained chunk 698 in 139.1s at 4278noun/s: lr=1.79e-04, loss=1.20e+00, top1=72.13%/71.757% +[ INFO][27-Jun-24 03:40:01] Chunk 699 = Batch 811077 = Sample 415270913 +[ INFO][27-Jun-24 03:42:20] Total gradient norm stats for 72 steps: 0.2529 <= 0.2679 + 0.01013z <= 0.3135 +[ INFO][27-Jun-24 03:42:20] Trained chunk 699 in 139.3s at 4271noun/s: lr=1.78e-04, loss=1.20e+00, top1=71.13%/71.767% +[ INFO][27-Jun-24 03:42:20] Chunk 700 = Batch 812239 = Sample 415865857 +[ INFO][27-Jun-24 03:44:34] Epoch 14 finished in 6961.9s +[ INFO][27-Jun-24 03:44:34] -------------------------------------------------------------------------------- +[ INFO][27-Jun-24 03:44:34] Epoch 15 = Batch 813345 = Sample 416432129 +[ INFO][27-Jun-24 03:44:41] Total gradient norm stats for 73 steps: 0.2512 <= 0.2659 + 0.009842z <= 0.307 +[ INFO][27-Jun-24 03:44:41] Trained chunk 700 in 140.9s at 4223noun/s: lr=1.76e-04, loss=1.20e+00, top1=72.16%/71.773% +[ INFO][27-Jun-24 03:44:41] Chunk 701 = Batch 813401 = Sample 416460801 +[ INFO][27-Jun-24 03:47:00] Total gradient norm stats for 73 steps: 0.2531 <= 0.2645 + 0.007012z <= 0.2874 +[ INFO][27-Jun-24 03:47:00] Trained chunk 701 in 139.2s at 4274noun/s: lr=1.74e-04, loss=1.20e+00, top1=72.07%/71.780% +[ INFO][27-Jun-24 03:47:00] Chunk 702 = Batch 814563 = Sample 417055745 +[ INFO][27-Jun-24 03:49:20] Total gradient norm stats for 72 steps: 0.2523 <= 0.2745 + 0.0737z <= 0.8849 +[ INFO][27-Jun-24 03:49:20] Trained chunk 702 in 139.4s at 4267noun/s: lr=1.72e-04, loss=1.20e+00, top1=72.03%/71.789% +[ INFO][27-Jun-24 03:49:20] Chunk 703 = Batch 815725 = Sample 417650689 +[ INFO][27-Jun-24 03:51:39] Total gradient norm stats for 73 steps: 0.2529 <= 0.2654 + 0.00745z <= 0.2861 +[ INFO][27-Jun-24 03:51:39] Trained chunk 703 in 139.2s at 4274noun/s: lr=1.71e-04, loss=1.20e+00, top1=71.69%/71.792% +[ INFO][27-Jun-24 03:51:39] Chunk 704 = Batch 816887 = Sample 418245633 +[ INFO][27-Jun-24 03:53:58] Total gradient norm stats for 73 steps: 0.252 <= 0.2643 + 0.007445z <= 0.2985 +[ INFO][27-Jun-24 03:53:58] Trained chunk 704 in 139.2s at 4275noun/s: lr=1.69e-04, loss=1.20e+00, top1=71.67%/71.796% +[ INFO][27-Jun-24 03:53:58] Chunk 705 = Batch 818049 = Sample 418840577 +[ INFO][27-Jun-24 03:56:17] Total gradient norm stats for 72 steps: 0.2526 <= 0.2671 + 0.009312z <= 0.2933 +[ INFO][27-Jun-24 03:56:17] Trained chunk 705 in 139.3s at 4270noun/s: lr=1.67e-04, loss=1.20e+00, top1=72.30%/71.798% +[ INFO][27-Jun-24 03:56:17] Chunk 706 = Batch 819211 = Sample 419435521 +[ INFO][27-Jun-24 03:58:37] Total gradient norm stats for 73 steps: 0.2524 <= 0.2654 + 0.009866z <= 0.3066 +[ INFO][27-Jun-24 03:58:37] Trained chunk 706 in 139.5s at 4263noun/s: lr=1.66e-04, loss=1.20e+00, top1=70.98%/71.808% +[ INFO][27-Jun-24 03:58:37] Chunk 707 = Batch 820373 = Sample 420030465 +[ INFO][27-Jun-24 04:00:56] Total gradient norm stats for 72 steps: 0.2498 <= 0.2672 + 0.01281z <= 0.3228 +[ INFO][27-Jun-24 04:00:56] Trained chunk 707 in 139.3s at 4272noun/s: lr=1.64e-04, loss=1.20e+00, top1=70.96%/71.815% +[ INFO][27-Jun-24 04:00:56] Chunk 708 = Batch 821535 = Sample 420625409 +[ INFO][27-Jun-24 04:03:15] Total gradient norm stats for 73 steps: 0.2531 <= 0.2678 + 0.009908z <= 0.3136 +[ INFO][27-Jun-24 04:03:15] Trained chunk 708 in 139.2s at 4273noun/s: lr=1.63e-04, loss=1.20e+00, top1=71.50%/71.817% +[ INFO][27-Jun-24 04:03:15] Chunk 709 = Batch 822697 = Sample 421220353 +[ INFO][27-Jun-24 04:05:35] Total gradient norm stats for 73 steps: 0.2538 <= 0.3418 + 0.612z <= 5.498 (clipped to 1) +[ INFO][27-Jun-24 04:05:35] Trained chunk 709 in 139.6s at 4263noun/s: lr=1.61e-04, loss=1.20e+00, top1=71.74%/71.821% +[ INFO][27-Jun-24 04:05:35] Chunk 710 = Batch 823859 = Sample 421815297 +[ INFO][27-Jun-24 04:07:54] Total gradient norm stats for 72 steps: 0.2523 <= 0.2688 + 0.02572z <= 0.465 +[ INFO][27-Jun-24 04:07:54] Trained chunk 710 in 139.4s at 4268noun/s: lr=1.59e-04, loss=1.20e+00, top1=70.62%/71.824% +[ INFO][27-Jun-24 04:07:54] Chunk 711 = Batch 825021 = Sample 422410241 +[ INFO][27-Jun-24 04:10:13] Total gradient norm stats for 73 steps: 0.2539 <= 0.267 + 0.02632z <= 0.4834 +[ INFO][27-Jun-24 04:10:13] Trained chunk 711 in 139.1s at 4276noun/s: lr=1.58e-04, loss=1.19e+00, top1=73.15%/71.827% +[ INFO][27-Jun-24 04:10:13] Chunk 712 = Batch 826183 = Sample 423005185 +[ INFO][27-Jun-24 04:12:33] Total gradient norm stats for 73 steps: 0.2522 <= 0.2673 + 0.008463z <= 0.2916 +[ INFO][27-Jun-24 04:12:33] Trained chunk 712 in 139.4s at 4269noun/s: lr=1.56e-04, loss=1.19e+00, top1=70.64%/71.828% +[ INFO][27-Jun-24 04:12:33] Chunk 713 = Batch 827345 = Sample 423600129 +[ INFO][27-Jun-24 04:14:52] Total gradient norm stats for 72 steps: 0.2548 <= 0.2662 + 0.007066z <= 0.2856 +[ INFO][27-Jun-24 04:14:52] Trained chunk 713 in 139.5s at 4264noun/s: lr=1.55e-04, loss=1.19e+00, top1=71.47%/71.833% +[ INFO][27-Jun-24 04:14:52] Chunk 714 = Batch 828507 = Sample 424195073 +[ INFO][27-Jun-24 04:17:12] Total gradient norm stats for 73 steps: 0.254 <= 0.2685 + 0.008944z <= 0.2946 +[ INFO][27-Jun-24 04:17:12] Trained chunk 714 in 139.8s at 4256noun/s: lr=1.53e-04, loss=1.19e+00, top1=71.93%/71.830% +[ INFO][27-Jun-24 04:17:12] Chunk 715 = Batch 829669 = Sample 424790017 +[ INFO][27-Jun-24 04:19:31] Total gradient norm stats for 72 steps: 0.2522 <= 0.2659 + 0.009393z <= 0.2982 +[ INFO][27-Jun-24 04:19:31] Trained chunk 715 in 139.3s at 4272noun/s: lr=1.51e-04, loss=1.19e+00, top1=71.82%/71.831% +[ INFO][27-Jun-24 04:19:31] Chunk 716 = Batch 830831 = Sample 425384961 +[ INFO][27-Jun-24 04:21:51] Total gradient norm stats for 73 steps: 0.2544 <= 0.2648 + 0.006226z <= 0.2861 +[ INFO][27-Jun-24 04:21:51] Trained chunk 716 in 139.4s at 4267noun/s: lr=1.50e-04, loss=1.19e+00, top1=72.61%/71.841% +[ INFO][27-Jun-24 04:21:51] Chunk 717 = Batch 831993 = Sample 425979905 +[ INFO][27-Jun-24 04:24:10] Total gradient norm stats for 73 steps: 0.2523 <= 0.2688 + 0.008006z <= 0.2871 +[ INFO][27-Jun-24 04:24:10] Trained chunk 717 in 139.5s at 4265noun/s: lr=1.48e-04, loss=1.19e+00, top1=71.10%/71.845% +[ INFO][27-Jun-24 04:24:10] Chunk 718 = Batch 833155 = Sample 426574849 +[ INFO][27-Jun-24 04:26:30] Total gradient norm stats for 72 steps: 0.255 <= 0.2664 + 0.005876z <= 0.2819 +[ INFO][27-Jun-24 04:26:30] Trained chunk 718 in 139.3s at 4270noun/s: lr=1.47e-04, loss=1.19e+00, top1=72.91%/71.842% +[ INFO][27-Jun-24 04:26:30] Chunk 719 = Batch 834317 = Sample 427169793 +[ INFO][27-Jun-24 04:28:49] Total gradient norm stats for 73 steps: 0.2555 <= 0.2663 + 0.006117z <= 0.2873 +[ INFO][27-Jun-24 04:28:49] Trained chunk 719 in 139.2s at 4273noun/s: lr=1.45e-04, loss=1.19e+00, top1=72.98%/71.853% +[ INFO][27-Jun-24 04:28:49] Chunk 720 = Batch 835479 = Sample 427764737 +[ INFO][27-Jun-24 04:31:08] Total gradient norm stats for 73 steps: 0.2559 <= 0.2677 + 0.009521z <= 0.2932 +[ INFO][27-Jun-24 04:31:08] Trained chunk 720 in 139.1s at 4278noun/s: lr=1.44e-04, loss=1.19e+00, top1=71.39%/71.858% +[ INFO][27-Jun-24 04:31:08] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240626_001447/ovod_chunk0720_20240627_043108.train +[ INFO][27-Jun-24 04:31:08] Chunk 721 = Batch 836641 = Sample 428359681 +[ INFO][27-Jun-24 04:33:27] Total gradient norm stats for 72 steps: 0.2546 <= 0.2656 + 0.006043z <= 0.2813 +[ INFO][27-Jun-24 04:33:27] Trained chunk 721 in 139.0s at 4282noun/s: lr=1.42e-04, loss=1.19e+00, top1=71.85%/71.861% +[ INFO][27-Jun-24 04:33:27] Chunk 722 = Batch 837803 = Sample 428954625 +[ INFO][27-Jun-24 04:35:46] Total gradient norm stats for 73 steps: 0.2562 <= 0.2663 + 0.007336z <= 0.2879 +[ INFO][27-Jun-24 04:35:46] Trained chunk 722 in 139.0s at 4279noun/s: lr=1.40e-04, loss=1.19e+00, top1=71.64%/71.866% +[ INFO][27-Jun-24 04:35:46] Chunk 723 = Batch 838965 = Sample 429549569 +[ INFO][27-Jun-24 04:38:05] Total gradient norm stats for 72 steps: 0.2552 <= 0.2675 + 0.01654z <= 0.3959 +[ INFO][27-Jun-24 04:38:05] Trained chunk 723 in 139.2s at 4275noun/s: lr=1.39e-04, loss=1.19e+00, top1=72.78%/71.865% +[ INFO][27-Jun-24 04:38:05] Chunk 724 = Batch 840127 = Sample 430144513 +[ INFO][27-Jun-24 04:40:25] Total gradient norm stats for 73 steps: 0.2545 <= 0.2682 + 0.01525z <= 0.3764 +[ INFO][27-Jun-24 04:40:25] Trained chunk 724 in 139.4s at 4268noun/s: lr=1.37e-04, loss=1.19e+00, top1=71.93%/71.864% +[ INFO][27-Jun-24 04:40:25] Chunk 725 = Batch 841289 = Sample 430739457 +[ INFO][27-Jun-24 04:42:44] Total gradient norm stats for 73 steps: 0.2539 <= 0.2657 + 0.008753z <= 0.3004 +[ INFO][27-Jun-24 04:42:44] Trained chunk 725 in 139.4s at 4267noun/s: lr=1.36e-04, loss=1.19e+00, top1=71.79%/71.869% +[ INFO][27-Jun-24 04:42:44] Chunk 726 = Batch 842451 = Sample 431334401 +[ INFO][27-Jun-24 04:45:04] Total gradient norm stats for 72 steps: 0.2505 <= 0.268 + 0.02681z <= 0.484 +[ INFO][27-Jun-24 04:45:04] Trained chunk 726 in 139.4s at 4267noun/s: lr=1.34e-04, loss=1.19e+00, top1=72.40%/71.876% +[ INFO][27-Jun-24 04:45:04] Chunk 727 = Batch 843613 = Sample 431929345 +[ INFO][27-Jun-24 04:47:23] Total gradient norm stats for 73 steps: 0.2516 <= 0.2679 + 0.01015z <= 0.3041 +[ INFO][27-Jun-24 04:47:23] Trained chunk 727 in 139.5s at 4265noun/s: lr=1.33e-04, loss=1.19e+00, top1=72.21%/71.877% +[ INFO][27-Jun-24 04:47:23] Chunk 728 = Batch 844775 = Sample 432524289 +[ INFO][27-Jun-24 04:49:42] Total gradient norm stats for 73 steps: 0.2547 <= 0.2675 + 0.008465z <= 0.3054 +[ INFO][27-Jun-24 04:49:42] Trained chunk 728 in 139.2s at 4275noun/s: lr=1.31e-04, loss=1.19e+00, top1=70.25%/71.872% +[ INFO][27-Jun-24 04:49:42] Chunk 729 = Batch 845937 = Sample 433119233 +[ INFO][27-Jun-24 04:52:01] Total gradient norm stats for 72 steps: 0.2541 <= 0.2676 + 0.008261z <= 0.3049 +[ INFO][27-Jun-24 04:52:01] Trained chunk 729 in 139.0s at 4280noun/s: lr=1.30e-04, loss=1.19e+00, top1=70.65%/71.878% +[ INFO][27-Jun-24 04:52:01] Chunk 730 = Batch 847099 = Sample 433714177 +[ INFO][27-Jun-24 04:54:20] Total gradient norm stats for 73 steps: 0.2544 <= 0.265 + 0.005723z <= 0.2806 +[ INFO][27-Jun-24 04:54:20] Trained chunk 730 in 139.1s at 4278noun/s: lr=1.28e-04, loss=1.19e+00, top1=71.40%/71.877% +[ INFO][27-Jun-24 04:54:20] Chunk 731 = Batch 848261 = Sample 434309121 +[ INFO][27-Jun-24 04:56:39] Total gradient norm stats for 72 steps: 0.2537 <= 0.267 + 0.006981z <= 0.296 +[ INFO][27-Jun-24 04:56:39] Trained chunk 731 in 139.1s at 4276noun/s: lr=1.27e-04, loss=1.19e+00, top1=72.07%/71.883% +[ INFO][27-Jun-24 04:56:39] Chunk 732 = Batch 849423 = Sample 434904065 +[ INFO][27-Jun-24 04:58:59] Total gradient norm stats for 73 steps: 0.2566 <= 0.2666 + 0.008117z <= 0.3101 +[ INFO][27-Jun-24 04:58:59] Trained chunk 732 in 139.3s at 4270noun/s: lr=1.26e-04, loss=1.19e+00, top1=71.02%/71.891% +[ INFO][27-Jun-24 04:58:59] Chunk 733 = Batch 850585 = Sample 435499009 +[ INFO][27-Jun-24 05:01:18] Total gradient norm stats for 73 steps: 0.2544 <= 0.269 + 0.0148z <= 0.3734 +[ INFO][27-Jun-24 05:01:18] Trained chunk 733 in 139.0s at 4279noun/s: lr=1.24e-04, loss=1.19e+00, top1=71.71%/71.897% +[ INFO][27-Jun-24 05:01:18] Chunk 734 = Batch 851747 = Sample 436093953 +[ INFO][27-Jun-24 05:03:37] Total gradient norm stats for 72 steps: 0.2567 <= 0.2664 + 0.005876z <= 0.2946 +[ INFO][27-Jun-24 05:03:37] Trained chunk 734 in 139.1s at 4276noun/s: lr=1.23e-04, loss=1.19e+00, top1=72.54%/71.903% +[ INFO][27-Jun-24 05:03:37] Chunk 735 = Batch 852909 = Sample 436688897 +[ INFO][27-Jun-24 05:05:56] Total gradient norm stats for 73 steps: 0.2581 <= 0.2729 + 0.0145z <= 0.3635 +[ INFO][27-Jun-24 05:05:56] Trained chunk 735 in 139.1s at 4279noun/s: lr=1.21e-04, loss=1.19e+00, top1=71.20%/71.903% +[ INFO][27-Jun-24 05:05:56] Chunk 736 = Batch 854071 = Sample 437283841 +[ INFO][27-Jun-24 05:08:15] Total gradient norm stats for 73 steps: 0.2575 <= 0.2702 + 0.02066z <= 0.4205 +[ INFO][27-Jun-24 05:08:15] Trained chunk 736 in 139.2s at 4273noun/s: lr=1.20e-04, loss=1.19e+00, top1=72.59%/71.909% +[ INFO][27-Jun-24 05:08:15] Chunk 737 = Batch 855233 = Sample 437878785 +[ INFO][27-Jun-24 05:10:35] Total gradient norm stats for 72 steps: 0.256 <= 0.2661 + 0.005698z <= 0.2857 +[ INFO][27-Jun-24 05:10:35] Trained chunk 737 in 139.3s at 4272noun/s: lr=1.18e-04, loss=1.19e+00, top1=72.53%/71.912% +[ INFO][27-Jun-24 05:10:35] Chunk 738 = Batch 856395 = Sample 438473729 +[ INFO][27-Jun-24 05:12:54] Total gradient norm stats for 73 steps: 0.2544 <= 0.2649 + 0.006743z <= 0.2986 +[ INFO][27-Jun-24 05:12:54] Trained chunk 738 in 139.1s at 4277noun/s: lr=1.17e-04, loss=1.19e+00, top1=72.52%/71.919% +[ INFO][27-Jun-24 05:12:54] Chunk 739 = Batch 857557 = Sample 439068673 +[ INFO][27-Jun-24 05:15:13] Total gradient norm stats for 72 steps: 0.2571 <= 0.2717 + 0.009951z <= 0.3014 +[ INFO][27-Jun-24 05:15:13] Trained chunk 739 in 139.3s at 4272noun/s: lr=1.16e-04, loss=1.19e+00, top1=71.51%/71.914% +[ INFO][27-Jun-24 05:15:13] Chunk 740 = Batch 858719 = Sample 439663617 +[ INFO][27-Jun-24 05:17:32] Total gradient norm stats for 73 steps: 0.257 <= 0.274 + 0.01164z <= 0.3051 +[ INFO][27-Jun-24 05:17:32] Trained chunk 740 in 139.5s at 4266noun/s: lr=1.14e-04, loss=1.19e+00, top1=71.89%/71.920% +[ INFO][27-Jun-24 05:17:32] Chunk 741 = Batch 859881 = Sample 440258561 +[ INFO][27-Jun-24 05:19:52] Total gradient norm stats for 73 steps: 0.2576 <= 0.267 + 0.006284z <= 0.2886 +[ INFO][27-Jun-24 05:19:52] Trained chunk 741 in 139.7s at 4260noun/s: lr=1.13e-04, loss=1.19e+00, top1=71.71%/71.925% +[ INFO][27-Jun-24 05:19:52] Chunk 742 = Batch 861043 = Sample 440853505 +[ INFO][27-Jun-24 05:22:12] Total gradient norm stats for 72 steps: 0.2534 <= 0.2668 + 0.006354z <= 0.2826 +[ INFO][27-Jun-24 05:22:12] Trained chunk 742 in 139.5s at 4265noun/s: lr=1.11e-04, loss=1.19e+00, top1=70.88%/71.933% +[ INFO][27-Jun-24 05:22:12] Chunk 743 = Batch 862205 = Sample 441448449 +[ INFO][27-Jun-24 05:24:31] Total gradient norm stats for 73 steps: 0.259 <= 0.2692 + 0.0141z <= 0.3767 +[ INFO][27-Jun-24 05:24:31] Trained chunk 743 in 139.6s at 4263noun/s: lr=1.10e-04, loss=1.19e+00, top1=70.64%/71.934% +[ INFO][27-Jun-24 05:24:31] Chunk 744 = Batch 863367 = Sample 442043393 +[ INFO][27-Jun-24 05:26:51] Total gradient norm stats for 73 steps: 0.2572 <= 0.2696 + 0.01125z <= 0.3481 +[ INFO][27-Jun-24 05:26:51] Trained chunk 744 in 139.9s at 4253noun/s: lr=1.09e-04, loss=1.19e+00, top1=72.27%/71.941% +[ INFO][27-Jun-24 05:26:51] Chunk 745 = Batch 864529 = Sample 442638337 +[ INFO][27-Jun-24 05:29:11] Total gradient norm stats for 72 steps: 0.2567 <= 0.2686 + 0.008574z <= 0.2999 +[ INFO][27-Jun-24 05:29:11] Trained chunk 745 in 139.7s at 4259noun/s: lr=1.07e-04, loss=1.19e+00, top1=72.61%/71.947% +[ INFO][27-Jun-24 05:29:11] Chunk 746 = Batch 865691 = Sample 443233281 +[ INFO][27-Jun-24 05:31:30] Total gradient norm stats for 73 steps: 0.2576 <= 0.2679 + 0.007769z <= 0.3054 +[ INFO][27-Jun-24 05:31:30] Trained chunk 746 in 139.5s at 4265noun/s: lr=1.06e-04, loss=1.19e+00, top1=71.61%/71.948% +[ INFO][27-Jun-24 05:31:30] Chunk 747 = Batch 866853 = Sample 443828225 +[ INFO][27-Jun-24 05:33:50] Total gradient norm stats for 72 steps: 0.258 <= 0.2698 + 0.009032z <= 0.3103 +[ INFO][27-Jun-24 05:33:50] Trained chunk 747 in 139.6s at 4262noun/s: lr=1.05e-04, loss=1.19e+00, top1=73.36%/71.954% +[ INFO][27-Jun-24 05:33:50] Chunk 748 = Batch 868015 = Sample 444423169 +[ INFO][27-Jun-24 05:36:10] Total gradient norm stats for 73 steps: 0.256 <= 0.2685 + 0.008904z <= 0.3016 +[ INFO][27-Jun-24 05:36:10] Trained chunk 748 in 139.8s at 4255noun/s: lr=1.03e-04, loss=1.19e+00, top1=71.21%/71.958% +[ INFO][27-Jun-24 05:36:10] Chunk 749 = Batch 869177 = Sample 445018113 +[ INFO][27-Jun-24 05:38:29] Total gradient norm stats for 73 steps: 0.2549 <= 0.2868 + 0.1517z <= 1.564 (clipped to 1) +[ INFO][27-Jun-24 05:38:29] Trained chunk 749 in 139.8s at 4256noun/s: lr=1.02e-04, loss=1.19e+00, top1=73.24%/71.958% +[ INFO][27-Jun-24 05:38:29] Chunk 750 = Batch 870339 = Sample 445613057 +[ INFO][27-Jun-24 05:40:43] Epoch 15 finished in 6968.6s +[ INFO][27-Jun-24 05:40:43] -------------------------------------------------------------------------------- +[ INFO][27-Jun-24 05:40:43] Epoch 16 = Batch 871441 = Sample 446177281 +[ INFO][27-Jun-24 05:40:50] Total gradient norm stats for 72 steps: 0.2573 <= 0.2688 + 0.009195z <= 0.3254 +[ INFO][27-Jun-24 05:40:50] Trained chunk 750 in 140.7s at 4230noun/s: lr=1.01e-04, loss=1.19e+00, top1=72.71%/71.959% +[ INFO][27-Jun-24 05:40:50] Chunk 751 = Batch 871501 = Sample 446208001 +[ INFO][27-Jun-24 05:43:10] Total gradient norm stats for 73 steps: 0.2589 <= 0.2708 + 0.009586z <= 0.3068 +[ INFO][27-Jun-24 05:43:10] Trained chunk 751 in 140.3s at 4239noun/s: lr=9.94e-05, loss=1.19e+00, top1=71.52%/71.967% +[ INFO][27-Jun-24 05:43:10] Chunk 752 = Batch 872663 = Sample 446802945 +[ INFO][27-Jun-24 05:45:30] Total gradient norm stats for 73 steps: 0.2557 <= 0.2709 + 0.0084z <= 0.2958 +[ INFO][27-Jun-24 05:45:30] Trained chunk 752 in 139.9s at 4252noun/s: lr=9.81e-05, loss=1.19e+00, top1=72.19%/71.968% +[ INFO][27-Jun-24 05:45:30] Chunk 753 = Batch 873825 = Sample 447397889 +[ INFO][27-Jun-24 05:47:50] Total gradient norm stats for 72 steps: 0.2548 <= 0.2679 + 0.006357z <= 0.2876 +[ INFO][27-Jun-24 05:47:50] Trained chunk 753 in 139.6s at 4261noun/s: lr=9.68e-05, loss=1.19e+00, top1=72.32%/71.972% +[ INFO][27-Jun-24 05:47:50] Chunk 754 = Batch 874987 = Sample 447992833 +[ INFO][27-Jun-24 05:50:10] Total gradient norm stats for 73 steps: 0.2575 <= 0.2715 + 0.03014z <= 0.5178 +[ INFO][27-Jun-24 05:50:10] Trained chunk 754 in 140.1s at 4246noun/s: lr=9.55e-05, loss=1.19e+00, top1=72.31%/71.976% +[ INFO][27-Jun-24 05:50:10] Chunk 755 = Batch 876149 = Sample 448587777 +[ INFO][27-Jun-24 05:52:30] Total gradient norm stats for 72 steps: 0.2552 <= 0.2672 + 0.005082z <= 0.2801 +[ INFO][27-Jun-24 05:52:30] Trained chunk 755 in 140.3s at 4242noun/s: lr=9.42e-05, loss=1.19e+00, top1=72.70%/71.985% +[ INFO][27-Jun-24 05:52:30] Chunk 756 = Batch 877311 = Sample 449182721 +[ INFO][27-Jun-24 05:54:51] Total gradient norm stats for 73 steps: 0.2555 <= 0.2679 + 0.007283z <= 0.2954 +[ INFO][27-Jun-24 05:54:51] Trained chunk 756 in 140.4s at 4237noun/s: lr=9.30e-05, loss=1.19e+00, top1=71.29%/71.993% +[ INFO][27-Jun-24 05:54:51] Chunk 757 = Batch 878473 = Sample 449777665 +[ INFO][27-Jun-24 05:57:11] Total gradient norm stats for 73 steps: 0.2558 <= 0.2677 + 0.005871z <= 0.2893 +[ INFO][27-Jun-24 05:57:11] Trained chunk 757 in 140.2s at 4243noun/s: lr=9.17e-05, loss=1.19e+00, top1=72.91%/71.996% +[ INFO][27-Jun-24 05:57:11] Chunk 758 = Batch 879635 = Sample 450372609 +[ INFO][27-Jun-24 05:59:31] Total gradient norm stats for 72 steps: 0.254 <= 0.2668 + 0.005819z <= 0.2827 +[ INFO][27-Jun-24 05:59:31] Trained chunk 758 in 140.0s at 4249noun/s: lr=9.05e-05, loss=1.19e+00, top1=72.55%/71.999% +[ INFO][27-Jun-24 05:59:31] Chunk 759 = Batch 880797 = Sample 450967553 +[ INFO][27-Jun-24 06:01:51] Total gradient norm stats for 73 steps: 0.2561 <= 0.268 + 0.00626z <= 0.288 +[ INFO][27-Jun-24 06:01:51] Trained chunk 759 in 140.2s at 4244noun/s: lr=8.92e-05, loss=1.19e+00, top1=73.21%/72.005% +[ INFO][27-Jun-24 06:01:51] Chunk 760 = Batch 881959 = Sample 451562497 +[ INFO][27-Jun-24 06:04:11] Total gradient norm stats for 73 steps: 0.2577 <= 0.2687 + 0.006855z <= 0.2891 +[ INFO][27-Jun-24 06:04:11] Trained chunk 760 in 140.0s at 4249noun/s: lr=8.80e-05, loss=1.19e+00, top1=71.96%/72.002% +[ INFO][27-Jun-24 06:04:11] Chunk 761 = Batch 883121 = Sample 452157441 +[ INFO][27-Jun-24 06:06:31] Total gradient norm stats for 72 steps: 0.2563 <= 0.2685 + 0.006419z <= 0.2957 +[ INFO][27-Jun-24 06:06:31] Trained chunk 761 in 139.7s at 4258noun/s: lr=8.68e-05, loss=1.19e+00, top1=72.73%/72.004% +[ INFO][27-Jun-24 06:06:31] Chunk 762 = Batch 884283 = Sample 452752385 +[ INFO][27-Jun-24 06:08:51] Total gradient norm stats for 73 steps: 0.258 <= 0.2695 + 0.007581z <= 0.2941 +[ INFO][27-Jun-24 06:08:51] Trained chunk 762 in 139.6s at 4262noun/s: lr=8.55e-05, loss=1.19e+00, top1=71.88%/72.006% +[ INFO][27-Jun-24 06:08:51] Chunk 763 = Batch 885445 = Sample 453347329 +[ INFO][27-Jun-24 06:11:11] Total gradient norm stats for 72 steps: 0.259 <= 0.2693 + 0.006554z <= 0.287 +[ INFO][27-Jun-24 06:11:11] Trained chunk 763 in 140.0s at 4250noun/s: lr=8.43e-05, loss=1.19e+00, top1=71.83%/72.005% +[ INFO][27-Jun-24 06:11:11] Chunk 764 = Batch 886607 = Sample 453942273 +[ INFO][27-Jun-24 06:13:30] Total gradient norm stats for 73 steps: 0.2597 <= 0.27 + 0.006867z <= 0.3062 +[ INFO][27-Jun-24 06:13:30] Trained chunk 764 in 139.8s at 4255noun/s: lr=8.31e-05, loss=1.19e+00, top1=72.27%/72.009% +[ INFO][27-Jun-24 06:13:30] Chunk 765 = Batch 887769 = Sample 454537217 +[ INFO][27-Jun-24 06:15:50] Total gradient norm stats for 73 steps: 0.2567 <= 0.268 + 0.006784z <= 0.2914 +[ INFO][27-Jun-24 06:15:50] Trained chunk 765 in 139.9s at 4252noun/s: lr=8.19e-05, loss=1.18e+00, top1=73.27%/72.016% +[ INFO][27-Jun-24 06:15:50] Chunk 766 = Batch 888931 = Sample 455132161 +[ INFO][27-Jun-24 06:18:10] Total gradient norm stats for 72 steps: 0.2548 <= 0.268 + 0.005912z <= 0.2918 +[ INFO][27-Jun-24 06:18:10] Trained chunk 766 in 139.6s at 4263noun/s: lr=8.07e-05, loss=1.18e+00, top1=72.83%/72.018% +[ INFO][27-Jun-24 06:18:10] Chunk 767 = Batch 890093 = Sample 455727105 +[ INFO][27-Jun-24 06:20:29] Total gradient norm stats for 73 steps: 0.2578 <= 0.2704 + 0.00618z <= 0.2876 +[ INFO][27-Jun-24 06:20:29] Trained chunk 767 in 139.3s at 4269noun/s: lr=7.96e-05, loss=1.18e+00, top1=71.92%/72.018% +[ INFO][27-Jun-24 06:20:29] Chunk 768 = Batch 891255 = Sample 456322049 +[ INFO][27-Jun-24 06:22:49] Total gradient norm stats for 73 steps: 0.258 <= 0.2706 + 0.0137z <= 0.3738 +[ INFO][27-Jun-24 06:22:49] Trained chunk 768 in 139.6s at 4261noun/s: lr=7.84e-05, loss=1.18e+00, top1=73.08%/72.020% +[ INFO][27-Jun-24 06:22:49] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240626_001447/ovod_chunk0768_20240627_062249.train +[ INFO][27-Jun-24 06:22:49] Chunk 769 = Batch 892417 = Sample 456916993 +[ INFO][27-Jun-24 06:25:08] Total gradient norm stats for 72 steps: 0.2597 <= 0.2693 + 0.00572z <= 0.2917 +[ INFO][27-Jun-24 06:25:08] Trained chunk 769 in 139.1s at 4277noun/s: lr=7.72e-05, loss=1.18e+00, top1=71.54%/72.021% +[ INFO][27-Jun-24 06:25:08] Chunk 770 = Batch 893579 = Sample 457511937 +[ INFO][27-Jun-24 06:27:28] Total gradient norm stats for 73 steps: 0.2587 <= 0.2683 + 0.006344z <= 0.2877 +[ INFO][27-Jun-24 06:27:28] Trained chunk 770 in 139.4s at 4268noun/s: lr=7.61e-05, loss=1.18e+00, top1=71.34%/72.020% +[ INFO][27-Jun-24 06:27:28] Chunk 771 = Batch 894741 = Sample 458106881 +[ INFO][27-Jun-24 06:29:47] Total gradient norm stats for 72 steps: 0.2557 <= 0.2668 + 0.00532z <= 0.2867 +[ INFO][27-Jun-24 06:29:47] Trained chunk 771 in 139.2s at 4274noun/s: lr=7.49e-05, loss=1.18e+00, top1=71.36%/72.023% +[ INFO][27-Jun-24 06:29:47] Chunk 772 = Batch 895903 = Sample 458701825 +[ INFO][27-Jun-24 06:32:06] Total gradient norm stats for 73 steps: 0.2583 <= 0.2685 + 0.006812z <= 0.2954 +[ INFO][27-Jun-24 06:32:06] Trained chunk 772 in 139.0s at 4279noun/s: lr=7.38e-05, loss=1.18e+00, top1=71.51%/72.028% +[ INFO][27-Jun-24 06:32:06] Chunk 773 = Batch 897065 = Sample 459296769 +[ INFO][27-Jun-24 06:34:25] Total gradient norm stats for 73 steps: 0.2537 <= 0.2706 + 0.008105z <= 0.2947 +[ INFO][27-Jun-24 06:34:25] Trained chunk 773 in 139.6s at 4262noun/s: lr=7.27e-05, loss=1.18e+00, top1=72.07%/72.033% +[ INFO][27-Jun-24 06:34:25] Chunk 774 = Batch 898227 = Sample 459891713 +[ INFO][27-Jun-24 06:36:45] Total gradient norm stats for 72 steps: 0.2608 <= 0.2731 + 0.01338z <= 0.3481 +[ INFO][27-Jun-24 06:36:45] Trained chunk 774 in 139.5s at 4266noun/s: lr=7.15e-05, loss=1.18e+00, top1=71.56%/72.038% +[ INFO][27-Jun-24 06:36:45] Chunk 775 = Batch 899389 = Sample 460486657 +[ INFO][27-Jun-24 06:39:04] Total gradient norm stats for 73 steps: 0.2602 <= 0.2703 + 0.006218z <= 0.2926 +[ INFO][27-Jun-24 06:39:04] Trained chunk 775 in 139.6s at 4263noun/s: lr=7.04e-05, loss=1.18e+00, top1=71.87%/72.040% +[ INFO][27-Jun-24 06:39:04] Chunk 776 = Batch 900551 = Sample 461081601 +[ INFO][27-Jun-24 06:41:24] Total gradient norm stats for 73 steps: 0.2596 <= 0.3092 + 0.3303z <= 3.091 (clipped to 1) +[ INFO][27-Jun-24 06:41:24] Trained chunk 776 in 139.5s at 4263noun/s: lr=6.93e-05, loss=1.18e+00, top1=72.03%/72.038% +[ INFO][27-Jun-24 06:41:24] Chunk 777 = Batch 901713 = Sample 461676545 +[ INFO][27-Jun-24 06:43:43] Total gradient norm stats for 72 steps: 0.26 <= 0.2683 + 0.004506z <= 0.2776 +[ INFO][27-Jun-24 06:43:43] Trained chunk 777 in 139.1s at 4278noun/s: lr=6.82e-05, loss=1.18e+00, top1=71.91%/72.042% +[ INFO][27-Jun-24 06:43:43] Chunk 778 = Batch 902875 = Sample 462271489 +[ INFO][27-Jun-24 06:46:03] Total gradient norm stats for 73 steps: 0.2553 <= 0.2681 + 0.005855z <= 0.2839 +[ INFO][27-Jun-24 06:46:03] Trained chunk 778 in 139.5s at 4264noun/s: lr=6.71e-05, loss=1.18e+00, top1=72.52%/72.045% +[ INFO][27-Jun-24 06:46:03] Chunk 779 = Batch 904037 = Sample 462866433 +[ INFO][27-Jun-24 06:48:22] Total gradient norm stats for 72 steps: 0.2585 <= 0.2701 + 0.006284z <= 0.2894 +[ INFO][27-Jun-24 06:48:22] Trained chunk 779 in 139.8s at 4257noun/s: lr=6.61e-05, loss=1.18e+00, top1=72.97%/72.044% +[ INFO][27-Jun-24 06:48:22] Chunk 780 = Batch 905199 = Sample 463461377 +[ INFO][27-Jun-24 06:50:42] Total gradient norm stats for 73 steps: 0.2595 <= 0.271 + 0.009877z <= 0.3189 +[ INFO][27-Jun-24 06:50:42] Trained chunk 780 in 139.6s at 4261noun/s: lr=6.50e-05, loss=1.18e+00, top1=72.41%/72.053% +[ INFO][27-Jun-24 06:50:42] Chunk 781 = Batch 906361 = Sample 464056321 +[ INFO][27-Jun-24 06:53:01] Total gradient norm stats for 73 steps: 0.2604 <= 0.2754 + 0.04846z <= 0.6811 +[ INFO][27-Jun-24 06:53:01] Trained chunk 781 in 139.4s at 4269noun/s: lr=6.39e-05, loss=1.18e+00, top1=71.87%/72.051% +[ INFO][27-Jun-24 06:53:01] Chunk 782 = Batch 907523 = Sample 464651265 +[ INFO][27-Jun-24 06:55:21] Total gradient norm stats for 72 steps: 0.2597 <= 0.2772 + 0.04862z <= 0.6663 +[ INFO][27-Jun-24 06:55:21] Trained chunk 782 in 139.4s at 4269noun/s: lr=6.29e-05, loss=1.18e+00, top1=72.16%/72.054% +[ INFO][27-Jun-24 06:55:21] Chunk 783 = Batch 908685 = Sample 465246209 +[ INFO][27-Jun-24 06:57:40] Total gradient norm stats for 73 steps: 0.2586 <= 0.2694 + 0.006099z <= 0.2902 +[ INFO][27-Jun-24 06:57:40] Trained chunk 783 in 139.2s at 4274noun/s: lr=6.18e-05, loss=1.18e+00, top1=71.41%/72.060% +[ INFO][27-Jun-24 06:57:40] Chunk 784 = Batch 909847 = Sample 465841153 +[ INFO][27-Jun-24 06:59:59] Total gradient norm stats for 73 steps: 0.2574 <= 0.2687 + 0.005614z <= 0.2835 +[ INFO][27-Jun-24 06:59:59] Trained chunk 784 in 139.5s at 4266noun/s: lr=6.08e-05, loss=1.18e+00, top1=73.21%/72.057% +[ INFO][27-Jun-24 06:59:59] Chunk 785 = Batch 911009 = Sample 466436097 +[ INFO][27-Jun-24 07:02:19] Total gradient norm stats for 72 steps: 0.259 <= 0.2686 + 0.005079z <= 0.2892 +[ INFO][27-Jun-24 07:02:19] Trained chunk 785 in 139.2s at 4274noun/s: lr=5.98e-05, loss=1.18e+00, top1=71.05%/72.059% +[ INFO][27-Jun-24 07:02:19] Chunk 786 = Batch 912171 = Sample 467031041 +[ INFO][27-Jun-24 07:04:38] Total gradient norm stats for 73 steps: 0.2583 <= 0.2677 + 0.004929z <= 0.2844 +[ INFO][27-Jun-24 07:04:38] Trained chunk 786 in 139.4s at 4267noun/s: lr=5.87e-05, loss=1.18e+00, top1=73.69%/72.062% +[ INFO][27-Jun-24 07:04:38] Chunk 787 = Batch 913333 = Sample 467625985 +[ INFO][27-Jun-24 07:06:58] Total gradient norm stats for 72 steps: 0.2612 <= 0.2715 + 0.01854z <= 0.4207 +[ INFO][27-Jun-24 07:06:58] Trained chunk 787 in 139.4s at 4267noun/s: lr=5.77e-05, loss=1.18e+00, top1=73.57%/72.067% +[ INFO][27-Jun-24 07:06:58] Chunk 788 = Batch 914495 = Sample 468220929 +[ INFO][27-Jun-24 07:09:17] Total gradient norm stats for 73 steps: 0.2587 <= 0.2687 + 0.00532z <= 0.2818 +[ INFO][27-Jun-24 07:09:17] Trained chunk 788 in 139.6s at 4261noun/s: lr=5.67e-05, loss=1.18e+00, top1=72.03%/72.076% +[ INFO][27-Jun-24 07:09:17] Chunk 789 = Batch 915657 = Sample 468815873 +[ INFO][27-Jun-24 07:11:36] Total gradient norm stats for 73 steps: 0.2602 <= 0.273 + 0.008045z <= 0.3015 +[ INFO][27-Jun-24 07:11:36] Trained chunk 789 in 139.2s at 4274noun/s: lr=5.57e-05, loss=1.18e+00, top1=72.00%/72.077% +[ INFO][27-Jun-24 07:11:36] Chunk 790 = Batch 916819 = Sample 469410817 +[ INFO][27-Jun-24 07:13:56] Total gradient norm stats for 72 steps: 0.2589 <= 0.2683 + 0.004667z <= 0.2813 +[ INFO][27-Jun-24 07:13:56] Trained chunk 790 in 139.4s at 4267noun/s: lr=5.47e-05, loss=1.18e+00, top1=72.24%/72.078% +[ INFO][27-Jun-24 07:13:56] Chunk 791 = Batch 917981 = Sample 470005761 +[ INFO][27-Jun-24 07:16:15] Total gradient norm stats for 73 steps: 0.2582 <= 0.27 + 0.006499z <= 0.285 +[ INFO][27-Jun-24 07:16:15] Trained chunk 791 in 139.5s at 4266noun/s: lr=5.38e-05, loss=1.18e+00, top1=71.30%/72.085% +[ INFO][27-Jun-24 07:16:15] Chunk 792 = Batch 919143 = Sample 470600705 +[ INFO][27-Jun-24 07:18:34] Total gradient norm stats for 73 steps: 0.2552 <= 0.2727 + 0.02522z <= 0.4782 +[ INFO][27-Jun-24 07:18:34] Trained chunk 792 in 139.2s at 4275noun/s: lr=5.28e-05, loss=1.18e+00, top1=71.42%/72.087% +[ INFO][27-Jun-24 07:18:34] Chunk 793 = Batch 920305 = Sample 471195649 +[ INFO][27-Jun-24 07:20:54] Total gradient norm stats for 72 steps: 0.2607 <= 0.2712 + 0.01134z <= 0.3552 +[ INFO][27-Jun-24 07:20:54] Trained chunk 793 in 139.1s at 4276noun/s: lr=5.18e-05, loss=1.18e+00, top1=72.10%/72.089% +[ INFO][27-Jun-24 07:20:54] Chunk 794 = Batch 921467 = Sample 471790593 +[ INFO][27-Jun-24 07:23:13] Total gradient norm stats for 73 steps: 0.2592 <= 0.2703 + 0.006118z <= 0.2945 +[ INFO][27-Jun-24 07:23:13] Trained chunk 794 in 139.3s at 4272noun/s: lr=5.09e-05, loss=1.18e+00, top1=72.87%/72.085% +[ INFO][27-Jun-24 07:23:13] Chunk 795 = Batch 922629 = Sample 472385537 +[ INFO][27-Jun-24 07:25:32] Total gradient norm stats for 72 steps: 0.259 <= 0.2694 + 0.006397z <= 0.303 +[ INFO][27-Jun-24 07:25:32] Trained chunk 795 in 139.4s at 4269noun/s: lr=4.99e-05, loss=1.18e+00, top1=71.51%/72.090% +[ INFO][27-Jun-24 07:25:32] Chunk 796 = Batch 923791 = Sample 472980481 +[ INFO][27-Jun-24 07:27:52] Total gradient norm stats for 73 steps: 0.2593 <= 0.2705 + 0.006369z <= 0.2879 +[ INFO][27-Jun-24 07:27:52] Trained chunk 796 in 139.4s at 4269noun/s: lr=4.90e-05, loss=1.18e+00, top1=72.00%/72.090% +[ INFO][27-Jun-24 07:27:52] Chunk 797 = Batch 924953 = Sample 473575425 +[ INFO][27-Jun-24 07:30:11] Total gradient norm stats for 73 steps: 0.258 <= 0.2698 + 0.00577z <= 0.2864 +[ INFO][27-Jun-24 07:30:11] Trained chunk 797 in 139.6s at 4260noun/s: lr=4.81e-05, loss=1.18e+00, top1=72.92%/72.092% +[ INFO][27-Jun-24 07:30:11] Chunk 798 = Batch 926115 = Sample 474170369 +[ INFO][27-Jun-24 07:32:30] Total gradient norm stats for 72 steps: 0.2585 <= 0.2821 + 0.0922z <= 1.013 (clipped to 1) +[ INFO][27-Jun-24 07:32:30] Trained chunk 798 in 139.0s at 4280noun/s: lr=4.71e-05, loss=1.18e+00, top1=72.52%/72.097% +[ INFO][27-Jun-24 07:32:30] Chunk 799 = Batch 927277 = Sample 474765313 +[ INFO][27-Jun-24 07:34:49] Total gradient norm stats for 73 steps: 0.2611 <= 0.2701 + 0.005792z <= 0.2852 +[ INFO][27-Jun-24 07:34:49] Trained chunk 799 in 139.2s at 4273noun/s: lr=4.62e-05, loss=1.18e+00, top1=71.61%/72.092% +[ INFO][27-Jun-24 07:34:49] Chunk 800 = Batch 928439 = Sample 475360257 +[ INFO][27-Jun-24 07:37:02] Epoch 16 finished in 6979.6s +[ INFO][27-Jun-24 07:37:02] -------------------------------------------------------------------------------- +[ INFO][27-Jun-24 07:37:02] Epoch 17 = Batch 929537 = Sample 475922433 +[ INFO][27-Jun-24 07:37:10] Total gradient norm stats for 73 steps: 0.2616 <= 0.2707 + 0.007257z <= 0.3051 +[ INFO][27-Jun-24 07:37:10] Trained chunk 800 in 140.6s at 4232noun/s: lr=4.53e-05, loss=1.18e+00, top1=71.88%/72.094% +[ INFO][27-Jun-24 07:37:10] Chunk 801 = Batch 929601 = Sample 475955201 +[ INFO][27-Jun-24 07:39:30] Total gradient norm stats for 72 steps: 0.2587 <= 0.2703 + 0.01162z <= 0.3579 +[ INFO][27-Jun-24 07:39:30] Trained chunk 801 in 139.5s at 4264noun/s: lr=4.44e-05, loss=1.18e+00, top1=72.88%/72.094% +[ INFO][27-Jun-24 07:39:30] Chunk 802 = Batch 930763 = Sample 476550145 +[ INFO][27-Jun-24 07:41:49] Total gradient norm stats for 73 steps: 0.2566 <= 0.2786 + 0.07871z <= 0.9383 +[ INFO][27-Jun-24 07:41:49] Trained chunk 802 in 139.4s at 4269noun/s: lr=4.36e-05, loss=1.18e+00, top1=70.92%/72.099% +[ INFO][27-Jun-24 07:41:49] Chunk 803 = Batch 931925 = Sample 477145089 +[ INFO][27-Jun-24 07:44:08] Total gradient norm stats for 72 steps: 0.26 <= 0.2683 + 0.00539z <= 0.2855 +[ INFO][27-Jun-24 07:44:08] Trained chunk 803 in 139.5s at 4266noun/s: lr=4.27e-05, loss=1.18e+00, top1=71.97%/72.101% +[ INFO][27-Jun-24 07:44:08] Chunk 804 = Batch 933087 = Sample 477740033 +[ INFO][27-Jun-24 07:46:28] Total gradient norm stats for 73 steps: 0.2596 <= 0.2688 + 0.004627z <= 0.2833 +[ INFO][27-Jun-24 07:46:28] Trained chunk 804 in 139.6s at 4262noun/s: lr=4.18e-05, loss=1.18e+00, top1=72.52%/72.108% +[ INFO][27-Jun-24 07:46:28] Chunk 805 = Batch 934249 = Sample 478334977 +[ INFO][27-Jun-24 07:48:48] Total gradient norm stats for 73 steps: 0.2603 <= 0.2705 + 0.009499z <= 0.3395 +[ INFO][27-Jun-24 07:48:48] Trained chunk 805 in 139.9s at 4252noun/s: lr=4.10e-05, loss=1.18e+00, top1=73.08%/72.104% +[ INFO][27-Jun-24 07:48:48] Chunk 806 = Batch 935411 = Sample 478929921 +[ INFO][27-Jun-24 07:51:08] Total gradient norm stats for 72 steps: 0.261 <= 0.2695 + 0.005627z <= 0.2947 +[ INFO][27-Jun-24 07:51:08] Trained chunk 806 in 139.7s at 4260noun/s: lr=4.01e-05, loss=1.18e+00, top1=71.13%/72.106% +[ INFO][27-Jun-24 07:51:08] Chunk 807 = Batch 936573 = Sample 479524865 +[ INFO][27-Jun-24 07:53:27] Total gradient norm stats for 73 steps: 0.2596 <= 0.27 + 0.0172z <= 0.4099 +[ INFO][27-Jun-24 07:53:27] Trained chunk 807 in 139.9s at 4253noun/s: lr=3.93e-05, loss=1.18e+00, top1=70.94%/72.112% +[ INFO][27-Jun-24 07:53:27] Chunk 808 = Batch 937735 = Sample 480119809 +[ INFO][27-Jun-24 07:55:47] Total gradient norm stats for 73 steps: 0.2608 <= 0.27 + 0.004364z <= 0.2826 +[ INFO][27-Jun-24 07:55:47] Trained chunk 808 in 139.6s at 4262noun/s: lr=3.84e-05, loss=1.18e+00, top1=71.36%/72.112% +[ INFO][27-Jun-24 07:55:47] Chunk 809 = Batch 938897 = Sample 480714753 +[ INFO][27-Jun-24 07:58:07] Total gradient norm stats for 72 steps: 0.2604 <= 0.2764 + 0.04788z <= 0.6735 +[ INFO][27-Jun-24 07:58:07] Trained chunk 809 in 139.5s at 4266noun/s: lr=3.76e-05, loss=1.18e+00, top1=71.47%/72.118% +[ INFO][27-Jun-24 07:58:07] Chunk 810 = Batch 940059 = Sample 481309697 +[ INFO][27-Jun-24 08:00:26] Total gradient norm stats for 73 steps: 0.259 <= 0.2942 + 0.2054z <= 2.024 (clipped to 1) +[ INFO][27-Jun-24 08:00:26] Trained chunk 810 in 139.9s at 4254noun/s: lr=3.68e-05, loss=1.18e+00, top1=71.51%/72.125% +[ INFO][27-Jun-24 08:00:26] Chunk 811 = Batch 941221 = Sample 481904641 +[ INFO][27-Jun-24 08:02:46] Total gradient norm stats for 72 steps: 0.2611 <= 0.2703 + 0.005086z <= 0.2843 +[ INFO][27-Jun-24 08:02:46] Trained chunk 811 in 139.7s at 4258noun/s: lr=3.60e-05, loss=1.18e+00, top1=72.13%/72.123% +[ INFO][27-Jun-24 08:02:46] Chunk 812 = Batch 942383 = Sample 482499585 +[ INFO][27-Jun-24 08:05:05] Total gradient norm stats for 73 steps: 0.2611 <= 0.2724 + 0.0193z <= 0.429 +[ INFO][27-Jun-24 08:05:05] Trained chunk 812 in 139.3s at 4272noun/s: lr=3.52e-05, loss=1.18e+00, top1=71.67%/72.127% +[ INFO][27-Jun-24 08:05:05] Chunk 813 = Batch 943545 = Sample 483094529 +[ INFO][27-Jun-24 08:07:25] Total gradient norm stats for 73 steps: 0.261 <= 0.2692 + 0.006299z <= 0.3024 +[ INFO][27-Jun-24 08:07:25] Trained chunk 813 in 139.6s at 4263noun/s: lr=3.44e-05, loss=1.18e+00, top1=71.32%/72.131% +[ INFO][27-Jun-24 08:07:25] Chunk 814 = Batch 944707 = Sample 483689473 +[ INFO][27-Jun-24 08:09:45] Total gradient norm stats for 72 steps: 0.2605 <= 0.2697 + 0.006241z <= 0.2915 +[ INFO][27-Jun-24 08:09:45] Trained chunk 814 in 139.7s at 4259noun/s: lr=3.36e-05, loss=1.18e+00, top1=72.30%/72.131% +[ INFO][27-Jun-24 08:09:45] Chunk 815 = Batch 945869 = Sample 484284417 +[ INFO][27-Jun-24 08:12:05] Total gradient norm stats for 73 steps: 0.2605 <= 0.2689 + 0.004912z <= 0.2854 +[ INFO][27-Jun-24 08:12:05] Trained chunk 815 in 139.9s at 4254noun/s: lr=3.28e-05, loss=1.18e+00, top1=73.54%/72.135% +[ INFO][27-Jun-24 08:12:05] Chunk 816 = Batch 947031 = Sample 484879361 +[ INFO][27-Jun-24 08:14:24] Total gradient norm stats for 73 steps: 0.2599 <= 0.2836 + 0.1192z <= 1.287 (clipped to 1) +[ INFO][27-Jun-24 08:14:24] Trained chunk 816 in 139.5s at 4264noun/s: lr=3.21e-05, loss=1.18e+00, top1=73.31%/72.139% +[ INFO][27-Jun-24 08:14:24] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240626_001447/ovod_chunk0816_20240627_081424.train +[ INFO][27-Jun-24 08:14:24] Chunk 817 = Batch 948193 = Sample 485474305 +[ INFO][27-Jun-24 08:16:44] Total gradient norm stats for 72 steps: 0.2604 <= 0.2694 + 0.005468z <= 0.2867 +[ INFO][27-Jun-24 08:16:44] Trained chunk 817 in 139.4s at 4269noun/s: lr=3.13e-05, loss=1.18e+00, top1=71.12%/72.133% +[ INFO][27-Jun-24 08:16:44] Chunk 818 = Batch 949355 = Sample 486069249 +[ INFO][27-Jun-24 08:19:03] Total gradient norm stats for 73 steps: 0.2561 <= 0.2701 + 0.01059z <= 0.3461 +[ INFO][27-Jun-24 08:19:03] Trained chunk 818 in 139.5s at 4265noun/s: lr=3.06e-05, loss=1.18e+00, top1=72.31%/72.136% +[ INFO][27-Jun-24 08:19:03] Chunk 819 = Batch 950517 = Sample 486664193 +[ INFO][27-Jun-24 08:21:23] Total gradient norm stats for 72 steps: 0.2593 <= 0.2718 + 0.01646z <= 0.3964 +[ INFO][27-Jun-24 08:21:23] Trained chunk 819 in 139.6s at 4261noun/s: lr=2.98e-05, loss=1.18e+00, top1=71.66%/72.140% +[ INFO][27-Jun-24 08:21:23] Chunk 820 = Batch 951679 = Sample 487259137 +[ INFO][27-Jun-24 08:23:43] Total gradient norm stats for 73 steps: 0.2609 <= 0.2691 + 0.004815z <= 0.2859 +[ INFO][27-Jun-24 08:23:43] Trained chunk 820 in 140.2s at 4243noun/s: lr=2.91e-05, loss=1.18e+00, top1=72.54%/72.147% +[ INFO][27-Jun-24 08:23:43] Chunk 821 = Batch 952841 = Sample 487854081 +[ INFO][27-Jun-24 08:26:02] Total gradient norm stats for 73 steps: 0.2607 <= 0.2728 + 0.02565z <= 0.4852 +[ INFO][27-Jun-24 08:26:02] Trained chunk 821 in 139.1s at 4277noun/s: lr=2.84e-05, loss=1.18e+00, top1=73.25%/72.152% +[ INFO][27-Jun-24 08:26:02] Chunk 822 = Batch 954003 = Sample 488449025 +[ INFO][27-Jun-24 08:28:21] Total gradient norm stats for 72 steps: 0.2598 <= 0.2694 + 0.004991z <= 0.282 +[ INFO][27-Jun-24 08:28:21] Trained chunk 822 in 139.2s at 4275noun/s: lr=2.77e-05, loss=1.18e+00, top1=73.08%/72.153% +[ INFO][27-Jun-24 08:28:21] Chunk 823 = Batch 955165 = Sample 489043969 +[ INFO][27-Jun-24 08:30:41] Total gradient norm stats for 73 steps: 0.2593 <= 0.2712 + 0.007694z <= 0.3087 +[ INFO][27-Jun-24 08:30:41] Trained chunk 823 in 139.5s at 4265noun/s: lr=2.70e-05, loss=1.18e+00, top1=71.56%/72.155% +[ INFO][27-Jun-24 08:30:41] Chunk 824 = Batch 956327 = Sample 489638913 +[ INFO][27-Jun-24 08:33:01] Total gradient norm stats for 73 steps: 0.2614 <= 0.2703 + 0.004642z <= 0.2858 +[ INFO][27-Jun-24 08:33:01] Trained chunk 824 in 140.1s at 4248noun/s: lr=2.63e-05, loss=1.18e+00, top1=73.54%/72.155% +[ INFO][27-Jun-24 08:33:01] Chunk 825 = Batch 957489 = Sample 490233857 +[ INFO][27-Jun-24 08:35:21] Total gradient norm stats for 72 steps: 0.2612 <= 0.2706 + 0.005099z <= 0.2856 +[ INFO][27-Jun-24 08:35:21] Trained chunk 825 in 140.5s at 4234noun/s: lr=2.56e-05, loss=1.18e+00, top1=72.16%/72.156% +[ INFO][27-Jun-24 08:35:21] Chunk 826 = Batch 958651 = Sample 490828801 +[ INFO][27-Jun-24 08:37:42] Total gradient norm stats for 73 steps: 0.2601 <= 0.2865 + 0.1301z <= 1.382 (clipped to 1) +[ INFO][27-Jun-24 08:37:42] Trained chunk 826 in 140.2s at 4244noun/s: lr=2.49e-05, loss=1.18e+00, top1=72.68%/72.156% +[ INFO][27-Jun-24 08:37:42] Chunk 827 = Batch 959813 = Sample 491423745 +[ INFO][27-Jun-24 08:40:02] Total gradient norm stats for 72 steps: 0.2614 <= 0.2693 + 0.00458z <= 0.2845 +[ INFO][27-Jun-24 08:40:02] Trained chunk 827 in 139.9s at 4253noun/s: lr=2.43e-05, loss=1.18e+00, top1=70.88%/72.158% +[ INFO][27-Jun-24 08:40:02] Chunk 828 = Batch 960975 = Sample 492018689 +[ INFO][27-Jun-24 08:42:21] Total gradient norm stats for 73 steps: 0.2584 <= 0.2732 + 0.01748z <= 0.3838 +[ INFO][27-Jun-24 08:42:21] Trained chunk 828 in 139.9s at 4253noun/s: lr=2.36e-05, loss=1.18e+00, top1=71.50%/72.156% +[ INFO][27-Jun-24 08:42:21] Chunk 829 = Batch 962137 = Sample 492613633 +[ INFO][27-Jun-24 08:44:42] Total gradient norm stats for 73 steps: 0.2619 <= 0.272 + 0.01221z <= 0.3473 +[ INFO][27-Jun-24 08:44:42] Trained chunk 829 in 140.3s at 4241noun/s: lr=2.30e-05, loss=1.18e+00, top1=71.60%/72.162% +[ INFO][27-Jun-24 08:44:42] Chunk 830 = Batch 963299 = Sample 493208577 +[ INFO][27-Jun-24 08:47:02] Total gradient norm stats for 72 steps: 0.2584 <= 0.2693 + 0.005461z <= 0.2881 +[ INFO][27-Jun-24 08:47:02] Trained chunk 830 in 140.2s at 4243noun/s: lr=2.23e-05, loss=1.18e+00, top1=72.27%/72.159% +[ INFO][27-Jun-24 08:47:02] Chunk 831 = Batch 964461 = Sample 493803521 +[ INFO][27-Jun-24 08:49:22] Total gradient norm stats for 73 steps: 0.2623 <= 0.2695 + 0.004267z <= 0.2816 +[ INFO][27-Jun-24 08:49:22] Trained chunk 831 in 140.0s at 4248noun/s: lr=2.17e-05, loss=1.18e+00, top1=71.32%/72.164% +[ INFO][27-Jun-24 08:49:22] Chunk 832 = Batch 965623 = Sample 494398465 +[ INFO][27-Jun-24 08:51:42] Total gradient norm stats for 73 steps: 0.2613 <= 0.2691 + 0.004746z <= 0.2851 +[ INFO][27-Jun-24 08:51:42] Trained chunk 832 in 140.0s at 4250noun/s: lr=2.11e-05, loss=1.18e+00, top1=72.66%/72.163% +[ INFO][27-Jun-24 08:51:42] Chunk 833 = Batch 966785 = Sample 494993409 +[ INFO][27-Jun-24 08:54:02] Total gradient norm stats for 72 steps: 0.2585 <= 0.2688 + 0.004772z <= 0.2885 +[ INFO][27-Jun-24 08:54:02] Trained chunk 833 in 140.0s at 4250noun/s: lr=2.05e-05, loss=1.18e+00, top1=72.97%/72.167% +[ INFO][27-Jun-24 08:54:02] Chunk 834 = Batch 967947 = Sample 495588353 +[ INFO][27-Jun-24 08:56:21] Total gradient norm stats for 73 steps: 0.2579 <= 0.269 + 0.00456z <= 0.279 +[ INFO][27-Jun-24 08:56:21] Trained chunk 834 in 139.1s at 4276noun/s: lr=1.99e-05, loss=1.18e+00, top1=71.16%/72.162% +[ INFO][27-Jun-24 08:56:21] Chunk 835 = Batch 969109 = Sample 496183297 +[ INFO][27-Jun-24 08:58:41] Total gradient norm stats for 72 steps: 0.2601 <= 0.2682 + 0.004266z <= 0.2828 +[ INFO][27-Jun-24 08:58:41] Trained chunk 835 in 139.5s at 4266noun/s: lr=1.93e-05, loss=1.18e+00, top1=72.23%/72.165% +[ INFO][27-Jun-24 08:58:41] Chunk 836 = Batch 970271 = Sample 496778241 +[ INFO][27-Jun-24 09:01:00] Total gradient norm stats for 73 steps: 0.2589 <= 0.2692 + 0.005037z <= 0.2867 +[ INFO][27-Jun-24 09:01:00] Trained chunk 836 in 139.2s at 4275noun/s: lr=1.87e-05, loss=1.18e+00, top1=71.66%/72.167% +[ INFO][27-Jun-24 09:01:00] Chunk 837 = Batch 971433 = Sample 497373185 +[ INFO][27-Jun-24 09:03:19] Total gradient norm stats for 73 steps: 0.2605 <= 0.269 + 0.004025z <= 0.2799 +[ INFO][27-Jun-24 09:03:19] Trained chunk 837 in 139.3s at 4271noun/s: lr=1.81e-05, loss=1.18e+00, top1=72.95%/72.168% +[ INFO][27-Jun-24 09:03:19] Chunk 838 = Batch 972595 = Sample 497968129 +[ INFO][27-Jun-24 09:05:38] Total gradient norm stats for 72 steps: 0.2619 <= 0.27 + 0.004229z <= 0.2809 +[ INFO][27-Jun-24 09:05:38] Trained chunk 838 in 139.2s at 4274noun/s: lr=1.75e-05, loss=1.18e+00, top1=71.86%/72.172% +[ INFO][27-Jun-24 09:05:38] Chunk 839 = Batch 973757 = Sample 498563073 +[ INFO][27-Jun-24 09:07:57] Total gradient norm stats for 73 steps: 0.2598 <= 0.2694 + 0.004541z <= 0.2844 +[ INFO][27-Jun-24 09:07:57] Trained chunk 839 in 138.8s at 4285noun/s: lr=1.70e-05, loss=1.18e+00, top1=73.35%/72.170% +[ INFO][27-Jun-24 09:07:57] Chunk 840 = Batch 974919 = Sample 499158017 +[ INFO][27-Jun-24 09:10:16] Total gradient norm stats for 73 steps: 0.2605 <= 0.27 + 0.005194z <= 0.2841 +[ INFO][27-Jun-24 09:10:16] Trained chunk 840 in 139.3s at 4270noun/s: lr=1.64e-05, loss=1.18e+00, top1=72.51%/72.169% +[ INFO][27-Jun-24 09:10:16] Chunk 841 = Batch 976081 = Sample 499752961 +[ INFO][27-Jun-24 09:12:36] Total gradient norm stats for 72 steps: 0.2615 <= 0.2696 + 0.00522z <= 0.2912 +[ INFO][27-Jun-24 09:12:36] Trained chunk 841 in 139.1s at 4277noun/s: lr=1.59e-05, loss=1.18e+00, top1=72.54%/72.162% +[ INFO][27-Jun-24 09:12:36] Chunk 842 = Batch 977243 = Sample 500347905 +[ INFO][27-Jun-24 09:14:55] Total gradient norm stats for 73 steps: 0.2627 <= 0.2717 + 0.007391z <= 0.3066 +[ INFO][27-Jun-24 09:14:55] Trained chunk 842 in 139.5s at 4266noun/s: lr=1.54e-05, loss=1.18e+00, top1=72.71%/72.167% +[ INFO][27-Jun-24 09:14:55] Chunk 843 = Batch 978405 = Sample 500942849 +[ INFO][27-Jun-24 09:17:14] Total gradient norm stats for 72 steps: 0.2617 <= 0.2706 + 0.005313z <= 0.2965 +[ INFO][27-Jun-24 09:17:14] Trained chunk 843 in 139.4s at 4268noun/s: lr=1.48e-05, loss=1.18e+00, top1=71.10%/72.176% +[ INFO][27-Jun-24 09:17:14] Chunk 844 = Batch 979567 = Sample 501537793 +[ INFO][27-Jun-24 09:19:33] Total gradient norm stats for 73 steps: 0.2635 <= 0.2719 + 0.01728z <= 0.4138 +[ INFO][27-Jun-24 09:19:33] Trained chunk 844 in 139.0s at 4281noun/s: lr=1.43e-05, loss=1.18e+00, top1=72.36%/72.176% +[ INFO][27-Jun-24 09:19:33] Chunk 845 = Batch 980729 = Sample 502132737 +[ INFO][27-Jun-24 09:21:53] Total gradient norm stats for 73 steps: 0.26 <= 0.2705 + 0.009647z <= 0.3398 +[ INFO][27-Jun-24 09:21:53] Trained chunk 845 in 139.7s at 4260noun/s: lr=1.38e-05, loss=1.18e+00, top1=72.37%/72.177% +[ INFO][27-Jun-24 09:21:53] Chunk 846 = Batch 981891 = Sample 502727681 +[ INFO][27-Jun-24 09:24:12] Total gradient norm stats for 72 steps: 0.2608 <= 0.2696 + 0.005147z <= 0.2833 +[ INFO][27-Jun-24 09:24:12] Trained chunk 846 in 139.4s at 4267noun/s: lr=1.33e-05, loss=1.18e+00, top1=72.07%/72.178% +[ INFO][27-Jun-24 09:24:12] Chunk 847 = Batch 983053 = Sample 503322625 +[ INFO][27-Jun-24 09:26:31] Total gradient norm stats for 73 steps: 0.2613 <= 0.2786 + 0.08405z <= 0.9859 +[ INFO][27-Jun-24 09:26:31] Trained chunk 847 in 138.8s at 4287noun/s: lr=1.28e-05, loss=1.18e+00, top1=72.03%/72.180% +[ INFO][27-Jun-24 09:26:31] Chunk 848 = Batch 984215 = Sample 503917569 +[ INFO][27-Jun-24 09:28:50] Total gradient norm stats for 73 steps: 0.2606 <= 0.2693 + 0.00693z <= 0.3093 +[ INFO][27-Jun-24 09:28:50] Trained chunk 848 in 139.1s at 4277noun/s: lr=1.23e-05, loss=1.18e+00, top1=71.25%/72.176% +[ INFO][27-Jun-24 09:28:50] Chunk 849 = Batch 985377 = Sample 504512513 +[ INFO][27-Jun-24 09:31:10] Total gradient norm stats for 72 steps: 0.2586 <= 0.269 + 0.004878z <= 0.2818 +[ INFO][27-Jun-24 09:31:10] Trained chunk 849 in 139.2s at 4274noun/s: lr=1.19e-05, loss=1.18e+00, top1=71.70%/72.177% +[ INFO][27-Jun-24 09:31:10] Chunk 850 = Batch 986539 = Sample 505107457 +[ INFO][27-Jun-24 09:33:22] Epoch 17 finished in 6979.7s +[ INFO][27-Jun-24 09:33:22] -------------------------------------------------------------------------------- +[ INFO][27-Jun-24 09:33:22] Epoch 18 = Batch 987633 = Sample 505667585 +[ INFO][27-Jun-24 09:33:30] Total gradient norm stats for 73 steps: 0.2593 <= 0.2696 + 0.007457z <= 0.3097 +[ INFO][27-Jun-24 09:33:30] Trained chunk 850 in 140.6s at 4231noun/s: lr=1.14e-05, loss=1.18e+00, top1=72.49%/72.177% +[ INFO][27-Jun-24 09:33:30] Chunk 851 = Batch 987701 = Sample 505702401 +[ INFO][27-Jun-24 09:35:49] Total gradient norm stats for 72 steps: 0.26 <= 0.2938 + 0.2105z <= 2.054 (clipped to 1) +[ INFO][27-Jun-24 09:35:49] Trained chunk 851 in 138.8s at 4286noun/s: lr=1.10e-05, loss=1.18e+00, top1=72.72%/72.179% +[ INFO][27-Jun-24 09:35:49] Chunk 852 = Batch 988863 = Sample 506297345 +[ INFO][27-Jun-24 09:38:08] Total gradient norm stats for 73 steps: 0.261 <= 0.2699 + 0.007549z <= 0.3122 +[ INFO][27-Jun-24 09:38:08] Trained chunk 852 in 139.5s at 4266noun/s: lr=1.05e-05, loss=1.18e+00, top1=72.54%/72.184% +[ INFO][27-Jun-24 09:38:08] Chunk 853 = Batch 990025 = Sample 506892289 +[ INFO][27-Jun-24 09:40:28] Total gradient norm stats for 73 steps: 0.2602 <= 0.2692 + 0.005492z <= 0.2915 +[ INFO][27-Jun-24 09:40:28] Trained chunk 853 in 139.5s at 4263noun/s: lr=1.01e-05, loss=1.18e+00, top1=71.00%/72.185% +[ INFO][27-Jun-24 09:40:28] Chunk 854 = Batch 991187 = Sample 507487233 +[ INFO][27-Jun-24 09:42:48] Total gradient norm stats for 72 steps: 0.2595 <= 0.2692 + 0.004615z <= 0.2853 +[ INFO][27-Jun-24 09:42:48] Trained chunk 854 in 140.0s at 4248noun/s: lr=9.67e-06, loss=1.18e+00, top1=71.49%/72.189% +[ INFO][27-Jun-24 09:42:48] Chunk 855 = Batch 992349 = Sample 508082177 +[ INFO][27-Jun-24 09:45:08] Total gradient norm stats for 73 steps: 0.2609 <= 0.2687 + 0.004082z <= 0.2796 +[ INFO][27-Jun-24 09:45:08] Trained chunk 855 in 139.8s at 4256noun/s: lr=9.25e-06, loss=1.17e+00, top1=71.77%/72.187% +[ INFO][27-Jun-24 09:45:08] Chunk 856 = Batch 993511 = Sample 508677121 +[ INFO][27-Jun-24 09:47:28] Total gradient norm stats for 73 steps: 0.2609 <= 0.2704 + 0.007283z <= 0.3094 +[ INFO][27-Jun-24 09:47:28] Trained chunk 856 in 139.9s at 4253noun/s: lr=8.85e-06, loss=1.17e+00, top1=71.64%/72.191% +[ INFO][27-Jun-24 09:47:28] Chunk 857 = Batch 994673 = Sample 509272065 +[ INFO][27-Jun-24 09:49:48] Total gradient norm stats for 72 steps: 0.2589 <= 0.2691 + 0.005429z <= 0.2858 +[ INFO][27-Jun-24 09:49:48] Trained chunk 857 in 140.1s at 4246noun/s: lr=8.45e-06, loss=1.17e+00, top1=70.91%/72.193% +[ INFO][27-Jun-24 09:49:48] Chunk 858 = Batch 995835 = Sample 509867009 +[ INFO][27-Jun-24 09:52:08] Total gradient norm stats for 73 steps: 0.2605 <= 0.2685 + 0.003911z <= 0.2787 +[ INFO][27-Jun-24 09:52:08] Trained chunk 858 in 140.1s at 4247noun/s: lr=8.06e-06, loss=1.17e+00, top1=72.65%/72.195% +[ INFO][27-Jun-24 09:52:08] Chunk 859 = Batch 996997 = Sample 510461953 +[ INFO][27-Jun-24 09:54:28] Total gradient norm stats for 72 steps: 0.2608 <= 0.2693 + 0.0043z <= 0.2868 +[ INFO][27-Jun-24 09:54:28] Trained chunk 859 in 140.4s at 4236noun/s: lr=7.68e-06, loss=1.17e+00, top1=71.29%/72.198% +[ INFO][27-Jun-24 09:54:28] Chunk 860 = Batch 998159 = Sample 511056897 +[ INFO][27-Jun-24 09:56:49] Total gradient norm stats for 73 steps: 0.2601 <= 0.2689 + 0.004543z <= 0.2788 +[ INFO][27-Jun-24 09:56:49] Trained chunk 860 in 140.4s at 4238noun/s: lr=7.32e-06, loss=1.18e+00, top1=72.84%/72.194% +[ INFO][27-Jun-24 09:56:49] Chunk 861 = Batch 999321 = Sample 511651841 +[ INFO][27-Jun-24 09:59:09] Total gradient norm stats for 73 steps: 0.2612 <= 0.2694 + 0.005268z <= 0.2909 +[ INFO][27-Jun-24 09:59:09] Trained chunk 861 in 140.2s at 4245noun/s: lr=6.95e-06, loss=1.18e+00, top1=71.61%/72.194% +[ INFO][27-Jun-24 09:59:09] Chunk 862 = Batch 1000483 = Sample 512246785 +[ INFO][27-Jun-24 10:01:30] Total gradient norm stats for 72 steps: 0.2605 <= 0.2702 + 0.00477z <= 0.2828 +[ INFO][27-Jun-24 10:01:30] Trained chunk 862 in 140.7s at 4230noun/s: lr=6.60e-06, loss=1.17e+00, top1=73.35%/72.194% +[ INFO][27-Jun-24 10:01:30] Chunk 863 = Batch 1001645 = Sample 512841729 +[ INFO][27-Jun-24 10:03:50] Total gradient norm stats for 73 steps: 0.2595 <= 0.2697 + 0.007927z <= 0.3212 +[ INFO][27-Jun-24 10:03:50] Trained chunk 863 in 140.3s at 4241noun/s: lr=6.26e-06, loss=1.17e+00, top1=71.94%/72.201% +[ INFO][27-Jun-24 10:03:50] Chunk 864 = Batch 1002807 = Sample 513436673 +[ INFO][27-Jun-24 10:06:10] Total gradient norm stats for 73 steps: 0.2613 <= 0.2742 + 0.03588z <= 0.5284 +[ INFO][27-Jun-24 10:06:10] Trained chunk 864 in 140.1s at 4247noun/s: lr=5.93e-06, loss=1.17e+00, top1=70.46%/72.203% +[ INFO][27-Jun-24 10:06:10] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240626_001447/ovod_chunk0864_20240627_100610.train +[ INFO][27-Jun-24 10:06:10] Chunk 865 = Batch 1003969 = Sample 514031617 +[ INFO][27-Jun-24 10:08:30] Total gradient norm stats for 72 steps: 0.2622 <= 0.2787 + 0.0833z <= 0.9749 +[ INFO][27-Jun-24 10:08:30] Trained chunk 865 in 139.8s at 4256noun/s: lr=5.60e-06, loss=1.17e+00, top1=73.56%/72.202% +[ INFO][27-Jun-24 10:08:30] Chunk 866 = Batch 1005131 = Sample 514626561 +[ INFO][27-Jun-24 10:10:49] Total gradient norm stats for 73 steps: 0.2602 <= 0.2694 + 0.004751z <= 0.2842 +[ INFO][27-Jun-24 10:10:49] Trained chunk 866 in 139.4s at 4266noun/s: lr=5.29e-06, loss=1.17e+00, top1=72.62%/72.199% +[ INFO][27-Jun-24 10:10:49] Chunk 867 = Batch 1006293 = Sample 515221505 +[ INFO][27-Jun-24 10:13:09] Total gradient norm stats for 72 steps: 0.2594 <= 0.2687 + 0.004082z <= 0.2795 +[ INFO][27-Jun-24 10:13:09] Trained chunk 867 in 139.7s at 4260noun/s: lr=4.98e-06, loss=1.17e+00, top1=73.53%/72.197% +[ INFO][27-Jun-24 10:13:09] Chunk 868 = Batch 1007455 = Sample 515816449 +[ INFO][27-Jun-24 10:15:28] Total gradient norm stats for 73 steps: 0.2595 <= 0.3009 + 0.2325z <= 2.233 (clipped to 1) +[ INFO][27-Jun-24 10:15:28] Trained chunk 868 in 139.3s at 4270noun/s: lr=4.68e-06, loss=1.17e+00, top1=72.60%/72.201% +[ INFO][27-Jun-24 10:15:28] Chunk 869 = Batch 1008617 = Sample 516411393 +[ INFO][27-Jun-24 10:17:48] Total gradient norm stats for 73 steps: 0.2601 <= 0.2695 + 0.005518z <= 0.2845 +[ INFO][27-Jun-24 10:17:48] Trained chunk 869 in 139.8s at 4257noun/s: lr=4.40e-06, loss=1.17e+00, top1=72.60%/72.197% +[ INFO][27-Jun-24 10:17:48] Chunk 870 = Batch 1009779 = Sample 517006337 +[ INFO][27-Jun-24 10:20:08] Total gradient norm stats for 72 steps: 0.2578 <= 0.2686 + 0.004396z <= 0.2818 +[ INFO][27-Jun-24 10:20:08] Trained chunk 870 in 139.6s at 4262noun/s: lr=4.12e-06, loss=1.17e+00, top1=72.24%/72.206% +[ INFO][27-Jun-24 10:20:08] Chunk 871 = Batch 1010941 = Sample 517601281 +[ INFO][27-Jun-24 10:22:27] Total gradient norm stats for 73 steps: 0.26 <= 0.2694 + 0.004584z <= 0.2844 +[ INFO][27-Jun-24 10:22:27] Trained chunk 871 in 139.5s at 4265noun/s: lr=3.85e-06, loss=1.17e+00, top1=72.34%/72.204% +[ INFO][27-Jun-24 10:22:27] Chunk 872 = Batch 1012103 = Sample 518196225 +[ INFO][27-Jun-24 10:24:46] Total gradient norm stats for 73 steps: 0.2595 <= 0.2682 + 0.004806z <= 0.282 +[ INFO][27-Jun-24 10:24:46] Trained chunk 872 in 139.1s at 4276noun/s: lr=3.59e-06, loss=1.17e+00, top1=72.29%/72.206% +[ INFO][27-Jun-24 10:24:46] Chunk 873 = Batch 1013265 = Sample 518791169 +[ INFO][27-Jun-24 10:27:06] Total gradient norm stats for 72 steps: 0.2613 <= 0.2688 + 0.005186z <= 0.2926 +[ INFO][27-Jun-24 10:27:06] Trained chunk 873 in 139.7s at 4260noun/s: lr=3.34e-06, loss=1.17e+00, top1=70.89%/72.203% +[ INFO][27-Jun-24 10:27:06] Chunk 874 = Batch 1014427 = Sample 519386113 +[ INFO][27-Jun-24 10:29:26] Total gradient norm stats for 73 steps: 0.2593 <= 0.2677 + 0.003879z <= 0.2756 +[ INFO][27-Jun-24 10:29:26] Trained chunk 874 in 139.8s at 4257noun/s: lr=3.09e-06, loss=1.17e+00, top1=71.23%/72.202% +[ INFO][27-Jun-24 10:29:26] Chunk 875 = Batch 1015589 = Sample 519981057 +[ INFO][27-Jun-24 10:31:45] Total gradient norm stats for 72 steps: 0.2572 <= 0.2698 + 0.0057z <= 0.3012 +[ INFO][27-Jun-24 10:31:45] Trained chunk 875 in 139.1s at 4277noun/s: lr=2.86e-06, loss=1.17e+00, top1=71.24%/72.202% +[ INFO][27-Jun-24 10:31:45] Chunk 876 = Batch 1016751 = Sample 520576001 +[ INFO][27-Jun-24 10:34:04] Total gradient norm stats for 73 steps: 0.2605 <= 0.2691 + 0.005815z <= 0.3026 +[ INFO][27-Jun-24 10:34:04] Trained chunk 876 in 139.4s at 4267noun/s: lr=2.64e-06, loss=1.17e+00, top1=71.18%/72.203% +[ INFO][27-Jun-24 10:34:04] Chunk 877 = Batch 1017913 = Sample 521170945 +[ INFO][27-Jun-24 10:36:24] Total gradient norm stats for 73 steps: 0.2614 <= 0.2709 + 0.01227z <= 0.3673 +[ INFO][27-Jun-24 10:36:24] Trained chunk 877 in 139.4s at 4267noun/s: lr=2.42e-06, loss=1.17e+00, top1=72.96%/72.207% +[ INFO][27-Jun-24 10:36:24] Chunk 878 = Batch 1019075 = Sample 521765889 +[ INFO][27-Jun-24 10:38:44] Total gradient norm stats for 72 steps: 0.2617 <= 0.268 + 0.003516z <= 0.2773 +[ INFO][27-Jun-24 10:38:44] Trained chunk 878 in 139.7s at 4258noun/s: lr=2.22e-06, loss=1.17e+00, top1=71.75%/72.214% +[ INFO][27-Jun-24 10:38:44] Chunk 879 = Batch 1020237 = Sample 522360833 +[ INFO][27-Jun-24 10:41:03] Total gradient norm stats for 73 steps: 0.2612 <= 0.27 + 0.004834z <= 0.2859 +[ INFO][27-Jun-24 10:41:03] Trained chunk 879 in 139.1s at 4276noun/s: lr=2.02e-06, loss=1.17e+00, top1=72.41%/72.215% +[ INFO][27-Jun-24 10:41:03] Chunk 880 = Batch 1021399 = Sample 522955777 +[ INFO][27-Jun-24 10:43:22] Total gradient norm stats for 73 steps: 0.258 <= 0.2687 + 0.004382z <= 0.287 +[ INFO][27-Jun-24 10:43:22] Trained chunk 880 in 139.3s at 4271noun/s: lr=1.83e-06, loss=1.17e+00, top1=71.19%/72.213% +[ INFO][27-Jun-24 10:43:22] Chunk 881 = Batch 1022561 = Sample 523550721 +[ INFO][27-Jun-24 10:45:42] Total gradient norm stats for 72 steps: 0.2623 <= 0.2683 + 0.003434z <= 0.278 +[ INFO][27-Jun-24 10:45:42] Trained chunk 881 in 139.7s at 4258noun/s: lr=1.65e-06, loss=1.17e+00, top1=72.54%/72.215% +[ INFO][27-Jun-24 10:45:42] Chunk 882 = Batch 1023723 = Sample 524145665 +[ INFO][27-Jun-24 10:48:01] Total gradient norm stats for 73 steps: 0.2613 <= 0.2699 + 0.00485z <= 0.2908 +[ INFO][27-Jun-24 10:48:01] Trained chunk 882 in 139.0s at 4281noun/s: lr=1.48e-06, loss=1.17e+00, top1=71.89%/72.212% +[ INFO][27-Jun-24 10:48:01] Chunk 883 = Batch 1024885 = Sample 524740609 +[ INFO][27-Jun-24 10:50:20] Total gradient norm stats for 72 steps: 0.259 <= 0.2692 + 0.004752z <= 0.2842 +[ INFO][27-Jun-24 10:50:20] Trained chunk 883 in 139.0s at 4280noun/s: lr=1.32e-06, loss=1.17e+00, top1=72.43%/72.209% +[ INFO][27-Jun-24 10:50:20] Chunk 884 = Batch 1026047 = Sample 525335553 +[ INFO][27-Jun-24 10:52:39] Total gradient norm stats for 73 steps: 0.261 <= 0.2688 + 0.00429z <= 0.283 +[ INFO][27-Jun-24 10:52:39] Trained chunk 884 in 139.3s at 4270noun/s: lr=1.17e-06, loss=1.17e+00, top1=72.80%/72.213% +[ INFO][27-Jun-24 10:52:39] Chunk 885 = Batch 1027209 = Sample 525930497 +[ INFO][27-Jun-24 10:54:58] Total gradient norm stats for 73 steps: 0.2576 <= 0.2693 + 0.005613z <= 0.2832 +[ INFO][27-Jun-24 10:54:58] Trained chunk 885 in 139.3s at 4270noun/s: lr=1.03e-06, loss=1.17e+00, top1=71.15%/72.211% +[ INFO][27-Jun-24 10:54:58] Chunk 886 = Batch 1028371 = Sample 526525441 +[ INFO][27-Jun-24 10:57:17] Total gradient norm stats for 72 steps: 0.2619 <= 0.2694 + 0.005159z <= 0.2938 +[ INFO][27-Jun-24 10:57:17] Trained chunk 886 in 138.8s at 4286noun/s: lr=8.97e-07, loss=1.17e+00, top1=73.39%/72.207% +[ INFO][27-Jun-24 10:57:17] Chunk 887 = Batch 1029533 = Sample 527120385 +[ INFO][27-Jun-24 10:59:37] Total gradient norm stats for 73 steps: 0.2596 <= 0.2681 + 0.004694z <= 0.2834 +[ INFO][27-Jun-24 10:59:37] Trained chunk 887 in 139.4s at 4269noun/s: lr=7.74e-07, loss=1.17e+00, top1=71.63%/72.213% +[ INFO][27-Jun-24 10:59:37] Chunk 888 = Batch 1030695 = Sample 527715329 +[ INFO][27-Jun-24 11:01:56] Total gradient norm stats for 73 steps: 0.2582 <= 0.2689 + 0.005422z <= 0.2926 +[ INFO][27-Jun-24 11:01:56] Trained chunk 888 in 139.6s at 4263noun/s: lr=6.59e-07, loss=1.17e+00, top1=72.47%/72.216% +[ INFO][27-Jun-24 11:01:56] Chunk 889 = Batch 1031857 = Sample 528310273 +[ INFO][27-Jun-24 11:04:15] Total gradient norm stats for 72 steps: 0.2598 <= 0.2703 + 0.006622z <= 0.3067 +[ INFO][27-Jun-24 11:04:15] Trained chunk 889 in 139.1s at 4277noun/s: lr=5.54e-07, loss=1.17e+00, top1=72.98%/72.209% +[ INFO][27-Jun-24 11:04:15] Chunk 890 = Batch 1033019 = Sample 528905217 +[ INFO][27-Jun-24 11:06:34] Total gradient norm stats for 73 steps: 0.2607 <= 0.2689 + 0.004687z <= 0.2858 +[ INFO][27-Jun-24 11:06:34] Trained chunk 890 in 139.1s at 4278noun/s: lr=4.58e-07, loss=1.17e+00, top1=71.67%/72.204% +[ INFO][27-Jun-24 11:06:34] Chunk 891 = Batch 1034181 = Sample 529500161 +[ INFO][27-Jun-24 11:08:54] Total gradient norm stats for 72 steps: 0.261 <= 0.2692 + 0.004842z <= 0.2856 +[ INFO][27-Jun-24 11:08:54] Trained chunk 891 in 139.3s at 4271noun/s: lr=3.71e-07, loss=1.17e+00, top1=71.99%/72.204% +[ INFO][27-Jun-24 11:08:54] Chunk 892 = Batch 1035343 = Sample 530095105 +[ INFO][27-Jun-24 11:11:13] Total gradient norm stats for 73 steps: 0.2601 <= 0.2689 + 0.005483z <= 0.2872 +[ INFO][27-Jun-24 11:11:13] Trained chunk 892 in 139.9s at 4253noun/s: lr=2.93e-07, loss=1.17e+00, top1=73.34%/72.207% +[ INFO][27-Jun-24 11:11:13] Chunk 893 = Batch 1036505 = Sample 530690049 +[ INFO][27-Jun-24 11:13:33] Total gradient norm stats for 73 steps: 0.2618 <= 0.2696 + 0.004855z <= 0.286 +[ INFO][27-Jun-24 11:13:33] Trained chunk 893 in 139.7s at 4257noun/s: lr=2.24e-07, loss=1.17e+00, top1=72.04%/72.209% +[ INFO][27-Jun-24 11:13:33] Chunk 894 = Batch 1037667 = Sample 531284993 +[ INFO][27-Jun-24 11:15:52] Total gradient norm stats for 72 steps: 0.2604 <= 0.27 + 0.0126z <= 0.3681 +[ INFO][27-Jun-24 11:15:52] Trained chunk 894 in 139.2s at 4275noun/s: lr=1.65e-07, loss=1.17e+00, top1=71.23%/72.202% +[ INFO][27-Jun-24 11:15:52] Chunk 895 = Batch 1038829 = Sample 531879937 +[ INFO][27-Jun-24 11:18:12] Total gradient norm stats for 73 steps: 0.2602 <= 0.2677 + 0.003957z <= 0.2771 +[ INFO][27-Jun-24 11:18:12] Trained chunk 895 in 139.5s at 4266noun/s: lr=1.14e-07, loss=1.17e+00, top1=72.11%/72.204% +[ INFO][27-Jun-24 11:18:12] Chunk 896 = Batch 1039991 = Sample 532474881 +[ INFO][27-Jun-24 11:20:31] Total gradient norm stats for 73 steps: 0.2598 <= 0.2705 + 0.009224z <= 0.3345 +[ INFO][27-Jun-24 11:20:31] Trained chunk 896 in 139.2s at 4273noun/s: lr=7.33e-08, loss=1.17e+00, top1=72.79%/72.200% +[ INFO][27-Jun-24 11:20:31] Chunk 897 = Batch 1041153 = Sample 533069825 +[ INFO][27-Jun-24 11:22:51] Total gradient norm stats for 72 steps: 0.2605 <= 0.269 + 0.005331z <= 0.2861 +[ INFO][27-Jun-24 11:22:51] Trained chunk 897 in 139.6s at 4261noun/s: lr=4.12e-08, loss=1.17e+00, top1=72.78%/72.206% +[ INFO][27-Jun-24 11:22:51] Chunk 898 = Batch 1042315 = Sample 533664769 +[ INFO][27-Jun-24 11:25:10] Total gradient norm stats for 73 steps: 0.2609 <= 0.2706 + 0.009554z <= 0.333 +[ INFO][27-Jun-24 11:25:10] Trained chunk 898 in 139.0s at 4282noun/s: lr=1.83e-08, loss=1.17e+00, top1=71.57%/72.206% +[ INFO][27-Jun-24 11:25:10] Chunk 899 = Batch 1043477 = Sample 534259713 +[ INFO][27-Jun-24 11:27:29] Total gradient norm stats for 72 steps: 0.2585 <= 0.2678 + 0.003495z <= 0.2798 +[ INFO][27-Jun-24 11:27:29] Trained chunk 899 in 139.5s at 4265noun/s: lr=4.58e-09, loss=1.17e+00, top1=72.49%/72.204% +[ INFO][27-Jun-24 11:27:29] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240626_001447/ovod_chunk0899_20240627_112729.train +[ INFO][27-Jun-24 11:27:30] -------------------------------------------------------------------------------- +[ INFO][27-Jun-24 11:27:30] Trained for 899 chunks (up to 18 epochs) in 126759.7s +[ INFO][27-Jun-24 11:27:30] Trained 1044638 batches = 534854656 samples +[ INFO][27-Jun-24 11:27:31] Unloaded and un-memory-mapped cache