Commit
·
d99a73a
1
Parent(s):
7145fd6
Add ablations
Browse files- ablations/combined_remaqe/tokenized_combined_remaqe.zip +3 -0
- ablations/combined_remaqe/trained_arm32_L4_combined_remaqe/beam1.zip +3 -0
- ablations/combined_remaqe/trained_arm32_L4_combined_remaqe/beam5.zip +3 -0
- ablations/combined_remaqe/trained_arm32_L4_combined_remaqe/checkpoint_best.pt +3 -0
- ablations/combined_remaqe/trained_arm32_L4_combined_remaqe/training.log +0 -0
- ablations/combined_remaqe/trained_arm32_base_combined_remaqe/beam1.zip +3 -0
- ablations/combined_remaqe/trained_arm32_base_combined_remaqe/beam5.zip +3 -0
- ablations/combined_remaqe/trained_arm32_base_combined_remaqe/checkpoint_best.pt +3 -0
- ablations/combined_remaqe/trained_arm32_base_combined_remaqe/training.log +0 -0
- ablations/noconst/tokenized_noconst.zip +3 -0
- ablations/noconst/trained_aarch64_best_noconst/beam1.zip +3 -0
- ablations/noconst/trained_aarch64_best_noconst/checkpoint_best.pt +3 -0
- ablations/noconst/trained_aarch64_best_noconst/training.log +230 -0
- ablations/noconst/trained_arm32_best_noconst/beam1.zip +3 -0
- ablations/noconst/trained_arm32_best_noconst/checkpoint_best.pt +3 -0
- ablations/noconst/trained_arm32_best_noconst/training.log +230 -0
- ablations/noconst/trained_x64_best_noconst/beam1.zip +3 -0
- ablations/noconst/trained_x64_best_noconst/checkpoint_best.pt +3 -0
- ablations/noconst/trained_x64_best_noconst/training.log +142 -0
- ablations/postfix/tokenized_postfix.zip +3 -0
- ablations/postfix/trained_aarch64_best_postfix/beam1.zip +3 -0
- ablations/postfix/trained_aarch64_best_postfix/checkpoint_best.pt +3 -0
- ablations/postfix/trained_aarch64_best_postfix/training.log +590 -0
- ablations/postfix/trained_arm32_best_postfix/beam1.zip +3 -0
- ablations/postfix/trained_arm32_best_postfix/checkpoint_best.pt +3 -0
- ablations/postfix/trained_arm32_best_postfix/training.log +0 -0
- ablations/postfix/trained_x64_best_postfix/beam1.zip +3 -0
- ablations/postfix/trained_x64_best_postfix/checkpoint_best.pt +3 -0
- ablations/postfix/trained_x64_best_postfix/training.log +385 -0
ablations/combined_remaqe/tokenized_combined_remaqe.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:188557e5ab2e719ae03024d3c7cfbe7d09a7101cd7f1aaed34303d9a0c69a033
|
| 3 |
+
size 34307365
|
ablations/combined_remaqe/trained_arm32_L4_combined_remaqe/beam1.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b70cc65a2c60296f8c4f6cf88c20b7f5e7f7303aa7c68656291b266efc69d1e9
|
| 3 |
+
size 1223170
|
ablations/combined_remaqe/trained_arm32_L4_combined_remaqe/beam5.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f5aac3a1b6bc20dc0935c218369d456a3d19c42cb27e6b0297baf50234d1071b
|
| 3 |
+
size 1218948
|
ablations/combined_remaqe/trained_arm32_L4_combined_remaqe/checkpoint_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3a2905110e2eee764ea97e5baa00a39cbae684a9a6a2ebb17cd8b221bd515d6a
|
| 3 |
+
size 96298768
|
ablations/combined_remaqe/trained_arm32_L4_combined_remaqe/training.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ablations/combined_remaqe/trained_arm32_base_combined_remaqe/beam1.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a158061ff4accf7dddff828fcfec40d1d607a41cf85b2a04f84d4265a8155b01
|
| 3 |
+
size 1221321
|
ablations/combined_remaqe/trained_arm32_base_combined_remaqe/beam5.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ded6f1e831902eb5a9af4971993a2c75fc934295e5f843ad1431d612f0668255
|
| 3 |
+
size 1213901
|
ablations/combined_remaqe/trained_arm32_base_combined_remaqe/checkpoint_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e8d01f00bd516f3d4a40b79725dd6a984aed68ba859083122a9151dee4916e6d
|
| 3 |
+
size 143875864
|
ablations/combined_remaqe/trained_arm32_base_combined_remaqe/training.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ablations/noconst/tokenized_noconst.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:74933be13ba3b2aa34911b97f4451c7b6c99bf35d4ba3daa0263ee9426c1aea0
|
| 3 |
+
size 138509503
|
ablations/noconst/trained_aarch64_best_noconst/beam1.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ea4f4c1ff72a8e98b31b751d135eb61fd51776bd67a51fb8597236504de30060
|
| 3 |
+
size 3303135
|
ablations/noconst/trained_aarch64_best_noconst/checkpoint_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8d5b661e4f82f5dcd60fabe7e7b3c765181028afa0469b5c909143477d52cdc4
|
| 3 |
+
size 186110260
|
ablations/noconst/trained_aarch64_best_noconst/training.log
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': 'aarch64/trained_aarch64_best_noconst/training.log', 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': False, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': None, 'path': None, 'post_process': None, 'quiet': False, 'model_overrides': '{}', 'results_path': None}, 'distributed_training': {'_name': None, 'distributed_world_size': 1, 'distributed_num_procs': 1, 'distributed_rank': 0, 'distributed_backend': 'nccl', 'distributed_init_method': None, 'distributed_port': -1, 'device_id': 0, 'distributed_no_spawn': False, 'ddp_backend': 'pytorch_ddp', 'ddp_comm_hook': 'none', 'bucket_cap_mb': 25, 'fix_batches_to_gpus': False, 'find_unused_parameters': False, 'gradient_as_bucket_view': False, 'fast_stat_sync': False, 'heartbeat_timeout': -1, 'broadcast_buffers': False, 'slowmo_momentum': None, 'slowmo_base_algorithm': 'localsgd', 'localsgd_frequency': 3, 'nprocs_per_node': 1, 'pipeline_model_parallel': False, 'pipeline_balance': None, 'pipeline_devices': None, 'pipeline_chunks': 0, 'pipeline_encoder_balance': None, 'pipeline_encoder_devices': None, 'pipeline_decoder_balance': None, 'pipeline_decoder_devices': None, 'pipeline_checkpoint': 'never', 'zero_sharding': 'none', 'fp16': False, 'memory_efficient_fp16': False, 'tpu': False, 'no_reshard_after_forward': False, 'fp32_reduce_scatter': False, 'cpu_offload': False, 'use_sharded_state': False, 'not_fsdp_flatten_parameters': False}, 'dataset': {'_name': None, 'num_workers': 1, 'skip_invalid_size_inputs_valid_test': False, 'max_tokens': 20000, 'batch_size': None, 'required_batch_size_multiple': 8, 'required_seq_len_multiple': 1, 'dataset_impl': None, 'data_buffer_size': 10, 'train_subset': 'train', 'valid_subset': 'valid', 'combine_valid_subsets': None, 'ignore_unused_valid_subsets': False, 'validate_interval': 1, 'validate_interval_updates': 0, 'validate_after_updates': 0, 'fixed_validation_seed': None, 'disable_validation': False, 'max_tokens_valid': 20000, 'batch_size_valid': None, 'max_valid_steps': None, 'curriculum': 0, 'gen_subset': 'test', 'num_shards': 1, 'shard_id': 0, 'grouped_shuffling': False, 'update_epoch_batch_itr': False, 'update_ordered_indices_seed': False}, 'optimization': {'_name': None, 'max_epoch': 0, 'max_update': 100000, 'stop_time_hours': 0.0, 'clip_norm': 0.0, 'sentence_avg': False, 'update_freq': [1], 'lr': [0.0005], 'stop_min_lr': -1.0, 'use_bmuf': False, 'skip_remainder_batch': False}, 'checkpoint': {'_name': None, 'save_dir': 'aarch64/trained_aarch64_best_noconst', 'restore_file': 'checkpoint_last.pt', 'continue_once': None, 'finetune_from_model': None, 'reset_dataloader': False, 'reset_lr_scheduler': False, 'reset_meters': False, 'reset_optimizer': False, 'optimizer_overrides': '{}', 'save_interval': 1, 'save_interval_updates': 0, 'keep_interval_updates': -1, 'keep_interval_updates_pattern': -1, 'keep_last_epochs': -1, 'keep_best_checkpoints': 3, 'no_save': False, 'no_epoch_checkpoints': True, 'no_last_checkpoints': False, 'no_save_optimizer_state': False, 'best_checkpoint_metric': 'loss', 'maximize_best_checkpoint_metric': False, 'patience': 4, 'checkpoint_suffix': '', 'checkpoint_shard_count': 1, 'load_checkpoint_on_all_dp_ranks': False, 'write_checkpoints_asynchronously': False, 'model_parallel_size': 1}, 'bmuf': {'_name': None, 'block_lr': 1.0, 'block_momentum': 0.875, 'global_sync_iter': 50, 'warmup_iterations': 500, 'use_nbm': False, 'average_sync': False, 'distributed_world_size': 1}, 'generation': {'_name': None, 'beam': 5, 'nbest': 1, 'max_len_a': 0.0, 'max_len_b': 200, 'min_len': 1, 'match_source_len': False, 'unnormalized': False, 'no_early_stop': False, 'no_beamable_mm': False, 'lenpen': 1.0, 'unkpen': 0.0, 'replace_unk': None, 'sacrebleu': False, 'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, 'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, 'constraints': None, 'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, 'diversity_rate': -1.0, 'print_alignment': None, 'print_step': False, 'lm_path': None, 'lm_weight': 0.0, 'iter_decode_eos_penalty': 0.0, 'iter_decode_max_iter': 10, 'iter_decode_force_max_iter': False, 'iter_decode_with_beam': 1, 'iter_decode_with_external_reranker': False, 'retain_iter_history': False, 'retain_dropout': False, 'retain_dropout_modules': None, 'decoding_format': None, 'no_seed_provided': False, 'eos_token': None}, 'eval_lm': {'_name': None, 'output_word_probs': False, 'output_word_stats': False, 'context_window': 0, 'softmax_batch': 9223372036854775807}, 'interactive': {'_name': None, 'buffer_size': 0, 'input': '-'}, 'model': Namespace(_name='transformer', activation_dropout=0.0, activation_fn='relu', adam_betas=(0.9, 0.999), adam_eps=1e-08, adaptive_input=False, adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, aim_repo=None, aim_run_hash=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, arch='transformer', attention_dropout=0.0, azureml_logging=False, batch_size=None, batch_size_valid=None, best_checkpoint_metric='loss', bf16=False, bpe=None, broadcast_buffers=False, bucket_cap_mb=25, checkpoint_activations=False, checkpoint_shard_count=1, checkpoint_suffix='', clip_norm=0.0, combine_valid_subsets=None, continue_once=None, cpu=False, cpu_offload=False, criterion='cross_entropy', cross_self_attention=False, curriculum=0, data='aarch64/tokenized_dlsm_aarch64_noconst', data_buffer_size=10, dataset_impl=None, ddp_backend='pytorch_ddp', ddp_comm_hook='none', decoder_attention_heads=8, decoder_embed_dim=96, decoder_embed_path=None, decoder_ffn_embed_dim=384, decoder_input_dim=96, decoder_layerdrop=0, decoder_layers=6, decoder_layers_to_keep=None, decoder_learned_pos=False, decoder_normalize_before=False, decoder_output_dim='96', device_id=0, disable_validation=False, distributed_backend='nccl', distributed_init_method=None, distributed_no_spawn=False, distributed_num_procs=1, distributed_port=-1, distributed_rank=0, distributed_world_size=1, dropout=0.1, ema_decay=0.9999, ema_fp32=False, ema_seed_model=None, ema_start_update=0, ema_update_freq=1, empty_cache_freq=0, encoder_attention_heads=8, encoder_embed_dim=384, encoder_embed_path=None, encoder_ffn_embed_dim=1536, encoder_layerdrop=0, encoder_layers=6, encoder_layers_to_keep=None, encoder_learned_pos=False, encoder_normalize_before=False, eos=2, eval_bleu=False, eval_bleu_args='{}', eval_bleu_detok='space', eval_bleu_detok_args='{}', eval_bleu_print_samples=False, eval_bleu_remove_bpe=None, eval_tokenized_bleu=False, fast_stat_sync=False, find_unused_parameters=False, finetune_from_model=None, fix_batches_to_gpus=False, fixed_validation_seed=None, fp16=False, fp16_adam_stats=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, fp32_reduce_scatter=False, gen_subset='test', gradient_as_bucket_view=False, grouped_shuffling=False, heartbeat_timeout=-1, ignore_unused_valid_subsets=False, keep_best_checkpoints=3, keep_interval_updates=-1, keep_interval_updates_pattern=-1, keep_last_epochs=-1, layernorm_embedding=False, left_pad_source=True, left_pad_target=False, load_alignments=False, load_checkpoint_on_all_dp_ranks=False, localsgd_frequency=3, log_file='aarch64/trained_aarch64_best_noconst/training.log', log_format=None, log_interval=100, lr=[0.0005], lr_scheduler='inverse_sqrt', max_epoch=0, max_source_positions=1024, max_target_positions=1024, max_tokens=20000, max_tokens_valid=20000, max_update=100000, max_valid_steps=None, maximize_best_checkpoint_metric=False, memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_cross_attention=False, no_epoch_checkpoints=True, no_last_checkpoints=False, no_progress_bar=False, no_reshard_after_forward=False, no_save=False, no_save_optimizer_state=False, no_scale_embedding=False, no_seed_provided=False, no_token_positional_embeddings=False, not_fsdp_flatten_parameters=False, nprocs_per_node=1, num_batch_buckets=0, num_shards=1, num_workers=1, offload_activations=False, on_cpu_convert_precision=False, optimizer='adam', optimizer_overrides='{}', pad=1, patience=4, pipeline_balance=None, pipeline_checkpoint='never', pipeline_chunks=0, pipeline_decoder_balance=None, pipeline_decoder_devices=None, pipeline_devices=None, pipeline_encoder_balance=None, pipeline_encoder_devices=None, pipeline_model_parallel=False, plasma_path='/tmp/plasma', profile=False, quant_noise_pq=0, quant_noise_pq_block_size=8, quant_noise_scalar=0, quantization_config_path=None, required_batch_size_multiple=8, required_seq_len_multiple=1, reset_dataloader=False, reset_logging=False, reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, restore_file='checkpoint_last.pt', save_dir='aarch64/trained_aarch64_best_noconst', save_interval=1, save_interval_updates=0, scoring='bleu', seed=1, sentence_avg=False, shard_id=0, share_all_embeddings=False, share_decoder_input_output_embed=False, skip_invalid_size_inputs_valid_test=False, skip_remainder_batch=False, slowmo_base_algorithm='localsgd', slowmo_momentum=None, source_lang=None, stop_min_lr=-1.0, stop_time_hours=0, store_ema=False, suppress_crashes=False, target_lang=None, task='translation', tensorboard_logdir=None, threshold_loss_scale=None, tie_adaptive_weights=False, tokenizer=None, tpu=False, train_subset='train', truncate_source=False, unk=3, update_epoch_batch_itr=False, update_freq=[1], update_ordered_indices_seed=False, upsample_primary=-1, use_bmuf=False, use_old_adam=False, use_plasma_view=False, use_sharded_state=False, user_dir=None, valid_subset='valid', validate_after_updates=0, validate_interval=1, validate_interval_updates=0, wandb_project=None, warmup_init_lr=-1, warmup_updates=4000, weight_decay=0.001, write_checkpoints_asynchronously=False, zero_sharding='none'), 'task': {'_name': 'translation', 'data': 'aarch64/tokenized_dlsm_aarch64_noconst', 'source_lang': None, 'target_lang': None, 'load_alignments': False, 'left_pad_source': True, 'left_pad_target': False, 'max_source_positions': 1024, 'max_target_positions': 1024, 'upsample_primary': -1, 'truncate_source': False, 'num_batch_buckets': 0, 'train_subset': 'train', 'dataset_impl': None, 'required_seq_len_multiple': 1, 'eval_bleu': False, 'eval_bleu_args': '{}', 'eval_bleu_detok': 'space', 'eval_bleu_detok_args': '{}', 'eval_tokenized_bleu': False, 'eval_bleu_remove_bpe': None, 'eval_bleu_print_samples': False}, 'criterion': {'_name': 'cross_entropy', 'sentence_avg': False}, 'optimizer': {'_name': 'adam', 'adam_betas': [0.9, 0.999], 'adam_eps': 1e-08, 'weight_decay': 0.001, 'use_old_adam': False, 'fp16_adam_stats': False, 'tpu': False, 'lr': [0.0005]}, 'lr_scheduler': {'_name': 'inverse_sqrt', 'warmup_updates': 4000, 'warmup_init_lr': -1.0, 'lr': [0.0005]}, 'scoring': {'_name': 'bleu', 'pad': 1, 'eos': 2, 'unk': 3}, 'bpe': None, 'tokenizer': None, 'ema': {'_name': None, 'store_ema': False, 'ema_decay': 0.9999, 'ema_start_update': 0, 'ema_seed_model': None, 'ema_update_freq': 1, 'ema_fp32': False}}
|
| 2 |
+
TransformerModel(
|
| 3 |
+
(encoder): TransformerEncoderBase(
|
| 4 |
+
(dropout_module): FairseqDropout()
|
| 5 |
+
(embed_tokens): Embedding(152, 384, padding_idx=1)
|
| 6 |
+
(embed_positions): SinusoidalPositionalEmbedding()
|
| 7 |
+
(layers): ModuleList(
|
| 8 |
+
(0-5): 6 x TransformerEncoderLayerBase(
|
| 9 |
+
(self_attn): MultiheadAttention(
|
| 10 |
+
(dropout_module): FairseqDropout()
|
| 11 |
+
(k_proj): Linear(in_features=384, out_features=384, bias=True)
|
| 12 |
+
(v_proj): Linear(in_features=384, out_features=384, bias=True)
|
| 13 |
+
(q_proj): Linear(in_features=384, out_features=384, bias=True)
|
| 14 |
+
(out_proj): Linear(in_features=384, out_features=384, bias=True)
|
| 15 |
+
)
|
| 16 |
+
(self_attn_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
|
| 17 |
+
(dropout_module): FairseqDropout()
|
| 18 |
+
(activation_dropout_module): FairseqDropout()
|
| 19 |
+
(fc1): Linear(in_features=384, out_features=1536, bias=True)
|
| 20 |
+
(fc2): Linear(in_features=1536, out_features=384, bias=True)
|
| 21 |
+
(final_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
|
| 22 |
+
)
|
| 23 |
+
)
|
| 24 |
+
)
|
| 25 |
+
(decoder): TransformerDecoderBase(
|
| 26 |
+
(dropout_module): FairseqDropout()
|
| 27 |
+
(embed_tokens): Embedding(40, 96, padding_idx=1)
|
| 28 |
+
(embed_positions): SinusoidalPositionalEmbedding()
|
| 29 |
+
(layers): ModuleList(
|
| 30 |
+
(0-5): 6 x TransformerDecoderLayerBase(
|
| 31 |
+
(dropout_module): FairseqDropout()
|
| 32 |
+
(self_attn): MultiheadAttention(
|
| 33 |
+
(dropout_module): FairseqDropout()
|
| 34 |
+
(k_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 35 |
+
(v_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 36 |
+
(q_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 37 |
+
(out_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 38 |
+
)
|
| 39 |
+
(activation_dropout_module): FairseqDropout()
|
| 40 |
+
(self_attn_layer_norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
|
| 41 |
+
(encoder_attn): MultiheadAttention(
|
| 42 |
+
(dropout_module): FairseqDropout()
|
| 43 |
+
(k_proj): Linear(in_features=384, out_features=96, bias=True)
|
| 44 |
+
(v_proj): Linear(in_features=384, out_features=96, bias=True)
|
| 45 |
+
(q_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 46 |
+
(out_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 47 |
+
)
|
| 48 |
+
(encoder_attn_layer_norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
|
| 49 |
+
(fc1): Linear(in_features=96, out_features=384, bias=True)
|
| 50 |
+
(fc2): Linear(in_features=384, out_features=96, bias=True)
|
| 51 |
+
(final_layer_norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
|
| 52 |
+
)
|
| 53 |
+
)
|
| 54 |
+
(output_projection): Linear(in_features=96, out_features=40, bias=False)
|
| 55 |
+
)
|
| 56 |
+
)
|
| 57 |
+
task: TranslationTask
|
| 58 |
+
model: TransformerModel
|
| 59 |
+
criterion: CrossEntropyCriterion
|
| 60 |
+
num. shared model params: 22,577,856 (num. trained: 22,577,856)
|
| 61 |
+
num. expert model params: 0 (num. trained: 0)
|
| 62 |
+
training on 1 devices (GPUs/TPUs)
|
| 63 |
+
max tokens per device = 20000 and max sentences per device = None
|
| 64 |
+
Start iterating over samples
|
| 65 |
+
begin validation on "valid" subset
|
| 66 |
+
epoch 001 | valid on 'valid' subset | loss 1.653 | ppl 3.14 | wps 26700.3 | wpb 2722.6 | bsz 101 | num_updates 4658
|
| 67 |
+
end of epoch 1 (average epoch stats below)
|
| 68 |
+
epoch 001 | loss 2.308 | ppl 4.95 | wps 8767 | ups 3.16 | wpb 2778.2 | bsz 101.6 | num_updates 4658 | lr 0.00046334 | gnorm 2.703 | train_wall 1452 | gb_free 12.9 | wall 1477
|
| 69 |
+
Start iterating over samples
|
| 70 |
+
begin validation on "valid" subset
|
| 71 |
+
epoch 002 | valid on 'valid' subset | loss 0.938 | ppl 1.92 | wps 27666.1 | wpb 2722.6 | bsz 101 | num_updates 9316 | best_loss 0.938
|
| 72 |
+
epoch 002 | valid on 'valid' subset | loss 0.938 | ppl 1.92 | wps 27666.1 | wpb 2722.6 | bsz 101 | num_updates 9316 | best_loss 0.938
|
| 73 |
+
end of epoch 2 (average epoch stats below)
|
| 74 |
+
epoch 002 | loss 1.284 | ppl 2.44 | wps 8920 | ups 3.21 | wpb 2778.2 | bsz 101.6 | num_updates 9316 | lr 0.000327631 | gnorm 1.316 | train_wall 1427 | gb_free 12.7 | wall 2928
|
| 75 |
+
epoch 002 | loss 1.284 | ppl 2.44 | wps 8920 | ups 3.21 | wpb 2778.2 | bsz 101.6 | num_updates 9316 | lr 0.000327631 | gnorm 1.316 | train_wall 1427 | gb_free 12.7 | wall 2928
|
| 76 |
+
Start iterating over samples
|
| 77 |
+
begin validation on "valid" subset
|
| 78 |
+
epoch 003 | valid on 'valid' subset | loss 0.715 | ppl 1.64 | wps 26749.8 | wpb 2722.6 | bsz 101 | num_updates 13974 | best_loss 0.715
|
| 79 |
+
epoch 003 | valid on 'valid' subset | loss 0.715 | ppl 1.64 | wps 26749.8 | wpb 2722.6 | bsz 101 | num_updates 13974 | best_loss 0.715
|
| 80 |
+
epoch 003 | valid on 'valid' subset | loss 0.715 | ppl 1.64 | wps 26749.8 | wpb 2722.6 | bsz 101 | num_updates 13974 | best_loss 0.715
|
| 81 |
+
end of epoch 3 (average epoch stats below)
|
| 82 |
+
epoch 003 | loss 0.803 | ppl 1.74 | wps 8866.1 | ups 3.19 | wpb 2778.2 | bsz 101.6 | num_updates 13974 | lr 0.00026751 | gnorm 1.088 | train_wall 1435 | gb_free 12.6 | wall 4387
|
| 83 |
+
epoch 003 | loss 0.803 | ppl 1.74 | wps 8866.1 | ups 3.19 | wpb 2778.2 | bsz 101.6 | num_updates 13974 | lr 0.00026751 | gnorm 1.088 | train_wall 1435 | gb_free 12.6 | wall 4387
|
| 84 |
+
epoch 003 | loss 0.803 | ppl 1.74 | wps 8866.1 | ups 3.19 | wpb 2778.2 | bsz 101.6 | num_updates 13974 | lr 0.00026751 | gnorm 1.088 | train_wall 1435 | gb_free 12.6 | wall 4387
|
| 85 |
+
Start iterating over samples
|
| 86 |
+
begin validation on "valid" subset
|
| 87 |
+
epoch 004 | valid on 'valid' subset | loss 0.621 | ppl 1.54 | wps 26630.6 | wpb 2722.6 | bsz 101 | num_updates 18632 | best_loss 0.621
|
| 88 |
+
epoch 004 | valid on 'valid' subset | loss 0.621 | ppl 1.54 | wps 26630.6 | wpb 2722.6 | bsz 101 | num_updates 18632 | best_loss 0.621
|
| 89 |
+
epoch 004 | valid on 'valid' subset | loss 0.621 | ppl 1.54 | wps 26630.6 | wpb 2722.6 | bsz 101 | num_updates 18632 | best_loss 0.621
|
| 90 |
+
epoch 004 | valid on 'valid' subset | loss 0.621 | ppl 1.54 | wps 26630.6 | wpb 2722.6 | bsz 101 | num_updates 18632 | best_loss 0.621
|
| 91 |
+
end of epoch 4 (average epoch stats below)
|
| 92 |
+
epoch 004 | loss 0.586 | ppl 1.5 | wps 8770.3 | ups 3.16 | wpb 2778.2 | bsz 101.6 | num_updates 18632 | lr 0.00023167 | gnorm 0.927 | train_wall 1450 | gb_free 12.9 | wall 5863
|
| 93 |
+
epoch 004 | loss 0.586 | ppl 1.5 | wps 8770.3 | ups 3.16 | wpb 2778.2 | bsz 101.6 | num_updates 18632 | lr 0.00023167 | gnorm 0.927 | train_wall 1450 | gb_free 12.9 | wall 5863
|
| 94 |
+
epoch 004 | loss 0.586 | ppl 1.5 | wps 8770.3 | ups 3.16 | wpb 2778.2 | bsz 101.6 | num_updates 18632 | lr 0.00023167 | gnorm 0.927 | train_wall 1450 | gb_free 12.9 | wall 5863
|
| 95 |
+
epoch 004 | loss 0.586 | ppl 1.5 | wps 8770.3 | ups 3.16 | wpb 2778.2 | bsz 101.6 | num_updates 18632 | lr 0.00023167 | gnorm 0.927 | train_wall 1450 | gb_free 12.9 | wall 5863
|
| 96 |
+
Start iterating over samples
|
| 97 |
+
begin validation on "valid" subset
|
| 98 |
+
epoch 005 | valid on 'valid' subset | loss 0.541 | ppl 1.45 | wps 26626.3 | wpb 2722.6 | bsz 101 | num_updates 23290 | best_loss 0.541
|
| 99 |
+
epoch 005 | valid on 'valid' subset | loss 0.541 | ppl 1.45 | wps 26626.3 | wpb 2722.6 | bsz 101 | num_updates 23290 | best_loss 0.541
|
| 100 |
+
epoch 005 | valid on 'valid' subset | loss 0.541 | ppl 1.45 | wps 26626.3 | wpb 2722.6 | bsz 101 | num_updates 23290 | best_loss 0.541
|
| 101 |
+
epoch 005 | valid on 'valid' subset | loss 0.541 | ppl 1.45 | wps 26626.3 | wpb 2722.6 | bsz 101 | num_updates 23290 | best_loss 0.541
|
| 102 |
+
epoch 005 | valid on 'valid' subset | loss 0.541 | ppl 1.45 | wps 26626.3 | wpb 2722.6 | bsz 101 | num_updates 23290 | best_loss 0.541
|
| 103 |
+
end of epoch 5 (average epoch stats below)
|
| 104 |
+
epoch 005 | loss 0.454 | ppl 1.37 | wps 8758.4 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 23290 | lr 0.000207212 | gnorm 0.835 | train_wall 1452 | gb_free 12.9 | wall 7340
|
| 105 |
+
epoch 005 | loss 0.454 | ppl 1.37 | wps 8758.4 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 23290 | lr 0.000207212 | gnorm 0.835 | train_wall 1452 | gb_free 12.9 | wall 7340
|
| 106 |
+
epoch 005 | loss 0.454 | ppl 1.37 | wps 8758.4 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 23290 | lr 0.000207212 | gnorm 0.835 | train_wall 1452 | gb_free 12.9 | wall 7340
|
| 107 |
+
epoch 005 | loss 0.454 | ppl 1.37 | wps 8758.4 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 23290 | lr 0.000207212 | gnorm 0.835 | train_wall 1452 | gb_free 12.9 | wall 7340
|
| 108 |
+
epoch 005 | loss 0.454 | ppl 1.37 | wps 8758.4 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 23290 | lr 0.000207212 | gnorm 0.835 | train_wall 1452 | gb_free 12.9 | wall 7340
|
| 109 |
+
Start iterating over samples
|
| 110 |
+
begin validation on "valid" subset
|
| 111 |
+
epoch 006 | valid on 'valid' subset | loss 0.508 | ppl 1.42 | wps 26570.2 | wpb 2722.6 | bsz 101 | num_updates 27948 | best_loss 0.508
|
| 112 |
+
epoch 006 | valid on 'valid' subset | loss 0.508 | ppl 1.42 | wps 26570.2 | wpb 2722.6 | bsz 101 | num_updates 27948 | best_loss 0.508
|
| 113 |
+
epoch 006 | valid on 'valid' subset | loss 0.508 | ppl 1.42 | wps 26570.2 | wpb 2722.6 | bsz 101 | num_updates 27948 | best_loss 0.508
|
| 114 |
+
epoch 006 | valid on 'valid' subset | loss 0.508 | ppl 1.42 | wps 26570.2 | wpb 2722.6 | bsz 101 | num_updates 27948 | best_loss 0.508
|
| 115 |
+
epoch 006 | valid on 'valid' subset | loss 0.508 | ppl 1.42 | wps 26570.2 | wpb 2722.6 | bsz 101 | num_updates 27948 | best_loss 0.508
|
| 116 |
+
epoch 006 | valid on 'valid' subset | loss 0.508 | ppl 1.42 | wps 26570.2 | wpb 2722.6 | bsz 101 | num_updates 27948 | best_loss 0.508
|
| 117 |
+
end of epoch 6 (average epoch stats below)
|
| 118 |
+
epoch 006 | loss 0.357 | ppl 1.28 | wps 8760.3 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 27948 | lr 0.000189158 | gnorm 0.782 | train_wall 1452 | gb_free 12.7 | wall 8818
|
| 119 |
+
epoch 006 | loss 0.357 | ppl 1.28 | wps 8760.3 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 27948 | lr 0.000189158 | gnorm 0.782 | train_wall 1452 | gb_free 12.7 | wall 8818
|
| 120 |
+
epoch 006 | loss 0.357 | ppl 1.28 | wps 8760.3 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 27948 | lr 0.000189158 | gnorm 0.782 | train_wall 1452 | gb_free 12.7 | wall 8818
|
| 121 |
+
epoch 006 | loss 0.357 | ppl 1.28 | wps 8760.3 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 27948 | lr 0.000189158 | gnorm 0.782 | train_wall 1452 | gb_free 12.7 | wall 8818
|
| 122 |
+
epoch 006 | loss 0.357 | ppl 1.28 | wps 8760.3 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 27948 | lr 0.000189158 | gnorm 0.782 | train_wall 1452 | gb_free 12.7 | wall 8818
|
| 123 |
+
epoch 006 | loss 0.357 | ppl 1.28 | wps 8760.3 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 27948 | lr 0.000189158 | gnorm 0.782 | train_wall 1452 | gb_free 12.7 | wall 8818
|
| 124 |
+
Start iterating over samples
|
| 125 |
+
begin validation on "valid" subset
|
| 126 |
+
epoch 007 | valid on 'valid' subset | loss 0.492 | ppl 1.41 | wps 26701.1 | wpb 2722.6 | bsz 101 | num_updates 32606 | best_loss 0.492
|
| 127 |
+
epoch 007 | valid on 'valid' subset | loss 0.492 | ppl 1.41 | wps 26701.1 | wpb 2722.6 | bsz 101 | num_updates 32606 | best_loss 0.492
|
| 128 |
+
epoch 007 | valid on 'valid' subset | loss 0.492 | ppl 1.41 | wps 26701.1 | wpb 2722.6 | bsz 101 | num_updates 32606 | best_loss 0.492
|
| 129 |
+
epoch 007 | valid on 'valid' subset | loss 0.492 | ppl 1.41 | wps 26701.1 | wpb 2722.6 | bsz 101 | num_updates 32606 | best_loss 0.492
|
| 130 |
+
epoch 007 | valid on 'valid' subset | loss 0.492 | ppl 1.41 | wps 26701.1 | wpb 2722.6 | bsz 101 | num_updates 32606 | best_loss 0.492
|
| 131 |
+
epoch 007 | valid on 'valid' subset | loss 0.492 | ppl 1.41 | wps 26701.1 | wpb 2722.6 | bsz 101 | num_updates 32606 | best_loss 0.492
|
| 132 |
+
epoch 007 | valid on 'valid' subset | loss 0.492 | ppl 1.41 | wps 26701.1 | wpb 2722.6 | bsz 101 | num_updates 32606 | best_loss 0.492
|
| 133 |
+
end of epoch 7 (average epoch stats below)
|
| 134 |
+
epoch 007 | loss 0.28 | ppl 1.21 | wps 8757 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 32606 | lr 0.000175126 | gnorm 0.739 | train_wall 1453 | gb_free 13 | wall 10295
|
| 135 |
+
epoch 007 | loss 0.28 | ppl 1.21 | wps 8757 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 32606 | lr 0.000175126 | gnorm 0.739 | train_wall 1453 | gb_free 13 | wall 10295
|
| 136 |
+
epoch 007 | loss 0.28 | ppl 1.21 | wps 8757 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 32606 | lr 0.000175126 | gnorm 0.739 | train_wall 1453 | gb_free 13 | wall 10295
|
| 137 |
+
epoch 007 | loss 0.28 | ppl 1.21 | wps 8757 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 32606 | lr 0.000175126 | gnorm 0.739 | train_wall 1453 | gb_free 13 | wall 10295
|
| 138 |
+
epoch 007 | loss 0.28 | ppl 1.21 | wps 8757 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 32606 | lr 0.000175126 | gnorm 0.739 | train_wall 1453 | gb_free 13 | wall 10295
|
| 139 |
+
epoch 007 | loss 0.28 | ppl 1.21 | wps 8757 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 32606 | lr 0.000175126 | gnorm 0.739 | train_wall 1453 | gb_free 13 | wall 10295
|
| 140 |
+
epoch 007 | loss 0.28 | ppl 1.21 | wps 8757 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 32606 | lr 0.000175126 | gnorm 0.739 | train_wall 1453 | gb_free 13 | wall 10295
|
| 141 |
+
Start iterating over samples
|
| 142 |
+
begin validation on "valid" subset
|
| 143 |
+
epoch 008 | valid on 'valid' subset | loss 0.52 | ppl 1.43 | wps 26826.2 | wpb 2722.6 | bsz 101 | num_updates 37264 | best_loss 0.492
|
| 144 |
+
epoch 008 | valid on 'valid' subset | loss 0.52 | ppl 1.43 | wps 26826.2 | wpb 2722.6 | bsz 101 | num_updates 37264 | best_loss 0.492
|
| 145 |
+
epoch 008 | valid on 'valid' subset | loss 0.52 | ppl 1.43 | wps 26826.2 | wpb 2722.6 | bsz 101 | num_updates 37264 | best_loss 0.492
|
| 146 |
+
epoch 008 | valid on 'valid' subset | loss 0.52 | ppl 1.43 | wps 26826.2 | wpb 2722.6 | bsz 101 | num_updates 37264 | best_loss 0.492
|
| 147 |
+
epoch 008 | valid on 'valid' subset | loss 0.52 | ppl 1.43 | wps 26826.2 | wpb 2722.6 | bsz 101 | num_updates 37264 | best_loss 0.492
|
| 148 |
+
epoch 008 | valid on 'valid' subset | loss 0.52 | ppl 1.43 | wps 26826.2 | wpb 2722.6 | bsz 101 | num_updates 37264 | best_loss 0.492
|
| 149 |
+
epoch 008 | valid on 'valid' subset | loss 0.52 | ppl 1.43 | wps 26826.2 | wpb 2722.6 | bsz 101 | num_updates 37264 | best_loss 0.492
|
| 150 |
+
epoch 008 | valid on 'valid' subset | loss 0.52 | ppl 1.43 | wps 26826.2 | wpb 2722.6 | bsz 101 | num_updates 37264 | best_loss 0.492
|
| 151 |
+
end of epoch 8 (average epoch stats below)
|
| 152 |
+
epoch 008 | loss 0.22 | ppl 1.16 | wps 8757.5 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 37264 | lr 0.000163816 | gnorm 0.698 | train_wall 1453 | gb_free 12.7 | wall 11773
|
| 153 |
+
epoch 008 | loss 0.22 | ppl 1.16 | wps 8757.5 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 37264 | lr 0.000163816 | gnorm 0.698 | train_wall 1453 | gb_free 12.7 | wall 11773
|
| 154 |
+
epoch 008 | loss 0.22 | ppl 1.16 | wps 8757.5 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 37264 | lr 0.000163816 | gnorm 0.698 | train_wall 1453 | gb_free 12.7 | wall 11773
|
| 155 |
+
epoch 008 | loss 0.22 | ppl 1.16 | wps 8757.5 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 37264 | lr 0.000163816 | gnorm 0.698 | train_wall 1453 | gb_free 12.7 | wall 11773
|
| 156 |
+
epoch 008 | loss 0.22 | ppl 1.16 | wps 8757.5 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 37264 | lr 0.000163816 | gnorm 0.698 | train_wall 1453 | gb_free 12.7 | wall 11773
|
| 157 |
+
epoch 008 | loss 0.22 | ppl 1.16 | wps 8757.5 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 37264 | lr 0.000163816 | gnorm 0.698 | train_wall 1453 | gb_free 12.7 | wall 11773
|
| 158 |
+
epoch 008 | loss 0.22 | ppl 1.16 | wps 8757.5 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 37264 | lr 0.000163816 | gnorm 0.698 | train_wall 1453 | gb_free 12.7 | wall 11773
|
| 159 |
+
epoch 008 | loss 0.22 | ppl 1.16 | wps 8757.5 | ups 3.15 | wpb 2778.2 | bsz 101.6 | num_updates 37264 | lr 0.000163816 | gnorm 0.698 | train_wall 1453 | gb_free 12.7 | wall 11773
|
| 160 |
+
Start iterating over samples
|
| 161 |
+
begin validation on "valid" subset
|
| 162 |
+
epoch 009 | valid on 'valid' subset | loss 0.513 | ppl 1.43 | wps 27649.3 | wpb 2722.6 | bsz 101 | num_updates 41922 | best_loss 0.492
|
| 163 |
+
epoch 009 | valid on 'valid' subset | loss 0.513 | ppl 1.43 | wps 27649.3 | wpb 2722.6 | bsz 101 | num_updates 41922 | best_loss 0.492
|
| 164 |
+
epoch 009 | valid on 'valid' subset | loss 0.513 | ppl 1.43 | wps 27649.3 | wpb 2722.6 | bsz 101 | num_updates 41922 | best_loss 0.492
|
| 165 |
+
epoch 009 | valid on 'valid' subset | loss 0.513 | ppl 1.43 | wps 27649.3 | wpb 2722.6 | bsz 101 | num_updates 41922 | best_loss 0.492
|
| 166 |
+
epoch 009 | valid on 'valid' subset | loss 0.513 | ppl 1.43 | wps 27649.3 | wpb 2722.6 | bsz 101 | num_updates 41922 | best_loss 0.492
|
| 167 |
+
epoch 009 | valid on 'valid' subset | loss 0.513 | ppl 1.43 | wps 27649.3 | wpb 2722.6 | bsz 101 | num_updates 41922 | best_loss 0.492
|
| 168 |
+
epoch 009 | valid on 'valid' subset | loss 0.513 | ppl 1.43 | wps 27649.3 | wpb 2722.6 | bsz 101 | num_updates 41922 | best_loss 0.492
|
| 169 |
+
epoch 009 | valid on 'valid' subset | loss 0.513 | ppl 1.43 | wps 27649.3 | wpb 2722.6 | bsz 101 | num_updates 41922 | best_loss 0.492
|
| 170 |
+
epoch 009 | valid on 'valid' subset | loss 0.513 | ppl 1.43 | wps 27649.3 | wpb 2722.6 | bsz 101 | num_updates 41922 | best_loss 0.492
|
| 171 |
+
end of epoch 9 (average epoch stats below)
|
| 172 |
+
epoch 009 | loss 0.174 | ppl 1.13 | wps 8934.2 | ups 3.22 | wpb 2778.2 | bsz 101.6 | num_updates 41922 | lr 0.000154447 | gnorm 0.671 | train_wall 1425 | gb_free 12.7 | wall 13221
|
| 173 |
+
epoch 009 | loss 0.174 | ppl 1.13 | wps 8934.2 | ups 3.22 | wpb 2778.2 | bsz 101.6 | num_updates 41922 | lr 0.000154447 | gnorm 0.671 | train_wall 1425 | gb_free 12.7 | wall 13221
|
| 174 |
+
epoch 009 | loss 0.174 | ppl 1.13 | wps 8934.2 | ups 3.22 | wpb 2778.2 | bsz 101.6 | num_updates 41922 | lr 0.000154447 | gnorm 0.671 | train_wall 1425 | gb_free 12.7 | wall 13221
|
| 175 |
+
epoch 009 | loss 0.174 | ppl 1.13 | wps 8934.2 | ups 3.22 | wpb 2778.2 | bsz 101.6 | num_updates 41922 | lr 0.000154447 | gnorm 0.671 | train_wall 1425 | gb_free 12.7 | wall 13221
|
| 176 |
+
epoch 009 | loss 0.174 | ppl 1.13 | wps 8934.2 | ups 3.22 | wpb 2778.2 | bsz 101.6 | num_updates 41922 | lr 0.000154447 | gnorm 0.671 | train_wall 1425 | gb_free 12.7 | wall 13221
|
| 177 |
+
epoch 009 | loss 0.174 | ppl 1.13 | wps 8934.2 | ups 3.22 | wpb 2778.2 | bsz 101.6 | num_updates 41922 | lr 0.000154447 | gnorm 0.671 | train_wall 1425 | gb_free 12.7 | wall 13221
|
| 178 |
+
epoch 009 | loss 0.174 | ppl 1.13 | wps 8934.2 | ups 3.22 | wpb 2778.2 | bsz 101.6 | num_updates 41922 | lr 0.000154447 | gnorm 0.671 | train_wall 1425 | gb_free 12.7 | wall 13221
|
| 179 |
+
epoch 009 | loss 0.174 | ppl 1.13 | wps 8934.2 | ups 3.22 | wpb 2778.2 | bsz 101.6 | num_updates 41922 | lr 0.000154447 | gnorm 0.671 | train_wall 1425 | gb_free 12.7 | wall 13221
|
| 180 |
+
epoch 009 | loss 0.174 | ppl 1.13 | wps 8934.2 | ups 3.22 | wpb 2778.2 | bsz 101.6 | num_updates 41922 | lr 0.000154447 | gnorm 0.671 | train_wall 1425 | gb_free 12.7 | wall 13221
|
| 181 |
+
Start iterating over samples
|
| 182 |
+
begin validation on "valid" subset
|
| 183 |
+
epoch 010 | valid on 'valid' subset | loss 0.591 | ppl 1.51 | wps 27575.7 | wpb 2722.6 | bsz 101 | num_updates 46580 | best_loss 0.492
|
| 184 |
+
epoch 010 | valid on 'valid' subset | loss 0.591 | ppl 1.51 | wps 27575.7 | wpb 2722.6 | bsz 101 | num_updates 46580 | best_loss 0.492
|
| 185 |
+
epoch 010 | valid on 'valid' subset | loss 0.591 | ppl 1.51 | wps 27575.7 | wpb 2722.6 | bsz 101 | num_updates 46580 | best_loss 0.492
|
| 186 |
+
epoch 010 | valid on 'valid' subset | loss 0.591 | ppl 1.51 | wps 27575.7 | wpb 2722.6 | bsz 101 | num_updates 46580 | best_loss 0.492
|
| 187 |
+
epoch 010 | valid on 'valid' subset | loss 0.591 | ppl 1.51 | wps 27575.7 | wpb 2722.6 | bsz 101 | num_updates 46580 | best_loss 0.492
|
| 188 |
+
epoch 010 | valid on 'valid' subset | loss 0.591 | ppl 1.51 | wps 27575.7 | wpb 2722.6 | bsz 101 | num_updates 46580 | best_loss 0.492
|
| 189 |
+
epoch 010 | valid on 'valid' subset | loss 0.591 | ppl 1.51 | wps 27575.7 | wpb 2722.6 | bsz 101 | num_updates 46580 | best_loss 0.492
|
| 190 |
+
epoch 010 | valid on 'valid' subset | loss 0.591 | ppl 1.51 | wps 27575.7 | wpb 2722.6 | bsz 101 | num_updates 46580 | best_loss 0.492
|
| 191 |
+
epoch 010 | valid on 'valid' subset | loss 0.591 | ppl 1.51 | wps 27575.7 | wpb 2722.6 | bsz 101 | num_updates 46580 | best_loss 0.492
|
| 192 |
+
epoch 010 | valid on 'valid' subset | loss 0.591 | ppl 1.51 | wps 27575.7 | wpb 2722.6 | bsz 101 | num_updates 46580 | best_loss 0.492
|
| 193 |
+
end of epoch 10 (average epoch stats below)
|
| 194 |
+
epoch 010 | loss 0.137 | ppl 1.1 | wps 9058.9 | ups 3.26 | wpb 2778.2 | bsz 101.6 | num_updates 46580 | lr 0.000146521 | gnorm 0.643 | train_wall 1406 | gb_free 12.8 | wall 14650
|
| 195 |
+
epoch 010 | loss 0.137 | ppl 1.1 | wps 9058.9 | ups 3.26 | wpb 2778.2 | bsz 101.6 | num_updates 46580 | lr 0.000146521 | gnorm 0.643 | train_wall 1406 | gb_free 12.8 | wall 14650
|
| 196 |
+
epoch 010 | loss 0.137 | ppl 1.1 | wps 9058.9 | ups 3.26 | wpb 2778.2 | bsz 101.6 | num_updates 46580 | lr 0.000146521 | gnorm 0.643 | train_wall 1406 | gb_free 12.8 | wall 14650
|
| 197 |
+
epoch 010 | loss 0.137 | ppl 1.1 | wps 9058.9 | ups 3.26 | wpb 2778.2 | bsz 101.6 | num_updates 46580 | lr 0.000146521 | gnorm 0.643 | train_wall 1406 | gb_free 12.8 | wall 14650
|
| 198 |
+
epoch 010 | loss 0.137 | ppl 1.1 | wps 9058.9 | ups 3.26 | wpb 2778.2 | bsz 101.6 | num_updates 46580 | lr 0.000146521 | gnorm 0.643 | train_wall 1406 | gb_free 12.8 | wall 14650
|
| 199 |
+
epoch 010 | loss 0.137 | ppl 1.1 | wps 9058.9 | ups 3.26 | wpb 2778.2 | bsz 101.6 | num_updates 46580 | lr 0.000146521 | gnorm 0.643 | train_wall 1406 | gb_free 12.8 | wall 14650
|
| 200 |
+
epoch 010 | loss 0.137 | ppl 1.1 | wps 9058.9 | ups 3.26 | wpb 2778.2 | bsz 101.6 | num_updates 46580 | lr 0.000146521 | gnorm 0.643 | train_wall 1406 | gb_free 12.8 | wall 14650
|
| 201 |
+
epoch 010 | loss 0.137 | ppl 1.1 | wps 9058.9 | ups 3.26 | wpb 2778.2 | bsz 101.6 | num_updates 46580 | lr 0.000146521 | gnorm 0.643 | train_wall 1406 | gb_free 12.8 | wall 14650
|
| 202 |
+
epoch 010 | loss 0.137 | ppl 1.1 | wps 9058.9 | ups 3.26 | wpb 2778.2 | bsz 101.6 | num_updates 46580 | lr 0.000146521 | gnorm 0.643 | train_wall 1406 | gb_free 12.8 | wall 14650
|
| 203 |
+
epoch 010 | loss 0.137 | ppl 1.1 | wps 9058.9 | ups 3.26 | wpb 2778.2 | bsz 101.6 | num_updates 46580 | lr 0.000146521 | gnorm 0.643 | train_wall 1406 | gb_free 12.8 | wall 14650
|
| 204 |
+
Start iterating over samples
|
| 205 |
+
begin validation on "valid" subset
|
| 206 |
+
epoch 011 | valid on 'valid' subset | loss 0.593 | ppl 1.51 | wps 27763.5 | wpb 2722.6 | bsz 101 | num_updates 51238 | best_loss 0.492
|
| 207 |
+
epoch 011 | valid on 'valid' subset | loss 0.593 | ppl 1.51 | wps 27763.5 | wpb 2722.6 | bsz 101 | num_updates 51238 | best_loss 0.492
|
| 208 |
+
epoch 011 | valid on 'valid' subset | loss 0.593 | ppl 1.51 | wps 27763.5 | wpb 2722.6 | bsz 101 | num_updates 51238 | best_loss 0.492
|
| 209 |
+
epoch 011 | valid on 'valid' subset | loss 0.593 | ppl 1.51 | wps 27763.5 | wpb 2722.6 | bsz 101 | num_updates 51238 | best_loss 0.492
|
| 210 |
+
epoch 011 | valid on 'valid' subset | loss 0.593 | ppl 1.51 | wps 27763.5 | wpb 2722.6 | bsz 101 | num_updates 51238 | best_loss 0.492
|
| 211 |
+
epoch 011 | valid on 'valid' subset | loss 0.593 | ppl 1.51 | wps 27763.5 | wpb 2722.6 | bsz 101 | num_updates 51238 | best_loss 0.492
|
| 212 |
+
epoch 011 | valid on 'valid' subset | loss 0.593 | ppl 1.51 | wps 27763.5 | wpb 2722.6 | bsz 101 | num_updates 51238 | best_loss 0.492
|
| 213 |
+
epoch 011 | valid on 'valid' subset | loss 0.593 | ppl 1.51 | wps 27763.5 | wpb 2722.6 | bsz 101 | num_updates 51238 | best_loss 0.492
|
| 214 |
+
epoch 011 | valid on 'valid' subset | loss 0.593 | ppl 1.51 | wps 27763.5 | wpb 2722.6 | bsz 101 | num_updates 51238 | best_loss 0.492
|
| 215 |
+
epoch 011 | valid on 'valid' subset | loss 0.593 | ppl 1.51 | wps 27763.5 | wpb 2722.6 | bsz 101 | num_updates 51238 | best_loss 0.492
|
| 216 |
+
epoch 011 | valid on 'valid' subset | loss 0.593 | ppl 1.51 | wps 27763.5 | wpb 2722.6 | bsz 101 | num_updates 51238 | best_loss 0.492
|
| 217 |
+
early stop since valid performance hasn't improved for last 4 runs
|
| 218 |
+
end of epoch 11 (average epoch stats below)
|
| 219 |
+
epoch 011 | loss 0.11 | ppl 1.08 | wps 9065.3 | ups 3.26 | wpb 2778.2 | bsz 101.6 | num_updates 51238 | lr 0.000139702 | gnorm 0.618 | train_wall 1405 | gb_free 12.9 | wall 16078
|
| 220 |
+
epoch 011 | loss 0.11 | ppl 1.08 | wps 9065.3 | ups 3.26 | wpb 2778.2 | bsz 101.6 | num_updates 51238 | lr 0.000139702 | gnorm 0.618 | train_wall 1405 | gb_free 12.9 | wall 16078
|
| 221 |
+
epoch 011 | loss 0.11 | ppl 1.08 | wps 9065.3 | ups 3.26 | wpb 2778.2 | bsz 101.6 | num_updates 51238 | lr 0.000139702 | gnorm 0.618 | train_wall 1405 | gb_free 12.9 | wall 16078
|
| 222 |
+
epoch 011 | loss 0.11 | ppl 1.08 | wps 9065.3 | ups 3.26 | wpb 2778.2 | bsz 101.6 | num_updates 51238 | lr 0.000139702 | gnorm 0.618 | train_wall 1405 | gb_free 12.9 | wall 16078
|
| 223 |
+
epoch 011 | loss 0.11 | ppl 1.08 | wps 9065.3 | ups 3.26 | wpb 2778.2 | bsz 101.6 | num_updates 51238 | lr 0.000139702 | gnorm 0.618 | train_wall 1405 | gb_free 12.9 | wall 16078
|
| 224 |
+
epoch 011 | loss 0.11 | ppl 1.08 | wps 9065.3 | ups 3.26 | wpb 2778.2 | bsz 101.6 | num_updates 51238 | lr 0.000139702 | gnorm 0.618 | train_wall 1405 | gb_free 12.9 | wall 16078
|
| 225 |
+
epoch 011 | loss 0.11 | ppl 1.08 | wps 9065.3 | ups 3.26 | wpb 2778.2 | bsz 101.6 | num_updates 51238 | lr 0.000139702 | gnorm 0.618 | train_wall 1405 | gb_free 12.9 | wall 16078
|
| 226 |
+
epoch 011 | loss 0.11 | ppl 1.08 | wps 9065.3 | ups 3.26 | wpb 2778.2 | bsz 101.6 | num_updates 51238 | lr 0.000139702 | gnorm 0.618 | train_wall 1405 | gb_free 12.9 | wall 16078
|
| 227 |
+
epoch 011 | loss 0.11 | ppl 1.08 | wps 9065.3 | ups 3.26 | wpb 2778.2 | bsz 101.6 | num_updates 51238 | lr 0.000139702 | gnorm 0.618 | train_wall 1405 | gb_free 12.9 | wall 16078
|
| 228 |
+
epoch 011 | loss 0.11 | ppl 1.08 | wps 9065.3 | ups 3.26 | wpb 2778.2 | bsz 101.6 | num_updates 51238 | lr 0.000139702 | gnorm 0.618 | train_wall 1405 | gb_free 12.9 | wall 16078
|
| 229 |
+
epoch 011 | loss 0.11 | ppl 1.08 | wps 9065.3 | ups 3.26 | wpb 2778.2 | bsz 101.6 | num_updates 51238 | lr 0.000139702 | gnorm 0.618 | train_wall 1405 | gb_free 12.9 | wall 16078
|
| 230 |
+
done training in 16077.1 seconds
|
ablations/noconst/trained_arm32_best_noconst/beam1.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:205eb1691e75be23c8760c7f9a257b2be617ee3a8ec97eb588d8e0e39c13bccd
|
| 3 |
+
size 2952656
|
ablations/noconst/trained_arm32_best_noconst/checkpoint_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b49e1926c18a8ff27057a4a94716be61dc5df10d16478eb64d94f76e9eeb3241
|
| 3 |
+
size 124639436
|
ablations/noconst/trained_arm32_best_noconst/training.log
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': 'arm32/trained_arm32_best_noconst/training.log', 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': False, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': None, 'path': None, 'post_process': None, 'quiet': False, 'model_overrides': '{}', 'results_path': None}, 'distributed_training': {'_name': None, 'distributed_world_size': 1, 'distributed_num_procs': 1, 'distributed_rank': 0, 'distributed_backend': 'nccl', 'distributed_init_method': None, 'distributed_port': -1, 'device_id': 0, 'distributed_no_spawn': False, 'ddp_backend': 'pytorch_ddp', 'ddp_comm_hook': 'none', 'bucket_cap_mb': 25, 'fix_batches_to_gpus': False, 'find_unused_parameters': False, 'gradient_as_bucket_view': False, 'fast_stat_sync': False, 'heartbeat_timeout': -1, 'broadcast_buffers': False, 'slowmo_momentum': None, 'slowmo_base_algorithm': 'localsgd', 'localsgd_frequency': 3, 'nprocs_per_node': 1, 'pipeline_model_parallel': False, 'pipeline_balance': None, 'pipeline_devices': None, 'pipeline_chunks': 0, 'pipeline_encoder_balance': None, 'pipeline_encoder_devices': None, 'pipeline_decoder_balance': None, 'pipeline_decoder_devices': None, 'pipeline_checkpoint': 'never', 'zero_sharding': 'none', 'fp16': False, 'memory_efficient_fp16': False, 'tpu': False, 'no_reshard_after_forward': False, 'fp32_reduce_scatter': False, 'cpu_offload': False, 'use_sharded_state': False, 'not_fsdp_flatten_parameters': False}, 'dataset': {'_name': None, 'num_workers': 1, 'skip_invalid_size_inputs_valid_test': False, 'max_tokens': 20000, 'batch_size': None, 'required_batch_size_multiple': 8, 'required_seq_len_multiple': 1, 'dataset_impl': None, 'data_buffer_size': 10, 'train_subset': 'train', 'valid_subset': 'valid', 'combine_valid_subsets': None, 'ignore_unused_valid_subsets': False, 'validate_interval': 1, 'validate_interval_updates': 0, 'validate_after_updates': 0, 'fixed_validation_seed': None, 'disable_validation': False, 'max_tokens_valid': 20000, 'batch_size_valid': None, 'max_valid_steps': None, 'curriculum': 0, 'gen_subset': 'test', 'num_shards': 1, 'shard_id': 0, 'grouped_shuffling': False, 'update_epoch_batch_itr': False, 'update_ordered_indices_seed': False}, 'optimization': {'_name': None, 'max_epoch': 0, 'max_update': 100000, 'stop_time_hours': 0.0, 'clip_norm': 0.0, 'sentence_avg': False, 'update_freq': [1], 'lr': [0.0005], 'stop_min_lr': -1.0, 'use_bmuf': False, 'skip_remainder_batch': False}, 'checkpoint': {'_name': None, 'save_dir': 'arm32/trained_arm32_best_noconst', 'restore_file': 'checkpoint_last.pt', 'continue_once': None, 'finetune_from_model': None, 'reset_dataloader': False, 'reset_lr_scheduler': False, 'reset_meters': False, 'reset_optimizer': False, 'optimizer_overrides': '{}', 'save_interval': 1, 'save_interval_updates': 0, 'keep_interval_updates': -1, 'keep_interval_updates_pattern': -1, 'keep_last_epochs': -1, 'keep_best_checkpoints': 3, 'no_save': False, 'no_epoch_checkpoints': True, 'no_last_checkpoints': False, 'no_save_optimizer_state': False, 'best_checkpoint_metric': 'loss', 'maximize_best_checkpoint_metric': False, 'patience': 4, 'checkpoint_suffix': '', 'checkpoint_shard_count': 1, 'load_checkpoint_on_all_dp_ranks': False, 'write_checkpoints_asynchronously': False, 'model_parallel_size': 1}, 'bmuf': {'_name': None, 'block_lr': 1.0, 'block_momentum': 0.875, 'global_sync_iter': 50, 'warmup_iterations': 500, 'use_nbm': False, 'average_sync': False, 'distributed_world_size': 1}, 'generation': {'_name': None, 'beam': 5, 'nbest': 1, 'max_len_a': 0.0, 'max_len_b': 200, 'min_len': 1, 'match_source_len': False, 'unnormalized': False, 'no_early_stop': False, 'no_beamable_mm': False, 'lenpen': 1.0, 'unkpen': 0.0, 'replace_unk': None, 'sacrebleu': False, 'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, 'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, 'constraints': None, 'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, 'diversity_rate': -1.0, 'print_alignment': None, 'print_step': False, 'lm_path': None, 'lm_weight': 0.0, 'iter_decode_eos_penalty': 0.0, 'iter_decode_max_iter': 10, 'iter_decode_force_max_iter': False, 'iter_decode_with_beam': 1, 'iter_decode_with_external_reranker': False, 'retain_iter_history': False, 'retain_dropout': False, 'retain_dropout_modules': None, 'decoding_format': None, 'no_seed_provided': False, 'eos_token': None}, 'eval_lm': {'_name': None, 'output_word_probs': False, 'output_word_stats': False, 'context_window': 0, 'softmax_batch': 9223372036854775807}, 'interactive': {'_name': None, 'buffer_size': 0, 'input': '-'}, 'model': Namespace(_name='transformer', activation_dropout=0.0, activation_fn='relu', adam_betas=(0.9, 0.999), adam_eps=1e-08, adaptive_input=False, adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, aim_repo=None, aim_run_hash=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, arch='transformer', attention_dropout=0.0, azureml_logging=False, batch_size=None, batch_size_valid=None, best_checkpoint_metric='loss', bf16=False, bpe=None, broadcast_buffers=False, bucket_cap_mb=25, checkpoint_activations=False, checkpoint_shard_count=1, checkpoint_suffix='', clip_norm=0.0, combine_valid_subsets=None, continue_once=None, cpu=False, cpu_offload=False, criterion='cross_entropy', cross_self_attention=False, curriculum=0, data='arm32/tokenized_dlsm_arm32_noconst', data_buffer_size=10, dataset_impl=None, ddp_backend='pytorch_ddp', ddp_comm_hook='none', decoder_attention_heads=8, decoder_embed_dim=96, decoder_embed_path=None, decoder_ffn_embed_dim=384, decoder_input_dim=96, decoder_layerdrop=0, decoder_layers=4, decoder_layers_to_keep=None, decoder_learned_pos=False, decoder_normalize_before=False, decoder_output_dim='96', device_id=0, disable_validation=False, distributed_backend='nccl', distributed_init_method=None, distributed_no_spawn=False, distributed_num_procs=1, distributed_port=-1, distributed_rank=0, distributed_world_size=1, dropout=0.05, ema_decay=0.9999, ema_fp32=False, ema_seed_model=None, ema_start_update=0, ema_update_freq=1, empty_cache_freq=0, encoder_attention_heads=8, encoder_embed_dim=384, encoder_embed_path=None, encoder_ffn_embed_dim=1536, encoder_layerdrop=0, encoder_layers=4, encoder_layers_to_keep=None, encoder_learned_pos=False, encoder_normalize_before=False, eos=2, eval_bleu=False, eval_bleu_args='{}', eval_bleu_detok='space', eval_bleu_detok_args='{}', eval_bleu_print_samples=False, eval_bleu_remove_bpe=None, eval_tokenized_bleu=False, fast_stat_sync=False, find_unused_parameters=False, finetune_from_model=None, fix_batches_to_gpus=False, fixed_validation_seed=None, fp16=False, fp16_adam_stats=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, fp32_reduce_scatter=False, gen_subset='test', gradient_as_bucket_view=False, grouped_shuffling=False, heartbeat_timeout=-1, ignore_unused_valid_subsets=False, keep_best_checkpoints=3, keep_interval_updates=-1, keep_interval_updates_pattern=-1, keep_last_epochs=-1, layernorm_embedding=False, left_pad_source=True, left_pad_target=False, load_alignments=False, load_checkpoint_on_all_dp_ranks=False, localsgd_frequency=3, log_file='arm32/trained_arm32_best_noconst/training.log', log_format=None, log_interval=100, lr=[0.0005], lr_scheduler='inverse_sqrt', max_epoch=0, max_source_positions=1024, max_target_positions=1024, max_tokens=20000, max_tokens_valid=20000, max_update=100000, max_valid_steps=None, maximize_best_checkpoint_metric=False, memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_cross_attention=False, no_epoch_checkpoints=True, no_last_checkpoints=False, no_progress_bar=False, no_reshard_after_forward=False, no_save=False, no_save_optimizer_state=False, no_scale_embedding=False, no_seed_provided=False, no_token_positional_embeddings=False, not_fsdp_flatten_parameters=False, nprocs_per_node=1, num_batch_buckets=0, num_shards=1, num_workers=1, offload_activations=False, on_cpu_convert_precision=False, optimizer='adam', optimizer_overrides='{}', pad=1, patience=4, pipeline_balance=None, pipeline_checkpoint='never', pipeline_chunks=0, pipeline_decoder_balance=None, pipeline_decoder_devices=None, pipeline_devices=None, pipeline_encoder_balance=None, pipeline_encoder_devices=None, pipeline_model_parallel=False, plasma_path='/tmp/plasma', profile=False, quant_noise_pq=0, quant_noise_pq_block_size=8, quant_noise_scalar=0, quantization_config_path=None, required_batch_size_multiple=8, required_seq_len_multiple=1, reset_dataloader=False, reset_logging=False, reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, restore_file='checkpoint_last.pt', save_dir='arm32/trained_arm32_best_noconst', save_interval=1, save_interval_updates=0, scoring='bleu', seed=1, sentence_avg=False, shard_id=0, share_all_embeddings=False, share_decoder_input_output_embed=False, skip_invalid_size_inputs_valid_test=False, skip_remainder_batch=False, slowmo_base_algorithm='localsgd', slowmo_momentum=None, source_lang=None, stop_min_lr=-1.0, stop_time_hours=0, store_ema=False, suppress_crashes=False, target_lang=None, task='translation', tensorboard_logdir=None, threshold_loss_scale=None, tie_adaptive_weights=False, tokenizer=None, tpu=False, train_subset='train', truncate_source=False, unk=3, update_epoch_batch_itr=False, update_freq=[1], update_ordered_indices_seed=False, upsample_primary=-1, use_bmuf=False, use_old_adam=False, use_plasma_view=False, use_sharded_state=False, user_dir=None, valid_subset='valid', validate_after_updates=0, validate_interval=1, validate_interval_updates=0, wandb_project=None, warmup_init_lr=-1, warmup_updates=4000, weight_decay=0.001, write_checkpoints_asynchronously=False, zero_sharding='none'), 'task': {'_name': 'translation', 'data': 'arm32/tokenized_dlsm_arm32_noconst', 'source_lang': None, 'target_lang': None, 'load_alignments': False, 'left_pad_source': True, 'left_pad_target': False, 'max_source_positions': 1024, 'max_target_positions': 1024, 'upsample_primary': -1, 'truncate_source': False, 'num_batch_buckets': 0, 'train_subset': 'train', 'dataset_impl': None, 'required_seq_len_multiple': 1, 'eval_bleu': False, 'eval_bleu_args': '{}', 'eval_bleu_detok': 'space', 'eval_bleu_detok_args': '{}', 'eval_tokenized_bleu': False, 'eval_bleu_remove_bpe': None, 'eval_bleu_print_samples': False}, 'criterion': {'_name': 'cross_entropy', 'sentence_avg': False}, 'optimizer': {'_name': 'adam', 'adam_betas': [0.9, 0.999], 'adam_eps': 1e-08, 'weight_decay': 0.001, 'use_old_adam': False, 'fp16_adam_stats': False, 'tpu': False, 'lr': [0.0005]}, 'lr_scheduler': {'_name': 'inverse_sqrt', 'warmup_updates': 4000, 'warmup_init_lr': -1.0, 'lr': [0.0005]}, 'scoring': {'_name': 'bleu', 'pad': 1, 'eos': 2, 'unk': 3}, 'bpe': None, 'tokenizer': None, 'ema': {'_name': None, 'store_ema': False, 'ema_decay': 0.9999, 'ema_start_update': 0, 'ema_seed_model': None, 'ema_update_freq': 1, 'ema_fp32': False}}
|
| 2 |
+
TransformerModel(
|
| 3 |
+
(encoder): TransformerEncoderBase(
|
| 4 |
+
(dropout_module): FairseqDropout()
|
| 5 |
+
(embed_tokens): Embedding(216, 384, padding_idx=1)
|
| 6 |
+
(embed_positions): SinusoidalPositionalEmbedding()
|
| 7 |
+
(layers): ModuleList(
|
| 8 |
+
(0-3): 4 x TransformerEncoderLayerBase(
|
| 9 |
+
(self_attn): MultiheadAttention(
|
| 10 |
+
(dropout_module): FairseqDropout()
|
| 11 |
+
(k_proj): Linear(in_features=384, out_features=384, bias=True)
|
| 12 |
+
(v_proj): Linear(in_features=384, out_features=384, bias=True)
|
| 13 |
+
(q_proj): Linear(in_features=384, out_features=384, bias=True)
|
| 14 |
+
(out_proj): Linear(in_features=384, out_features=384, bias=True)
|
| 15 |
+
)
|
| 16 |
+
(self_attn_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
|
| 17 |
+
(dropout_module): FairseqDropout()
|
| 18 |
+
(activation_dropout_module): FairseqDropout()
|
| 19 |
+
(fc1): Linear(in_features=384, out_features=1536, bias=True)
|
| 20 |
+
(fc2): Linear(in_features=1536, out_features=384, bias=True)
|
| 21 |
+
(final_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
|
| 22 |
+
)
|
| 23 |
+
)
|
| 24 |
+
)
|
| 25 |
+
(decoder): TransformerDecoderBase(
|
| 26 |
+
(dropout_module): FairseqDropout()
|
| 27 |
+
(embed_tokens): Embedding(40, 96, padding_idx=1)
|
| 28 |
+
(embed_positions): SinusoidalPositionalEmbedding()
|
| 29 |
+
(layers): ModuleList(
|
| 30 |
+
(0-3): 4 x TransformerDecoderLayerBase(
|
| 31 |
+
(dropout_module): FairseqDropout()
|
| 32 |
+
(self_attn): MultiheadAttention(
|
| 33 |
+
(dropout_module): FairseqDropout()
|
| 34 |
+
(k_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 35 |
+
(v_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 36 |
+
(q_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 37 |
+
(out_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 38 |
+
)
|
| 39 |
+
(activation_dropout_module): FairseqDropout()
|
| 40 |
+
(self_attn_layer_norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
|
| 41 |
+
(encoder_attn): MultiheadAttention(
|
| 42 |
+
(dropout_module): FairseqDropout()
|
| 43 |
+
(k_proj): Linear(in_features=384, out_features=96, bias=True)
|
| 44 |
+
(v_proj): Linear(in_features=384, out_features=96, bias=True)
|
| 45 |
+
(q_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 46 |
+
(out_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 47 |
+
)
|
| 48 |
+
(encoder_attn_layer_norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
|
| 49 |
+
(fc1): Linear(in_features=96, out_features=384, bias=True)
|
| 50 |
+
(fc2): Linear(in_features=384, out_features=96, bias=True)
|
| 51 |
+
(final_layer_norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
|
| 52 |
+
)
|
| 53 |
+
)
|
| 54 |
+
(output_projection): Linear(in_features=96, out_features=40, bias=False)
|
| 55 |
+
)
|
| 56 |
+
)
|
| 57 |
+
task: TranslationTask
|
| 58 |
+
model: TransformerModel
|
| 59 |
+
criterion: CrossEntropyCriterion
|
| 60 |
+
num. shared model params: 15,098,496 (num. trained: 15,098,496)
|
| 61 |
+
num. expert model params: 0 (num. trained: 0)
|
| 62 |
+
training on 1 devices (GPUs/TPUs)
|
| 63 |
+
max tokens per device = 20000 and max sentences per device = None
|
| 64 |
+
Start iterating over samples
|
| 65 |
+
begin validation on "valid" subset
|
| 66 |
+
epoch 001 | valid on 'valid' subset | loss 1.532 | ppl 2.89 | wps 44149.4 | wpb 2858.9 | bsz 108.4 | num_updates 3926
|
| 67 |
+
end of epoch 1 (average epoch stats below)
|
| 68 |
+
epoch 001 | loss 2.22 | ppl 4.66 | wps 14250.5 | ups 4.81 | wpb 2961.7 | bsz 110.3 | num_updates 3926 | lr 0.00049075 | gnorm 2.478 | train_wall 800 | gb_free 13.9 | wall 817
|
| 69 |
+
Start iterating over samples
|
| 70 |
+
begin validation on "valid" subset
|
| 71 |
+
epoch 002 | valid on 'valid' subset | loss 0.887 | ppl 1.85 | wps 44076.7 | wpb 2858.9 | bsz 108.4 | num_updates 7852 | best_loss 0.887
|
| 72 |
+
epoch 002 | valid on 'valid' subset | loss 0.887 | ppl 1.85 | wps 44076.7 | wpb 2858.9 | bsz 108.4 | num_updates 7852 | best_loss 0.887
|
| 73 |
+
end of epoch 2 (average epoch stats below)
|
| 74 |
+
epoch 002 | loss 1.238 | ppl 2.36 | wps 14211.8 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 7852 | lr 0.00035687 | gnorm 1.187 | train_wall 802 | gb_free 13.4 | wall 1635
|
| 75 |
+
epoch 002 | loss 1.238 | ppl 2.36 | wps 14211.8 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 7852 | lr 0.00035687 | gnorm 1.187 | train_wall 802 | gb_free 13.4 | wall 1635
|
| 76 |
+
Start iterating over samples
|
| 77 |
+
begin validation on "valid" subset
|
| 78 |
+
epoch 003 | valid on 'valid' subset | loss 0.664 | ppl 1.58 | wps 43946.5 | wpb 2858.9 | bsz 108.4 | num_updates 11778 | best_loss 0.664
|
| 79 |
+
epoch 003 | valid on 'valid' subset | loss 0.664 | ppl 1.58 | wps 43946.5 | wpb 2858.9 | bsz 108.4 | num_updates 11778 | best_loss 0.664
|
| 80 |
+
epoch 003 | valid on 'valid' subset | loss 0.664 | ppl 1.58 | wps 43946.5 | wpb 2858.9 | bsz 108.4 | num_updates 11778 | best_loss 0.664
|
| 81 |
+
end of epoch 3 (average epoch stats below)
|
| 82 |
+
epoch 003 | loss 0.78 | ppl 1.72 | wps 14233.1 | ups 4.81 | wpb 2961.7 | bsz 110.3 | num_updates 11778 | lr 0.000291383 | gnorm 0.943 | train_wall 801 | gb_free 13 | wall 2452
|
| 83 |
+
epoch 003 | loss 0.78 | ppl 1.72 | wps 14233.1 | ups 4.81 | wpb 2961.7 | bsz 110.3 | num_updates 11778 | lr 0.000291383 | gnorm 0.943 | train_wall 801 | gb_free 13 | wall 2452
|
| 84 |
+
epoch 003 | loss 0.78 | ppl 1.72 | wps 14233.1 | ups 4.81 | wpb 2961.7 | bsz 110.3 | num_updates 11778 | lr 0.000291383 | gnorm 0.943 | train_wall 801 | gb_free 13 | wall 2452
|
| 85 |
+
Start iterating over samples
|
| 86 |
+
begin validation on "valid" subset
|
| 87 |
+
epoch 004 | valid on 'valid' subset | loss 0.554 | ppl 1.47 | wps 44110 | wpb 2858.9 | bsz 108.4 | num_updates 15704 | best_loss 0.554
|
| 88 |
+
epoch 004 | valid on 'valid' subset | loss 0.554 | ppl 1.47 | wps 44110 | wpb 2858.9 | bsz 108.4 | num_updates 15704 | best_loss 0.554
|
| 89 |
+
epoch 004 | valid on 'valid' subset | loss 0.554 | ppl 1.47 | wps 44110 | wpb 2858.9 | bsz 108.4 | num_updates 15704 | best_loss 0.554
|
| 90 |
+
epoch 004 | valid on 'valid' subset | loss 0.554 | ppl 1.47 | wps 44110 | wpb 2858.9 | bsz 108.4 | num_updates 15704 | best_loss 0.554
|
| 91 |
+
end of epoch 4 (average epoch stats below)
|
| 92 |
+
epoch 004 | loss 0.571 | ppl 1.49 | wps 14206.7 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 15704 | lr 0.000252345 | gnorm 0.79 | train_wall 802 | gb_free 13.3 | wall 3270
|
| 93 |
+
epoch 004 | loss 0.571 | ppl 1.49 | wps 14206.7 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 15704 | lr 0.000252345 | gnorm 0.79 | train_wall 802 | gb_free 13.3 | wall 3270
|
| 94 |
+
epoch 004 | loss 0.571 | ppl 1.49 | wps 14206.7 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 15704 | lr 0.000252345 | gnorm 0.79 | train_wall 802 | gb_free 13.3 | wall 3270
|
| 95 |
+
epoch 004 | loss 0.571 | ppl 1.49 | wps 14206.7 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 15704 | lr 0.000252345 | gnorm 0.79 | train_wall 802 | gb_free 13.3 | wall 3270
|
| 96 |
+
Start iterating over samples
|
| 97 |
+
begin validation on "valid" subset
|
| 98 |
+
epoch 005 | valid on 'valid' subset | loss 0.496 | ppl 1.41 | wps 43747.4 | wpb 2858.9 | bsz 108.4 | num_updates 19630 | best_loss 0.496
|
| 99 |
+
epoch 005 | valid on 'valid' subset | loss 0.496 | ppl 1.41 | wps 43747.4 | wpb 2858.9 | bsz 108.4 | num_updates 19630 | best_loss 0.496
|
| 100 |
+
epoch 005 | valid on 'valid' subset | loss 0.496 | ppl 1.41 | wps 43747.4 | wpb 2858.9 | bsz 108.4 | num_updates 19630 | best_loss 0.496
|
| 101 |
+
epoch 005 | valid on 'valid' subset | loss 0.496 | ppl 1.41 | wps 43747.4 | wpb 2858.9 | bsz 108.4 | num_updates 19630 | best_loss 0.496
|
| 102 |
+
epoch 005 | valid on 'valid' subset | loss 0.496 | ppl 1.41 | wps 43747.4 | wpb 2858.9 | bsz 108.4 | num_updates 19630 | best_loss 0.496
|
| 103 |
+
end of epoch 5 (average epoch stats below)
|
| 104 |
+
epoch 005 | loss 0.454 | ppl 1.37 | wps 14205.6 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 19630 | lr 0.000225704 | gnorm 0.705 | train_wall 802 | gb_free 13.8 | wall 4089
|
| 105 |
+
epoch 005 | loss 0.454 | ppl 1.37 | wps 14205.6 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 19630 | lr 0.000225704 | gnorm 0.705 | train_wall 802 | gb_free 13.8 | wall 4089
|
| 106 |
+
epoch 005 | loss 0.454 | ppl 1.37 | wps 14205.6 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 19630 | lr 0.000225704 | gnorm 0.705 | train_wall 802 | gb_free 13.8 | wall 4089
|
| 107 |
+
epoch 005 | loss 0.454 | ppl 1.37 | wps 14205.6 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 19630 | lr 0.000225704 | gnorm 0.705 | train_wall 802 | gb_free 13.8 | wall 4089
|
| 108 |
+
epoch 005 | loss 0.454 | ppl 1.37 | wps 14205.6 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 19630 | lr 0.000225704 | gnorm 0.705 | train_wall 802 | gb_free 13.8 | wall 4089
|
| 109 |
+
Start iterating over samples
|
| 110 |
+
begin validation on "valid" subset
|
| 111 |
+
epoch 006 | valid on 'valid' subset | loss 0.49 | ppl 1.4 | wps 43929.7 | wpb 2858.9 | bsz 108.4 | num_updates 23556 | best_loss 0.49
|
| 112 |
+
epoch 006 | valid on 'valid' subset | loss 0.49 | ppl 1.4 | wps 43929.7 | wpb 2858.9 | bsz 108.4 | num_updates 23556 | best_loss 0.49
|
| 113 |
+
epoch 006 | valid on 'valid' subset | loss 0.49 | ppl 1.4 | wps 43929.7 | wpb 2858.9 | bsz 108.4 | num_updates 23556 | best_loss 0.49
|
| 114 |
+
epoch 006 | valid on 'valid' subset | loss 0.49 | ppl 1.4 | wps 43929.7 | wpb 2858.9 | bsz 108.4 | num_updates 23556 | best_loss 0.49
|
| 115 |
+
epoch 006 | valid on 'valid' subset | loss 0.49 | ppl 1.4 | wps 43929.7 | wpb 2858.9 | bsz 108.4 | num_updates 23556 | best_loss 0.49
|
| 116 |
+
epoch 006 | valid on 'valid' subset | loss 0.49 | ppl 1.4 | wps 43929.7 | wpb 2858.9 | bsz 108.4 | num_updates 23556 | best_loss 0.49
|
| 117 |
+
end of epoch 6 (average epoch stats below)
|
| 118 |
+
epoch 006 | loss 0.385 | ppl 1.31 | wps 14223.3 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 23556 | lr 0.000206039 | gnorm 0.657 | train_wall 801 | gb_free 13.7 | wall 4906
|
| 119 |
+
epoch 006 | loss 0.385 | ppl 1.31 | wps 14223.3 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 23556 | lr 0.000206039 | gnorm 0.657 | train_wall 801 | gb_free 13.7 | wall 4906
|
| 120 |
+
epoch 006 | loss 0.385 | ppl 1.31 | wps 14223.3 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 23556 | lr 0.000206039 | gnorm 0.657 | train_wall 801 | gb_free 13.7 | wall 4906
|
| 121 |
+
epoch 006 | loss 0.385 | ppl 1.31 | wps 14223.3 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 23556 | lr 0.000206039 | gnorm 0.657 | train_wall 801 | gb_free 13.7 | wall 4906
|
| 122 |
+
epoch 006 | loss 0.385 | ppl 1.31 | wps 14223.3 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 23556 | lr 0.000206039 | gnorm 0.657 | train_wall 801 | gb_free 13.7 | wall 4906
|
| 123 |
+
epoch 006 | loss 0.385 | ppl 1.31 | wps 14223.3 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 23556 | lr 0.000206039 | gnorm 0.657 | train_wall 801 | gb_free 13.7 | wall 4906
|
| 124 |
+
Start iterating over samples
|
| 125 |
+
begin validation on "valid" subset
|
| 126 |
+
epoch 007 | valid on 'valid' subset | loss 0.485 | ppl 1.4 | wps 43868.1 | wpb 2858.9 | bsz 108.4 | num_updates 27482 | best_loss 0.485
|
| 127 |
+
epoch 007 | valid on 'valid' subset | loss 0.485 | ppl 1.4 | wps 43868.1 | wpb 2858.9 | bsz 108.4 | num_updates 27482 | best_loss 0.485
|
| 128 |
+
epoch 007 | valid on 'valid' subset | loss 0.485 | ppl 1.4 | wps 43868.1 | wpb 2858.9 | bsz 108.4 | num_updates 27482 | best_loss 0.485
|
| 129 |
+
epoch 007 | valid on 'valid' subset | loss 0.485 | ppl 1.4 | wps 43868.1 | wpb 2858.9 | bsz 108.4 | num_updates 27482 | best_loss 0.485
|
| 130 |
+
epoch 007 | valid on 'valid' subset | loss 0.485 | ppl 1.4 | wps 43868.1 | wpb 2858.9 | bsz 108.4 | num_updates 27482 | best_loss 0.485
|
| 131 |
+
epoch 007 | valid on 'valid' subset | loss 0.485 | ppl 1.4 | wps 43868.1 | wpb 2858.9 | bsz 108.4 | num_updates 27482 | best_loss 0.485
|
| 132 |
+
epoch 007 | valid on 'valid' subset | loss 0.485 | ppl 1.4 | wps 43868.1 | wpb 2858.9 | bsz 108.4 | num_updates 27482 | best_loss 0.485
|
| 133 |
+
end of epoch 7 (average epoch stats below)
|
| 134 |
+
epoch 007 | loss 0.334 | ppl 1.26 | wps 14209.5 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 27482 | lr 0.000190755 | gnorm 0.637 | train_wall 802 | gb_free 13.6 | wall 5725
|
| 135 |
+
epoch 007 | loss 0.334 | ppl 1.26 | wps 14209.5 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 27482 | lr 0.000190755 | gnorm 0.637 | train_wall 802 | gb_free 13.6 | wall 5725
|
| 136 |
+
epoch 007 | loss 0.334 | ppl 1.26 | wps 14209.5 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 27482 | lr 0.000190755 | gnorm 0.637 | train_wall 802 | gb_free 13.6 | wall 5725
|
| 137 |
+
epoch 007 | loss 0.334 | ppl 1.26 | wps 14209.5 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 27482 | lr 0.000190755 | gnorm 0.637 | train_wall 802 | gb_free 13.6 | wall 5725
|
| 138 |
+
epoch 007 | loss 0.334 | ppl 1.26 | wps 14209.5 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 27482 | lr 0.000190755 | gnorm 0.637 | train_wall 802 | gb_free 13.6 | wall 5725
|
| 139 |
+
epoch 007 | loss 0.334 | ppl 1.26 | wps 14209.5 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 27482 | lr 0.000190755 | gnorm 0.637 | train_wall 802 | gb_free 13.6 | wall 5725
|
| 140 |
+
epoch 007 | loss 0.334 | ppl 1.26 | wps 14209.5 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 27482 | lr 0.000190755 | gnorm 0.637 | train_wall 802 | gb_free 13.6 | wall 5725
|
| 141 |
+
Start iterating over samples
|
| 142 |
+
begin validation on "valid" subset
|
| 143 |
+
epoch 008 | valid on 'valid' subset | loss 0.503 | ppl 1.42 | wps 44034.7 | wpb 2858.9 | bsz 108.4 | num_updates 31408 | best_loss 0.485
|
| 144 |
+
epoch 008 | valid on 'valid' subset | loss 0.503 | ppl 1.42 | wps 44034.7 | wpb 2858.9 | bsz 108.4 | num_updates 31408 | best_loss 0.485
|
| 145 |
+
epoch 008 | valid on 'valid' subset | loss 0.503 | ppl 1.42 | wps 44034.7 | wpb 2858.9 | bsz 108.4 | num_updates 31408 | best_loss 0.485
|
| 146 |
+
epoch 008 | valid on 'valid' subset | loss 0.503 | ppl 1.42 | wps 44034.7 | wpb 2858.9 | bsz 108.4 | num_updates 31408 | best_loss 0.485
|
| 147 |
+
epoch 008 | valid on 'valid' subset | loss 0.503 | ppl 1.42 | wps 44034.7 | wpb 2858.9 | bsz 108.4 | num_updates 31408 | best_loss 0.485
|
| 148 |
+
epoch 008 | valid on 'valid' subset | loss 0.503 | ppl 1.42 | wps 44034.7 | wpb 2858.9 | bsz 108.4 | num_updates 31408 | best_loss 0.485
|
| 149 |
+
epoch 008 | valid on 'valid' subset | loss 0.503 | ppl 1.42 | wps 44034.7 | wpb 2858.9 | bsz 108.4 | num_updates 31408 | best_loss 0.485
|
| 150 |
+
epoch 008 | valid on 'valid' subset | loss 0.503 | ppl 1.42 | wps 44034.7 | wpb 2858.9 | bsz 108.4 | num_updates 31408 | best_loss 0.485
|
| 151 |
+
end of epoch 8 (average epoch stats below)
|
| 152 |
+
epoch 008 | loss 0.291 | ppl 1.22 | wps 14209.8 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 31408 | lr 0.000178435 | gnorm 0.627 | train_wall 802 | gb_free 13.5 | wall 6543
|
| 153 |
+
epoch 008 | loss 0.291 | ppl 1.22 | wps 14209.8 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 31408 | lr 0.000178435 | gnorm 0.627 | train_wall 802 | gb_free 13.5 | wall 6543
|
| 154 |
+
epoch 008 | loss 0.291 | ppl 1.22 | wps 14209.8 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 31408 | lr 0.000178435 | gnorm 0.627 | train_wall 802 | gb_free 13.5 | wall 6543
|
| 155 |
+
epoch 008 | loss 0.291 | ppl 1.22 | wps 14209.8 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 31408 | lr 0.000178435 | gnorm 0.627 | train_wall 802 | gb_free 13.5 | wall 6543
|
| 156 |
+
epoch 008 | loss 0.291 | ppl 1.22 | wps 14209.8 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 31408 | lr 0.000178435 | gnorm 0.627 | train_wall 802 | gb_free 13.5 | wall 6543
|
| 157 |
+
epoch 008 | loss 0.291 | ppl 1.22 | wps 14209.8 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 31408 | lr 0.000178435 | gnorm 0.627 | train_wall 802 | gb_free 13.5 | wall 6543
|
| 158 |
+
epoch 008 | loss 0.291 | ppl 1.22 | wps 14209.8 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 31408 | lr 0.000178435 | gnorm 0.627 | train_wall 802 | gb_free 13.5 | wall 6543
|
| 159 |
+
epoch 008 | loss 0.291 | ppl 1.22 | wps 14209.8 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 31408 | lr 0.000178435 | gnorm 0.627 | train_wall 802 | gb_free 13.5 | wall 6543
|
| 160 |
+
Start iterating over samples
|
| 161 |
+
begin validation on "valid" subset
|
| 162 |
+
epoch 009 | valid on 'valid' subset | loss 0.541 | ppl 1.46 | wps 43993.1 | wpb 2858.9 | bsz 108.4 | num_updates 35334 | best_loss 0.485
|
| 163 |
+
epoch 009 | valid on 'valid' subset | loss 0.541 | ppl 1.46 | wps 43993.1 | wpb 2858.9 | bsz 108.4 | num_updates 35334 | best_loss 0.485
|
| 164 |
+
epoch 009 | valid on 'valid' subset | loss 0.541 | ppl 1.46 | wps 43993.1 | wpb 2858.9 | bsz 108.4 | num_updates 35334 | best_loss 0.485
|
| 165 |
+
epoch 009 | valid on 'valid' subset | loss 0.541 | ppl 1.46 | wps 43993.1 | wpb 2858.9 | bsz 108.4 | num_updates 35334 | best_loss 0.485
|
| 166 |
+
epoch 009 | valid on 'valid' subset | loss 0.541 | ppl 1.46 | wps 43993.1 | wpb 2858.9 | bsz 108.4 | num_updates 35334 | best_loss 0.485
|
| 167 |
+
epoch 009 | valid on 'valid' subset | loss 0.541 | ppl 1.46 | wps 43993.1 | wpb 2858.9 | bsz 108.4 | num_updates 35334 | best_loss 0.485
|
| 168 |
+
epoch 009 | valid on 'valid' subset | loss 0.541 | ppl 1.46 | wps 43993.1 | wpb 2858.9 | bsz 108.4 | num_updates 35334 | best_loss 0.485
|
| 169 |
+
epoch 009 | valid on 'valid' subset | loss 0.541 | ppl 1.46 | wps 43993.1 | wpb 2858.9 | bsz 108.4 | num_updates 35334 | best_loss 0.485
|
| 170 |
+
epoch 009 | valid on 'valid' subset | loss 0.541 | ppl 1.46 | wps 43993.1 | wpb 2858.9 | bsz 108.4 | num_updates 35334 | best_loss 0.485
|
| 171 |
+
end of epoch 9 (average epoch stats below)
|
| 172 |
+
epoch 009 | loss 0.254 | ppl 1.19 | wps 14221.9 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 35334 | lr 0.00016823 | gnorm 0.626 | train_wall 802 | gb_free 13.7 | wall 7361
|
| 173 |
+
epoch 009 | loss 0.254 | ppl 1.19 | wps 14221.9 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 35334 | lr 0.00016823 | gnorm 0.626 | train_wall 802 | gb_free 13.7 | wall 7361
|
| 174 |
+
epoch 009 | loss 0.254 | ppl 1.19 | wps 14221.9 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 35334 | lr 0.00016823 | gnorm 0.626 | train_wall 802 | gb_free 13.7 | wall 7361
|
| 175 |
+
epoch 009 | loss 0.254 | ppl 1.19 | wps 14221.9 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 35334 | lr 0.00016823 | gnorm 0.626 | train_wall 802 | gb_free 13.7 | wall 7361
|
| 176 |
+
epoch 009 | loss 0.254 | ppl 1.19 | wps 14221.9 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 35334 | lr 0.00016823 | gnorm 0.626 | train_wall 802 | gb_free 13.7 | wall 7361
|
| 177 |
+
epoch 009 | loss 0.254 | ppl 1.19 | wps 14221.9 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 35334 | lr 0.00016823 | gnorm 0.626 | train_wall 802 | gb_free 13.7 | wall 7361
|
| 178 |
+
epoch 009 | loss 0.254 | ppl 1.19 | wps 14221.9 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 35334 | lr 0.00016823 | gnorm 0.626 | train_wall 802 | gb_free 13.7 | wall 7361
|
| 179 |
+
epoch 009 | loss 0.254 | ppl 1.19 | wps 14221.9 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 35334 | lr 0.00016823 | gnorm 0.626 | train_wall 802 | gb_free 13.7 | wall 7361
|
| 180 |
+
epoch 009 | loss 0.254 | ppl 1.19 | wps 14221.9 | ups 4.8 | wpb 2961.7 | bsz 110.3 | num_updates 35334 | lr 0.00016823 | gnorm 0.626 | train_wall 802 | gb_free 13.7 | wall 7361
|
| 181 |
+
Start iterating over samples
|
| 182 |
+
begin validation on "valid" subset
|
| 183 |
+
epoch 010 | valid on 'valid' subset | loss 0.57 | ppl 1.48 | wps 43885.5 | wpb 2858.9 | bsz 108.4 | num_updates 39260 | best_loss 0.485
|
| 184 |
+
epoch 010 | valid on 'valid' subset | loss 0.57 | ppl 1.48 | wps 43885.5 | wpb 2858.9 | bsz 108.4 | num_updates 39260 | best_loss 0.485
|
| 185 |
+
epoch 010 | valid on 'valid' subset | loss 0.57 | ppl 1.48 | wps 43885.5 | wpb 2858.9 | bsz 108.4 | num_updates 39260 | best_loss 0.485
|
| 186 |
+
epoch 010 | valid on 'valid' subset | loss 0.57 | ppl 1.48 | wps 43885.5 | wpb 2858.9 | bsz 108.4 | num_updates 39260 | best_loss 0.485
|
| 187 |
+
epoch 010 | valid on 'valid' subset | loss 0.57 | ppl 1.48 | wps 43885.5 | wpb 2858.9 | bsz 108.4 | num_updates 39260 | best_loss 0.485
|
| 188 |
+
epoch 010 | valid on 'valid' subset | loss 0.57 | ppl 1.48 | wps 43885.5 | wpb 2858.9 | bsz 108.4 | num_updates 39260 | best_loss 0.485
|
| 189 |
+
epoch 010 | valid on 'valid' subset | loss 0.57 | ppl 1.48 | wps 43885.5 | wpb 2858.9 | bsz 108.4 | num_updates 39260 | best_loss 0.485
|
| 190 |
+
epoch 010 | valid on 'valid' subset | loss 0.57 | ppl 1.48 | wps 43885.5 | wpb 2858.9 | bsz 108.4 | num_updates 39260 | best_loss 0.485
|
| 191 |
+
epoch 010 | valid on 'valid' subset | loss 0.57 | ppl 1.48 | wps 43885.5 | wpb 2858.9 | bsz 108.4 | num_updates 39260 | best_loss 0.485
|
| 192 |
+
epoch 010 | valid on 'valid' subset | loss 0.57 | ppl 1.48 | wps 43885.5 | wpb 2858.9 | bsz 108.4 | num_updates 39260 | best_loss 0.485
|
| 193 |
+
end of epoch 10 (average epoch stats below)
|
| 194 |
+
epoch 010 | loss 0.22 | ppl 1.16 | wps 14234.6 | ups 4.81 | wpb 2961.7 | bsz 110.3 | num_updates 39260 | lr 0.000159597 | gnorm 0.62 | train_wall 801 | gb_free 13.7 | wall 8177
|
| 195 |
+
epoch 010 | loss 0.22 | ppl 1.16 | wps 14234.6 | ups 4.81 | wpb 2961.7 | bsz 110.3 | num_updates 39260 | lr 0.000159597 | gnorm 0.62 | train_wall 801 | gb_free 13.7 | wall 8177
|
| 196 |
+
epoch 010 | loss 0.22 | ppl 1.16 | wps 14234.6 | ups 4.81 | wpb 2961.7 | bsz 110.3 | num_updates 39260 | lr 0.000159597 | gnorm 0.62 | train_wall 801 | gb_free 13.7 | wall 8177
|
| 197 |
+
epoch 010 | loss 0.22 | ppl 1.16 | wps 14234.6 | ups 4.81 | wpb 2961.7 | bsz 110.3 | num_updates 39260 | lr 0.000159597 | gnorm 0.62 | train_wall 801 | gb_free 13.7 | wall 8177
|
| 198 |
+
epoch 010 | loss 0.22 | ppl 1.16 | wps 14234.6 | ups 4.81 | wpb 2961.7 | bsz 110.3 | num_updates 39260 | lr 0.000159597 | gnorm 0.62 | train_wall 801 | gb_free 13.7 | wall 8177
|
| 199 |
+
epoch 010 | loss 0.22 | ppl 1.16 | wps 14234.6 | ups 4.81 | wpb 2961.7 | bsz 110.3 | num_updates 39260 | lr 0.000159597 | gnorm 0.62 | train_wall 801 | gb_free 13.7 | wall 8177
|
| 200 |
+
epoch 010 | loss 0.22 | ppl 1.16 | wps 14234.6 | ups 4.81 | wpb 2961.7 | bsz 110.3 | num_updates 39260 | lr 0.000159597 | gnorm 0.62 | train_wall 801 | gb_free 13.7 | wall 8177
|
| 201 |
+
epoch 010 | loss 0.22 | ppl 1.16 | wps 14234.6 | ups 4.81 | wpb 2961.7 | bsz 110.3 | num_updates 39260 | lr 0.000159597 | gnorm 0.62 | train_wall 801 | gb_free 13.7 | wall 8177
|
| 202 |
+
epoch 010 | loss 0.22 | ppl 1.16 | wps 14234.6 | ups 4.81 | wpb 2961.7 | bsz 110.3 | num_updates 39260 | lr 0.000159597 | gnorm 0.62 | train_wall 801 | gb_free 13.7 | wall 8177
|
| 203 |
+
epoch 010 | loss 0.22 | ppl 1.16 | wps 14234.6 | ups 4.81 | wpb 2961.7 | bsz 110.3 | num_updates 39260 | lr 0.000159597 | gnorm 0.62 | train_wall 801 | gb_free 13.7 | wall 8177
|
| 204 |
+
Start iterating over samples
|
| 205 |
+
begin validation on "valid" subset
|
| 206 |
+
epoch 011 | valid on 'valid' subset | loss 0.598 | ppl 1.51 | wps 44069.5 | wpb 2858.9 | bsz 108.4 | num_updates 43186 | best_loss 0.485
|
| 207 |
+
epoch 011 | valid on 'valid' subset | loss 0.598 | ppl 1.51 | wps 44069.5 | wpb 2858.9 | bsz 108.4 | num_updates 43186 | best_loss 0.485
|
| 208 |
+
epoch 011 | valid on 'valid' subset | loss 0.598 | ppl 1.51 | wps 44069.5 | wpb 2858.9 | bsz 108.4 | num_updates 43186 | best_loss 0.485
|
| 209 |
+
epoch 011 | valid on 'valid' subset | loss 0.598 | ppl 1.51 | wps 44069.5 | wpb 2858.9 | bsz 108.4 | num_updates 43186 | best_loss 0.485
|
| 210 |
+
epoch 011 | valid on 'valid' subset | loss 0.598 | ppl 1.51 | wps 44069.5 | wpb 2858.9 | bsz 108.4 | num_updates 43186 | best_loss 0.485
|
| 211 |
+
epoch 011 | valid on 'valid' subset | loss 0.598 | ppl 1.51 | wps 44069.5 | wpb 2858.9 | bsz 108.4 | num_updates 43186 | best_loss 0.485
|
| 212 |
+
epoch 011 | valid on 'valid' subset | loss 0.598 | ppl 1.51 | wps 44069.5 | wpb 2858.9 | bsz 108.4 | num_updates 43186 | best_loss 0.485
|
| 213 |
+
epoch 011 | valid on 'valid' subset | loss 0.598 | ppl 1.51 | wps 44069.5 | wpb 2858.9 | bsz 108.4 | num_updates 43186 | best_loss 0.485
|
| 214 |
+
epoch 011 | valid on 'valid' subset | loss 0.598 | ppl 1.51 | wps 44069.5 | wpb 2858.9 | bsz 108.4 | num_updates 43186 | best_loss 0.485
|
| 215 |
+
epoch 011 | valid on 'valid' subset | loss 0.598 | ppl 1.51 | wps 44069.5 | wpb 2858.9 | bsz 108.4 | num_updates 43186 | best_loss 0.485
|
| 216 |
+
epoch 011 | valid on 'valid' subset | loss 0.598 | ppl 1.51 | wps 44069.5 | wpb 2858.9 | bsz 108.4 | num_updates 43186 | best_loss 0.485
|
| 217 |
+
early stop since valid performance hasn't improved for last 4 runs
|
| 218 |
+
end of epoch 11 (average epoch stats below)
|
| 219 |
+
epoch 011 | loss 0.19 | ppl 1.14 | wps 14240.1 | ups 4.81 | wpb 2961.7 | bsz 110.3 | num_updates 43186 | lr 0.00015217 | gnorm 0.623 | train_wall 801 | gb_free 13.7 | wall 8994
|
| 220 |
+
epoch 011 | loss 0.19 | ppl 1.14 | wps 14240.1 | ups 4.81 | wpb 2961.7 | bsz 110.3 | num_updates 43186 | lr 0.00015217 | gnorm 0.623 | train_wall 801 | gb_free 13.7 | wall 8994
|
| 221 |
+
epoch 011 | loss 0.19 | ppl 1.14 | wps 14240.1 | ups 4.81 | wpb 2961.7 | bsz 110.3 | num_updates 43186 | lr 0.00015217 | gnorm 0.623 | train_wall 801 | gb_free 13.7 | wall 8994
|
| 222 |
+
epoch 011 | loss 0.19 | ppl 1.14 | wps 14240.1 | ups 4.81 | wpb 2961.7 | bsz 110.3 | num_updates 43186 | lr 0.00015217 | gnorm 0.623 | train_wall 801 | gb_free 13.7 | wall 8994
|
| 223 |
+
epoch 011 | loss 0.19 | ppl 1.14 | wps 14240.1 | ups 4.81 | wpb 2961.7 | bsz 110.3 | num_updates 43186 | lr 0.00015217 | gnorm 0.623 | train_wall 801 | gb_free 13.7 | wall 8994
|
| 224 |
+
epoch 011 | loss 0.19 | ppl 1.14 | wps 14240.1 | ups 4.81 | wpb 2961.7 | bsz 110.3 | num_updates 43186 | lr 0.00015217 | gnorm 0.623 | train_wall 801 | gb_free 13.7 | wall 8994
|
| 225 |
+
epoch 011 | loss 0.19 | ppl 1.14 | wps 14240.1 | ups 4.81 | wpb 2961.7 | bsz 110.3 | num_updates 43186 | lr 0.00015217 | gnorm 0.623 | train_wall 801 | gb_free 13.7 | wall 8994
|
| 226 |
+
epoch 011 | loss 0.19 | ppl 1.14 | wps 14240.1 | ups 4.81 | wpb 2961.7 | bsz 110.3 | num_updates 43186 | lr 0.00015217 | gnorm 0.623 | train_wall 801 | gb_free 13.7 | wall 8994
|
| 227 |
+
epoch 011 | loss 0.19 | ppl 1.14 | wps 14240.1 | ups 4.81 | wpb 2961.7 | bsz 110.3 | num_updates 43186 | lr 0.00015217 | gnorm 0.623 | train_wall 801 | gb_free 13.7 | wall 8994
|
| 228 |
+
epoch 011 | loss 0.19 | ppl 1.14 | wps 14240.1 | ups 4.81 | wpb 2961.7 | bsz 110.3 | num_updates 43186 | lr 0.00015217 | gnorm 0.623 | train_wall 801 | gb_free 13.7 | wall 8994
|
| 229 |
+
epoch 011 | loss 0.19 | ppl 1.14 | wps 14240.1 | ups 4.81 | wpb 2961.7 | bsz 110.3 | num_updates 43186 | lr 0.00015217 | gnorm 0.623 | train_wall 801 | gb_free 13.7 | wall 8994
|
| 230 |
+
done training in 8993.5 seconds
|
ablations/noconst/trained_x64_best_noconst/beam1.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c0f7b91663e22fd7f7950507091546a1df281e3a31fb8944eeaeb8e1859b835c
|
| 3 |
+
size 3801344
|
ablations/noconst/trained_x64_best_noconst/checkpoint_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:82e62bf86ed73833a72344216deda861a86ae321a9485edb6b583591e2df2602
|
| 3 |
+
size 185999604
|
ablations/noconst/trained_x64_best_noconst/training.log
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': 'x64/trained_x64_best_noconst/training.log', 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': False, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': None, 'path': None, 'post_process': None, 'quiet': False, 'model_overrides': '{}', 'results_path': None}, 'distributed_training': {'_name': None, 'distributed_world_size': 1, 'distributed_num_procs': 1, 'distributed_rank': 0, 'distributed_backend': 'nccl', 'distributed_init_method': None, 'distributed_port': -1, 'device_id': 0, 'distributed_no_spawn': False, 'ddp_backend': 'pytorch_ddp', 'ddp_comm_hook': 'none', 'bucket_cap_mb': 25, 'fix_batches_to_gpus': False, 'find_unused_parameters': False, 'gradient_as_bucket_view': False, 'fast_stat_sync': False, 'heartbeat_timeout': -1, 'broadcast_buffers': False, 'slowmo_momentum': None, 'slowmo_base_algorithm': 'localsgd', 'localsgd_frequency': 3, 'nprocs_per_node': 1, 'pipeline_model_parallel': False, 'pipeline_balance': None, 'pipeline_devices': None, 'pipeline_chunks': 0, 'pipeline_encoder_balance': None, 'pipeline_encoder_devices': None, 'pipeline_decoder_balance': None, 'pipeline_decoder_devices': None, 'pipeline_checkpoint': 'never', 'zero_sharding': 'none', 'fp16': False, 'memory_efficient_fp16': False, 'tpu': False, 'no_reshard_after_forward': False, 'fp32_reduce_scatter': False, 'cpu_offload': False, 'use_sharded_state': False, 'not_fsdp_flatten_parameters': False}, 'dataset': {'_name': None, 'num_workers': 1, 'skip_invalid_size_inputs_valid_test': False, 'max_tokens': 20000, 'batch_size': None, 'required_batch_size_multiple': 8, 'required_seq_len_multiple': 1, 'dataset_impl': None, 'data_buffer_size': 10, 'train_subset': 'train', 'valid_subset': 'valid', 'combine_valid_subsets': None, 'ignore_unused_valid_subsets': False, 'validate_interval': 1, 'validate_interval_updates': 0, 'validate_after_updates': 0, 'fixed_validation_seed': None, 'disable_validation': False, 'max_tokens_valid': 20000, 'batch_size_valid': None, 'max_valid_steps': None, 'curriculum': 0, 'gen_subset': 'test', 'num_shards': 1, 'shard_id': 0, 'grouped_shuffling': False, 'update_epoch_batch_itr': False, 'update_ordered_indices_seed': False}, 'optimization': {'_name': None, 'max_epoch': 0, 'max_update': 100000, 'stop_time_hours': 0.0, 'clip_norm': 0.0, 'sentence_avg': False, 'update_freq': [1], 'lr': [0.0005], 'stop_min_lr': -1.0, 'use_bmuf': False, 'skip_remainder_batch': False}, 'checkpoint': {'_name': None, 'save_dir': 'x64/trained_x64_best_noconst', 'restore_file': 'checkpoint_last.pt', 'continue_once': None, 'finetune_from_model': None, 'reset_dataloader': False, 'reset_lr_scheduler': False, 'reset_meters': False, 'reset_optimizer': False, 'optimizer_overrides': '{}', 'save_interval': 1, 'save_interval_updates': 0, 'keep_interval_updates': -1, 'keep_interval_updates_pattern': -1, 'keep_last_epochs': -1, 'keep_best_checkpoints': 3, 'no_save': False, 'no_epoch_checkpoints': True, 'no_last_checkpoints': False, 'no_save_optimizer_state': False, 'best_checkpoint_metric': 'loss', 'maximize_best_checkpoint_metric': False, 'patience': 4, 'checkpoint_suffix': '', 'checkpoint_shard_count': 1, 'load_checkpoint_on_all_dp_ranks': False, 'write_checkpoints_asynchronously': False, 'model_parallel_size': 1}, 'bmuf': {'_name': None, 'block_lr': 1.0, 'block_momentum': 0.875, 'global_sync_iter': 50, 'warmup_iterations': 500, 'use_nbm': False, 'average_sync': False, 'distributed_world_size': 1}, 'generation': {'_name': None, 'beam': 5, 'nbest': 1, 'max_len_a': 0.0, 'max_len_b': 200, 'min_len': 1, 'match_source_len': False, 'unnormalized': False, 'no_early_stop': False, 'no_beamable_mm': False, 'lenpen': 1.0, 'unkpen': 0.0, 'replace_unk': None, 'sacrebleu': False, 'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, 'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, 'constraints': None, 'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, 'diversity_rate': -1.0, 'print_alignment': None, 'print_step': False, 'lm_path': None, 'lm_weight': 0.0, 'iter_decode_eos_penalty': 0.0, 'iter_decode_max_iter': 10, 'iter_decode_force_max_iter': False, 'iter_decode_with_beam': 1, 'iter_decode_with_external_reranker': False, 'retain_iter_history': False, 'retain_dropout': False, 'retain_dropout_modules': None, 'decoding_format': None, 'no_seed_provided': False, 'eos_token': None}, 'eval_lm': {'_name': None, 'output_word_probs': False, 'output_word_stats': False, 'context_window': 0, 'softmax_batch': 9223372036854775807}, 'interactive': {'_name': None, 'buffer_size': 0, 'input': '-'}, 'model': Namespace(_name='transformer', activation_dropout=0.0, activation_fn='relu', adam_betas=(0.9, 0.999), adam_eps=1e-08, adaptive_input=False, adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, aim_repo=None, aim_run_hash=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, arch='transformer', attention_dropout=0.0, azureml_logging=False, batch_size=None, batch_size_valid=None, best_checkpoint_metric='loss', bf16=False, bpe=None, broadcast_buffers=False, bucket_cap_mb=25, checkpoint_activations=False, checkpoint_shard_count=1, checkpoint_suffix='', clip_norm=0.0, combine_valid_subsets=None, continue_once=None, cpu=False, cpu_offload=False, criterion='cross_entropy', cross_self_attention=False, curriculum=0, data='x64/tokenized_dlsm_x64_noconst', data_buffer_size=10, dataset_impl=None, ddp_backend='pytorch_ddp', ddp_comm_hook='none', decoder_attention_heads=8, decoder_embed_dim=96, decoder_embed_path=None, decoder_ffn_embed_dim=384, decoder_input_dim=96, decoder_layerdrop=0, decoder_layers=6, decoder_layers_to_keep=None, decoder_learned_pos=False, decoder_normalize_before=False, decoder_output_dim='96', device_id=0, disable_validation=False, distributed_backend='nccl', distributed_init_method=None, distributed_no_spawn=False, distributed_num_procs=1, distributed_port=-1, distributed_rank=0, distributed_world_size=1, dropout=0.05, ema_decay=0.9999, ema_fp32=False, ema_seed_model=None, ema_start_update=0, ema_update_freq=1, empty_cache_freq=0, encoder_attention_heads=8, encoder_embed_dim=384, encoder_embed_path=None, encoder_ffn_embed_dim=1536, encoder_layerdrop=0, encoder_layers=6, encoder_layers_to_keep=None, encoder_learned_pos=False, encoder_normalize_before=False, eos=2, eval_bleu=False, eval_bleu_args='{}', eval_bleu_detok='space', eval_bleu_detok_args='{}', eval_bleu_print_samples=False, eval_bleu_remove_bpe=None, eval_tokenized_bleu=False, fast_stat_sync=False, find_unused_parameters=False, finetune_from_model=None, fix_batches_to_gpus=False, fixed_validation_seed=None, fp16=False, fp16_adam_stats=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, fp32_reduce_scatter=False, gen_subset='test', gradient_as_bucket_view=False, grouped_shuffling=False, heartbeat_timeout=-1, ignore_unused_valid_subsets=False, keep_best_checkpoints=3, keep_interval_updates=-1, keep_interval_updates_pattern=-1, keep_last_epochs=-1, layernorm_embedding=False, left_pad_source=True, left_pad_target=False, load_alignments=False, load_checkpoint_on_all_dp_ranks=False, localsgd_frequency=3, log_file='x64/trained_x64_best_noconst/training.log', log_format=None, log_interval=100, lr=[0.0005], lr_scheduler='inverse_sqrt', max_epoch=0, max_source_positions=1024, max_target_positions=1024, max_tokens=20000, max_tokens_valid=20000, max_update=100000, max_valid_steps=None, maximize_best_checkpoint_metric=False, memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_cross_attention=False, no_epoch_checkpoints=True, no_last_checkpoints=False, no_progress_bar=False, no_reshard_after_forward=False, no_save=False, no_save_optimizer_state=False, no_scale_embedding=False, no_seed_provided=False, no_token_positional_embeddings=False, not_fsdp_flatten_parameters=False, nprocs_per_node=1, num_batch_buckets=0, num_shards=1, num_workers=1, offload_activations=False, on_cpu_convert_precision=False, optimizer='adam', optimizer_overrides='{}', pad=1, patience=4, pipeline_balance=None, pipeline_checkpoint='never', pipeline_chunks=0, pipeline_decoder_balance=None, pipeline_decoder_devices=None, pipeline_devices=None, pipeline_encoder_balance=None, pipeline_encoder_devices=None, pipeline_model_parallel=False, plasma_path='/tmp/plasma', profile=False, quant_noise_pq=0, quant_noise_pq_block_size=8, quant_noise_scalar=0, quantization_config_path=None, required_batch_size_multiple=8, required_seq_len_multiple=1, reset_dataloader=False, reset_logging=False, reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, restore_file='checkpoint_last.pt', save_dir='x64/trained_x64_best_noconst', save_interval=1, save_interval_updates=0, scoring='bleu', seed=1, sentence_avg=False, shard_id=0, share_all_embeddings=False, share_decoder_input_output_embed=False, skip_invalid_size_inputs_valid_test=False, skip_remainder_batch=False, slowmo_base_algorithm='localsgd', slowmo_momentum=None, source_lang=None, stop_min_lr=-1.0, stop_time_hours=0, store_ema=False, suppress_crashes=False, target_lang=None, task='translation', tensorboard_logdir=None, threshold_loss_scale=None, tie_adaptive_weights=False, tokenizer=None, tpu=False, train_subset='train', truncate_source=False, unk=3, update_epoch_batch_itr=False, update_freq=[1], update_ordered_indices_seed=False, upsample_primary=-1, use_bmuf=False, use_old_adam=False, use_plasma_view=False, use_sharded_state=False, user_dir=None, valid_subset='valid', validate_after_updates=0, validate_interval=1, validate_interval_updates=0, wandb_project=None, warmup_init_lr=-1, warmup_updates=4000, weight_decay=0.005, write_checkpoints_asynchronously=False, zero_sharding='none'), 'task': {'_name': 'translation', 'data': 'x64/tokenized_dlsm_x64_noconst', 'source_lang': None, 'target_lang': None, 'load_alignments': False, 'left_pad_source': True, 'left_pad_target': False, 'max_source_positions': 1024, 'max_target_positions': 1024, 'upsample_primary': -1, 'truncate_source': False, 'num_batch_buckets': 0, 'train_subset': 'train', 'dataset_impl': None, 'required_seq_len_multiple': 1, 'eval_bleu': False, 'eval_bleu_args': '{}', 'eval_bleu_detok': 'space', 'eval_bleu_detok_args': '{}', 'eval_tokenized_bleu': False, 'eval_bleu_remove_bpe': None, 'eval_bleu_print_samples': False}, 'criterion': {'_name': 'cross_entropy', 'sentence_avg': False}, 'optimizer': {'_name': 'adam', 'adam_betas': [0.9, 0.999], 'adam_eps': 1e-08, 'weight_decay': 0.005, 'use_old_adam': False, 'fp16_adam_stats': False, 'tpu': False, 'lr': [0.0005]}, 'lr_scheduler': {'_name': 'inverse_sqrt', 'warmup_updates': 4000, 'warmup_init_lr': -1.0, 'lr': [0.0005]}, 'scoring': {'_name': 'bleu', 'pad': 1, 'eos': 2, 'unk': 3}, 'bpe': None, 'tokenizer': None, 'ema': {'_name': None, 'store_ema': False, 'ema_decay': 0.9999, 'ema_start_update': 0, 'ema_seed_model': None, 'ema_update_freq': 1, 'ema_fp32': False}}
|
| 2 |
+
TransformerModel(
|
| 3 |
+
(encoder): TransformerEncoderBase(
|
| 4 |
+
(dropout_module): FairseqDropout()
|
| 5 |
+
(embed_tokens): Embedding(128, 384, padding_idx=1)
|
| 6 |
+
(embed_positions): SinusoidalPositionalEmbedding()
|
| 7 |
+
(layers): ModuleList(
|
| 8 |
+
(0-5): 6 x TransformerEncoderLayerBase(
|
| 9 |
+
(self_attn): MultiheadAttention(
|
| 10 |
+
(dropout_module): FairseqDropout()
|
| 11 |
+
(k_proj): Linear(in_features=384, out_features=384, bias=True)
|
| 12 |
+
(v_proj): Linear(in_features=384, out_features=384, bias=True)
|
| 13 |
+
(q_proj): Linear(in_features=384, out_features=384, bias=True)
|
| 14 |
+
(out_proj): Linear(in_features=384, out_features=384, bias=True)
|
| 15 |
+
)
|
| 16 |
+
(self_attn_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
|
| 17 |
+
(dropout_module): FairseqDropout()
|
| 18 |
+
(activation_dropout_module): FairseqDropout()
|
| 19 |
+
(fc1): Linear(in_features=384, out_features=1536, bias=True)
|
| 20 |
+
(fc2): Linear(in_features=1536, out_features=384, bias=True)
|
| 21 |
+
(final_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
|
| 22 |
+
)
|
| 23 |
+
)
|
| 24 |
+
)
|
| 25 |
+
(decoder): TransformerDecoderBase(
|
| 26 |
+
(dropout_module): FairseqDropout()
|
| 27 |
+
(embed_tokens): Embedding(40, 96, padding_idx=1)
|
| 28 |
+
(embed_positions): SinusoidalPositionalEmbedding()
|
| 29 |
+
(layers): ModuleList(
|
| 30 |
+
(0-5): 6 x TransformerDecoderLayerBase(
|
| 31 |
+
(dropout_module): FairseqDropout()
|
| 32 |
+
(self_attn): MultiheadAttention(
|
| 33 |
+
(dropout_module): FairseqDropout()
|
| 34 |
+
(k_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 35 |
+
(v_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 36 |
+
(q_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 37 |
+
(out_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 38 |
+
)
|
| 39 |
+
(activation_dropout_module): FairseqDropout()
|
| 40 |
+
(self_attn_layer_norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
|
| 41 |
+
(encoder_attn): MultiheadAttention(
|
| 42 |
+
(dropout_module): FairseqDropout()
|
| 43 |
+
(k_proj): Linear(in_features=384, out_features=96, bias=True)
|
| 44 |
+
(v_proj): Linear(in_features=384, out_features=96, bias=True)
|
| 45 |
+
(q_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 46 |
+
(out_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 47 |
+
)
|
| 48 |
+
(encoder_attn_layer_norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
|
| 49 |
+
(fc1): Linear(in_features=96, out_features=384, bias=True)
|
| 50 |
+
(fc2): Linear(in_features=384, out_features=96, bias=True)
|
| 51 |
+
(final_layer_norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
|
| 52 |
+
)
|
| 53 |
+
)
|
| 54 |
+
(output_projection): Linear(in_features=96, out_features=40, bias=False)
|
| 55 |
+
)
|
| 56 |
+
)
|
| 57 |
+
task: TranslationTask
|
| 58 |
+
model: TransformerModel
|
| 59 |
+
criterion: CrossEntropyCriterion
|
| 60 |
+
num. shared model params: 22,568,640 (num. trained: 22,568,640)
|
| 61 |
+
num. expert model params: 0 (num. trained: 0)
|
| 62 |
+
training on 1 devices (GPUs/TPUs)
|
| 63 |
+
max tokens per device = 20000 and max sentences per device = None
|
| 64 |
+
Start iterating over samples
|
| 65 |
+
begin validation on "valid" subset
|
| 66 |
+
epoch 001 | valid on 'valid' subset | loss 1.81 | ppl 3.51 | wps 27721.3 | wpb 2679.5 | bsz 97.1 | num_updates 4211
|
| 67 |
+
end of epoch 1 (average epoch stats below)
|
| 68 |
+
epoch 001 | loss 2.368 | ppl 5.16 | wps 9077.7 | ups 3.25 | wpb 2793.3 | bsz 100.7 | num_updates 4211 | lr 0.000487312 | gnorm 2.758 | train_wall 1275 | gb_free 13.1 | wall 1296
|
| 69 |
+
Start iterating over samples
|
| 70 |
+
begin validation on "valid" subset
|
| 71 |
+
epoch 002 | valid on 'valid' subset | loss 1.272 | ppl 2.42 | wps 27624.5 | wpb 2679.5 | bsz 97.1 | num_updates 8422 | best_loss 1.272
|
| 72 |
+
epoch 002 | valid on 'valid' subset | loss 1.272 | ppl 2.42 | wps 27624.5 | wpb 2679.5 | bsz 97.1 | num_updates 8422 | best_loss 1.272
|
| 73 |
+
end of epoch 2 (average epoch stats below)
|
| 74 |
+
epoch 002 | loss 1.507 | ppl 2.84 | wps 9061.9 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 8422 | lr 0.000344582 | gnorm 1.357 | train_wall 1277 | gb_free 12.9 | wall 2594
|
| 75 |
+
epoch 002 | loss 1.507 | ppl 2.84 | wps 9061.9 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 8422 | lr 0.000344582 | gnorm 1.357 | train_wall 1277 | gb_free 12.9 | wall 2594
|
| 76 |
+
Start iterating over samples
|
| 77 |
+
begin validation on "valid" subset
|
| 78 |
+
epoch 003 | valid on 'valid' subset | loss 1.17 | ppl 2.25 | wps 27577.4 | wpb 2679.5 | bsz 97.1 | num_updates 12633 | best_loss 1.17
|
| 79 |
+
epoch 003 | valid on 'valid' subset | loss 1.17 | ppl 2.25 | wps 27577.4 | wpb 2679.5 | bsz 97.1 | num_updates 12633 | best_loss 1.17
|
| 80 |
+
epoch 003 | valid on 'valid' subset | loss 1.17 | ppl 2.25 | wps 27577.4 | wpb 2679.5 | bsz 97.1 | num_updates 12633 | best_loss 1.17
|
| 81 |
+
end of epoch 3 (average epoch stats below)
|
| 82 |
+
epoch 003 | loss 1.155 | ppl 2.23 | wps 9060.7 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 12633 | lr 0.00028135 | gnorm 1.172 | train_wall 1277 | gb_free 12.8 | wall 3893
|
| 83 |
+
epoch 003 | loss 1.155 | ppl 2.23 | wps 9060.7 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 12633 | lr 0.00028135 | gnorm 1.172 | train_wall 1277 | gb_free 12.8 | wall 3893
|
| 84 |
+
epoch 003 | loss 1.155 | ppl 2.23 | wps 9060.7 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 12633 | lr 0.00028135 | gnorm 1.172 | train_wall 1277 | gb_free 12.8 | wall 3893
|
| 85 |
+
Start iterating over samples
|
| 86 |
+
begin validation on "valid" subset
|
| 87 |
+
epoch 004 | valid on 'valid' subset | loss 1.201 | ppl 2.3 | wps 27719.2 | wpb 2679.5 | bsz 97.1 | num_updates 16844 | best_loss 1.17
|
| 88 |
+
epoch 004 | valid on 'valid' subset | loss 1.201 | ppl 2.3 | wps 27719.2 | wpb 2679.5 | bsz 97.1 | num_updates 16844 | best_loss 1.17
|
| 89 |
+
epoch 004 | valid on 'valid' subset | loss 1.201 | ppl 2.3 | wps 27719.2 | wpb 2679.5 | bsz 97.1 | num_updates 16844 | best_loss 1.17
|
| 90 |
+
epoch 004 | valid on 'valid' subset | loss 1.201 | ppl 2.3 | wps 27719.2 | wpb 2679.5 | bsz 97.1 | num_updates 16844 | best_loss 1.17
|
| 91 |
+
end of epoch 4 (average epoch stats below)
|
| 92 |
+
epoch 004 | loss 0.976 | ppl 1.97 | wps 9063.1 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 16844 | lr 0.000243656 | gnorm 1.093 | train_wall 1277 | gb_free 13.1 | wall 5190
|
| 93 |
+
epoch 004 | loss 0.976 | ppl 1.97 | wps 9063.1 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 16844 | lr 0.000243656 | gnorm 1.093 | train_wall 1277 | gb_free 13.1 | wall 5190
|
| 94 |
+
epoch 004 | loss 0.976 | ppl 1.97 | wps 9063.1 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 16844 | lr 0.000243656 | gnorm 1.093 | train_wall 1277 | gb_free 13.1 | wall 5190
|
| 95 |
+
epoch 004 | loss 0.976 | ppl 1.97 | wps 9063.1 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 16844 | lr 0.000243656 | gnorm 1.093 | train_wall 1277 | gb_free 13.1 | wall 5190
|
| 96 |
+
Start iterating over samples
|
| 97 |
+
begin validation on "valid" subset
|
| 98 |
+
epoch 005 | valid on 'valid' subset | loss 1.267 | ppl 2.41 | wps 27584.2 | wpb 2679.5 | bsz 97.1 | num_updates 21055 | best_loss 1.17
|
| 99 |
+
epoch 005 | valid on 'valid' subset | loss 1.267 | ppl 2.41 | wps 27584.2 | wpb 2679.5 | bsz 97.1 | num_updates 21055 | best_loss 1.17
|
| 100 |
+
epoch 005 | valid on 'valid' subset | loss 1.267 | ppl 2.41 | wps 27584.2 | wpb 2679.5 | bsz 97.1 | num_updates 21055 | best_loss 1.17
|
| 101 |
+
epoch 005 | valid on 'valid' subset | loss 1.267 | ppl 2.41 | wps 27584.2 | wpb 2679.5 | bsz 97.1 | num_updates 21055 | best_loss 1.17
|
| 102 |
+
epoch 005 | valid on 'valid' subset | loss 1.267 | ppl 2.41 | wps 27584.2 | wpb 2679.5 | bsz 97.1 | num_updates 21055 | best_loss 1.17
|
| 103 |
+
end of epoch 5 (average epoch stats below)
|
| 104 |
+
epoch 005 | loss 0.847 | ppl 1.8 | wps 9051.5 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 21055 | lr 0.000217933 | gnorm 1.075 | train_wall 1279 | gb_free 12.7 | wall 6490
|
| 105 |
+
epoch 005 | loss 0.847 | ppl 1.8 | wps 9051.5 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 21055 | lr 0.000217933 | gnorm 1.075 | train_wall 1279 | gb_free 12.7 | wall 6490
|
| 106 |
+
epoch 005 | loss 0.847 | ppl 1.8 | wps 9051.5 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 21055 | lr 0.000217933 | gnorm 1.075 | train_wall 1279 | gb_free 12.7 | wall 6490
|
| 107 |
+
epoch 005 | loss 0.847 | ppl 1.8 | wps 9051.5 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 21055 | lr 0.000217933 | gnorm 1.075 | train_wall 1279 | gb_free 12.7 | wall 6490
|
| 108 |
+
epoch 005 | loss 0.847 | ppl 1.8 | wps 9051.5 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 21055 | lr 0.000217933 | gnorm 1.075 | train_wall 1279 | gb_free 12.7 | wall 6490
|
| 109 |
+
Start iterating over samples
|
| 110 |
+
begin validation on "valid" subset
|
| 111 |
+
epoch 006 | valid on 'valid' subset | loss 1.371 | ppl 2.59 | wps 27709.9 | wpb 2679.5 | bsz 97.1 | num_updates 25266 | best_loss 1.17
|
| 112 |
+
epoch 006 | valid on 'valid' subset | loss 1.371 | ppl 2.59 | wps 27709.9 | wpb 2679.5 | bsz 97.1 | num_updates 25266 | best_loss 1.17
|
| 113 |
+
epoch 006 | valid on 'valid' subset | loss 1.371 | ppl 2.59 | wps 27709.9 | wpb 2679.5 | bsz 97.1 | num_updates 25266 | best_loss 1.17
|
| 114 |
+
epoch 006 | valid on 'valid' subset | loss 1.371 | ppl 2.59 | wps 27709.9 | wpb 2679.5 | bsz 97.1 | num_updates 25266 | best_loss 1.17
|
| 115 |
+
epoch 006 | valid on 'valid' subset | loss 1.371 | ppl 2.59 | wps 27709.9 | wpb 2679.5 | bsz 97.1 | num_updates 25266 | best_loss 1.17
|
| 116 |
+
epoch 006 | valid on 'valid' subset | loss 1.371 | ppl 2.59 | wps 27709.9 | wpb 2679.5 | bsz 97.1 | num_updates 25266 | best_loss 1.17
|
| 117 |
+
end of epoch 6 (average epoch stats below)
|
| 118 |
+
epoch 006 | loss 0.737 | ppl 1.67 | wps 9050.9 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 25266 | lr 0.000198944 | gnorm 1.103 | train_wall 1278 | gb_free 12.9 | wall 7790
|
| 119 |
+
epoch 006 | loss 0.737 | ppl 1.67 | wps 9050.9 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 25266 | lr 0.000198944 | gnorm 1.103 | train_wall 1278 | gb_free 12.9 | wall 7790
|
| 120 |
+
epoch 006 | loss 0.737 | ppl 1.67 | wps 9050.9 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 25266 | lr 0.000198944 | gnorm 1.103 | train_wall 1278 | gb_free 12.9 | wall 7790
|
| 121 |
+
epoch 006 | loss 0.737 | ppl 1.67 | wps 9050.9 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 25266 | lr 0.000198944 | gnorm 1.103 | train_wall 1278 | gb_free 12.9 | wall 7790
|
| 122 |
+
epoch 006 | loss 0.737 | ppl 1.67 | wps 9050.9 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 25266 | lr 0.000198944 | gnorm 1.103 | train_wall 1278 | gb_free 12.9 | wall 7790
|
| 123 |
+
epoch 006 | loss 0.737 | ppl 1.67 | wps 9050.9 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 25266 | lr 0.000198944 | gnorm 1.103 | train_wall 1278 | gb_free 12.9 | wall 7790
|
| 124 |
+
Start iterating over samples
|
| 125 |
+
begin validation on "valid" subset
|
| 126 |
+
epoch 007 | valid on 'valid' subset | loss 1.475 | ppl 2.78 | wps 27733.8 | wpb 2679.5 | bsz 97.1 | num_updates 29477 | best_loss 1.17
|
| 127 |
+
epoch 007 | valid on 'valid' subset | loss 1.475 | ppl 2.78 | wps 27733.8 | wpb 2679.5 | bsz 97.1 | num_updates 29477 | best_loss 1.17
|
| 128 |
+
epoch 007 | valid on 'valid' subset | loss 1.475 | ppl 2.78 | wps 27733.8 | wpb 2679.5 | bsz 97.1 | num_updates 29477 | best_loss 1.17
|
| 129 |
+
epoch 007 | valid on 'valid' subset | loss 1.475 | ppl 2.78 | wps 27733.8 | wpb 2679.5 | bsz 97.1 | num_updates 29477 | best_loss 1.17
|
| 130 |
+
epoch 007 | valid on 'valid' subset | loss 1.475 | ppl 2.78 | wps 27733.8 | wpb 2679.5 | bsz 97.1 | num_updates 29477 | best_loss 1.17
|
| 131 |
+
epoch 007 | valid on 'valid' subset | loss 1.475 | ppl 2.78 | wps 27733.8 | wpb 2679.5 | bsz 97.1 | num_updates 29477 | best_loss 1.17
|
| 132 |
+
epoch 007 | valid on 'valid' subset | loss 1.475 | ppl 2.78 | wps 27733.8 | wpb 2679.5 | bsz 97.1 | num_updates 29477 | best_loss 1.17
|
| 133 |
+
early stop since valid performance hasn't improved for last 4 runs
|
| 134 |
+
end of epoch 7 (average epoch stats below)
|
| 135 |
+
epoch 007 | loss 0.644 | ppl 1.56 | wps 9051.9 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 29477 | lr 0.000184187 | gnorm 1.147 | train_wall 1277 | gb_free 12.9 | wall 9089
|
| 136 |
+
epoch 007 | loss 0.644 | ppl 1.56 | wps 9051.9 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 29477 | lr 0.000184187 | gnorm 1.147 | train_wall 1277 | gb_free 12.9 | wall 9089
|
| 137 |
+
epoch 007 | loss 0.644 | ppl 1.56 | wps 9051.9 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 29477 | lr 0.000184187 | gnorm 1.147 | train_wall 1277 | gb_free 12.9 | wall 9089
|
| 138 |
+
epoch 007 | loss 0.644 | ppl 1.56 | wps 9051.9 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 29477 | lr 0.000184187 | gnorm 1.147 | train_wall 1277 | gb_free 12.9 | wall 9089
|
| 139 |
+
epoch 007 | loss 0.644 | ppl 1.56 | wps 9051.9 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 29477 | lr 0.000184187 | gnorm 1.147 | train_wall 1277 | gb_free 12.9 | wall 9089
|
| 140 |
+
epoch 007 | loss 0.644 | ppl 1.56 | wps 9051.9 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 29477 | lr 0.000184187 | gnorm 1.147 | train_wall 1277 | gb_free 12.9 | wall 9089
|
| 141 |
+
epoch 007 | loss 0.644 | ppl 1.56 | wps 9051.9 | ups 3.24 | wpb 2793.3 | bsz 100.7 | num_updates 29477 | lr 0.000184187 | gnorm 1.147 | train_wall 1277 | gb_free 12.9 | wall 9089
|
| 142 |
+
done training in 9088.6 seconds
|
ablations/postfix/tokenized_postfix.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6c91cb78de78a69a17dc9e41d0d9e1881e09a63f05a3f27e6ef65405bcaef8e2
|
| 3 |
+
size 121303562
|
ablations/postfix/trained_aarch64_best_postfix/beam1.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7d87d55da3a5b55535dadead1bf429a4dbff610bea36df3283ea40973c4cf16a
|
| 3 |
+
size 2538195
|
ablations/postfix/trained_aarch64_best_postfix/checkpoint_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ec8724cbe0491471936204fadd4d30a8382581e02534189053cf415ac101f7f5
|
| 3 |
+
size 186184500
|
ablations/postfix/trained_aarch64_best_postfix/training.log
ADDED
|
@@ -0,0 +1,590 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': 'aarch64/trained_aarch64_best_postfix/training.log', 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': False, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': None, 'path': None, 'post_process': None, 'quiet': False, 'model_overrides': '{}', 'results_path': None}, 'distributed_training': {'_name': None, 'distributed_world_size': 1, 'distributed_num_procs': 1, 'distributed_rank': 0, 'distributed_backend': 'nccl', 'distributed_init_method': None, 'distributed_port': -1, 'device_id': 0, 'distributed_no_spawn': False, 'ddp_backend': 'pytorch_ddp', 'ddp_comm_hook': 'none', 'bucket_cap_mb': 25, 'fix_batches_to_gpus': False, 'find_unused_parameters': False, 'gradient_as_bucket_view': False, 'fast_stat_sync': False, 'heartbeat_timeout': -1, 'broadcast_buffers': False, 'slowmo_momentum': None, 'slowmo_base_algorithm': 'localsgd', 'localsgd_frequency': 3, 'nprocs_per_node': 1, 'pipeline_model_parallel': False, 'pipeline_balance': None, 'pipeline_devices': None, 'pipeline_chunks': 0, 'pipeline_encoder_balance': None, 'pipeline_encoder_devices': None, 'pipeline_decoder_balance': None, 'pipeline_decoder_devices': None, 'pipeline_checkpoint': 'never', 'zero_sharding': 'none', 'fp16': False, 'memory_efficient_fp16': False, 'tpu': False, 'no_reshard_after_forward': False, 'fp32_reduce_scatter': False, 'cpu_offload': False, 'use_sharded_state': False, 'not_fsdp_flatten_parameters': False}, 'dataset': {'_name': None, 'num_workers': 1, 'skip_invalid_size_inputs_valid_test': False, 'max_tokens': 20000, 'batch_size': None, 'required_batch_size_multiple': 8, 'required_seq_len_multiple': 1, 'dataset_impl': None, 'data_buffer_size': 10, 'train_subset': 'train', 'valid_subset': 'valid', 'combine_valid_subsets': None, 'ignore_unused_valid_subsets': False, 'validate_interval': 1, 'validate_interval_updates': 0, 'validate_after_updates': 0, 'fixed_validation_seed': None, 'disable_validation': False, 'max_tokens_valid': 20000, 'batch_size_valid': None, 'max_valid_steps': None, 'curriculum': 0, 'gen_subset': 'test', 'num_shards': 1, 'shard_id': 0, 'grouped_shuffling': False, 'update_epoch_batch_itr': False, 'update_ordered_indices_seed': False}, 'optimization': {'_name': None, 'max_epoch': 0, 'max_update': 100000, 'stop_time_hours': 0.0, 'clip_norm': 0.0, 'sentence_avg': False, 'update_freq': [1], 'lr': [0.0005], 'stop_min_lr': -1.0, 'use_bmuf': False, 'skip_remainder_batch': False}, 'checkpoint': {'_name': None, 'save_dir': 'aarch64/trained_aarch64_best_postfix', 'restore_file': 'checkpoint_last.pt', 'continue_once': None, 'finetune_from_model': None, 'reset_dataloader': False, 'reset_lr_scheduler': False, 'reset_meters': False, 'reset_optimizer': False, 'optimizer_overrides': '{}', 'save_interval': 1, 'save_interval_updates': 0, 'keep_interval_updates': -1, 'keep_interval_updates_pattern': -1, 'keep_last_epochs': -1, 'keep_best_checkpoints': 3, 'no_save': False, 'no_epoch_checkpoints': True, 'no_last_checkpoints': False, 'no_save_optimizer_state': False, 'best_checkpoint_metric': 'loss', 'maximize_best_checkpoint_metric': False, 'patience': 4, 'checkpoint_suffix': '', 'checkpoint_shard_count': 1, 'load_checkpoint_on_all_dp_ranks': False, 'write_checkpoints_asynchronously': False, 'model_parallel_size': 1}, 'bmuf': {'_name': None, 'block_lr': 1.0, 'block_momentum': 0.875, 'global_sync_iter': 50, 'warmup_iterations': 500, 'use_nbm': False, 'average_sync': False, 'distributed_world_size': 1}, 'generation': {'_name': None, 'beam': 5, 'nbest': 1, 'max_len_a': 0.0, 'max_len_b': 200, 'min_len': 1, 'match_source_len': False, 'unnormalized': False, 'no_early_stop': False, 'no_beamable_mm': False, 'lenpen': 1.0, 'unkpen': 0.0, 'replace_unk': None, 'sacrebleu': False, 'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, 'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, 'constraints': None, 'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, 'diversity_rate': -1.0, 'print_alignment': None, 'print_step': False, 'lm_path': None, 'lm_weight': 0.0, 'iter_decode_eos_penalty': 0.0, 'iter_decode_max_iter': 10, 'iter_decode_force_max_iter': False, 'iter_decode_with_beam': 1, 'iter_decode_with_external_reranker': False, 'retain_iter_history': False, 'retain_dropout': False, 'retain_dropout_modules': None, 'decoding_format': None, 'no_seed_provided': False, 'eos_token': None}, 'eval_lm': {'_name': None, 'output_word_probs': False, 'output_word_stats': False, 'context_window': 0, 'softmax_batch': 9223372036854775807}, 'interactive': {'_name': None, 'buffer_size': 0, 'input': '-'}, 'model': Namespace(_name='transformer', activation_dropout=0.0, activation_fn='relu', adam_betas=(0.9, 0.999), adam_eps=1e-08, adaptive_input=False, adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, aim_repo=None, aim_run_hash=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, arch='transformer', attention_dropout=0.0, azureml_logging=False, batch_size=None, batch_size_valid=None, best_checkpoint_metric='loss', bf16=False, bpe=None, broadcast_buffers=False, bucket_cap_mb=25, checkpoint_activations=False, checkpoint_shard_count=1, checkpoint_suffix='', clip_norm=0.0, combine_valid_subsets=None, continue_once=None, cpu=False, cpu_offload=False, criterion='cross_entropy', cross_self_attention=False, curriculum=0, data='aarch64/tokenized_dlsm_aarch64_postfix', data_buffer_size=10, dataset_impl=None, ddp_backend='pytorch_ddp', ddp_comm_hook='none', decoder_attention_heads=8, decoder_embed_dim=96, decoder_embed_path=None, decoder_ffn_embed_dim=384, decoder_input_dim=96, decoder_layerdrop=0, decoder_layers=6, decoder_layers_to_keep=None, decoder_learned_pos=False, decoder_normalize_before=False, decoder_output_dim='96', device_id=0, disable_validation=False, distributed_backend='nccl', distributed_init_method=None, distributed_no_spawn=False, distributed_num_procs=1, distributed_port=-1, distributed_rank=0, distributed_world_size=1, dropout=0.1, ema_decay=0.9999, ema_fp32=False, ema_seed_model=None, ema_start_update=0, ema_update_freq=1, empty_cache_freq=0, encoder_attention_heads=8, encoder_embed_dim=384, encoder_embed_path=None, encoder_ffn_embed_dim=1536, encoder_layerdrop=0, encoder_layers=6, encoder_layers_to_keep=None, encoder_learned_pos=False, encoder_normalize_before=False, eos=2, eval_bleu=False, eval_bleu_args='{}', eval_bleu_detok='space', eval_bleu_detok_args='{}', eval_bleu_print_samples=False, eval_bleu_remove_bpe=None, eval_tokenized_bleu=False, fast_stat_sync=False, find_unused_parameters=False, finetune_from_model=None, fix_batches_to_gpus=False, fixed_validation_seed=None, fp16=False, fp16_adam_stats=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, fp32_reduce_scatter=False, gen_subset='test', gradient_as_bucket_view=False, grouped_shuffling=False, heartbeat_timeout=-1, ignore_unused_valid_subsets=False, keep_best_checkpoints=3, keep_interval_updates=-1, keep_interval_updates_pattern=-1, keep_last_epochs=-1, layernorm_embedding=False, left_pad_source=True, left_pad_target=False, load_alignments=False, load_checkpoint_on_all_dp_ranks=False, localsgd_frequency=3, log_file='aarch64/trained_aarch64_best_postfix/training.log', log_format=None, log_interval=100, lr=[0.0005], lr_scheduler='inverse_sqrt', max_epoch=0, max_source_positions=1024, max_target_positions=1024, max_tokens=20000, max_tokens_valid=20000, max_update=100000, max_valid_steps=None, maximize_best_checkpoint_metric=False, memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_cross_attention=False, no_epoch_checkpoints=True, no_last_checkpoints=False, no_progress_bar=False, no_reshard_after_forward=False, no_save=False, no_save_optimizer_state=False, no_scale_embedding=False, no_seed_provided=False, no_token_positional_embeddings=False, not_fsdp_flatten_parameters=False, nprocs_per_node=1, num_batch_buckets=0, num_shards=1, num_workers=1, offload_activations=False, on_cpu_convert_precision=False, optimizer='adam', optimizer_overrides='{}', pad=1, patience=4, pipeline_balance=None, pipeline_checkpoint='never', pipeline_chunks=0, pipeline_decoder_balance=None, pipeline_decoder_devices=None, pipeline_devices=None, pipeline_encoder_balance=None, pipeline_encoder_devices=None, pipeline_model_parallel=False, plasma_path='/tmp/plasma', profile=False, quant_noise_pq=0, quant_noise_pq_block_size=8, quant_noise_scalar=0, quantization_config_path=None, required_batch_size_multiple=8, required_seq_len_multiple=1, reset_dataloader=False, reset_logging=False, reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, restore_file='checkpoint_last.pt', save_dir='aarch64/trained_aarch64_best_postfix', save_interval=1, save_interval_updates=0, scoring='bleu', seed=1, sentence_avg=False, shard_id=0, share_all_embeddings=False, share_decoder_input_output_embed=False, skip_invalid_size_inputs_valid_test=False, skip_remainder_batch=False, slowmo_base_algorithm='localsgd', slowmo_momentum=None, source_lang=None, stop_min_lr=-1.0, stop_time_hours=0, store_ema=False, suppress_crashes=False, target_lang=None, task='translation', tensorboard_logdir=None, threshold_loss_scale=None, tie_adaptive_weights=False, tokenizer=None, tpu=False, train_subset='train', truncate_source=False, unk=3, update_epoch_batch_itr=False, update_freq=[1], update_ordered_indices_seed=False, upsample_primary=-1, use_bmuf=False, use_old_adam=False, use_plasma_view=False, use_sharded_state=False, user_dir=None, valid_subset='valid', validate_after_updates=0, validate_interval=1, validate_interval_updates=0, wandb_project=None, warmup_init_lr=-1, warmup_updates=4000, weight_decay=0.001, write_checkpoints_asynchronously=False, zero_sharding='none'), 'task': {'_name': 'translation', 'data': 'aarch64/tokenized_dlsm_aarch64_postfix', 'source_lang': None, 'target_lang': None, 'load_alignments': False, 'left_pad_source': True, 'left_pad_target': False, 'max_source_positions': 1024, 'max_target_positions': 1024, 'upsample_primary': -1, 'truncate_source': False, 'num_batch_buckets': 0, 'train_subset': 'train', 'dataset_impl': None, 'required_seq_len_multiple': 1, 'eval_bleu': False, 'eval_bleu_args': '{}', 'eval_bleu_detok': 'space', 'eval_bleu_detok_args': '{}', 'eval_tokenized_bleu': False, 'eval_bleu_remove_bpe': None, 'eval_bleu_print_samples': False}, 'criterion': {'_name': 'cross_entropy', 'sentence_avg': False}, 'optimizer': {'_name': 'adam', 'adam_betas': [0.9, 0.999], 'adam_eps': 1e-08, 'weight_decay': 0.001, 'use_old_adam': False, 'fp16_adam_stats': False, 'tpu': False, 'lr': [0.0005]}, 'lr_scheduler': {'_name': 'inverse_sqrt', 'warmup_updates': 4000, 'warmup_init_lr': -1.0, 'lr': [0.0005]}, 'scoring': {'_name': 'bleu', 'pad': 1, 'eos': 2, 'unk': 3}, 'bpe': None, 'tokenizer': None, 'ema': {'_name': None, 'store_ema': False, 'ema_decay': 0.9999, 'ema_start_update': 0, 'ema_seed_model': None, 'ema_update_freq': 1, 'ema_fp32': False}}
|
| 2 |
+
TransformerModel(
|
| 3 |
+
(encoder): TransformerEncoderBase(
|
| 4 |
+
(dropout_module): FairseqDropout()
|
| 5 |
+
(embed_tokens): Embedding(168, 384, padding_idx=1)
|
| 6 |
+
(embed_positions): SinusoidalPositionalEmbedding()
|
| 7 |
+
(layers): ModuleList(
|
| 8 |
+
(0-5): 6 x TransformerEncoderLayerBase(
|
| 9 |
+
(self_attn): MultiheadAttention(
|
| 10 |
+
(dropout_module): FairseqDropout()
|
| 11 |
+
(k_proj): Linear(in_features=384, out_features=384, bias=True)
|
| 12 |
+
(v_proj): Linear(in_features=384, out_features=384, bias=True)
|
| 13 |
+
(q_proj): Linear(in_features=384, out_features=384, bias=True)
|
| 14 |
+
(out_proj): Linear(in_features=384, out_features=384, bias=True)
|
| 15 |
+
)
|
| 16 |
+
(self_attn_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
|
| 17 |
+
(dropout_module): FairseqDropout()
|
| 18 |
+
(activation_dropout_module): FairseqDropout()
|
| 19 |
+
(fc1): Linear(in_features=384, out_features=1536, bias=True)
|
| 20 |
+
(fc2): Linear(in_features=1536, out_features=384, bias=True)
|
| 21 |
+
(final_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
|
| 22 |
+
)
|
| 23 |
+
)
|
| 24 |
+
)
|
| 25 |
+
(decoder): TransformerDecoderBase(
|
| 26 |
+
(dropout_module): FairseqDropout()
|
| 27 |
+
(embed_tokens): Embedding(40, 96, padding_idx=1)
|
| 28 |
+
(embed_positions): SinusoidalPositionalEmbedding()
|
| 29 |
+
(layers): ModuleList(
|
| 30 |
+
(0-5): 6 x TransformerDecoderLayerBase(
|
| 31 |
+
(dropout_module): FairseqDropout()
|
| 32 |
+
(self_attn): MultiheadAttention(
|
| 33 |
+
(dropout_module): FairseqDropout()
|
| 34 |
+
(k_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 35 |
+
(v_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 36 |
+
(q_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 37 |
+
(out_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 38 |
+
)
|
| 39 |
+
(activation_dropout_module): FairseqDropout()
|
| 40 |
+
(self_attn_layer_norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
|
| 41 |
+
(encoder_attn): MultiheadAttention(
|
| 42 |
+
(dropout_module): FairseqDropout()
|
| 43 |
+
(k_proj): Linear(in_features=384, out_features=96, bias=True)
|
| 44 |
+
(v_proj): Linear(in_features=384, out_features=96, bias=True)
|
| 45 |
+
(q_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 46 |
+
(out_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 47 |
+
)
|
| 48 |
+
(encoder_attn_layer_norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
|
| 49 |
+
(fc1): Linear(in_features=96, out_features=384, bias=True)
|
| 50 |
+
(fc2): Linear(in_features=384, out_features=96, bias=True)
|
| 51 |
+
(final_layer_norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
|
| 52 |
+
)
|
| 53 |
+
)
|
| 54 |
+
(output_projection): Linear(in_features=96, out_features=40, bias=False)
|
| 55 |
+
)
|
| 56 |
+
)
|
| 57 |
+
task: TranslationTask
|
| 58 |
+
model: TransformerModel
|
| 59 |
+
criterion: CrossEntropyCriterion
|
| 60 |
+
num. shared model params: 22,584,000 (num. trained: 22,584,000)
|
| 61 |
+
num. expert model params: 0 (num. trained: 0)
|
| 62 |
+
training on 1 devices (GPUs/TPUs)
|
| 63 |
+
max tokens per device = 20000 and max sentences per device = None
|
| 64 |
+
Start iterating over samples
|
| 65 |
+
begin validation on "valid" subset
|
| 66 |
+
epoch 001 | valid on 'valid' subset | loss 1.037 | ppl 2.05 | wps 21510.2 | wpb 1943.9 | bsz 93.9 | num_updates 4943
|
| 67 |
+
end of epoch 1 (average epoch stats below)
|
| 68 |
+
epoch 001 | loss 1.718 | ppl 3.29 | wps 6524 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 4943 | lr 0.000449785 | gnorm 2.84 | train_wall 1470 | gb_free 13 | wall 1494
|
| 69 |
+
Start iterating over samples
|
| 70 |
+
begin validation on "valid" subset
|
| 71 |
+
epoch 002 | valid on 'valid' subset | loss 0.526 | ppl 1.44 | wps 21452 | wpb 1943.9 | bsz 93.9 | num_updates 9886 | best_loss 0.526
|
| 72 |
+
epoch 002 | valid on 'valid' subset | loss 0.526 | ppl 1.44 | wps 21452 | wpb 1943.9 | bsz 93.9 | num_updates 9886 | best_loss 0.526
|
| 73 |
+
end of epoch 2 (average epoch stats below)
|
| 74 |
+
epoch 002 | loss 0.745 | ppl 1.68 | wps 6532.6 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 9886 | lr 0.000318046 | gnorm 1.305 | train_wall 1468 | gb_free 13 | wall 2986
|
| 75 |
+
epoch 002 | loss 0.745 | ppl 1.68 | wps 6532.6 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 9886 | lr 0.000318046 | gnorm 1.305 | train_wall 1468 | gb_free 13 | wall 2986
|
| 76 |
+
Start iterating over samples
|
| 77 |
+
begin validation on "valid" subset
|
| 78 |
+
epoch 003 | valid on 'valid' subset | loss 0.347 | ppl 1.27 | wps 21573.1 | wpb 1943.9 | bsz 93.9 | num_updates 14829 | best_loss 0.347
|
| 79 |
+
epoch 003 | valid on 'valid' subset | loss 0.347 | ppl 1.27 | wps 21573.1 | wpb 1943.9 | bsz 93.9 | num_updates 14829 | best_loss 0.347
|
| 80 |
+
epoch 003 | valid on 'valid' subset | loss 0.347 | ppl 1.27 | wps 21573.1 | wpb 1943.9 | bsz 93.9 | num_updates 14829 | best_loss 0.347
|
| 81 |
+
end of epoch 3 (average epoch stats below)
|
| 82 |
+
epoch 003 | loss 0.402 | ppl 1.32 | wps 6533.6 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 14829 | lr 0.000259683 | gnorm 0.979 | train_wall 1467 | gb_free 12.7 | wall 4478
|
| 83 |
+
epoch 003 | loss 0.402 | ppl 1.32 | wps 6533.6 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 14829 | lr 0.000259683 | gnorm 0.979 | train_wall 1467 | gb_free 12.7 | wall 4478
|
| 84 |
+
epoch 003 | loss 0.402 | ppl 1.32 | wps 6533.6 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 14829 | lr 0.000259683 | gnorm 0.979 | train_wall 1467 | gb_free 12.7 | wall 4478
|
| 85 |
+
Start iterating over samples
|
| 86 |
+
begin validation on "valid" subset
|
| 87 |
+
epoch 004 | valid on 'valid' subset | loss 0.264 | ppl 1.2 | wps 21501.3 | wpb 1943.9 | bsz 93.9 | num_updates 19772 | best_loss 0.264
|
| 88 |
+
epoch 004 | valid on 'valid' subset | loss 0.264 | ppl 1.2 | wps 21501.3 | wpb 1943.9 | bsz 93.9 | num_updates 19772 | best_loss 0.264
|
| 89 |
+
epoch 004 | valid on 'valid' subset | loss 0.264 | ppl 1.2 | wps 21501.3 | wpb 1943.9 | bsz 93.9 | num_updates 19772 | best_loss 0.264
|
| 90 |
+
epoch 004 | valid on 'valid' subset | loss 0.264 | ppl 1.2 | wps 21501.3 | wpb 1943.9 | bsz 93.9 | num_updates 19772 | best_loss 0.264
|
| 91 |
+
end of epoch 4 (average epoch stats below)
|
| 92 |
+
epoch 004 | loss 0.24 | ppl 1.18 | wps 6540.8 | ups 3.32 | wpb 1971.8 | bsz 95.4 | num_updates 19772 | lr 0.000224892 | gnorm 0.813 | train_wall 1466 | gb_free 13 | wall 5968
|
| 93 |
+
epoch 004 | loss 0.24 | ppl 1.18 | wps 6540.8 | ups 3.32 | wpb 1971.8 | bsz 95.4 | num_updates 19772 | lr 0.000224892 | gnorm 0.813 | train_wall 1466 | gb_free 13 | wall 5968
|
| 94 |
+
epoch 004 | loss 0.24 | ppl 1.18 | wps 6540.8 | ups 3.32 | wpb 1971.8 | bsz 95.4 | num_updates 19772 | lr 0.000224892 | gnorm 0.813 | train_wall 1466 | gb_free 13 | wall 5968
|
| 95 |
+
epoch 004 | loss 0.24 | ppl 1.18 | wps 6540.8 | ups 3.32 | wpb 1971.8 | bsz 95.4 | num_updates 19772 | lr 0.000224892 | gnorm 0.813 | train_wall 1466 | gb_free 13 | wall 5968
|
| 96 |
+
Start iterating over samples
|
| 97 |
+
begin validation on "valid" subset
|
| 98 |
+
epoch 005 | valid on 'valid' subset | loss 0.194 | ppl 1.14 | wps 21456.9 | wpb 1943.9 | bsz 93.9 | num_updates 24715 | best_loss 0.194
|
| 99 |
+
epoch 005 | valid on 'valid' subset | loss 0.194 | ppl 1.14 | wps 21456.9 | wpb 1943.9 | bsz 93.9 | num_updates 24715 | best_loss 0.194
|
| 100 |
+
epoch 005 | valid on 'valid' subset | loss 0.194 | ppl 1.14 | wps 21456.9 | wpb 1943.9 | bsz 93.9 | num_updates 24715 | best_loss 0.194
|
| 101 |
+
epoch 005 | valid on 'valid' subset | loss 0.194 | ppl 1.14 | wps 21456.9 | wpb 1943.9 | bsz 93.9 | num_updates 24715 | best_loss 0.194
|
| 102 |
+
epoch 005 | valid on 'valid' subset | loss 0.194 | ppl 1.14 | wps 21456.9 | wpb 1943.9 | bsz 93.9 | num_updates 24715 | best_loss 0.194
|
| 103 |
+
end of epoch 5 (average epoch stats below)
|
| 104 |
+
epoch 005 | loss 0.14 | ppl 1.1 | wps 6454.8 | ups 3.27 | wpb 1971.8 | bsz 95.4 | num_updates 24715 | lr 0.00020115 | gnorm 0.705 | train_wall 1469 | gb_free 12.9 | wall 7478
|
| 105 |
+
epoch 005 | loss 0.14 | ppl 1.1 | wps 6454.8 | ups 3.27 | wpb 1971.8 | bsz 95.4 | num_updates 24715 | lr 0.00020115 | gnorm 0.705 | train_wall 1469 | gb_free 12.9 | wall 7478
|
| 106 |
+
epoch 005 | loss 0.14 | ppl 1.1 | wps 6454.8 | ups 3.27 | wpb 1971.8 | bsz 95.4 | num_updates 24715 | lr 0.00020115 | gnorm 0.705 | train_wall 1469 | gb_free 12.9 | wall 7478
|
| 107 |
+
epoch 005 | loss 0.14 | ppl 1.1 | wps 6454.8 | ups 3.27 | wpb 1971.8 | bsz 95.4 | num_updates 24715 | lr 0.00020115 | gnorm 0.705 | train_wall 1469 | gb_free 12.9 | wall 7478
|
| 108 |
+
epoch 005 | loss 0.14 | ppl 1.1 | wps 6454.8 | ups 3.27 | wpb 1971.8 | bsz 95.4 | num_updates 24715 | lr 0.00020115 | gnorm 0.705 | train_wall 1469 | gb_free 12.9 | wall 7478
|
| 109 |
+
Start iterating over samples
|
| 110 |
+
begin validation on "valid" subset
|
| 111 |
+
epoch 006 | valid on 'valid' subset | loss 0.146 | ppl 1.11 | wps 21466.7 | wpb 1943.9 | bsz 93.9 | num_updates 29658 | best_loss 0.146
|
| 112 |
+
epoch 006 | valid on 'valid' subset | loss 0.146 | ppl 1.11 | wps 21466.7 | wpb 1943.9 | bsz 93.9 | num_updates 29658 | best_loss 0.146
|
| 113 |
+
epoch 006 | valid on 'valid' subset | loss 0.146 | ppl 1.11 | wps 21466.7 | wpb 1943.9 | bsz 93.9 | num_updates 29658 | best_loss 0.146
|
| 114 |
+
epoch 006 | valid on 'valid' subset | loss 0.146 | ppl 1.11 | wps 21466.7 | wpb 1943.9 | bsz 93.9 | num_updates 29658 | best_loss 0.146
|
| 115 |
+
epoch 006 | valid on 'valid' subset | loss 0.146 | ppl 1.11 | wps 21466.7 | wpb 1943.9 | bsz 93.9 | num_updates 29658 | best_loss 0.146
|
| 116 |
+
epoch 006 | valid on 'valid' subset | loss 0.146 | ppl 1.11 | wps 21466.7 | wpb 1943.9 | bsz 93.9 | num_updates 29658 | best_loss 0.146
|
| 117 |
+
end of epoch 6 (average epoch stats below)
|
| 118 |
+
epoch 006 | loss 0.081 | ppl 1.06 | wps 6502.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 29658 | lr 0.000183624 | gnorm 0.609 | train_wall 1474 | gb_free 12.8 | wall 8977
|
| 119 |
+
epoch 006 | loss 0.081 | ppl 1.06 | wps 6502.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 29658 | lr 0.000183624 | gnorm 0.609 | train_wall 1474 | gb_free 12.8 | wall 8977
|
| 120 |
+
epoch 006 | loss 0.081 | ppl 1.06 | wps 6502.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 29658 | lr 0.000183624 | gnorm 0.609 | train_wall 1474 | gb_free 12.8 | wall 8977
|
| 121 |
+
epoch 006 | loss 0.081 | ppl 1.06 | wps 6502.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 29658 | lr 0.000183624 | gnorm 0.609 | train_wall 1474 | gb_free 12.8 | wall 8977
|
| 122 |
+
epoch 006 | loss 0.081 | ppl 1.06 | wps 6502.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 29658 | lr 0.000183624 | gnorm 0.609 | train_wall 1474 | gb_free 12.8 | wall 8977
|
| 123 |
+
epoch 006 | loss 0.081 | ppl 1.06 | wps 6502.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 29658 | lr 0.000183624 | gnorm 0.609 | train_wall 1474 | gb_free 12.8 | wall 8977
|
| 124 |
+
Start iterating over samples
|
| 125 |
+
begin validation on "valid" subset
|
| 126 |
+
epoch 007 | valid on 'valid' subset | loss 0.138 | ppl 1.1 | wps 21354.2 | wpb 1943.9 | bsz 93.9 | num_updates 34601 | best_loss 0.138
|
| 127 |
+
epoch 007 | valid on 'valid' subset | loss 0.138 | ppl 1.1 | wps 21354.2 | wpb 1943.9 | bsz 93.9 | num_updates 34601 | best_loss 0.138
|
| 128 |
+
epoch 007 | valid on 'valid' subset | loss 0.138 | ppl 1.1 | wps 21354.2 | wpb 1943.9 | bsz 93.9 | num_updates 34601 | best_loss 0.138
|
| 129 |
+
epoch 007 | valid on 'valid' subset | loss 0.138 | ppl 1.1 | wps 21354.2 | wpb 1943.9 | bsz 93.9 | num_updates 34601 | best_loss 0.138
|
| 130 |
+
epoch 007 | valid on 'valid' subset | loss 0.138 | ppl 1.1 | wps 21354.2 | wpb 1943.9 | bsz 93.9 | num_updates 34601 | best_loss 0.138
|
| 131 |
+
epoch 007 | valid on 'valid' subset | loss 0.138 | ppl 1.1 | wps 21354.2 | wpb 1943.9 | bsz 93.9 | num_updates 34601 | best_loss 0.138
|
| 132 |
+
epoch 007 | valid on 'valid' subset | loss 0.138 | ppl 1.1 | wps 21354.2 | wpb 1943.9 | bsz 93.9 | num_updates 34601 | best_loss 0.138
|
| 133 |
+
end of epoch 7 (average epoch stats below)
|
| 134 |
+
epoch 007 | loss 0.051 | ppl 1.04 | wps 6479.2 | ups 3.29 | wpb 1971.8 | bsz 95.4 | num_updates 34601 | lr 0.000170003 | gnorm 0.544 | train_wall 1478 | gb_free 13.2 | wall 10481
|
| 135 |
+
epoch 007 | loss 0.051 | ppl 1.04 | wps 6479.2 | ups 3.29 | wpb 1971.8 | bsz 95.4 | num_updates 34601 | lr 0.000170003 | gnorm 0.544 | train_wall 1478 | gb_free 13.2 | wall 10481
|
| 136 |
+
epoch 007 | loss 0.051 | ppl 1.04 | wps 6479.2 | ups 3.29 | wpb 1971.8 | bsz 95.4 | num_updates 34601 | lr 0.000170003 | gnorm 0.544 | train_wall 1478 | gb_free 13.2 | wall 10481
|
| 137 |
+
epoch 007 | loss 0.051 | ppl 1.04 | wps 6479.2 | ups 3.29 | wpb 1971.8 | bsz 95.4 | num_updates 34601 | lr 0.000170003 | gnorm 0.544 | train_wall 1478 | gb_free 13.2 | wall 10481
|
| 138 |
+
epoch 007 | loss 0.051 | ppl 1.04 | wps 6479.2 | ups 3.29 | wpb 1971.8 | bsz 95.4 | num_updates 34601 | lr 0.000170003 | gnorm 0.544 | train_wall 1478 | gb_free 13.2 | wall 10481
|
| 139 |
+
epoch 007 | loss 0.051 | ppl 1.04 | wps 6479.2 | ups 3.29 | wpb 1971.8 | bsz 95.4 | num_updates 34601 | lr 0.000170003 | gnorm 0.544 | train_wall 1478 | gb_free 13.2 | wall 10481
|
| 140 |
+
epoch 007 | loss 0.051 | ppl 1.04 | wps 6479.2 | ups 3.29 | wpb 1971.8 | bsz 95.4 | num_updates 34601 | lr 0.000170003 | gnorm 0.544 | train_wall 1478 | gb_free 13.2 | wall 10481
|
| 141 |
+
Start iterating over samples
|
| 142 |
+
begin validation on "valid" subset
|
| 143 |
+
epoch 008 | valid on 'valid' subset | loss 0.111 | ppl 1.08 | wps 21464.3 | wpb 1943.9 | bsz 93.9 | num_updates 39544 | best_loss 0.111
|
| 144 |
+
epoch 008 | valid on 'valid' subset | loss 0.111 | ppl 1.08 | wps 21464.3 | wpb 1943.9 | bsz 93.9 | num_updates 39544 | best_loss 0.111
|
| 145 |
+
epoch 008 | valid on 'valid' subset | loss 0.111 | ppl 1.08 | wps 21464.3 | wpb 1943.9 | bsz 93.9 | num_updates 39544 | best_loss 0.111
|
| 146 |
+
epoch 008 | valid on 'valid' subset | loss 0.111 | ppl 1.08 | wps 21464.3 | wpb 1943.9 | bsz 93.9 | num_updates 39544 | best_loss 0.111
|
| 147 |
+
epoch 008 | valid on 'valid' subset | loss 0.111 | ppl 1.08 | wps 21464.3 | wpb 1943.9 | bsz 93.9 | num_updates 39544 | best_loss 0.111
|
| 148 |
+
epoch 008 | valid on 'valid' subset | loss 0.111 | ppl 1.08 | wps 21464.3 | wpb 1943.9 | bsz 93.9 | num_updates 39544 | best_loss 0.111
|
| 149 |
+
epoch 008 | valid on 'valid' subset | loss 0.111 | ppl 1.08 | wps 21464.3 | wpb 1943.9 | bsz 93.9 | num_updates 39544 | best_loss 0.111
|
| 150 |
+
epoch 008 | valid on 'valid' subset | loss 0.111 | ppl 1.08 | wps 21464.3 | wpb 1943.9 | bsz 93.9 | num_updates 39544 | best_loss 0.111
|
| 151 |
+
end of epoch 8 (average epoch stats below)
|
| 152 |
+
epoch 008 | loss 0.034 | ppl 1.02 | wps 6517.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 39544 | lr 0.000159023 | gnorm 0.47 | train_wall 1471 | gb_free 12 | wall 11977
|
| 153 |
+
epoch 008 | loss 0.034 | ppl 1.02 | wps 6517.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 39544 | lr 0.000159023 | gnorm 0.47 | train_wall 1471 | gb_free 12 | wall 11977
|
| 154 |
+
epoch 008 | loss 0.034 | ppl 1.02 | wps 6517.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 39544 | lr 0.000159023 | gnorm 0.47 | train_wall 1471 | gb_free 12 | wall 11977
|
| 155 |
+
epoch 008 | loss 0.034 | ppl 1.02 | wps 6517.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 39544 | lr 0.000159023 | gnorm 0.47 | train_wall 1471 | gb_free 12 | wall 11977
|
| 156 |
+
epoch 008 | loss 0.034 | ppl 1.02 | wps 6517.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 39544 | lr 0.000159023 | gnorm 0.47 | train_wall 1471 | gb_free 12 | wall 11977
|
| 157 |
+
epoch 008 | loss 0.034 | ppl 1.02 | wps 6517.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 39544 | lr 0.000159023 | gnorm 0.47 | train_wall 1471 | gb_free 12 | wall 11977
|
| 158 |
+
epoch 008 | loss 0.034 | ppl 1.02 | wps 6517.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 39544 | lr 0.000159023 | gnorm 0.47 | train_wall 1471 | gb_free 12 | wall 11977
|
| 159 |
+
epoch 008 | loss 0.034 | ppl 1.02 | wps 6517.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 39544 | lr 0.000159023 | gnorm 0.47 | train_wall 1471 | gb_free 12 | wall 11977
|
| 160 |
+
Start iterating over samples
|
| 161 |
+
begin validation on "valid" subset
|
| 162 |
+
epoch 009 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21374 | wpb 1943.9 | bsz 93.9 | num_updates 44487 | best_loss 0.11
|
| 163 |
+
epoch 009 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21374 | wpb 1943.9 | bsz 93.9 | num_updates 44487 | best_loss 0.11
|
| 164 |
+
epoch 009 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21374 | wpb 1943.9 | bsz 93.9 | num_updates 44487 | best_loss 0.11
|
| 165 |
+
epoch 009 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21374 | wpb 1943.9 | bsz 93.9 | num_updates 44487 | best_loss 0.11
|
| 166 |
+
epoch 009 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21374 | wpb 1943.9 | bsz 93.9 | num_updates 44487 | best_loss 0.11
|
| 167 |
+
epoch 009 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21374 | wpb 1943.9 | bsz 93.9 | num_updates 44487 | best_loss 0.11
|
| 168 |
+
epoch 009 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21374 | wpb 1943.9 | bsz 93.9 | num_updates 44487 | best_loss 0.11
|
| 169 |
+
epoch 009 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21374 | wpb 1943.9 | bsz 93.9 | num_updates 44487 | best_loss 0.11
|
| 170 |
+
epoch 009 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21374 | wpb 1943.9 | bsz 93.9 | num_updates 44487 | best_loss 0.11
|
| 171 |
+
end of epoch 9 (average epoch stats below)
|
| 172 |
+
epoch 009 | loss 0.026 | ppl 1.02 | wps 6516.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 44487 | lr 0.000149928 | gnorm 0.429 | train_wall 1471 | gb_free 12.6 | wall 13472
|
| 173 |
+
epoch 009 | loss 0.026 | ppl 1.02 | wps 6516.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 44487 | lr 0.000149928 | gnorm 0.429 | train_wall 1471 | gb_free 12.6 | wall 13472
|
| 174 |
+
epoch 009 | loss 0.026 | ppl 1.02 | wps 6516.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 44487 | lr 0.000149928 | gnorm 0.429 | train_wall 1471 | gb_free 12.6 | wall 13472
|
| 175 |
+
epoch 009 | loss 0.026 | ppl 1.02 | wps 6516.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 44487 | lr 0.000149928 | gnorm 0.429 | train_wall 1471 | gb_free 12.6 | wall 13472
|
| 176 |
+
epoch 009 | loss 0.026 | ppl 1.02 | wps 6516.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 44487 | lr 0.000149928 | gnorm 0.429 | train_wall 1471 | gb_free 12.6 | wall 13472
|
| 177 |
+
epoch 009 | loss 0.026 | ppl 1.02 | wps 6516.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 44487 | lr 0.000149928 | gnorm 0.429 | train_wall 1471 | gb_free 12.6 | wall 13472
|
| 178 |
+
epoch 009 | loss 0.026 | ppl 1.02 | wps 6516.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 44487 | lr 0.000149928 | gnorm 0.429 | train_wall 1471 | gb_free 12.6 | wall 13472
|
| 179 |
+
epoch 009 | loss 0.026 | ppl 1.02 | wps 6516.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 44487 | lr 0.000149928 | gnorm 0.429 | train_wall 1471 | gb_free 12.6 | wall 13472
|
| 180 |
+
epoch 009 | loss 0.026 | ppl 1.02 | wps 6516.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 44487 | lr 0.000149928 | gnorm 0.429 | train_wall 1471 | gb_free 12.6 | wall 13472
|
| 181 |
+
Start iterating over samples
|
| 182 |
+
begin validation on "valid" subset
|
| 183 |
+
epoch 010 | valid on 'valid' subset | loss 0.113 | ppl 1.08 | wps 21458.6 | wpb 1943.9 | bsz 93.9 | num_updates 49430 | best_loss 0.11
|
| 184 |
+
epoch 010 | valid on 'valid' subset | loss 0.113 | ppl 1.08 | wps 21458.6 | wpb 1943.9 | bsz 93.9 | num_updates 49430 | best_loss 0.11
|
| 185 |
+
epoch 010 | valid on 'valid' subset | loss 0.113 | ppl 1.08 | wps 21458.6 | wpb 1943.9 | bsz 93.9 | num_updates 49430 | best_loss 0.11
|
| 186 |
+
epoch 010 | valid on 'valid' subset | loss 0.113 | ppl 1.08 | wps 21458.6 | wpb 1943.9 | bsz 93.9 | num_updates 49430 | best_loss 0.11
|
| 187 |
+
epoch 010 | valid on 'valid' subset | loss 0.113 | ppl 1.08 | wps 21458.6 | wpb 1943.9 | bsz 93.9 | num_updates 49430 | best_loss 0.11
|
| 188 |
+
epoch 010 | valid on 'valid' subset | loss 0.113 | ppl 1.08 | wps 21458.6 | wpb 1943.9 | bsz 93.9 | num_updates 49430 | best_loss 0.11
|
| 189 |
+
epoch 010 | valid on 'valid' subset | loss 0.113 | ppl 1.08 | wps 21458.6 | wpb 1943.9 | bsz 93.9 | num_updates 49430 | best_loss 0.11
|
| 190 |
+
epoch 010 | valid on 'valid' subset | loss 0.113 | ppl 1.08 | wps 21458.6 | wpb 1943.9 | bsz 93.9 | num_updates 49430 | best_loss 0.11
|
| 191 |
+
epoch 010 | valid on 'valid' subset | loss 0.113 | ppl 1.08 | wps 21458.6 | wpb 1943.9 | bsz 93.9 | num_updates 49430 | best_loss 0.11
|
| 192 |
+
epoch 010 | valid on 'valid' subset | loss 0.113 | ppl 1.08 | wps 21458.6 | wpb 1943.9 | bsz 93.9 | num_updates 49430 | best_loss 0.11
|
| 193 |
+
end of epoch 10 (average epoch stats below)
|
| 194 |
+
epoch 010 | loss 0.02 | ppl 1.01 | wps 6514.4 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 49430 | lr 0.000142234 | gnorm 0.394 | train_wall 1472 | gb_free 12.7 | wall 14968
|
| 195 |
+
epoch 010 | loss 0.02 | ppl 1.01 | wps 6514.4 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 49430 | lr 0.000142234 | gnorm 0.394 | train_wall 1472 | gb_free 12.7 | wall 14968
|
| 196 |
+
epoch 010 | loss 0.02 | ppl 1.01 | wps 6514.4 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 49430 | lr 0.000142234 | gnorm 0.394 | train_wall 1472 | gb_free 12.7 | wall 14968
|
| 197 |
+
epoch 010 | loss 0.02 | ppl 1.01 | wps 6514.4 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 49430 | lr 0.000142234 | gnorm 0.394 | train_wall 1472 | gb_free 12.7 | wall 14968
|
| 198 |
+
epoch 010 | loss 0.02 | ppl 1.01 | wps 6514.4 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 49430 | lr 0.000142234 | gnorm 0.394 | train_wall 1472 | gb_free 12.7 | wall 14968
|
| 199 |
+
epoch 010 | loss 0.02 | ppl 1.01 | wps 6514.4 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 49430 | lr 0.000142234 | gnorm 0.394 | train_wall 1472 | gb_free 12.7 | wall 14968
|
| 200 |
+
epoch 010 | loss 0.02 | ppl 1.01 | wps 6514.4 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 49430 | lr 0.000142234 | gnorm 0.394 | train_wall 1472 | gb_free 12.7 | wall 14968
|
| 201 |
+
epoch 010 | loss 0.02 | ppl 1.01 | wps 6514.4 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 49430 | lr 0.000142234 | gnorm 0.394 | train_wall 1472 | gb_free 12.7 | wall 14968
|
| 202 |
+
epoch 010 | loss 0.02 | ppl 1.01 | wps 6514.4 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 49430 | lr 0.000142234 | gnorm 0.394 | train_wall 1472 | gb_free 12.7 | wall 14968
|
| 203 |
+
epoch 010 | loss 0.02 | ppl 1.01 | wps 6514.4 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 49430 | lr 0.000142234 | gnorm 0.394 | train_wall 1472 | gb_free 12.7 | wall 14968
|
| 204 |
+
Start iterating over samples
|
| 205 |
+
begin validation on "valid" subset
|
| 206 |
+
epoch 011 | valid on 'valid' subset | loss 0.123 | ppl 1.09 | wps 21419.1 | wpb 1943.9 | bsz 93.9 | num_updates 54373 | best_loss 0.11
|
| 207 |
+
epoch 011 | valid on 'valid' subset | loss 0.123 | ppl 1.09 | wps 21419.1 | wpb 1943.9 | bsz 93.9 | num_updates 54373 | best_loss 0.11
|
| 208 |
+
epoch 011 | valid on 'valid' subset | loss 0.123 | ppl 1.09 | wps 21419.1 | wpb 1943.9 | bsz 93.9 | num_updates 54373 | best_loss 0.11
|
| 209 |
+
epoch 011 | valid on 'valid' subset | loss 0.123 | ppl 1.09 | wps 21419.1 | wpb 1943.9 | bsz 93.9 | num_updates 54373 | best_loss 0.11
|
| 210 |
+
epoch 011 | valid on 'valid' subset | loss 0.123 | ppl 1.09 | wps 21419.1 | wpb 1943.9 | bsz 93.9 | num_updates 54373 | best_loss 0.11
|
| 211 |
+
epoch 011 | valid on 'valid' subset | loss 0.123 | ppl 1.09 | wps 21419.1 | wpb 1943.9 | bsz 93.9 | num_updates 54373 | best_loss 0.11
|
| 212 |
+
epoch 011 | valid on 'valid' subset | loss 0.123 | ppl 1.09 | wps 21419.1 | wpb 1943.9 | bsz 93.9 | num_updates 54373 | best_loss 0.11
|
| 213 |
+
epoch 011 | valid on 'valid' subset | loss 0.123 | ppl 1.09 | wps 21419.1 | wpb 1943.9 | bsz 93.9 | num_updates 54373 | best_loss 0.11
|
| 214 |
+
epoch 011 | valid on 'valid' subset | loss 0.123 | ppl 1.09 | wps 21419.1 | wpb 1943.9 | bsz 93.9 | num_updates 54373 | best_loss 0.11
|
| 215 |
+
epoch 011 | valid on 'valid' subset | loss 0.123 | ppl 1.09 | wps 21419.1 | wpb 1943.9 | bsz 93.9 | num_updates 54373 | best_loss 0.11
|
| 216 |
+
epoch 011 | valid on 'valid' subset | loss 0.123 | ppl 1.09 | wps 21419.1 | wpb 1943.9 | bsz 93.9 | num_updates 54373 | best_loss 0.11
|
| 217 |
+
end of epoch 11 (average epoch stats below)
|
| 218 |
+
epoch 011 | loss 0.017 | ppl 1.01 | wps 6514.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 54373 | lr 0.000135615 | gnorm 0.361 | train_wall 1472 | gb_free 12.8 | wall 16464
|
| 219 |
+
epoch 011 | loss 0.017 | ppl 1.01 | wps 6514.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 54373 | lr 0.000135615 | gnorm 0.361 | train_wall 1472 | gb_free 12.8 | wall 16464
|
| 220 |
+
epoch 011 | loss 0.017 | ppl 1.01 | wps 6514.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 54373 | lr 0.000135615 | gnorm 0.361 | train_wall 1472 | gb_free 12.8 | wall 16464
|
| 221 |
+
epoch 011 | loss 0.017 | ppl 1.01 | wps 6514.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 54373 | lr 0.000135615 | gnorm 0.361 | train_wall 1472 | gb_free 12.8 | wall 16464
|
| 222 |
+
epoch 011 | loss 0.017 | ppl 1.01 | wps 6514.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 54373 | lr 0.000135615 | gnorm 0.361 | train_wall 1472 | gb_free 12.8 | wall 16464
|
| 223 |
+
epoch 011 | loss 0.017 | ppl 1.01 | wps 6514.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 54373 | lr 0.000135615 | gnorm 0.361 | train_wall 1472 | gb_free 12.8 | wall 16464
|
| 224 |
+
epoch 011 | loss 0.017 | ppl 1.01 | wps 6514.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 54373 | lr 0.000135615 | gnorm 0.361 | train_wall 1472 | gb_free 12.8 | wall 16464
|
| 225 |
+
epoch 011 | loss 0.017 | ppl 1.01 | wps 6514.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 54373 | lr 0.000135615 | gnorm 0.361 | train_wall 1472 | gb_free 12.8 | wall 16464
|
| 226 |
+
epoch 011 | loss 0.017 | ppl 1.01 | wps 6514.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 54373 | lr 0.000135615 | gnorm 0.361 | train_wall 1472 | gb_free 12.8 | wall 16464
|
| 227 |
+
epoch 011 | loss 0.017 | ppl 1.01 | wps 6514.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 54373 | lr 0.000135615 | gnorm 0.361 | train_wall 1472 | gb_free 12.8 | wall 16464
|
| 228 |
+
epoch 011 | loss 0.017 | ppl 1.01 | wps 6514.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 54373 | lr 0.000135615 | gnorm 0.361 | train_wall 1472 | gb_free 12.8 | wall 16464
|
| 229 |
+
Start iterating over samples
|
| 230 |
+
begin validation on "valid" subset
|
| 231 |
+
epoch 012 | valid on 'valid' subset | loss 0.108 | ppl 1.08 | wps 21329.7 | wpb 1943.9 | bsz 93.9 | num_updates 59316 | best_loss 0.108
|
| 232 |
+
epoch 012 | valid on 'valid' subset | loss 0.108 | ppl 1.08 | wps 21329.7 | wpb 1943.9 | bsz 93.9 | num_updates 59316 | best_loss 0.108
|
| 233 |
+
epoch 012 | valid on 'valid' subset | loss 0.108 | ppl 1.08 | wps 21329.7 | wpb 1943.9 | bsz 93.9 | num_updates 59316 | best_loss 0.108
|
| 234 |
+
epoch 012 | valid on 'valid' subset | loss 0.108 | ppl 1.08 | wps 21329.7 | wpb 1943.9 | bsz 93.9 | num_updates 59316 | best_loss 0.108
|
| 235 |
+
epoch 012 | valid on 'valid' subset | loss 0.108 | ppl 1.08 | wps 21329.7 | wpb 1943.9 | bsz 93.9 | num_updates 59316 | best_loss 0.108
|
| 236 |
+
epoch 012 | valid on 'valid' subset | loss 0.108 | ppl 1.08 | wps 21329.7 | wpb 1943.9 | bsz 93.9 | num_updates 59316 | best_loss 0.108
|
| 237 |
+
epoch 012 | valid on 'valid' subset | loss 0.108 | ppl 1.08 | wps 21329.7 | wpb 1943.9 | bsz 93.9 | num_updates 59316 | best_loss 0.108
|
| 238 |
+
epoch 012 | valid on 'valid' subset | loss 0.108 | ppl 1.08 | wps 21329.7 | wpb 1943.9 | bsz 93.9 | num_updates 59316 | best_loss 0.108
|
| 239 |
+
epoch 012 | valid on 'valid' subset | loss 0.108 | ppl 1.08 | wps 21329.7 | wpb 1943.9 | bsz 93.9 | num_updates 59316 | best_loss 0.108
|
| 240 |
+
epoch 012 | valid on 'valid' subset | loss 0.108 | ppl 1.08 | wps 21329.7 | wpb 1943.9 | bsz 93.9 | num_updates 59316 | best_loss 0.108
|
| 241 |
+
epoch 012 | valid on 'valid' subset | loss 0.108 | ppl 1.08 | wps 21329.7 | wpb 1943.9 | bsz 93.9 | num_updates 59316 | best_loss 0.108
|
| 242 |
+
epoch 012 | valid on 'valid' subset | loss 0.108 | ppl 1.08 | wps 21329.7 | wpb 1943.9 | bsz 93.9 | num_updates 59316 | best_loss 0.108
|
| 243 |
+
end of epoch 12 (average epoch stats below)
|
| 244 |
+
epoch 012 | loss 0.014 | ppl 1.01 | wps 6511.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 59316 | lr 0.000129842 | gnorm 0.334 | train_wall 1472 | gb_free 12.8 | wall 17961
|
| 245 |
+
epoch 012 | loss 0.014 | ppl 1.01 | wps 6511.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 59316 | lr 0.000129842 | gnorm 0.334 | train_wall 1472 | gb_free 12.8 | wall 17961
|
| 246 |
+
epoch 012 | loss 0.014 | ppl 1.01 | wps 6511.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 59316 | lr 0.000129842 | gnorm 0.334 | train_wall 1472 | gb_free 12.8 | wall 17961
|
| 247 |
+
epoch 012 | loss 0.014 | ppl 1.01 | wps 6511.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 59316 | lr 0.000129842 | gnorm 0.334 | train_wall 1472 | gb_free 12.8 | wall 17961
|
| 248 |
+
epoch 012 | loss 0.014 | ppl 1.01 | wps 6511.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 59316 | lr 0.000129842 | gnorm 0.334 | train_wall 1472 | gb_free 12.8 | wall 17961
|
| 249 |
+
epoch 012 | loss 0.014 | ppl 1.01 | wps 6511.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 59316 | lr 0.000129842 | gnorm 0.334 | train_wall 1472 | gb_free 12.8 | wall 17961
|
| 250 |
+
epoch 012 | loss 0.014 | ppl 1.01 | wps 6511.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 59316 | lr 0.000129842 | gnorm 0.334 | train_wall 1472 | gb_free 12.8 | wall 17961
|
| 251 |
+
epoch 012 | loss 0.014 | ppl 1.01 | wps 6511.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 59316 | lr 0.000129842 | gnorm 0.334 | train_wall 1472 | gb_free 12.8 | wall 17961
|
| 252 |
+
epoch 012 | loss 0.014 | ppl 1.01 | wps 6511.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 59316 | lr 0.000129842 | gnorm 0.334 | train_wall 1472 | gb_free 12.8 | wall 17961
|
| 253 |
+
epoch 012 | loss 0.014 | ppl 1.01 | wps 6511.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 59316 | lr 0.000129842 | gnorm 0.334 | train_wall 1472 | gb_free 12.8 | wall 17961
|
| 254 |
+
epoch 012 | loss 0.014 | ppl 1.01 | wps 6511.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 59316 | lr 0.000129842 | gnorm 0.334 | train_wall 1472 | gb_free 12.8 | wall 17961
|
| 255 |
+
epoch 012 | loss 0.014 | ppl 1.01 | wps 6511.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 59316 | lr 0.000129842 | gnorm 0.334 | train_wall 1472 | gb_free 12.8 | wall 17961
|
| 256 |
+
Start iterating over samples
|
| 257 |
+
begin validation on "valid" subset
|
| 258 |
+
epoch 013 | valid on 'valid' subset | loss 0.108 | ppl 1.08 | wps 21359.7 | wpb 1943.9 | bsz 93.9 | num_updates 64259 | best_loss 0.108
|
| 259 |
+
epoch 013 | valid on 'valid' subset | loss 0.108 | ppl 1.08 | wps 21359.7 | wpb 1943.9 | bsz 93.9 | num_updates 64259 | best_loss 0.108
|
| 260 |
+
epoch 013 | valid on 'valid' subset | loss 0.108 | ppl 1.08 | wps 21359.7 | wpb 1943.9 | bsz 93.9 | num_updates 64259 | best_loss 0.108
|
| 261 |
+
epoch 013 | valid on 'valid' subset | loss 0.108 | ppl 1.08 | wps 21359.7 | wpb 1943.9 | bsz 93.9 | num_updates 64259 | best_loss 0.108
|
| 262 |
+
epoch 013 | valid on 'valid' subset | loss 0.108 | ppl 1.08 | wps 21359.7 | wpb 1943.9 | bsz 93.9 | num_updates 64259 | best_loss 0.108
|
| 263 |
+
epoch 013 | valid on 'valid' subset | loss 0.108 | ppl 1.08 | wps 21359.7 | wpb 1943.9 | bsz 93.9 | num_updates 64259 | best_loss 0.108
|
| 264 |
+
epoch 013 | valid on 'valid' subset | loss 0.108 | ppl 1.08 | wps 21359.7 | wpb 1943.9 | bsz 93.9 | num_updates 64259 | best_loss 0.108
|
| 265 |
+
epoch 013 | valid on 'valid' subset | loss 0.108 | ppl 1.08 | wps 21359.7 | wpb 1943.9 | bsz 93.9 | num_updates 64259 | best_loss 0.108
|
| 266 |
+
epoch 013 | valid on 'valid' subset | loss 0.108 | ppl 1.08 | wps 21359.7 | wpb 1943.9 | bsz 93.9 | num_updates 64259 | best_loss 0.108
|
| 267 |
+
epoch 013 | valid on 'valid' subset | loss 0.108 | ppl 1.08 | wps 21359.7 | wpb 1943.9 | bsz 93.9 | num_updates 64259 | best_loss 0.108
|
| 268 |
+
epoch 013 | valid on 'valid' subset | loss 0.108 | ppl 1.08 | wps 21359.7 | wpb 1943.9 | bsz 93.9 | num_updates 64259 | best_loss 0.108
|
| 269 |
+
epoch 013 | valid on 'valid' subset | loss 0.108 | ppl 1.08 | wps 21359.7 | wpb 1943.9 | bsz 93.9 | num_updates 64259 | best_loss 0.108
|
| 270 |
+
epoch 013 | valid on 'valid' subset | loss 0.108 | ppl 1.08 | wps 21359.7 | wpb 1943.9 | bsz 93.9 | num_updates 64259 | best_loss 0.108
|
| 271 |
+
end of epoch 13 (average epoch stats below)
|
| 272 |
+
epoch 013 | loss 0.012 | ppl 1.01 | wps 6506.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 64259 | lr 0.000124748 | gnorm 0.323 | train_wall 1474 | gb_free 13 | wall 19459
|
| 273 |
+
epoch 013 | loss 0.012 | ppl 1.01 | wps 6506.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 64259 | lr 0.000124748 | gnorm 0.323 | train_wall 1474 | gb_free 13 | wall 19459
|
| 274 |
+
epoch 013 | loss 0.012 | ppl 1.01 | wps 6506.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 64259 | lr 0.000124748 | gnorm 0.323 | train_wall 1474 | gb_free 13 | wall 19459
|
| 275 |
+
epoch 013 | loss 0.012 | ppl 1.01 | wps 6506.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 64259 | lr 0.000124748 | gnorm 0.323 | train_wall 1474 | gb_free 13 | wall 19459
|
| 276 |
+
epoch 013 | loss 0.012 | ppl 1.01 | wps 6506.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 64259 | lr 0.000124748 | gnorm 0.323 | train_wall 1474 | gb_free 13 | wall 19459
|
| 277 |
+
epoch 013 | loss 0.012 | ppl 1.01 | wps 6506.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 64259 | lr 0.000124748 | gnorm 0.323 | train_wall 1474 | gb_free 13 | wall 19459
|
| 278 |
+
epoch 013 | loss 0.012 | ppl 1.01 | wps 6506.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 64259 | lr 0.000124748 | gnorm 0.323 | train_wall 1474 | gb_free 13 | wall 19459
|
| 279 |
+
epoch 013 | loss 0.012 | ppl 1.01 | wps 6506.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 64259 | lr 0.000124748 | gnorm 0.323 | train_wall 1474 | gb_free 13 | wall 19459
|
| 280 |
+
epoch 013 | loss 0.012 | ppl 1.01 | wps 6506.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 64259 | lr 0.000124748 | gnorm 0.323 | train_wall 1474 | gb_free 13 | wall 19459
|
| 281 |
+
epoch 013 | loss 0.012 | ppl 1.01 | wps 6506.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 64259 | lr 0.000124748 | gnorm 0.323 | train_wall 1474 | gb_free 13 | wall 19459
|
| 282 |
+
epoch 013 | loss 0.012 | ppl 1.01 | wps 6506.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 64259 | lr 0.000124748 | gnorm 0.323 | train_wall 1474 | gb_free 13 | wall 19459
|
| 283 |
+
epoch 013 | loss 0.012 | ppl 1.01 | wps 6506.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 64259 | lr 0.000124748 | gnorm 0.323 | train_wall 1474 | gb_free 13 | wall 19459
|
| 284 |
+
epoch 013 | loss 0.012 | ppl 1.01 | wps 6506.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 64259 | lr 0.000124748 | gnorm 0.323 | train_wall 1474 | gb_free 13 | wall 19459
|
| 285 |
+
Start iterating over samples
|
| 286 |
+
begin validation on "valid" subset
|
| 287 |
+
epoch 014 | valid on 'valid' subset | loss 0.113 | ppl 1.08 | wps 21403.9 | wpb 1943.9 | bsz 93.9 | num_updates 69202 | best_loss 0.108
|
| 288 |
+
epoch 014 | valid on 'valid' subset | loss 0.113 | ppl 1.08 | wps 21403.9 | wpb 1943.9 | bsz 93.9 | num_updates 69202 | best_loss 0.108
|
| 289 |
+
epoch 014 | valid on 'valid' subset | loss 0.113 | ppl 1.08 | wps 21403.9 | wpb 1943.9 | bsz 93.9 | num_updates 69202 | best_loss 0.108
|
| 290 |
+
epoch 014 | valid on 'valid' subset | loss 0.113 | ppl 1.08 | wps 21403.9 | wpb 1943.9 | bsz 93.9 | num_updates 69202 | best_loss 0.108
|
| 291 |
+
epoch 014 | valid on 'valid' subset | loss 0.113 | ppl 1.08 | wps 21403.9 | wpb 1943.9 | bsz 93.9 | num_updates 69202 | best_loss 0.108
|
| 292 |
+
epoch 014 | valid on 'valid' subset | loss 0.113 | ppl 1.08 | wps 21403.9 | wpb 1943.9 | bsz 93.9 | num_updates 69202 | best_loss 0.108
|
| 293 |
+
epoch 014 | valid on 'valid' subset | loss 0.113 | ppl 1.08 | wps 21403.9 | wpb 1943.9 | bsz 93.9 | num_updates 69202 | best_loss 0.108
|
| 294 |
+
epoch 014 | valid on 'valid' subset | loss 0.113 | ppl 1.08 | wps 21403.9 | wpb 1943.9 | bsz 93.9 | num_updates 69202 | best_loss 0.108
|
| 295 |
+
epoch 014 | valid on 'valid' subset | loss 0.113 | ppl 1.08 | wps 21403.9 | wpb 1943.9 | bsz 93.9 | num_updates 69202 | best_loss 0.108
|
| 296 |
+
epoch 014 | valid on 'valid' subset | loss 0.113 | ppl 1.08 | wps 21403.9 | wpb 1943.9 | bsz 93.9 | num_updates 69202 | best_loss 0.108
|
| 297 |
+
epoch 014 | valid on 'valid' subset | loss 0.113 | ppl 1.08 | wps 21403.9 | wpb 1943.9 | bsz 93.9 | num_updates 69202 | best_loss 0.108
|
| 298 |
+
epoch 014 | valid on 'valid' subset | loss 0.113 | ppl 1.08 | wps 21403.9 | wpb 1943.9 | bsz 93.9 | num_updates 69202 | best_loss 0.108
|
| 299 |
+
epoch 014 | valid on 'valid' subset | loss 0.113 | ppl 1.08 | wps 21403.9 | wpb 1943.9 | bsz 93.9 | num_updates 69202 | best_loss 0.108
|
| 300 |
+
epoch 014 | valid on 'valid' subset | loss 0.113 | ppl 1.08 | wps 21403.9 | wpb 1943.9 | bsz 93.9 | num_updates 69202 | best_loss 0.108
|
| 301 |
+
end of epoch 14 (average epoch stats below)
|
| 302 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6512.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 69202 | lr 0.00012021 | gnorm 0.295 | train_wall 1473 | gb_free 12.8 | wall 20956
|
| 303 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6512.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 69202 | lr 0.00012021 | gnorm 0.295 | train_wall 1473 | gb_free 12.8 | wall 20956
|
| 304 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6512.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 69202 | lr 0.00012021 | gnorm 0.295 | train_wall 1473 | gb_free 12.8 | wall 20956
|
| 305 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6512.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 69202 | lr 0.00012021 | gnorm 0.295 | train_wall 1473 | gb_free 12.8 | wall 20956
|
| 306 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6512.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 69202 | lr 0.00012021 | gnorm 0.295 | train_wall 1473 | gb_free 12.8 | wall 20956
|
| 307 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6512.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 69202 | lr 0.00012021 | gnorm 0.295 | train_wall 1473 | gb_free 12.8 | wall 20956
|
| 308 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6512.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 69202 | lr 0.00012021 | gnorm 0.295 | train_wall 1473 | gb_free 12.8 | wall 20956
|
| 309 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6512.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 69202 | lr 0.00012021 | gnorm 0.295 | train_wall 1473 | gb_free 12.8 | wall 20956
|
| 310 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6512.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 69202 | lr 0.00012021 | gnorm 0.295 | train_wall 1473 | gb_free 12.8 | wall 20956
|
| 311 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6512.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 69202 | lr 0.00012021 | gnorm 0.295 | train_wall 1473 | gb_free 12.8 | wall 20956
|
| 312 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6512.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 69202 | lr 0.00012021 | gnorm 0.295 | train_wall 1473 | gb_free 12.8 | wall 20956
|
| 313 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6512.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 69202 | lr 0.00012021 | gnorm 0.295 | train_wall 1473 | gb_free 12.8 | wall 20956
|
| 314 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6512.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 69202 | lr 0.00012021 | gnorm 0.295 | train_wall 1473 | gb_free 12.8 | wall 20956
|
| 315 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6512.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 69202 | lr 0.00012021 | gnorm 0.295 | train_wall 1473 | gb_free 12.8 | wall 20956
|
| 316 |
+
Start iterating over samples
|
| 317 |
+
begin validation on "valid" subset
|
| 318 |
+
epoch 015 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21394.9 | wpb 1943.9 | bsz 93.9 | num_updates 74145 | best_loss 0.106
|
| 319 |
+
epoch 015 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21394.9 | wpb 1943.9 | bsz 93.9 | num_updates 74145 | best_loss 0.106
|
| 320 |
+
epoch 015 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21394.9 | wpb 1943.9 | bsz 93.9 | num_updates 74145 | best_loss 0.106
|
| 321 |
+
epoch 015 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21394.9 | wpb 1943.9 | bsz 93.9 | num_updates 74145 | best_loss 0.106
|
| 322 |
+
epoch 015 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21394.9 | wpb 1943.9 | bsz 93.9 | num_updates 74145 | best_loss 0.106
|
| 323 |
+
epoch 015 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21394.9 | wpb 1943.9 | bsz 93.9 | num_updates 74145 | best_loss 0.106
|
| 324 |
+
epoch 015 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21394.9 | wpb 1943.9 | bsz 93.9 | num_updates 74145 | best_loss 0.106
|
| 325 |
+
epoch 015 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21394.9 | wpb 1943.9 | bsz 93.9 | num_updates 74145 | best_loss 0.106
|
| 326 |
+
epoch 015 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21394.9 | wpb 1943.9 | bsz 93.9 | num_updates 74145 | best_loss 0.106
|
| 327 |
+
epoch 015 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21394.9 | wpb 1943.9 | bsz 93.9 | num_updates 74145 | best_loss 0.106
|
| 328 |
+
epoch 015 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21394.9 | wpb 1943.9 | bsz 93.9 | num_updates 74145 | best_loss 0.106
|
| 329 |
+
epoch 015 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21394.9 | wpb 1943.9 | bsz 93.9 | num_updates 74145 | best_loss 0.106
|
| 330 |
+
epoch 015 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21394.9 | wpb 1943.9 | bsz 93.9 | num_updates 74145 | best_loss 0.106
|
| 331 |
+
epoch 015 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21394.9 | wpb 1943.9 | bsz 93.9 | num_updates 74145 | best_loss 0.106
|
| 332 |
+
epoch 015 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21394.9 | wpb 1943.9 | bsz 93.9 | num_updates 74145 | best_loss 0.106
|
| 333 |
+
end of epoch 15 (average epoch stats below)
|
| 334 |
+
epoch 015 | loss 0.009 | ppl 1.01 | wps 6509.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 74145 | lr 0.000116134 | gnorm 0.276 | train_wall 1473 | gb_free 12.8 | wall 22453
|
| 335 |
+
epoch 015 | loss 0.009 | ppl 1.01 | wps 6509.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 74145 | lr 0.000116134 | gnorm 0.276 | train_wall 1473 | gb_free 12.8 | wall 22453
|
| 336 |
+
epoch 015 | loss 0.009 | ppl 1.01 | wps 6509.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 74145 | lr 0.000116134 | gnorm 0.276 | train_wall 1473 | gb_free 12.8 | wall 22453
|
| 337 |
+
epoch 015 | loss 0.009 | ppl 1.01 | wps 6509.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 74145 | lr 0.000116134 | gnorm 0.276 | train_wall 1473 | gb_free 12.8 | wall 22453
|
| 338 |
+
epoch 015 | loss 0.009 | ppl 1.01 | wps 6509.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 74145 | lr 0.000116134 | gnorm 0.276 | train_wall 1473 | gb_free 12.8 | wall 22453
|
| 339 |
+
epoch 015 | loss 0.009 | ppl 1.01 | wps 6509.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 74145 | lr 0.000116134 | gnorm 0.276 | train_wall 1473 | gb_free 12.8 | wall 22453
|
| 340 |
+
epoch 015 | loss 0.009 | ppl 1.01 | wps 6509.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 74145 | lr 0.000116134 | gnorm 0.276 | train_wall 1473 | gb_free 12.8 | wall 22453
|
| 341 |
+
epoch 015 | loss 0.009 | ppl 1.01 | wps 6509.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 74145 | lr 0.000116134 | gnorm 0.276 | train_wall 1473 | gb_free 12.8 | wall 22453
|
| 342 |
+
epoch 015 | loss 0.009 | ppl 1.01 | wps 6509.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 74145 | lr 0.000116134 | gnorm 0.276 | train_wall 1473 | gb_free 12.8 | wall 22453
|
| 343 |
+
epoch 015 | loss 0.009 | ppl 1.01 | wps 6509.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 74145 | lr 0.000116134 | gnorm 0.276 | train_wall 1473 | gb_free 12.8 | wall 22453
|
| 344 |
+
epoch 015 | loss 0.009 | ppl 1.01 | wps 6509.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 74145 | lr 0.000116134 | gnorm 0.276 | train_wall 1473 | gb_free 12.8 | wall 22453
|
| 345 |
+
epoch 015 | loss 0.009 | ppl 1.01 | wps 6509.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 74145 | lr 0.000116134 | gnorm 0.276 | train_wall 1473 | gb_free 12.8 | wall 22453
|
| 346 |
+
epoch 015 | loss 0.009 | ppl 1.01 | wps 6509.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 74145 | lr 0.000116134 | gnorm 0.276 | train_wall 1473 | gb_free 12.8 | wall 22453
|
| 347 |
+
epoch 015 | loss 0.009 | ppl 1.01 | wps 6509.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 74145 | lr 0.000116134 | gnorm 0.276 | train_wall 1473 | gb_free 12.8 | wall 22453
|
| 348 |
+
epoch 015 | loss 0.009 | ppl 1.01 | wps 6509.6 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 74145 | lr 0.000116134 | gnorm 0.276 | train_wall 1473 | gb_free 12.8 | wall 22453
|
| 349 |
+
Start iterating over samples
|
| 350 |
+
begin validation on "valid" subset
|
| 351 |
+
epoch 016 | valid on 'valid' subset | loss 0.102 | ppl 1.07 | wps 21274.8 | wpb 1943.9 | bsz 93.9 | num_updates 79088 | best_loss 0.102
|
| 352 |
+
epoch 016 | valid on 'valid' subset | loss 0.102 | ppl 1.07 | wps 21274.8 | wpb 1943.9 | bsz 93.9 | num_updates 79088 | best_loss 0.102
|
| 353 |
+
epoch 016 | valid on 'valid' subset | loss 0.102 | ppl 1.07 | wps 21274.8 | wpb 1943.9 | bsz 93.9 | num_updates 79088 | best_loss 0.102
|
| 354 |
+
epoch 016 | valid on 'valid' subset | loss 0.102 | ppl 1.07 | wps 21274.8 | wpb 1943.9 | bsz 93.9 | num_updates 79088 | best_loss 0.102
|
| 355 |
+
epoch 016 | valid on 'valid' subset | loss 0.102 | ppl 1.07 | wps 21274.8 | wpb 1943.9 | bsz 93.9 | num_updates 79088 | best_loss 0.102
|
| 356 |
+
epoch 016 | valid on 'valid' subset | loss 0.102 | ppl 1.07 | wps 21274.8 | wpb 1943.9 | bsz 93.9 | num_updates 79088 | best_loss 0.102
|
| 357 |
+
epoch 016 | valid on 'valid' subset | loss 0.102 | ppl 1.07 | wps 21274.8 | wpb 1943.9 | bsz 93.9 | num_updates 79088 | best_loss 0.102
|
| 358 |
+
epoch 016 | valid on 'valid' subset | loss 0.102 | ppl 1.07 | wps 21274.8 | wpb 1943.9 | bsz 93.9 | num_updates 79088 | best_loss 0.102
|
| 359 |
+
epoch 016 | valid on 'valid' subset | loss 0.102 | ppl 1.07 | wps 21274.8 | wpb 1943.9 | bsz 93.9 | num_updates 79088 | best_loss 0.102
|
| 360 |
+
epoch 016 | valid on 'valid' subset | loss 0.102 | ppl 1.07 | wps 21274.8 | wpb 1943.9 | bsz 93.9 | num_updates 79088 | best_loss 0.102
|
| 361 |
+
epoch 016 | valid on 'valid' subset | loss 0.102 | ppl 1.07 | wps 21274.8 | wpb 1943.9 | bsz 93.9 | num_updates 79088 | best_loss 0.102
|
| 362 |
+
epoch 016 | valid on 'valid' subset | loss 0.102 | ppl 1.07 | wps 21274.8 | wpb 1943.9 | bsz 93.9 | num_updates 79088 | best_loss 0.102
|
| 363 |
+
epoch 016 | valid on 'valid' subset | loss 0.102 | ppl 1.07 | wps 21274.8 | wpb 1943.9 | bsz 93.9 | num_updates 79088 | best_loss 0.102
|
| 364 |
+
epoch 016 | valid on 'valid' subset | loss 0.102 | ppl 1.07 | wps 21274.8 | wpb 1943.9 | bsz 93.9 | num_updates 79088 | best_loss 0.102
|
| 365 |
+
epoch 016 | valid on 'valid' subset | loss 0.102 | ppl 1.07 | wps 21274.8 | wpb 1943.9 | bsz 93.9 | num_updates 79088 | best_loss 0.102
|
| 366 |
+
epoch 016 | valid on 'valid' subset | loss 0.102 | ppl 1.07 | wps 21274.8 | wpb 1943.9 | bsz 93.9 | num_updates 79088 | best_loss 0.102
|
| 367 |
+
end of epoch 16 (average epoch stats below)
|
| 368 |
+
epoch 016 | loss 0.008 | ppl 1.01 | wps 6507.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 79088 | lr 0.000112446 | gnorm 0.268 | train_wall 1473 | gb_free 12.9 | wall 23951
|
| 369 |
+
epoch 016 | loss 0.008 | ppl 1.01 | wps 6507.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 79088 | lr 0.000112446 | gnorm 0.268 | train_wall 1473 | gb_free 12.9 | wall 23951
|
| 370 |
+
epoch 016 | loss 0.008 | ppl 1.01 | wps 6507.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 79088 | lr 0.000112446 | gnorm 0.268 | train_wall 1473 | gb_free 12.9 | wall 23951
|
| 371 |
+
epoch 016 | loss 0.008 | ppl 1.01 | wps 6507.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 79088 | lr 0.000112446 | gnorm 0.268 | train_wall 1473 | gb_free 12.9 | wall 23951
|
| 372 |
+
epoch 016 | loss 0.008 | ppl 1.01 | wps 6507.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 79088 | lr 0.000112446 | gnorm 0.268 | train_wall 1473 | gb_free 12.9 | wall 23951
|
| 373 |
+
epoch 016 | loss 0.008 | ppl 1.01 | wps 6507.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 79088 | lr 0.000112446 | gnorm 0.268 | train_wall 1473 | gb_free 12.9 | wall 23951
|
| 374 |
+
epoch 016 | loss 0.008 | ppl 1.01 | wps 6507.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 79088 | lr 0.000112446 | gnorm 0.268 | train_wall 1473 | gb_free 12.9 | wall 23951
|
| 375 |
+
epoch 016 | loss 0.008 | ppl 1.01 | wps 6507.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 79088 | lr 0.000112446 | gnorm 0.268 | train_wall 1473 | gb_free 12.9 | wall 23951
|
| 376 |
+
epoch 016 | loss 0.008 | ppl 1.01 | wps 6507.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 79088 | lr 0.000112446 | gnorm 0.268 | train_wall 1473 | gb_free 12.9 | wall 23951
|
| 377 |
+
epoch 016 | loss 0.008 | ppl 1.01 | wps 6507.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 79088 | lr 0.000112446 | gnorm 0.268 | train_wall 1473 | gb_free 12.9 | wall 23951
|
| 378 |
+
epoch 016 | loss 0.008 | ppl 1.01 | wps 6507.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 79088 | lr 0.000112446 | gnorm 0.268 | train_wall 1473 | gb_free 12.9 | wall 23951
|
| 379 |
+
epoch 016 | loss 0.008 | ppl 1.01 | wps 6507.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 79088 | lr 0.000112446 | gnorm 0.268 | train_wall 1473 | gb_free 12.9 | wall 23951
|
| 380 |
+
epoch 016 | loss 0.008 | ppl 1.01 | wps 6507.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 79088 | lr 0.000112446 | gnorm 0.268 | train_wall 1473 | gb_free 12.9 | wall 23951
|
| 381 |
+
epoch 016 | loss 0.008 | ppl 1.01 | wps 6507.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 79088 | lr 0.000112446 | gnorm 0.268 | train_wall 1473 | gb_free 12.9 | wall 23951
|
| 382 |
+
epoch 016 | loss 0.008 | ppl 1.01 | wps 6507.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 79088 | lr 0.000112446 | gnorm 0.268 | train_wall 1473 | gb_free 12.9 | wall 23951
|
| 383 |
+
epoch 016 | loss 0.008 | ppl 1.01 | wps 6507.7 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 79088 | lr 0.000112446 | gnorm 0.268 | train_wall 1473 | gb_free 12.9 | wall 23951
|
| 384 |
+
Start iterating over samples
|
| 385 |
+
begin validation on "valid" subset
|
| 386 |
+
epoch 017 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21457.6 | wpb 1943.9 | bsz 93.9 | num_updates 84031 | best_loss 0.102
|
| 387 |
+
epoch 017 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21457.6 | wpb 1943.9 | bsz 93.9 | num_updates 84031 | best_loss 0.102
|
| 388 |
+
epoch 017 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21457.6 | wpb 1943.9 | bsz 93.9 | num_updates 84031 | best_loss 0.102
|
| 389 |
+
epoch 017 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21457.6 | wpb 1943.9 | bsz 93.9 | num_updates 84031 | best_loss 0.102
|
| 390 |
+
epoch 017 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21457.6 | wpb 1943.9 | bsz 93.9 | num_updates 84031 | best_loss 0.102
|
| 391 |
+
epoch 017 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21457.6 | wpb 1943.9 | bsz 93.9 | num_updates 84031 | best_loss 0.102
|
| 392 |
+
epoch 017 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21457.6 | wpb 1943.9 | bsz 93.9 | num_updates 84031 | best_loss 0.102
|
| 393 |
+
epoch 017 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21457.6 | wpb 1943.9 | bsz 93.9 | num_updates 84031 | best_loss 0.102
|
| 394 |
+
epoch 017 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21457.6 | wpb 1943.9 | bsz 93.9 | num_updates 84031 | best_loss 0.102
|
| 395 |
+
epoch 017 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21457.6 | wpb 1943.9 | bsz 93.9 | num_updates 84031 | best_loss 0.102
|
| 396 |
+
epoch 017 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21457.6 | wpb 1943.9 | bsz 93.9 | num_updates 84031 | best_loss 0.102
|
| 397 |
+
epoch 017 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21457.6 | wpb 1943.9 | bsz 93.9 | num_updates 84031 | best_loss 0.102
|
| 398 |
+
epoch 017 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21457.6 | wpb 1943.9 | bsz 93.9 | num_updates 84031 | best_loss 0.102
|
| 399 |
+
epoch 017 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21457.6 | wpb 1943.9 | bsz 93.9 | num_updates 84031 | best_loss 0.102
|
| 400 |
+
epoch 017 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21457.6 | wpb 1943.9 | bsz 93.9 | num_updates 84031 | best_loss 0.102
|
| 401 |
+
epoch 017 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21457.6 | wpb 1943.9 | bsz 93.9 | num_updates 84031 | best_loss 0.102
|
| 402 |
+
epoch 017 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21457.6 | wpb 1943.9 | bsz 93.9 | num_updates 84031 | best_loss 0.102
|
| 403 |
+
end of epoch 17 (average epoch stats below)
|
| 404 |
+
epoch 017 | loss 0.007 | ppl 1.01 | wps 6509.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 84031 | lr 0.000109089 | gnorm 0.246 | train_wall 1473 | gb_free 13 | wall 25448
|
| 405 |
+
epoch 017 | loss 0.007 | ppl 1.01 | wps 6509.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 84031 | lr 0.000109089 | gnorm 0.246 | train_wall 1473 | gb_free 13 | wall 25448
|
| 406 |
+
epoch 017 | loss 0.007 | ppl 1.01 | wps 6509.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 84031 | lr 0.000109089 | gnorm 0.246 | train_wall 1473 | gb_free 13 | wall 25448
|
| 407 |
+
epoch 017 | loss 0.007 | ppl 1.01 | wps 6509.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 84031 | lr 0.000109089 | gnorm 0.246 | train_wall 1473 | gb_free 13 | wall 25448
|
| 408 |
+
epoch 017 | loss 0.007 | ppl 1.01 | wps 6509.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 84031 | lr 0.000109089 | gnorm 0.246 | train_wall 1473 | gb_free 13 | wall 25448
|
| 409 |
+
epoch 017 | loss 0.007 | ppl 1.01 | wps 6509.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 84031 | lr 0.000109089 | gnorm 0.246 | train_wall 1473 | gb_free 13 | wall 25448
|
| 410 |
+
epoch 017 | loss 0.007 | ppl 1.01 | wps 6509.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 84031 | lr 0.000109089 | gnorm 0.246 | train_wall 1473 | gb_free 13 | wall 25448
|
| 411 |
+
epoch 017 | loss 0.007 | ppl 1.01 | wps 6509.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 84031 | lr 0.000109089 | gnorm 0.246 | train_wall 1473 | gb_free 13 | wall 25448
|
| 412 |
+
epoch 017 | loss 0.007 | ppl 1.01 | wps 6509.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 84031 | lr 0.000109089 | gnorm 0.246 | train_wall 1473 | gb_free 13 | wall 25448
|
| 413 |
+
epoch 017 | loss 0.007 | ppl 1.01 | wps 6509.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 84031 | lr 0.000109089 | gnorm 0.246 | train_wall 1473 | gb_free 13 | wall 25448
|
| 414 |
+
epoch 017 | loss 0.007 | ppl 1.01 | wps 6509.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 84031 | lr 0.000109089 | gnorm 0.246 | train_wall 1473 | gb_free 13 | wall 25448
|
| 415 |
+
epoch 017 | loss 0.007 | ppl 1.01 | wps 6509.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 84031 | lr 0.000109089 | gnorm 0.246 | train_wall 1473 | gb_free 13 | wall 25448
|
| 416 |
+
epoch 017 | loss 0.007 | ppl 1.01 | wps 6509.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 84031 | lr 0.000109089 | gnorm 0.246 | train_wall 1473 | gb_free 13 | wall 25448
|
| 417 |
+
epoch 017 | loss 0.007 | ppl 1.01 | wps 6509.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 84031 | lr 0.000109089 | gnorm 0.246 | train_wall 1473 | gb_free 13 | wall 25448
|
| 418 |
+
epoch 017 | loss 0.007 | ppl 1.01 | wps 6509.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 84031 | lr 0.000109089 | gnorm 0.246 | train_wall 1473 | gb_free 13 | wall 25448
|
| 419 |
+
epoch 017 | loss 0.007 | ppl 1.01 | wps 6509.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 84031 | lr 0.000109089 | gnorm 0.246 | train_wall 1473 | gb_free 13 | wall 25448
|
| 420 |
+
epoch 017 | loss 0.007 | ppl 1.01 | wps 6509.5 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 84031 | lr 0.000109089 | gnorm 0.246 | train_wall 1473 | gb_free 13 | wall 25448
|
| 421 |
+
Start iterating over samples
|
| 422 |
+
begin validation on "valid" subset
|
| 423 |
+
epoch 018 | valid on 'valid' subset | loss 0.097 | ppl 1.07 | wps 21448.5 | wpb 1943.9 | bsz 93.9 | num_updates 88974 | best_loss 0.097
|
| 424 |
+
epoch 018 | valid on 'valid' subset | loss 0.097 | ppl 1.07 | wps 21448.5 | wpb 1943.9 | bsz 93.9 | num_updates 88974 | best_loss 0.097
|
| 425 |
+
epoch 018 | valid on 'valid' subset | loss 0.097 | ppl 1.07 | wps 21448.5 | wpb 1943.9 | bsz 93.9 | num_updates 88974 | best_loss 0.097
|
| 426 |
+
epoch 018 | valid on 'valid' subset | loss 0.097 | ppl 1.07 | wps 21448.5 | wpb 1943.9 | bsz 93.9 | num_updates 88974 | best_loss 0.097
|
| 427 |
+
epoch 018 | valid on 'valid' subset | loss 0.097 | ppl 1.07 | wps 21448.5 | wpb 1943.9 | bsz 93.9 | num_updates 88974 | best_loss 0.097
|
| 428 |
+
epoch 018 | valid on 'valid' subset | loss 0.097 | ppl 1.07 | wps 21448.5 | wpb 1943.9 | bsz 93.9 | num_updates 88974 | best_loss 0.097
|
| 429 |
+
epoch 018 | valid on 'valid' subset | loss 0.097 | ppl 1.07 | wps 21448.5 | wpb 1943.9 | bsz 93.9 | num_updates 88974 | best_loss 0.097
|
| 430 |
+
epoch 018 | valid on 'valid' subset | loss 0.097 | ppl 1.07 | wps 21448.5 | wpb 1943.9 | bsz 93.9 | num_updates 88974 | best_loss 0.097
|
| 431 |
+
epoch 018 | valid on 'valid' subset | loss 0.097 | ppl 1.07 | wps 21448.5 | wpb 1943.9 | bsz 93.9 | num_updates 88974 | best_loss 0.097
|
| 432 |
+
epoch 018 | valid on 'valid' subset | loss 0.097 | ppl 1.07 | wps 21448.5 | wpb 1943.9 | bsz 93.9 | num_updates 88974 | best_loss 0.097
|
| 433 |
+
epoch 018 | valid on 'valid' subset | loss 0.097 | ppl 1.07 | wps 21448.5 | wpb 1943.9 | bsz 93.9 | num_updates 88974 | best_loss 0.097
|
| 434 |
+
epoch 018 | valid on 'valid' subset | loss 0.097 | ppl 1.07 | wps 21448.5 | wpb 1943.9 | bsz 93.9 | num_updates 88974 | best_loss 0.097
|
| 435 |
+
epoch 018 | valid on 'valid' subset | loss 0.097 | ppl 1.07 | wps 21448.5 | wpb 1943.9 | bsz 93.9 | num_updates 88974 | best_loss 0.097
|
| 436 |
+
epoch 018 | valid on 'valid' subset | loss 0.097 | ppl 1.07 | wps 21448.5 | wpb 1943.9 | bsz 93.9 | num_updates 88974 | best_loss 0.097
|
| 437 |
+
epoch 018 | valid on 'valid' subset | loss 0.097 | ppl 1.07 | wps 21448.5 | wpb 1943.9 | bsz 93.9 | num_updates 88974 | best_loss 0.097
|
| 438 |
+
epoch 018 | valid on 'valid' subset | loss 0.097 | ppl 1.07 | wps 21448.5 | wpb 1943.9 | bsz 93.9 | num_updates 88974 | best_loss 0.097
|
| 439 |
+
epoch 018 | valid on 'valid' subset | loss 0.097 | ppl 1.07 | wps 21448.5 | wpb 1943.9 | bsz 93.9 | num_updates 88974 | best_loss 0.097
|
| 440 |
+
epoch 018 | valid on 'valid' subset | loss 0.097 | ppl 1.07 | wps 21448.5 | wpb 1943.9 | bsz 93.9 | num_updates 88974 | best_loss 0.097
|
| 441 |
+
end of epoch 18 (average epoch stats below)
|
| 442 |
+
epoch 018 | loss 0.007 | ppl 1 | wps 6510.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 88974 | lr 0.000106015 | gnorm 0.238 | train_wall 1473 | gb_free 12.9 | wall 26945
|
| 443 |
+
epoch 018 | loss 0.007 | ppl 1 | wps 6510.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 88974 | lr 0.000106015 | gnorm 0.238 | train_wall 1473 | gb_free 12.9 | wall 26945
|
| 444 |
+
epoch 018 | loss 0.007 | ppl 1 | wps 6510.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 88974 | lr 0.000106015 | gnorm 0.238 | train_wall 1473 | gb_free 12.9 | wall 26945
|
| 445 |
+
epoch 018 | loss 0.007 | ppl 1 | wps 6510.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 88974 | lr 0.000106015 | gnorm 0.238 | train_wall 1473 | gb_free 12.9 | wall 26945
|
| 446 |
+
epoch 018 | loss 0.007 | ppl 1 | wps 6510.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 88974 | lr 0.000106015 | gnorm 0.238 | train_wall 1473 | gb_free 12.9 | wall 26945
|
| 447 |
+
epoch 018 | loss 0.007 | ppl 1 | wps 6510.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 88974 | lr 0.000106015 | gnorm 0.238 | train_wall 1473 | gb_free 12.9 | wall 26945
|
| 448 |
+
epoch 018 | loss 0.007 | ppl 1 | wps 6510.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 88974 | lr 0.000106015 | gnorm 0.238 | train_wall 1473 | gb_free 12.9 | wall 26945
|
| 449 |
+
epoch 018 | loss 0.007 | ppl 1 | wps 6510.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 88974 | lr 0.000106015 | gnorm 0.238 | train_wall 1473 | gb_free 12.9 | wall 26945
|
| 450 |
+
epoch 018 | loss 0.007 | ppl 1 | wps 6510.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 88974 | lr 0.000106015 | gnorm 0.238 | train_wall 1473 | gb_free 12.9 | wall 26945
|
| 451 |
+
epoch 018 | loss 0.007 | ppl 1 | wps 6510.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 88974 | lr 0.000106015 | gnorm 0.238 | train_wall 1473 | gb_free 12.9 | wall 26945
|
| 452 |
+
epoch 018 | loss 0.007 | ppl 1 | wps 6510.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 88974 | lr 0.000106015 | gnorm 0.238 | train_wall 1473 | gb_free 12.9 | wall 26945
|
| 453 |
+
epoch 018 | loss 0.007 | ppl 1 | wps 6510.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 88974 | lr 0.000106015 | gnorm 0.238 | train_wall 1473 | gb_free 12.9 | wall 26945
|
| 454 |
+
epoch 018 | loss 0.007 | ppl 1 | wps 6510.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 88974 | lr 0.000106015 | gnorm 0.238 | train_wall 1473 | gb_free 12.9 | wall 26945
|
| 455 |
+
epoch 018 | loss 0.007 | ppl 1 | wps 6510.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 88974 | lr 0.000106015 | gnorm 0.238 | train_wall 1473 | gb_free 12.9 | wall 26945
|
| 456 |
+
epoch 018 | loss 0.007 | ppl 1 | wps 6510.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 88974 | lr 0.000106015 | gnorm 0.238 | train_wall 1473 | gb_free 12.9 | wall 26945
|
| 457 |
+
epoch 018 | loss 0.007 | ppl 1 | wps 6510.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 88974 | lr 0.000106015 | gnorm 0.238 | train_wall 1473 | gb_free 12.9 | wall 26945
|
| 458 |
+
epoch 018 | loss 0.007 | ppl 1 | wps 6510.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 88974 | lr 0.000106015 | gnorm 0.238 | train_wall 1473 | gb_free 12.9 | wall 26945
|
| 459 |
+
epoch 018 | loss 0.007 | ppl 1 | wps 6510.3 | ups 3.3 | wpb 1971.8 | bsz 95.4 | num_updates 88974 | lr 0.000106015 | gnorm 0.238 | train_wall 1473 | gb_free 12.9 | wall 26945
|
| 460 |
+
Start iterating over samples
|
| 461 |
+
begin validation on "valid" subset
|
| 462 |
+
epoch 019 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 21517.5 | wpb 1943.9 | bsz 93.9 | num_updates 93917 | best_loss 0.097
|
| 463 |
+
epoch 019 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 21517.5 | wpb 1943.9 | bsz 93.9 | num_updates 93917 | best_loss 0.097
|
| 464 |
+
epoch 019 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 21517.5 | wpb 1943.9 | bsz 93.9 | num_updates 93917 | best_loss 0.097
|
| 465 |
+
epoch 019 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 21517.5 | wpb 1943.9 | bsz 93.9 | num_updates 93917 | best_loss 0.097
|
| 466 |
+
epoch 019 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 21517.5 | wpb 1943.9 | bsz 93.9 | num_updates 93917 | best_loss 0.097
|
| 467 |
+
epoch 019 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 21517.5 | wpb 1943.9 | bsz 93.9 | num_updates 93917 | best_loss 0.097
|
| 468 |
+
epoch 019 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 21517.5 | wpb 1943.9 | bsz 93.9 | num_updates 93917 | best_loss 0.097
|
| 469 |
+
epoch 019 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 21517.5 | wpb 1943.9 | bsz 93.9 | num_updates 93917 | best_loss 0.097
|
| 470 |
+
epoch 019 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 21517.5 | wpb 1943.9 | bsz 93.9 | num_updates 93917 | best_loss 0.097
|
| 471 |
+
epoch 019 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 21517.5 | wpb 1943.9 | bsz 93.9 | num_updates 93917 | best_loss 0.097
|
| 472 |
+
epoch 019 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 21517.5 | wpb 1943.9 | bsz 93.9 | num_updates 93917 | best_loss 0.097
|
| 473 |
+
epoch 019 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 21517.5 | wpb 1943.9 | bsz 93.9 | num_updates 93917 | best_loss 0.097
|
| 474 |
+
epoch 019 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 21517.5 | wpb 1943.9 | bsz 93.9 | num_updates 93917 | best_loss 0.097
|
| 475 |
+
epoch 019 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 21517.5 | wpb 1943.9 | bsz 93.9 | num_updates 93917 | best_loss 0.097
|
| 476 |
+
epoch 019 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 21517.5 | wpb 1943.9 | bsz 93.9 | num_updates 93917 | best_loss 0.097
|
| 477 |
+
epoch 019 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 21517.5 | wpb 1943.9 | bsz 93.9 | num_updates 93917 | best_loss 0.097
|
| 478 |
+
epoch 019 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 21517.5 | wpb 1943.9 | bsz 93.9 | num_updates 93917 | best_loss 0.097
|
| 479 |
+
epoch 019 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 21517.5 | wpb 1943.9 | bsz 93.9 | num_updates 93917 | best_loss 0.097
|
| 480 |
+
epoch 019 | valid on 'valid' subset | loss 0.098 | ppl 1.07 | wps 21517.5 | wpb 1943.9 | bsz 93.9 | num_updates 93917 | best_loss 0.097
|
| 481 |
+
end of epoch 19 (average epoch stats below)
|
| 482 |
+
epoch 019 | loss 0.006 | ppl 1 | wps 6530.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 93917 | lr 0.000103188 | gnorm 0.226 | train_wall 1468 | gb_free 12.7 | wall 28438
|
| 483 |
+
epoch 019 | loss 0.006 | ppl 1 | wps 6530.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 93917 | lr 0.000103188 | gnorm 0.226 | train_wall 1468 | gb_free 12.7 | wall 28438
|
| 484 |
+
epoch 019 | loss 0.006 | ppl 1 | wps 6530.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 93917 | lr 0.000103188 | gnorm 0.226 | train_wall 1468 | gb_free 12.7 | wall 28438
|
| 485 |
+
epoch 019 | loss 0.006 | ppl 1 | wps 6530.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 93917 | lr 0.000103188 | gnorm 0.226 | train_wall 1468 | gb_free 12.7 | wall 28438
|
| 486 |
+
epoch 019 | loss 0.006 | ppl 1 | wps 6530.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 93917 | lr 0.000103188 | gnorm 0.226 | train_wall 1468 | gb_free 12.7 | wall 28438
|
| 487 |
+
epoch 019 | loss 0.006 | ppl 1 | wps 6530.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 93917 | lr 0.000103188 | gnorm 0.226 | train_wall 1468 | gb_free 12.7 | wall 28438
|
| 488 |
+
epoch 019 | loss 0.006 | ppl 1 | wps 6530.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 93917 | lr 0.000103188 | gnorm 0.226 | train_wall 1468 | gb_free 12.7 | wall 28438
|
| 489 |
+
epoch 019 | loss 0.006 | ppl 1 | wps 6530.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 93917 | lr 0.000103188 | gnorm 0.226 | train_wall 1468 | gb_free 12.7 | wall 28438
|
| 490 |
+
epoch 019 | loss 0.006 | ppl 1 | wps 6530.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 93917 | lr 0.000103188 | gnorm 0.226 | train_wall 1468 | gb_free 12.7 | wall 28438
|
| 491 |
+
epoch 019 | loss 0.006 | ppl 1 | wps 6530.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 93917 | lr 0.000103188 | gnorm 0.226 | train_wall 1468 | gb_free 12.7 | wall 28438
|
| 492 |
+
epoch 019 | loss 0.006 | ppl 1 | wps 6530.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 93917 | lr 0.000103188 | gnorm 0.226 | train_wall 1468 | gb_free 12.7 | wall 28438
|
| 493 |
+
epoch 019 | loss 0.006 | ppl 1 | wps 6530.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 93917 | lr 0.000103188 | gnorm 0.226 | train_wall 1468 | gb_free 12.7 | wall 28438
|
| 494 |
+
epoch 019 | loss 0.006 | ppl 1 | wps 6530.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 93917 | lr 0.000103188 | gnorm 0.226 | train_wall 1468 | gb_free 12.7 | wall 28438
|
| 495 |
+
epoch 019 | loss 0.006 | ppl 1 | wps 6530.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 93917 | lr 0.000103188 | gnorm 0.226 | train_wall 1468 | gb_free 12.7 | wall 28438
|
| 496 |
+
epoch 019 | loss 0.006 | ppl 1 | wps 6530.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 93917 | lr 0.000103188 | gnorm 0.226 | train_wall 1468 | gb_free 12.7 | wall 28438
|
| 497 |
+
epoch 019 | loss 0.006 | ppl 1 | wps 6530.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 93917 | lr 0.000103188 | gnorm 0.226 | train_wall 1468 | gb_free 12.7 | wall 28438
|
| 498 |
+
epoch 019 | loss 0.006 | ppl 1 | wps 6530.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 93917 | lr 0.000103188 | gnorm 0.226 | train_wall 1468 | gb_free 12.7 | wall 28438
|
| 499 |
+
epoch 019 | loss 0.006 | ppl 1 | wps 6530.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 93917 | lr 0.000103188 | gnorm 0.226 | train_wall 1468 | gb_free 12.7 | wall 28438
|
| 500 |
+
epoch 019 | loss 0.006 | ppl 1 | wps 6530.4 | ups 3.31 | wpb 1971.8 | bsz 95.4 | num_updates 93917 | lr 0.000103188 | gnorm 0.226 | train_wall 1468 | gb_free 12.7 | wall 28438
|
| 501 |
+
Start iterating over samples
|
| 502 |
+
begin validation on "valid" subset
|
| 503 |
+
epoch 020 | valid on 'valid' subset | loss 0.1 | ppl 1.07 | wps 21536.6 | wpb 1943.9 | bsz 93.9 | num_updates 98860 | best_loss 0.097
|
| 504 |
+
epoch 020 | valid on 'valid' subset | loss 0.1 | ppl 1.07 | wps 21536.6 | wpb 1943.9 | bsz 93.9 | num_updates 98860 | best_loss 0.097
|
| 505 |
+
epoch 020 | valid on 'valid' subset | loss 0.1 | ppl 1.07 | wps 21536.6 | wpb 1943.9 | bsz 93.9 | num_updates 98860 | best_loss 0.097
|
| 506 |
+
epoch 020 | valid on 'valid' subset | loss 0.1 | ppl 1.07 | wps 21536.6 | wpb 1943.9 | bsz 93.9 | num_updates 98860 | best_loss 0.097
|
| 507 |
+
epoch 020 | valid on 'valid' subset | loss 0.1 | ppl 1.07 | wps 21536.6 | wpb 1943.9 | bsz 93.9 | num_updates 98860 | best_loss 0.097
|
| 508 |
+
epoch 020 | valid on 'valid' subset | loss 0.1 | ppl 1.07 | wps 21536.6 | wpb 1943.9 | bsz 93.9 | num_updates 98860 | best_loss 0.097
|
| 509 |
+
epoch 020 | valid on 'valid' subset | loss 0.1 | ppl 1.07 | wps 21536.6 | wpb 1943.9 | bsz 93.9 | num_updates 98860 | best_loss 0.097
|
| 510 |
+
epoch 020 | valid on 'valid' subset | loss 0.1 | ppl 1.07 | wps 21536.6 | wpb 1943.9 | bsz 93.9 | num_updates 98860 | best_loss 0.097
|
| 511 |
+
epoch 020 | valid on 'valid' subset | loss 0.1 | ppl 1.07 | wps 21536.6 | wpb 1943.9 | bsz 93.9 | num_updates 98860 | best_loss 0.097
|
| 512 |
+
epoch 020 | valid on 'valid' subset | loss 0.1 | ppl 1.07 | wps 21536.6 | wpb 1943.9 | bsz 93.9 | num_updates 98860 | best_loss 0.097
|
| 513 |
+
epoch 020 | valid on 'valid' subset | loss 0.1 | ppl 1.07 | wps 21536.6 | wpb 1943.9 | bsz 93.9 | num_updates 98860 | best_loss 0.097
|
| 514 |
+
epoch 020 | valid on 'valid' subset | loss 0.1 | ppl 1.07 | wps 21536.6 | wpb 1943.9 | bsz 93.9 | num_updates 98860 | best_loss 0.097
|
| 515 |
+
epoch 020 | valid on 'valid' subset | loss 0.1 | ppl 1.07 | wps 21536.6 | wpb 1943.9 | bsz 93.9 | num_updates 98860 | best_loss 0.097
|
| 516 |
+
epoch 020 | valid on 'valid' subset | loss 0.1 | ppl 1.07 | wps 21536.6 | wpb 1943.9 | bsz 93.9 | num_updates 98860 | best_loss 0.097
|
| 517 |
+
epoch 020 | valid on 'valid' subset | loss 0.1 | ppl 1.07 | wps 21536.6 | wpb 1943.9 | bsz 93.9 | num_updates 98860 | best_loss 0.097
|
| 518 |
+
epoch 020 | valid on 'valid' subset | loss 0.1 | ppl 1.07 | wps 21536.6 | wpb 1943.9 | bsz 93.9 | num_updates 98860 | best_loss 0.097
|
| 519 |
+
epoch 020 | valid on 'valid' subset | loss 0.1 | ppl 1.07 | wps 21536.6 | wpb 1943.9 | bsz 93.9 | num_updates 98860 | best_loss 0.097
|
| 520 |
+
epoch 020 | valid on 'valid' subset | loss 0.1 | ppl 1.07 | wps 21536.6 | wpb 1943.9 | bsz 93.9 | num_updates 98860 | best_loss 0.097
|
| 521 |
+
epoch 020 | valid on 'valid' subset | loss 0.1 | ppl 1.07 | wps 21536.6 | wpb 1943.9 | bsz 93.9 | num_updates 98860 | best_loss 0.097
|
| 522 |
+
epoch 020 | valid on 'valid' subset | loss 0.1 | ppl 1.07 | wps 21536.6 | wpb 1943.9 | bsz 93.9 | num_updates 98860 | best_loss 0.097
|
| 523 |
+
end of epoch 20 (average epoch stats below)
|
| 524 |
+
epoch 020 | loss 0.006 | ppl 1 | wps 6546.6 | ups 3.32 | wpb 1971.8 | bsz 95.4 | num_updates 98860 | lr 0.000100575 | gnorm 0.214 | train_wall 1465 | gb_free 13 | wall 29927
|
| 525 |
+
epoch 020 | loss 0.006 | ppl 1 | wps 6546.6 | ups 3.32 | wpb 1971.8 | bsz 95.4 | num_updates 98860 | lr 0.000100575 | gnorm 0.214 | train_wall 1465 | gb_free 13 | wall 29927
|
| 526 |
+
epoch 020 | loss 0.006 | ppl 1 | wps 6546.6 | ups 3.32 | wpb 1971.8 | bsz 95.4 | num_updates 98860 | lr 0.000100575 | gnorm 0.214 | train_wall 1465 | gb_free 13 | wall 29927
|
| 527 |
+
epoch 020 | loss 0.006 | ppl 1 | wps 6546.6 | ups 3.32 | wpb 1971.8 | bsz 95.4 | num_updates 98860 | lr 0.000100575 | gnorm 0.214 | train_wall 1465 | gb_free 13 | wall 29927
|
| 528 |
+
epoch 020 | loss 0.006 | ppl 1 | wps 6546.6 | ups 3.32 | wpb 1971.8 | bsz 95.4 | num_updates 98860 | lr 0.000100575 | gnorm 0.214 | train_wall 1465 | gb_free 13 | wall 29927
|
| 529 |
+
epoch 020 | loss 0.006 | ppl 1 | wps 6546.6 | ups 3.32 | wpb 1971.8 | bsz 95.4 | num_updates 98860 | lr 0.000100575 | gnorm 0.214 | train_wall 1465 | gb_free 13 | wall 29927
|
| 530 |
+
epoch 020 | loss 0.006 | ppl 1 | wps 6546.6 | ups 3.32 | wpb 1971.8 | bsz 95.4 | num_updates 98860 | lr 0.000100575 | gnorm 0.214 | train_wall 1465 | gb_free 13 | wall 29927
|
| 531 |
+
epoch 020 | loss 0.006 | ppl 1 | wps 6546.6 | ups 3.32 | wpb 1971.8 | bsz 95.4 | num_updates 98860 | lr 0.000100575 | gnorm 0.214 | train_wall 1465 | gb_free 13 | wall 29927
|
| 532 |
+
epoch 020 | loss 0.006 | ppl 1 | wps 6546.6 | ups 3.32 | wpb 1971.8 | bsz 95.4 | num_updates 98860 | lr 0.000100575 | gnorm 0.214 | train_wall 1465 | gb_free 13 | wall 29927
|
| 533 |
+
epoch 020 | loss 0.006 | ppl 1 | wps 6546.6 | ups 3.32 | wpb 1971.8 | bsz 95.4 | num_updates 98860 | lr 0.000100575 | gnorm 0.214 | train_wall 1465 | gb_free 13 | wall 29927
|
| 534 |
+
epoch 020 | loss 0.006 | ppl 1 | wps 6546.6 | ups 3.32 | wpb 1971.8 | bsz 95.4 | num_updates 98860 | lr 0.000100575 | gnorm 0.214 | train_wall 1465 | gb_free 13 | wall 29927
|
| 535 |
+
epoch 020 | loss 0.006 | ppl 1 | wps 6546.6 | ups 3.32 | wpb 1971.8 | bsz 95.4 | num_updates 98860 | lr 0.000100575 | gnorm 0.214 | train_wall 1465 | gb_free 13 | wall 29927
|
| 536 |
+
epoch 020 | loss 0.006 | ppl 1 | wps 6546.6 | ups 3.32 | wpb 1971.8 | bsz 95.4 | num_updates 98860 | lr 0.000100575 | gnorm 0.214 | train_wall 1465 | gb_free 13 | wall 29927
|
| 537 |
+
epoch 020 | loss 0.006 | ppl 1 | wps 6546.6 | ups 3.32 | wpb 1971.8 | bsz 95.4 | num_updates 98860 | lr 0.000100575 | gnorm 0.214 | train_wall 1465 | gb_free 13 | wall 29927
|
| 538 |
+
epoch 020 | loss 0.006 | ppl 1 | wps 6546.6 | ups 3.32 | wpb 1971.8 | bsz 95.4 | num_updates 98860 | lr 0.000100575 | gnorm 0.214 | train_wall 1465 | gb_free 13 | wall 29927
|
| 539 |
+
epoch 020 | loss 0.006 | ppl 1 | wps 6546.6 | ups 3.32 | wpb 1971.8 | bsz 95.4 | num_updates 98860 | lr 0.000100575 | gnorm 0.214 | train_wall 1465 | gb_free 13 | wall 29927
|
| 540 |
+
epoch 020 | loss 0.006 | ppl 1 | wps 6546.6 | ups 3.32 | wpb 1971.8 | bsz 95.4 | num_updates 98860 | lr 0.000100575 | gnorm 0.214 | train_wall 1465 | gb_free 13 | wall 29927
|
| 541 |
+
epoch 020 | loss 0.006 | ppl 1 | wps 6546.6 | ups 3.32 | wpb 1971.8 | bsz 95.4 | num_updates 98860 | lr 0.000100575 | gnorm 0.214 | train_wall 1465 | gb_free 13 | wall 29927
|
| 542 |
+
epoch 020 | loss 0.006 | ppl 1 | wps 6546.6 | ups 3.32 | wpb 1971.8 | bsz 95.4 | num_updates 98860 | lr 0.000100575 | gnorm 0.214 | train_wall 1465 | gb_free 13 | wall 29927
|
| 543 |
+
epoch 020 | loss 0.006 | ppl 1 | wps 6546.6 | ups 3.32 | wpb 1971.8 | bsz 95.4 | num_updates 98860 | lr 0.000100575 | gnorm 0.214 | train_wall 1465 | gb_free 13 | wall 29927
|
| 544 |
+
Start iterating over samples
|
| 545 |
+
Stopping training due to num_updates: 100000 >= max_update: 100000
|
| 546 |
+
begin validation on "valid" subset
|
| 547 |
+
epoch 021 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21391.8 | wpb 1943.9 | bsz 93.9 | num_updates 100000 | best_loss 0.097
|
| 548 |
+
epoch 021 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21391.8 | wpb 1943.9 | bsz 93.9 | num_updates 100000 | best_loss 0.097
|
| 549 |
+
epoch 021 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21391.8 | wpb 1943.9 | bsz 93.9 | num_updates 100000 | best_loss 0.097
|
| 550 |
+
epoch 021 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21391.8 | wpb 1943.9 | bsz 93.9 | num_updates 100000 | best_loss 0.097
|
| 551 |
+
epoch 021 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21391.8 | wpb 1943.9 | bsz 93.9 | num_updates 100000 | best_loss 0.097
|
| 552 |
+
epoch 021 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21391.8 | wpb 1943.9 | bsz 93.9 | num_updates 100000 | best_loss 0.097
|
| 553 |
+
epoch 021 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21391.8 | wpb 1943.9 | bsz 93.9 | num_updates 100000 | best_loss 0.097
|
| 554 |
+
epoch 021 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21391.8 | wpb 1943.9 | bsz 93.9 | num_updates 100000 | best_loss 0.097
|
| 555 |
+
epoch 021 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21391.8 | wpb 1943.9 | bsz 93.9 | num_updates 100000 | best_loss 0.097
|
| 556 |
+
epoch 021 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21391.8 | wpb 1943.9 | bsz 93.9 | num_updates 100000 | best_loss 0.097
|
| 557 |
+
epoch 021 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21391.8 | wpb 1943.9 | bsz 93.9 | num_updates 100000 | best_loss 0.097
|
| 558 |
+
epoch 021 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21391.8 | wpb 1943.9 | bsz 93.9 | num_updates 100000 | best_loss 0.097
|
| 559 |
+
epoch 021 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21391.8 | wpb 1943.9 | bsz 93.9 | num_updates 100000 | best_loss 0.097
|
| 560 |
+
epoch 021 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21391.8 | wpb 1943.9 | bsz 93.9 | num_updates 100000 | best_loss 0.097
|
| 561 |
+
epoch 021 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21391.8 | wpb 1943.9 | bsz 93.9 | num_updates 100000 | best_loss 0.097
|
| 562 |
+
epoch 021 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21391.8 | wpb 1943.9 | bsz 93.9 | num_updates 100000 | best_loss 0.097
|
| 563 |
+
epoch 021 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21391.8 | wpb 1943.9 | bsz 93.9 | num_updates 100000 | best_loss 0.097
|
| 564 |
+
epoch 021 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21391.8 | wpb 1943.9 | bsz 93.9 | num_updates 100000 | best_loss 0.097
|
| 565 |
+
epoch 021 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21391.8 | wpb 1943.9 | bsz 93.9 | num_updates 100000 | best_loss 0.097
|
| 566 |
+
epoch 021 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21391.8 | wpb 1943.9 | bsz 93.9 | num_updates 100000 | best_loss 0.097
|
| 567 |
+
epoch 021 | valid on 'valid' subset | loss 0.106 | ppl 1.08 | wps 21391.8 | wpb 1943.9 | bsz 93.9 | num_updates 100000 | best_loss 0.097
|
| 568 |
+
end of epoch 21 (average epoch stats below)
|
| 569 |
+
epoch 021 | loss 0.006 | ppl 1 | wps 6330.8 | ups 3.22 | wpb 1963.6 | bsz 95 | num_updates 100000 | lr 0.0001 | gnorm 0.235 | train_wall 339 | gb_free 12.5 | wall 30280
|
| 570 |
+
epoch 021 | loss 0.006 | ppl 1 | wps 6330.8 | ups 3.22 | wpb 1963.6 | bsz 95 | num_updates 100000 | lr 0.0001 | gnorm 0.235 | train_wall 339 | gb_free 12.5 | wall 30280
|
| 571 |
+
epoch 021 | loss 0.006 | ppl 1 | wps 6330.8 | ups 3.22 | wpb 1963.6 | bsz 95 | num_updates 100000 | lr 0.0001 | gnorm 0.235 | train_wall 339 | gb_free 12.5 | wall 30280
|
| 572 |
+
epoch 021 | loss 0.006 | ppl 1 | wps 6330.8 | ups 3.22 | wpb 1963.6 | bsz 95 | num_updates 100000 | lr 0.0001 | gnorm 0.235 | train_wall 339 | gb_free 12.5 | wall 30280
|
| 573 |
+
epoch 021 | loss 0.006 | ppl 1 | wps 6330.8 | ups 3.22 | wpb 1963.6 | bsz 95 | num_updates 100000 | lr 0.0001 | gnorm 0.235 | train_wall 339 | gb_free 12.5 | wall 30280
|
| 574 |
+
epoch 021 | loss 0.006 | ppl 1 | wps 6330.8 | ups 3.22 | wpb 1963.6 | bsz 95 | num_updates 100000 | lr 0.0001 | gnorm 0.235 | train_wall 339 | gb_free 12.5 | wall 30280
|
| 575 |
+
epoch 021 | loss 0.006 | ppl 1 | wps 6330.8 | ups 3.22 | wpb 1963.6 | bsz 95 | num_updates 100000 | lr 0.0001 | gnorm 0.235 | train_wall 339 | gb_free 12.5 | wall 30280
|
| 576 |
+
epoch 021 | loss 0.006 | ppl 1 | wps 6330.8 | ups 3.22 | wpb 1963.6 | bsz 95 | num_updates 100000 | lr 0.0001 | gnorm 0.235 | train_wall 339 | gb_free 12.5 | wall 30280
|
| 577 |
+
epoch 021 | loss 0.006 | ppl 1 | wps 6330.8 | ups 3.22 | wpb 1963.6 | bsz 95 | num_updates 100000 | lr 0.0001 | gnorm 0.235 | train_wall 339 | gb_free 12.5 | wall 30280
|
| 578 |
+
epoch 021 | loss 0.006 | ppl 1 | wps 6330.8 | ups 3.22 | wpb 1963.6 | bsz 95 | num_updates 100000 | lr 0.0001 | gnorm 0.235 | train_wall 339 | gb_free 12.5 | wall 30280
|
| 579 |
+
epoch 021 | loss 0.006 | ppl 1 | wps 6330.8 | ups 3.22 | wpb 1963.6 | bsz 95 | num_updates 100000 | lr 0.0001 | gnorm 0.235 | train_wall 339 | gb_free 12.5 | wall 30280
|
| 580 |
+
epoch 021 | loss 0.006 | ppl 1 | wps 6330.8 | ups 3.22 | wpb 1963.6 | bsz 95 | num_updates 100000 | lr 0.0001 | gnorm 0.235 | train_wall 339 | gb_free 12.5 | wall 30280
|
| 581 |
+
epoch 021 | loss 0.006 | ppl 1 | wps 6330.8 | ups 3.22 | wpb 1963.6 | bsz 95 | num_updates 100000 | lr 0.0001 | gnorm 0.235 | train_wall 339 | gb_free 12.5 | wall 30280
|
| 582 |
+
epoch 021 | loss 0.006 | ppl 1 | wps 6330.8 | ups 3.22 | wpb 1963.6 | bsz 95 | num_updates 100000 | lr 0.0001 | gnorm 0.235 | train_wall 339 | gb_free 12.5 | wall 30280
|
| 583 |
+
epoch 021 | loss 0.006 | ppl 1 | wps 6330.8 | ups 3.22 | wpb 1963.6 | bsz 95 | num_updates 100000 | lr 0.0001 | gnorm 0.235 | train_wall 339 | gb_free 12.5 | wall 30280
|
| 584 |
+
epoch 021 | loss 0.006 | ppl 1 | wps 6330.8 | ups 3.22 | wpb 1963.6 | bsz 95 | num_updates 100000 | lr 0.0001 | gnorm 0.235 | train_wall 339 | gb_free 12.5 | wall 30280
|
| 585 |
+
epoch 021 | loss 0.006 | ppl 1 | wps 6330.8 | ups 3.22 | wpb 1963.6 | bsz 95 | num_updates 100000 | lr 0.0001 | gnorm 0.235 | train_wall 339 | gb_free 12.5 | wall 30280
|
| 586 |
+
epoch 021 | loss 0.006 | ppl 1 | wps 6330.8 | ups 3.22 | wpb 1963.6 | bsz 95 | num_updates 100000 | lr 0.0001 | gnorm 0.235 | train_wall 339 | gb_free 12.5 | wall 30280
|
| 587 |
+
epoch 021 | loss 0.006 | ppl 1 | wps 6330.8 | ups 3.22 | wpb 1963.6 | bsz 95 | num_updates 100000 | lr 0.0001 | gnorm 0.235 | train_wall 339 | gb_free 12.5 | wall 30280
|
| 588 |
+
epoch 021 | loss 0.006 | ppl 1 | wps 6330.8 | ups 3.22 | wpb 1963.6 | bsz 95 | num_updates 100000 | lr 0.0001 | gnorm 0.235 | train_wall 339 | gb_free 12.5 | wall 30280
|
| 589 |
+
epoch 021 | loss 0.006 | ppl 1 | wps 6330.8 | ups 3.22 | wpb 1963.6 | bsz 95 | num_updates 100000 | lr 0.0001 | gnorm 0.235 | train_wall 339 | gb_free 12.5 | wall 30280
|
| 590 |
+
done training in 30279.7 seconds
|
ablations/postfix/trained_arm32_best_postfix/beam1.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:65006183aa175b82c3d02c31a6c5ae1b5990df997a5661640bf78cc42aa803c4
|
| 3 |
+
size 2223373
|
ablations/postfix/trained_arm32_best_postfix/checkpoint_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2b72d91990f0117dfdb5d61cceff6c279405e56ee786115a656584cdd9f0bae2
|
| 3 |
+
size 124676620
|
ablations/postfix/trained_arm32_best_postfix/training.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ablations/postfix/trained_x64_best_postfix/beam1.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1ef2c6fb16c63beccb0fda4a30a7b71ad213945996583cdfa7998e66e734bd67
|
| 3 |
+
size 2732416
|
ablations/postfix/trained_x64_best_postfix/checkpoint_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:038881962540bf4707fb4ac19c18e7e3669f986feb243cf53267063de088ebc5
|
| 3 |
+
size 186036468
|
ablations/postfix/trained_x64_best_postfix/training.log
ADDED
|
@@ -0,0 +1,385 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': 'x64/trained_x64_best_postfix/training.log', 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': False, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': None, 'path': None, 'post_process': None, 'quiet': False, 'model_overrides': '{}', 'results_path': None}, 'distributed_training': {'_name': None, 'distributed_world_size': 1, 'distributed_num_procs': 1, 'distributed_rank': 0, 'distributed_backend': 'nccl', 'distributed_init_method': None, 'distributed_port': -1, 'device_id': 0, 'distributed_no_spawn': False, 'ddp_backend': 'pytorch_ddp', 'ddp_comm_hook': 'none', 'bucket_cap_mb': 25, 'fix_batches_to_gpus': False, 'find_unused_parameters': False, 'gradient_as_bucket_view': False, 'fast_stat_sync': False, 'heartbeat_timeout': -1, 'broadcast_buffers': False, 'slowmo_momentum': None, 'slowmo_base_algorithm': 'localsgd', 'localsgd_frequency': 3, 'nprocs_per_node': 1, 'pipeline_model_parallel': False, 'pipeline_balance': None, 'pipeline_devices': None, 'pipeline_chunks': 0, 'pipeline_encoder_balance': None, 'pipeline_encoder_devices': None, 'pipeline_decoder_balance': None, 'pipeline_decoder_devices': None, 'pipeline_checkpoint': 'never', 'zero_sharding': 'none', 'fp16': False, 'memory_efficient_fp16': False, 'tpu': False, 'no_reshard_after_forward': False, 'fp32_reduce_scatter': False, 'cpu_offload': False, 'use_sharded_state': False, 'not_fsdp_flatten_parameters': False}, 'dataset': {'_name': None, 'num_workers': 1, 'skip_invalid_size_inputs_valid_test': False, 'max_tokens': 20000, 'batch_size': None, 'required_batch_size_multiple': 8, 'required_seq_len_multiple': 1, 'dataset_impl': None, 'data_buffer_size': 10, 'train_subset': 'train', 'valid_subset': 'valid', 'combine_valid_subsets': None, 'ignore_unused_valid_subsets': False, 'validate_interval': 1, 'validate_interval_updates': 0, 'validate_after_updates': 0, 'fixed_validation_seed': None, 'disable_validation': False, 'max_tokens_valid': 20000, 'batch_size_valid': None, 'max_valid_steps': None, 'curriculum': 0, 'gen_subset': 'test', 'num_shards': 1, 'shard_id': 0, 'grouped_shuffling': False, 'update_epoch_batch_itr': False, 'update_ordered_indices_seed': False}, 'optimization': {'_name': None, 'max_epoch': 0, 'max_update': 100000, 'stop_time_hours': 0.0, 'clip_norm': 0.0, 'sentence_avg': False, 'update_freq': [1], 'lr': [0.0005], 'stop_min_lr': -1.0, 'use_bmuf': False, 'skip_remainder_batch': False}, 'checkpoint': {'_name': None, 'save_dir': 'x64/trained_x64_best_postfix', 'restore_file': 'checkpoint_last.pt', 'continue_once': None, 'finetune_from_model': None, 'reset_dataloader': False, 'reset_lr_scheduler': False, 'reset_meters': False, 'reset_optimizer': False, 'optimizer_overrides': '{}', 'save_interval': 1, 'save_interval_updates': 0, 'keep_interval_updates': -1, 'keep_interval_updates_pattern': -1, 'keep_last_epochs': -1, 'keep_best_checkpoints': 3, 'no_save': False, 'no_epoch_checkpoints': True, 'no_last_checkpoints': False, 'no_save_optimizer_state': False, 'best_checkpoint_metric': 'loss', 'maximize_best_checkpoint_metric': False, 'patience': 4, 'checkpoint_suffix': '', 'checkpoint_shard_count': 1, 'load_checkpoint_on_all_dp_ranks': False, 'write_checkpoints_asynchronously': False, 'model_parallel_size': 1}, 'bmuf': {'_name': None, 'block_lr': 1.0, 'block_momentum': 0.875, 'global_sync_iter': 50, 'warmup_iterations': 500, 'use_nbm': False, 'average_sync': False, 'distributed_world_size': 1}, 'generation': {'_name': None, 'beam': 5, 'nbest': 1, 'max_len_a': 0.0, 'max_len_b': 200, 'min_len': 1, 'match_source_len': False, 'unnormalized': False, 'no_early_stop': False, 'no_beamable_mm': False, 'lenpen': 1.0, 'unkpen': 0.0, 'replace_unk': None, 'sacrebleu': False, 'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, 'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, 'constraints': None, 'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, 'diversity_rate': -1.0, 'print_alignment': None, 'print_step': False, 'lm_path': None, 'lm_weight': 0.0, 'iter_decode_eos_penalty': 0.0, 'iter_decode_max_iter': 10, 'iter_decode_force_max_iter': False, 'iter_decode_with_beam': 1, 'iter_decode_with_external_reranker': False, 'retain_iter_history': False, 'retain_dropout': False, 'retain_dropout_modules': None, 'decoding_format': None, 'no_seed_provided': False, 'eos_token': None}, 'eval_lm': {'_name': None, 'output_word_probs': False, 'output_word_stats': False, 'context_window': 0, 'softmax_batch': 9223372036854775807}, 'interactive': {'_name': None, 'buffer_size': 0, 'input': '-'}, 'model': Namespace(_name='transformer', activation_dropout=0.0, activation_fn='relu', adam_betas=(0.9, 0.999), adam_eps=1e-08, adaptive_input=False, adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, aim_repo=None, aim_run_hash=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, arch='transformer', attention_dropout=0.0, azureml_logging=False, batch_size=None, batch_size_valid=None, best_checkpoint_metric='loss', bf16=False, bpe=None, broadcast_buffers=False, bucket_cap_mb=25, checkpoint_activations=False, checkpoint_shard_count=1, checkpoint_suffix='', clip_norm=0.0, combine_valid_subsets=None, continue_once=None, cpu=False, cpu_offload=False, criterion='cross_entropy', cross_self_attention=False, curriculum=0, data='x64/tokenized_dlsm_x64_postfix', data_buffer_size=10, dataset_impl=None, ddp_backend='pytorch_ddp', ddp_comm_hook='none', decoder_attention_heads=8, decoder_embed_dim=96, decoder_embed_path=None, decoder_ffn_embed_dim=384, decoder_input_dim=96, decoder_layerdrop=0, decoder_layers=6, decoder_layers_to_keep=None, decoder_learned_pos=False, decoder_normalize_before=False, decoder_output_dim='96', device_id=0, disable_validation=False, distributed_backend='nccl', distributed_init_method=None, distributed_no_spawn=False, distributed_num_procs=1, distributed_port=-1, distributed_rank=0, distributed_world_size=1, dropout=0.05, ema_decay=0.9999, ema_fp32=False, ema_seed_model=None, ema_start_update=0, ema_update_freq=1, empty_cache_freq=0, encoder_attention_heads=8, encoder_embed_dim=384, encoder_embed_path=None, encoder_ffn_embed_dim=1536, encoder_layerdrop=0, encoder_layers=6, encoder_layers_to_keep=None, encoder_learned_pos=False, encoder_normalize_before=False, eos=2, eval_bleu=False, eval_bleu_args='{}', eval_bleu_detok='space', eval_bleu_detok_args='{}', eval_bleu_print_samples=False, eval_bleu_remove_bpe=None, eval_tokenized_bleu=False, fast_stat_sync=False, find_unused_parameters=False, finetune_from_model=None, fix_batches_to_gpus=False, fixed_validation_seed=None, fp16=False, fp16_adam_stats=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, fp32_reduce_scatter=False, gen_subset='test', gradient_as_bucket_view=False, grouped_shuffling=False, heartbeat_timeout=-1, ignore_unused_valid_subsets=False, keep_best_checkpoints=3, keep_interval_updates=-1, keep_interval_updates_pattern=-1, keep_last_epochs=-1, layernorm_embedding=False, left_pad_source=True, left_pad_target=False, load_alignments=False, load_checkpoint_on_all_dp_ranks=False, localsgd_frequency=3, log_file='x64/trained_x64_best_postfix/training.log', log_format=None, log_interval=100, lr=[0.0005], lr_scheduler='inverse_sqrt', max_epoch=0, max_source_positions=1024, max_target_positions=1024, max_tokens=20000, max_tokens_valid=20000, max_update=100000, max_valid_steps=None, maximize_best_checkpoint_metric=False, memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_cross_attention=False, no_epoch_checkpoints=True, no_last_checkpoints=False, no_progress_bar=False, no_reshard_after_forward=False, no_save=False, no_save_optimizer_state=False, no_scale_embedding=False, no_seed_provided=False, no_token_positional_embeddings=False, not_fsdp_flatten_parameters=False, nprocs_per_node=1, num_batch_buckets=0, num_shards=1, num_workers=1, offload_activations=False, on_cpu_convert_precision=False, optimizer='adam', optimizer_overrides='{}', pad=1, patience=4, pipeline_balance=None, pipeline_checkpoint='never', pipeline_chunks=0, pipeline_decoder_balance=None, pipeline_decoder_devices=None, pipeline_devices=None, pipeline_encoder_balance=None, pipeline_encoder_devices=None, pipeline_model_parallel=False, plasma_path='/tmp/plasma', profile=False, quant_noise_pq=0, quant_noise_pq_block_size=8, quant_noise_scalar=0, quantization_config_path=None, required_batch_size_multiple=8, required_seq_len_multiple=1, reset_dataloader=False, reset_logging=False, reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, restore_file='checkpoint_last.pt', save_dir='x64/trained_x64_best_postfix', save_interval=1, save_interval_updates=0, scoring='bleu', seed=1, sentence_avg=False, shard_id=0, share_all_embeddings=False, share_decoder_input_output_embed=False, skip_invalid_size_inputs_valid_test=False, skip_remainder_batch=False, slowmo_base_algorithm='localsgd', slowmo_momentum=None, source_lang=None, stop_min_lr=-1.0, stop_time_hours=0, store_ema=False, suppress_crashes=False, target_lang=None, task='translation', tensorboard_logdir=None, threshold_loss_scale=None, tie_adaptive_weights=False, tokenizer=None, tpu=False, train_subset='train', truncate_source=False, unk=3, update_epoch_batch_itr=False, update_freq=[1], update_ordered_indices_seed=False, upsample_primary=-1, use_bmuf=False, use_old_adam=False, use_plasma_view=False, use_sharded_state=False, user_dir=None, valid_subset='valid', validate_after_updates=0, validate_interval=1, validate_interval_updates=0, wandb_project=None, warmup_init_lr=-1, warmup_updates=4000, weight_decay=0.005, write_checkpoints_asynchronously=False, zero_sharding='none'), 'task': {'_name': 'translation', 'data': 'x64/tokenized_dlsm_x64_postfix', 'source_lang': None, 'target_lang': None, 'load_alignments': False, 'left_pad_source': True, 'left_pad_target': False, 'max_source_positions': 1024, 'max_target_positions': 1024, 'upsample_primary': -1, 'truncate_source': False, 'num_batch_buckets': 0, 'train_subset': 'train', 'dataset_impl': None, 'required_seq_len_multiple': 1, 'eval_bleu': False, 'eval_bleu_args': '{}', 'eval_bleu_detok': 'space', 'eval_bleu_detok_args': '{}', 'eval_tokenized_bleu': False, 'eval_bleu_remove_bpe': None, 'eval_bleu_print_samples': False}, 'criterion': {'_name': 'cross_entropy', 'sentence_avg': False}, 'optimizer': {'_name': 'adam', 'adam_betas': [0.9, 0.999], 'adam_eps': 1e-08, 'weight_decay': 0.005, 'use_old_adam': False, 'fp16_adam_stats': False, 'tpu': False, 'lr': [0.0005]}, 'lr_scheduler': {'_name': 'inverse_sqrt', 'warmup_updates': 4000, 'warmup_init_lr': -1.0, 'lr': [0.0005]}, 'scoring': {'_name': 'bleu', 'pad': 1, 'eos': 2, 'unk': 3}, 'bpe': None, 'tokenizer': None, 'ema': {'_name': None, 'store_ema': False, 'ema_decay': 0.9999, 'ema_start_update': 0, 'ema_seed_model': None, 'ema_update_freq': 1, 'ema_fp32': False}}
|
| 2 |
+
TransformerModel(
|
| 3 |
+
(encoder): TransformerEncoderBase(
|
| 4 |
+
(dropout_module): FairseqDropout()
|
| 5 |
+
(embed_tokens): Embedding(136, 384, padding_idx=1)
|
| 6 |
+
(embed_positions): SinusoidalPositionalEmbedding()
|
| 7 |
+
(layers): ModuleList(
|
| 8 |
+
(0-5): 6 x TransformerEncoderLayerBase(
|
| 9 |
+
(self_attn): MultiheadAttention(
|
| 10 |
+
(dropout_module): FairseqDropout()
|
| 11 |
+
(k_proj): Linear(in_features=384, out_features=384, bias=True)
|
| 12 |
+
(v_proj): Linear(in_features=384, out_features=384, bias=True)
|
| 13 |
+
(q_proj): Linear(in_features=384, out_features=384, bias=True)
|
| 14 |
+
(out_proj): Linear(in_features=384, out_features=384, bias=True)
|
| 15 |
+
)
|
| 16 |
+
(self_attn_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
|
| 17 |
+
(dropout_module): FairseqDropout()
|
| 18 |
+
(activation_dropout_module): FairseqDropout()
|
| 19 |
+
(fc1): Linear(in_features=384, out_features=1536, bias=True)
|
| 20 |
+
(fc2): Linear(in_features=1536, out_features=384, bias=True)
|
| 21 |
+
(final_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
|
| 22 |
+
)
|
| 23 |
+
)
|
| 24 |
+
)
|
| 25 |
+
(decoder): TransformerDecoderBase(
|
| 26 |
+
(dropout_module): FairseqDropout()
|
| 27 |
+
(embed_tokens): Embedding(40, 96, padding_idx=1)
|
| 28 |
+
(embed_positions): SinusoidalPositionalEmbedding()
|
| 29 |
+
(layers): ModuleList(
|
| 30 |
+
(0-5): 6 x TransformerDecoderLayerBase(
|
| 31 |
+
(dropout_module): FairseqDropout()
|
| 32 |
+
(self_attn): MultiheadAttention(
|
| 33 |
+
(dropout_module): FairseqDropout()
|
| 34 |
+
(k_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 35 |
+
(v_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 36 |
+
(q_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 37 |
+
(out_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 38 |
+
)
|
| 39 |
+
(activation_dropout_module): FairseqDropout()
|
| 40 |
+
(self_attn_layer_norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
|
| 41 |
+
(encoder_attn): MultiheadAttention(
|
| 42 |
+
(dropout_module): FairseqDropout()
|
| 43 |
+
(k_proj): Linear(in_features=384, out_features=96, bias=True)
|
| 44 |
+
(v_proj): Linear(in_features=384, out_features=96, bias=True)
|
| 45 |
+
(q_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 46 |
+
(out_proj): Linear(in_features=96, out_features=96, bias=True)
|
| 47 |
+
)
|
| 48 |
+
(encoder_attn_layer_norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
|
| 49 |
+
(fc1): Linear(in_features=96, out_features=384, bias=True)
|
| 50 |
+
(fc2): Linear(in_features=384, out_features=96, bias=True)
|
| 51 |
+
(final_layer_norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
|
| 52 |
+
)
|
| 53 |
+
)
|
| 54 |
+
(output_projection): Linear(in_features=96, out_features=40, bias=False)
|
| 55 |
+
)
|
| 56 |
+
)
|
| 57 |
+
task: TranslationTask
|
| 58 |
+
model: TransformerModel
|
| 59 |
+
criterion: CrossEntropyCriterion
|
| 60 |
+
num. shared model params: 22,571,712 (num. trained: 22,571,712)
|
| 61 |
+
num. expert model params: 0 (num. trained: 0)
|
| 62 |
+
training on 1 devices (GPUs/TPUs)
|
| 63 |
+
max tokens per device = 20000 and max sentences per device = None
|
| 64 |
+
Start iterating over samples
|
| 65 |
+
begin validation on "valid" subset
|
| 66 |
+
epoch 001 | valid on 'valid' subset | loss 0.843 | ppl 1.79 | wps 21756.6 | wpb 1954.1 | bsz 93.8 | num_updates 4852
|
| 67 |
+
end of epoch 1 (average epoch stats below)
|
| 68 |
+
epoch 001 | loss 1.658 | ppl 3.16 | wps 6582.3 | ups 3.32 | wpb 1984.3 | bsz 95.9 | num_updates 4852 | lr 0.000453983 | gnorm 2.832 | train_wall 1440 | gb_free 13 | wall 1463
|
| 69 |
+
Start iterating over samples
|
| 70 |
+
begin validation on "valid" subset
|
| 71 |
+
epoch 002 | valid on 'valid' subset | loss 0.391 | ppl 1.31 | wps 21776.9 | wpb 1954.1 | bsz 93.8 | num_updates 9704 | best_loss 0.391
|
| 72 |
+
epoch 002 | valid on 'valid' subset | loss 0.391 | ppl 1.31 | wps 21776.9 | wpb 1954.1 | bsz 93.8 | num_updates 9704 | best_loss 0.391
|
| 73 |
+
end of epoch 2 (average epoch stats below)
|
| 74 |
+
epoch 002 | loss 0.56 | ppl 1.47 | wps 6573.7 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 9704 | lr 0.000321014 | gnorm 1.203 | train_wall 1441 | gb_free 12.8 | wall 2928
|
| 75 |
+
epoch 002 | loss 0.56 | ppl 1.47 | wps 6573.7 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 9704 | lr 0.000321014 | gnorm 1.203 | train_wall 1441 | gb_free 12.8 | wall 2928
|
| 76 |
+
Start iterating over samples
|
| 77 |
+
begin validation on "valid" subset
|
| 78 |
+
epoch 003 | valid on 'valid' subset | loss 0.242 | ppl 1.18 | wps 21774.3 | wpb 1954.1 | bsz 93.8 | num_updates 14556 | best_loss 0.242
|
| 79 |
+
epoch 003 | valid on 'valid' subset | loss 0.242 | ppl 1.18 | wps 21774.3 | wpb 1954.1 | bsz 93.8 | num_updates 14556 | best_loss 0.242
|
| 80 |
+
epoch 003 | valid on 'valid' subset | loss 0.242 | ppl 1.18 | wps 21774.3 | wpb 1954.1 | bsz 93.8 | num_updates 14556 | best_loss 0.242
|
| 81 |
+
end of epoch 3 (average epoch stats below)
|
| 82 |
+
epoch 003 | loss 0.277 | ppl 1.21 | wps 6573.3 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 14556 | lr 0.000262107 | gnorm 0.892 | train_wall 1441 | gb_free 12.9 | wall 4393
|
| 83 |
+
epoch 003 | loss 0.277 | ppl 1.21 | wps 6573.3 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 14556 | lr 0.000262107 | gnorm 0.892 | train_wall 1441 | gb_free 12.9 | wall 4393
|
| 84 |
+
epoch 003 | loss 0.277 | ppl 1.21 | wps 6573.3 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 14556 | lr 0.000262107 | gnorm 0.892 | train_wall 1441 | gb_free 12.9 | wall 4393
|
| 85 |
+
Start iterating over samples
|
| 86 |
+
begin validation on "valid" subset
|
| 87 |
+
epoch 004 | valid on 'valid' subset | loss 0.144 | ppl 1.11 | wps 21742.1 | wpb 1954.1 | bsz 93.8 | num_updates 19408 | best_loss 0.144
|
| 88 |
+
epoch 004 | valid on 'valid' subset | loss 0.144 | ppl 1.11 | wps 21742.1 | wpb 1954.1 | bsz 93.8 | num_updates 19408 | best_loss 0.144
|
| 89 |
+
epoch 004 | valid on 'valid' subset | loss 0.144 | ppl 1.11 | wps 21742.1 | wpb 1954.1 | bsz 93.8 | num_updates 19408 | best_loss 0.144
|
| 90 |
+
epoch 004 | valid on 'valid' subset | loss 0.144 | ppl 1.11 | wps 21742.1 | wpb 1954.1 | bsz 93.8 | num_updates 19408 | best_loss 0.144
|
| 91 |
+
end of epoch 4 (average epoch stats below)
|
| 92 |
+
epoch 004 | loss 0.151 | ppl 1.11 | wps 6570 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 19408 | lr 0.000226992 | gnorm 0.728 | train_wall 1442 | gb_free 13 | wall 5858
|
| 93 |
+
epoch 004 | loss 0.151 | ppl 1.11 | wps 6570 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 19408 | lr 0.000226992 | gnorm 0.728 | train_wall 1442 | gb_free 13 | wall 5858
|
| 94 |
+
epoch 004 | loss 0.151 | ppl 1.11 | wps 6570 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 19408 | lr 0.000226992 | gnorm 0.728 | train_wall 1442 | gb_free 13 | wall 5858
|
| 95 |
+
epoch 004 | loss 0.151 | ppl 1.11 | wps 6570 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 19408 | lr 0.000226992 | gnorm 0.728 | train_wall 1442 | gb_free 13 | wall 5858
|
| 96 |
+
Start iterating over samples
|
| 97 |
+
begin validation on "valid" subset
|
| 98 |
+
epoch 005 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21789.1 | wpb 1954.1 | bsz 93.8 | num_updates 24260 | best_loss 0.11
|
| 99 |
+
epoch 005 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21789.1 | wpb 1954.1 | bsz 93.8 | num_updates 24260 | best_loss 0.11
|
| 100 |
+
epoch 005 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21789.1 | wpb 1954.1 | bsz 93.8 | num_updates 24260 | best_loss 0.11
|
| 101 |
+
epoch 005 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21789.1 | wpb 1954.1 | bsz 93.8 | num_updates 24260 | best_loss 0.11
|
| 102 |
+
epoch 005 | valid on 'valid' subset | loss 0.11 | ppl 1.08 | wps 21789.1 | wpb 1954.1 | bsz 93.8 | num_updates 24260 | best_loss 0.11
|
| 103 |
+
end of epoch 5 (average epoch stats below)
|
| 104 |
+
epoch 005 | loss 0.087 | ppl 1.06 | wps 6567.9 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 24260 | lr 0.000203027 | gnorm 0.626 | train_wall 1443 | gb_free 12.8 | wall 7324
|
| 105 |
+
epoch 005 | loss 0.087 | ppl 1.06 | wps 6567.9 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 24260 | lr 0.000203027 | gnorm 0.626 | train_wall 1443 | gb_free 12.8 | wall 7324
|
| 106 |
+
epoch 005 | loss 0.087 | ppl 1.06 | wps 6567.9 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 24260 | lr 0.000203027 | gnorm 0.626 | train_wall 1443 | gb_free 12.8 | wall 7324
|
| 107 |
+
epoch 005 | loss 0.087 | ppl 1.06 | wps 6567.9 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 24260 | lr 0.000203027 | gnorm 0.626 | train_wall 1443 | gb_free 12.8 | wall 7324
|
| 108 |
+
epoch 005 | loss 0.087 | ppl 1.06 | wps 6567.9 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 24260 | lr 0.000203027 | gnorm 0.626 | train_wall 1443 | gb_free 12.8 | wall 7324
|
| 109 |
+
Start iterating over samples
|
| 110 |
+
begin validation on "valid" subset
|
| 111 |
+
epoch 006 | valid on 'valid' subset | loss 0.104 | ppl 1.07 | wps 21782.7 | wpb 1954.1 | bsz 93.8 | num_updates 29112 | best_loss 0.104
|
| 112 |
+
epoch 006 | valid on 'valid' subset | loss 0.104 | ppl 1.07 | wps 21782.7 | wpb 1954.1 | bsz 93.8 | num_updates 29112 | best_loss 0.104
|
| 113 |
+
epoch 006 | valid on 'valid' subset | loss 0.104 | ppl 1.07 | wps 21782.7 | wpb 1954.1 | bsz 93.8 | num_updates 29112 | best_loss 0.104
|
| 114 |
+
epoch 006 | valid on 'valid' subset | loss 0.104 | ppl 1.07 | wps 21782.7 | wpb 1954.1 | bsz 93.8 | num_updates 29112 | best_loss 0.104
|
| 115 |
+
epoch 006 | valid on 'valid' subset | loss 0.104 | ppl 1.07 | wps 21782.7 | wpb 1954.1 | bsz 93.8 | num_updates 29112 | best_loss 0.104
|
| 116 |
+
epoch 006 | valid on 'valid' subset | loss 0.104 | ppl 1.07 | wps 21782.7 | wpb 1954.1 | bsz 93.8 | num_updates 29112 | best_loss 0.104
|
| 117 |
+
end of epoch 6 (average epoch stats below)
|
| 118 |
+
epoch 006 | loss 0.055 | ppl 1.04 | wps 6564.9 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 29112 | lr 0.000185338 | gnorm 0.541 | train_wall 1443 | gb_free 13 | wall 8790
|
| 119 |
+
epoch 006 | loss 0.055 | ppl 1.04 | wps 6564.9 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 29112 | lr 0.000185338 | gnorm 0.541 | train_wall 1443 | gb_free 13 | wall 8790
|
| 120 |
+
epoch 006 | loss 0.055 | ppl 1.04 | wps 6564.9 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 29112 | lr 0.000185338 | gnorm 0.541 | train_wall 1443 | gb_free 13 | wall 8790
|
| 121 |
+
epoch 006 | loss 0.055 | ppl 1.04 | wps 6564.9 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 29112 | lr 0.000185338 | gnorm 0.541 | train_wall 1443 | gb_free 13 | wall 8790
|
| 122 |
+
epoch 006 | loss 0.055 | ppl 1.04 | wps 6564.9 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 29112 | lr 0.000185338 | gnorm 0.541 | train_wall 1443 | gb_free 13 | wall 8790
|
| 123 |
+
epoch 006 | loss 0.055 | ppl 1.04 | wps 6564.9 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 29112 | lr 0.000185338 | gnorm 0.541 | train_wall 1443 | gb_free 13 | wall 8790
|
| 124 |
+
Start iterating over samples
|
| 125 |
+
begin validation on "valid" subset
|
| 126 |
+
epoch 007 | valid on 'valid' subset | loss 0.087 | ppl 1.06 | wps 21687 | wpb 1954.1 | bsz 93.8 | num_updates 33964 | best_loss 0.087
|
| 127 |
+
epoch 007 | valid on 'valid' subset | loss 0.087 | ppl 1.06 | wps 21687 | wpb 1954.1 | bsz 93.8 | num_updates 33964 | best_loss 0.087
|
| 128 |
+
epoch 007 | valid on 'valid' subset | loss 0.087 | ppl 1.06 | wps 21687 | wpb 1954.1 | bsz 93.8 | num_updates 33964 | best_loss 0.087
|
| 129 |
+
epoch 007 | valid on 'valid' subset | loss 0.087 | ppl 1.06 | wps 21687 | wpb 1954.1 | bsz 93.8 | num_updates 33964 | best_loss 0.087
|
| 130 |
+
epoch 007 | valid on 'valid' subset | loss 0.087 | ppl 1.06 | wps 21687 | wpb 1954.1 | bsz 93.8 | num_updates 33964 | best_loss 0.087
|
| 131 |
+
epoch 007 | valid on 'valid' subset | loss 0.087 | ppl 1.06 | wps 21687 | wpb 1954.1 | bsz 93.8 | num_updates 33964 | best_loss 0.087
|
| 132 |
+
epoch 007 | valid on 'valid' subset | loss 0.087 | ppl 1.06 | wps 21687 | wpb 1954.1 | bsz 93.8 | num_updates 33964 | best_loss 0.087
|
| 133 |
+
end of epoch 7 (average epoch stats below)
|
| 134 |
+
epoch 007 | loss 0.038 | ppl 1.03 | wps 6567.8 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 33964 | lr 0.000171589 | gnorm 0.486 | train_wall 1442 | gb_free 13.3 | wall 10256
|
| 135 |
+
epoch 007 | loss 0.038 | ppl 1.03 | wps 6567.8 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 33964 | lr 0.000171589 | gnorm 0.486 | train_wall 1442 | gb_free 13.3 | wall 10256
|
| 136 |
+
epoch 007 | loss 0.038 | ppl 1.03 | wps 6567.8 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 33964 | lr 0.000171589 | gnorm 0.486 | train_wall 1442 | gb_free 13.3 | wall 10256
|
| 137 |
+
epoch 007 | loss 0.038 | ppl 1.03 | wps 6567.8 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 33964 | lr 0.000171589 | gnorm 0.486 | train_wall 1442 | gb_free 13.3 | wall 10256
|
| 138 |
+
epoch 007 | loss 0.038 | ppl 1.03 | wps 6567.8 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 33964 | lr 0.000171589 | gnorm 0.486 | train_wall 1442 | gb_free 13.3 | wall 10256
|
| 139 |
+
epoch 007 | loss 0.038 | ppl 1.03 | wps 6567.8 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 33964 | lr 0.000171589 | gnorm 0.486 | train_wall 1442 | gb_free 13.3 | wall 10256
|
| 140 |
+
epoch 007 | loss 0.038 | ppl 1.03 | wps 6567.8 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 33964 | lr 0.000171589 | gnorm 0.486 | train_wall 1442 | gb_free 13.3 | wall 10256
|
| 141 |
+
Start iterating over samples
|
| 142 |
+
begin validation on "valid" subset
|
| 143 |
+
epoch 008 | valid on 'valid' subset | loss 0.085 | ppl 1.06 | wps 21767.2 | wpb 1954.1 | bsz 93.8 | num_updates 38816 | best_loss 0.085
|
| 144 |
+
epoch 008 | valid on 'valid' subset | loss 0.085 | ppl 1.06 | wps 21767.2 | wpb 1954.1 | bsz 93.8 | num_updates 38816 | best_loss 0.085
|
| 145 |
+
epoch 008 | valid on 'valid' subset | loss 0.085 | ppl 1.06 | wps 21767.2 | wpb 1954.1 | bsz 93.8 | num_updates 38816 | best_loss 0.085
|
| 146 |
+
epoch 008 | valid on 'valid' subset | loss 0.085 | ppl 1.06 | wps 21767.2 | wpb 1954.1 | bsz 93.8 | num_updates 38816 | best_loss 0.085
|
| 147 |
+
epoch 008 | valid on 'valid' subset | loss 0.085 | ppl 1.06 | wps 21767.2 | wpb 1954.1 | bsz 93.8 | num_updates 38816 | best_loss 0.085
|
| 148 |
+
epoch 008 | valid on 'valid' subset | loss 0.085 | ppl 1.06 | wps 21767.2 | wpb 1954.1 | bsz 93.8 | num_updates 38816 | best_loss 0.085
|
| 149 |
+
epoch 008 | valid on 'valid' subset | loss 0.085 | ppl 1.06 | wps 21767.2 | wpb 1954.1 | bsz 93.8 | num_updates 38816 | best_loss 0.085
|
| 150 |
+
epoch 008 | valid on 'valid' subset | loss 0.085 | ppl 1.06 | wps 21767.2 | wpb 1954.1 | bsz 93.8 | num_updates 38816 | best_loss 0.085
|
| 151 |
+
end of epoch 8 (average epoch stats below)
|
| 152 |
+
epoch 008 | loss 0.028 | ppl 1.02 | wps 6567.8 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 38816 | lr 0.000160507 | gnorm 0.44 | train_wall 1443 | gb_free 12.8 | wall 11722
|
| 153 |
+
epoch 008 | loss 0.028 | ppl 1.02 | wps 6567.8 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 38816 | lr 0.000160507 | gnorm 0.44 | train_wall 1443 | gb_free 12.8 | wall 11722
|
| 154 |
+
epoch 008 | loss 0.028 | ppl 1.02 | wps 6567.8 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 38816 | lr 0.000160507 | gnorm 0.44 | train_wall 1443 | gb_free 12.8 | wall 11722
|
| 155 |
+
epoch 008 | loss 0.028 | ppl 1.02 | wps 6567.8 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 38816 | lr 0.000160507 | gnorm 0.44 | train_wall 1443 | gb_free 12.8 | wall 11722
|
| 156 |
+
epoch 008 | loss 0.028 | ppl 1.02 | wps 6567.8 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 38816 | lr 0.000160507 | gnorm 0.44 | train_wall 1443 | gb_free 12.8 | wall 11722
|
| 157 |
+
epoch 008 | loss 0.028 | ppl 1.02 | wps 6567.8 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 38816 | lr 0.000160507 | gnorm 0.44 | train_wall 1443 | gb_free 12.8 | wall 11722
|
| 158 |
+
epoch 008 | loss 0.028 | ppl 1.02 | wps 6567.8 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 38816 | lr 0.000160507 | gnorm 0.44 | train_wall 1443 | gb_free 12.8 | wall 11722
|
| 159 |
+
epoch 008 | loss 0.028 | ppl 1.02 | wps 6567.8 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 38816 | lr 0.000160507 | gnorm 0.44 | train_wall 1443 | gb_free 12.8 | wall 11722
|
| 160 |
+
Start iterating over samples
|
| 161 |
+
begin validation on "valid" subset
|
| 162 |
+
epoch 009 | valid on 'valid' subset | loss 0.083 | ppl 1.06 | wps 21736.2 | wpb 1954.1 | bsz 93.8 | num_updates 43668 | best_loss 0.083
|
| 163 |
+
epoch 009 | valid on 'valid' subset | loss 0.083 | ppl 1.06 | wps 21736.2 | wpb 1954.1 | bsz 93.8 | num_updates 43668 | best_loss 0.083
|
| 164 |
+
epoch 009 | valid on 'valid' subset | loss 0.083 | ppl 1.06 | wps 21736.2 | wpb 1954.1 | bsz 93.8 | num_updates 43668 | best_loss 0.083
|
| 165 |
+
epoch 009 | valid on 'valid' subset | loss 0.083 | ppl 1.06 | wps 21736.2 | wpb 1954.1 | bsz 93.8 | num_updates 43668 | best_loss 0.083
|
| 166 |
+
epoch 009 | valid on 'valid' subset | loss 0.083 | ppl 1.06 | wps 21736.2 | wpb 1954.1 | bsz 93.8 | num_updates 43668 | best_loss 0.083
|
| 167 |
+
epoch 009 | valid on 'valid' subset | loss 0.083 | ppl 1.06 | wps 21736.2 | wpb 1954.1 | bsz 93.8 | num_updates 43668 | best_loss 0.083
|
| 168 |
+
epoch 009 | valid on 'valid' subset | loss 0.083 | ppl 1.06 | wps 21736.2 | wpb 1954.1 | bsz 93.8 | num_updates 43668 | best_loss 0.083
|
| 169 |
+
epoch 009 | valid on 'valid' subset | loss 0.083 | ppl 1.06 | wps 21736.2 | wpb 1954.1 | bsz 93.8 | num_updates 43668 | best_loss 0.083
|
| 170 |
+
epoch 009 | valid on 'valid' subset | loss 0.083 | ppl 1.06 | wps 21736.2 | wpb 1954.1 | bsz 93.8 | num_updates 43668 | best_loss 0.083
|
| 171 |
+
end of epoch 9 (average epoch stats below)
|
| 172 |
+
epoch 009 | loss 0.022 | ppl 1.02 | wps 6564.3 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 43668 | lr 0.000151328 | gnorm 0.401 | train_wall 1443 | gb_free 12.9 | wall 13189
|
| 173 |
+
epoch 009 | loss 0.022 | ppl 1.02 | wps 6564.3 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 43668 | lr 0.000151328 | gnorm 0.401 | train_wall 1443 | gb_free 12.9 | wall 13189
|
| 174 |
+
epoch 009 | loss 0.022 | ppl 1.02 | wps 6564.3 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 43668 | lr 0.000151328 | gnorm 0.401 | train_wall 1443 | gb_free 12.9 | wall 13189
|
| 175 |
+
epoch 009 | loss 0.022 | ppl 1.02 | wps 6564.3 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 43668 | lr 0.000151328 | gnorm 0.401 | train_wall 1443 | gb_free 12.9 | wall 13189
|
| 176 |
+
epoch 009 | loss 0.022 | ppl 1.02 | wps 6564.3 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 43668 | lr 0.000151328 | gnorm 0.401 | train_wall 1443 | gb_free 12.9 | wall 13189
|
| 177 |
+
epoch 009 | loss 0.022 | ppl 1.02 | wps 6564.3 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 43668 | lr 0.000151328 | gnorm 0.401 | train_wall 1443 | gb_free 12.9 | wall 13189
|
| 178 |
+
epoch 009 | loss 0.022 | ppl 1.02 | wps 6564.3 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 43668 | lr 0.000151328 | gnorm 0.401 | train_wall 1443 | gb_free 12.9 | wall 13189
|
| 179 |
+
epoch 009 | loss 0.022 | ppl 1.02 | wps 6564.3 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 43668 | lr 0.000151328 | gnorm 0.401 | train_wall 1443 | gb_free 12.9 | wall 13189
|
| 180 |
+
epoch 009 | loss 0.022 | ppl 1.02 | wps 6564.3 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 43668 | lr 0.000151328 | gnorm 0.401 | train_wall 1443 | gb_free 12.9 | wall 13189
|
| 181 |
+
Start iterating over samples
|
| 182 |
+
begin validation on "valid" subset
|
| 183 |
+
epoch 010 | valid on 'valid' subset | loss 0.084 | ppl 1.06 | wps 21769.7 | wpb 1954.1 | bsz 93.8 | num_updates 48520 | best_loss 0.083
|
| 184 |
+
epoch 010 | valid on 'valid' subset | loss 0.084 | ppl 1.06 | wps 21769.7 | wpb 1954.1 | bsz 93.8 | num_updates 48520 | best_loss 0.083
|
| 185 |
+
epoch 010 | valid on 'valid' subset | loss 0.084 | ppl 1.06 | wps 21769.7 | wpb 1954.1 | bsz 93.8 | num_updates 48520 | best_loss 0.083
|
| 186 |
+
epoch 010 | valid on 'valid' subset | loss 0.084 | ppl 1.06 | wps 21769.7 | wpb 1954.1 | bsz 93.8 | num_updates 48520 | best_loss 0.083
|
| 187 |
+
epoch 010 | valid on 'valid' subset | loss 0.084 | ppl 1.06 | wps 21769.7 | wpb 1954.1 | bsz 93.8 | num_updates 48520 | best_loss 0.083
|
| 188 |
+
epoch 010 | valid on 'valid' subset | loss 0.084 | ppl 1.06 | wps 21769.7 | wpb 1954.1 | bsz 93.8 | num_updates 48520 | best_loss 0.083
|
| 189 |
+
epoch 010 | valid on 'valid' subset | loss 0.084 | ppl 1.06 | wps 21769.7 | wpb 1954.1 | bsz 93.8 | num_updates 48520 | best_loss 0.083
|
| 190 |
+
epoch 010 | valid on 'valid' subset | loss 0.084 | ppl 1.06 | wps 21769.7 | wpb 1954.1 | bsz 93.8 | num_updates 48520 | best_loss 0.083
|
| 191 |
+
epoch 010 | valid on 'valid' subset | loss 0.084 | ppl 1.06 | wps 21769.7 | wpb 1954.1 | bsz 93.8 | num_updates 48520 | best_loss 0.083
|
| 192 |
+
epoch 010 | valid on 'valid' subset | loss 0.084 | ppl 1.06 | wps 21769.7 | wpb 1954.1 | bsz 93.8 | num_updates 48520 | best_loss 0.083
|
| 193 |
+
end of epoch 10 (average epoch stats below)
|
| 194 |
+
epoch 010 | loss 0.018 | ppl 1.01 | wps 6566 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 48520 | lr 0.000143562 | gnorm 0.377 | train_wall 1443 | gb_free 12.8 | wall 14655
|
| 195 |
+
epoch 010 | loss 0.018 | ppl 1.01 | wps 6566 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 48520 | lr 0.000143562 | gnorm 0.377 | train_wall 1443 | gb_free 12.8 | wall 14655
|
| 196 |
+
epoch 010 | loss 0.018 | ppl 1.01 | wps 6566 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 48520 | lr 0.000143562 | gnorm 0.377 | train_wall 1443 | gb_free 12.8 | wall 14655
|
| 197 |
+
epoch 010 | loss 0.018 | ppl 1.01 | wps 6566 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 48520 | lr 0.000143562 | gnorm 0.377 | train_wall 1443 | gb_free 12.8 | wall 14655
|
| 198 |
+
epoch 010 | loss 0.018 | ppl 1.01 | wps 6566 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 48520 | lr 0.000143562 | gnorm 0.377 | train_wall 1443 | gb_free 12.8 | wall 14655
|
| 199 |
+
epoch 010 | loss 0.018 | ppl 1.01 | wps 6566 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 48520 | lr 0.000143562 | gnorm 0.377 | train_wall 1443 | gb_free 12.8 | wall 14655
|
| 200 |
+
epoch 010 | loss 0.018 | ppl 1.01 | wps 6566 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 48520 | lr 0.000143562 | gnorm 0.377 | train_wall 1443 | gb_free 12.8 | wall 14655
|
| 201 |
+
epoch 010 | loss 0.018 | ppl 1.01 | wps 6566 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 48520 | lr 0.000143562 | gnorm 0.377 | train_wall 1443 | gb_free 12.8 | wall 14655
|
| 202 |
+
epoch 010 | loss 0.018 | ppl 1.01 | wps 6566 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 48520 | lr 0.000143562 | gnorm 0.377 | train_wall 1443 | gb_free 12.8 | wall 14655
|
| 203 |
+
epoch 010 | loss 0.018 | ppl 1.01 | wps 6566 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 48520 | lr 0.000143562 | gnorm 0.377 | train_wall 1443 | gb_free 12.8 | wall 14655
|
| 204 |
+
Start iterating over samples
|
| 205 |
+
begin validation on "valid" subset
|
| 206 |
+
epoch 011 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 21426.4 | wpb 1954.1 | bsz 93.8 | num_updates 53372 | best_loss 0.078
|
| 207 |
+
epoch 011 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 21426.4 | wpb 1954.1 | bsz 93.8 | num_updates 53372 | best_loss 0.078
|
| 208 |
+
epoch 011 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 21426.4 | wpb 1954.1 | bsz 93.8 | num_updates 53372 | best_loss 0.078
|
| 209 |
+
epoch 011 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 21426.4 | wpb 1954.1 | bsz 93.8 | num_updates 53372 | best_loss 0.078
|
| 210 |
+
epoch 011 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 21426.4 | wpb 1954.1 | bsz 93.8 | num_updates 53372 | best_loss 0.078
|
| 211 |
+
epoch 011 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 21426.4 | wpb 1954.1 | bsz 93.8 | num_updates 53372 | best_loss 0.078
|
| 212 |
+
epoch 011 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 21426.4 | wpb 1954.1 | bsz 93.8 | num_updates 53372 | best_loss 0.078
|
| 213 |
+
epoch 011 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 21426.4 | wpb 1954.1 | bsz 93.8 | num_updates 53372 | best_loss 0.078
|
| 214 |
+
epoch 011 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 21426.4 | wpb 1954.1 | bsz 93.8 | num_updates 53372 | best_loss 0.078
|
| 215 |
+
epoch 011 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 21426.4 | wpb 1954.1 | bsz 93.8 | num_updates 53372 | best_loss 0.078
|
| 216 |
+
epoch 011 | valid on 'valid' subset | loss 0.078 | ppl 1.06 | wps 21426.4 | wpb 1954.1 | bsz 93.8 | num_updates 53372 | best_loss 0.078
|
| 217 |
+
end of epoch 11 (average epoch stats below)
|
| 218 |
+
epoch 011 | loss 0.015 | ppl 1.01 | wps 6564.3 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 53372 | lr 0.000136881 | gnorm 0.347 | train_wall 1443 | gb_free 12.8 | wall 16122
|
| 219 |
+
epoch 011 | loss 0.015 | ppl 1.01 | wps 6564.3 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 53372 | lr 0.000136881 | gnorm 0.347 | train_wall 1443 | gb_free 12.8 | wall 16122
|
| 220 |
+
epoch 011 | loss 0.015 | ppl 1.01 | wps 6564.3 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 53372 | lr 0.000136881 | gnorm 0.347 | train_wall 1443 | gb_free 12.8 | wall 16122
|
| 221 |
+
epoch 011 | loss 0.015 | ppl 1.01 | wps 6564.3 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 53372 | lr 0.000136881 | gnorm 0.347 | train_wall 1443 | gb_free 12.8 | wall 16122
|
| 222 |
+
epoch 011 | loss 0.015 | ppl 1.01 | wps 6564.3 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 53372 | lr 0.000136881 | gnorm 0.347 | train_wall 1443 | gb_free 12.8 | wall 16122
|
| 223 |
+
epoch 011 | loss 0.015 | ppl 1.01 | wps 6564.3 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 53372 | lr 0.000136881 | gnorm 0.347 | train_wall 1443 | gb_free 12.8 | wall 16122
|
| 224 |
+
epoch 011 | loss 0.015 | ppl 1.01 | wps 6564.3 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 53372 | lr 0.000136881 | gnorm 0.347 | train_wall 1443 | gb_free 12.8 | wall 16122
|
| 225 |
+
epoch 011 | loss 0.015 | ppl 1.01 | wps 6564.3 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 53372 | lr 0.000136881 | gnorm 0.347 | train_wall 1443 | gb_free 12.8 | wall 16122
|
| 226 |
+
epoch 011 | loss 0.015 | ppl 1.01 | wps 6564.3 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 53372 | lr 0.000136881 | gnorm 0.347 | train_wall 1443 | gb_free 12.8 | wall 16122
|
| 227 |
+
epoch 011 | loss 0.015 | ppl 1.01 | wps 6564.3 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 53372 | lr 0.000136881 | gnorm 0.347 | train_wall 1443 | gb_free 12.8 | wall 16122
|
| 228 |
+
epoch 011 | loss 0.015 | ppl 1.01 | wps 6564.3 | ups 3.31 | wpb 1984.3 | bsz 95.9 | num_updates 53372 | lr 0.000136881 | gnorm 0.347 | train_wall 1443 | gb_free 12.8 | wall 16122
|
| 229 |
+
Start iterating over samples
|
| 230 |
+
begin validation on "valid" subset
|
| 231 |
+
epoch 012 | valid on 'valid' subset | loss 0.072 | ppl 1.05 | wps 21430.6 | wpb 1954.1 | bsz 93.8 | num_updates 58224 | best_loss 0.072
|
| 232 |
+
epoch 012 | valid on 'valid' subset | loss 0.072 | ppl 1.05 | wps 21430.6 | wpb 1954.1 | bsz 93.8 | num_updates 58224 | best_loss 0.072
|
| 233 |
+
epoch 012 | valid on 'valid' subset | loss 0.072 | ppl 1.05 | wps 21430.6 | wpb 1954.1 | bsz 93.8 | num_updates 58224 | best_loss 0.072
|
| 234 |
+
epoch 012 | valid on 'valid' subset | loss 0.072 | ppl 1.05 | wps 21430.6 | wpb 1954.1 | bsz 93.8 | num_updates 58224 | best_loss 0.072
|
| 235 |
+
epoch 012 | valid on 'valid' subset | loss 0.072 | ppl 1.05 | wps 21430.6 | wpb 1954.1 | bsz 93.8 | num_updates 58224 | best_loss 0.072
|
| 236 |
+
epoch 012 | valid on 'valid' subset | loss 0.072 | ppl 1.05 | wps 21430.6 | wpb 1954.1 | bsz 93.8 | num_updates 58224 | best_loss 0.072
|
| 237 |
+
epoch 012 | valid on 'valid' subset | loss 0.072 | ppl 1.05 | wps 21430.6 | wpb 1954.1 | bsz 93.8 | num_updates 58224 | best_loss 0.072
|
| 238 |
+
epoch 012 | valid on 'valid' subset | loss 0.072 | ppl 1.05 | wps 21430.6 | wpb 1954.1 | bsz 93.8 | num_updates 58224 | best_loss 0.072
|
| 239 |
+
epoch 012 | valid on 'valid' subset | loss 0.072 | ppl 1.05 | wps 21430.6 | wpb 1954.1 | bsz 93.8 | num_updates 58224 | best_loss 0.072
|
| 240 |
+
epoch 012 | valid on 'valid' subset | loss 0.072 | ppl 1.05 | wps 21430.6 | wpb 1954.1 | bsz 93.8 | num_updates 58224 | best_loss 0.072
|
| 241 |
+
epoch 012 | valid on 'valid' subset | loss 0.072 | ppl 1.05 | wps 21430.6 | wpb 1954.1 | bsz 93.8 | num_updates 58224 | best_loss 0.072
|
| 242 |
+
epoch 012 | valid on 'valid' subset | loss 0.072 | ppl 1.05 | wps 21430.6 | wpb 1954.1 | bsz 93.8 | num_updates 58224 | best_loss 0.072
|
| 243 |
+
end of epoch 12 (average epoch stats below)
|
| 244 |
+
epoch 012 | loss 0.012 | ppl 1.01 | wps 6490.7 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 58224 | lr 0.000131054 | gnorm 0.316 | train_wall 1457 | gb_free 12.7 | wall 17605
|
| 245 |
+
epoch 012 | loss 0.012 | ppl 1.01 | wps 6490.7 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 58224 | lr 0.000131054 | gnorm 0.316 | train_wall 1457 | gb_free 12.7 | wall 17605
|
| 246 |
+
epoch 012 | loss 0.012 | ppl 1.01 | wps 6490.7 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 58224 | lr 0.000131054 | gnorm 0.316 | train_wall 1457 | gb_free 12.7 | wall 17605
|
| 247 |
+
epoch 012 | loss 0.012 | ppl 1.01 | wps 6490.7 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 58224 | lr 0.000131054 | gnorm 0.316 | train_wall 1457 | gb_free 12.7 | wall 17605
|
| 248 |
+
epoch 012 | loss 0.012 | ppl 1.01 | wps 6490.7 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 58224 | lr 0.000131054 | gnorm 0.316 | train_wall 1457 | gb_free 12.7 | wall 17605
|
| 249 |
+
epoch 012 | loss 0.012 | ppl 1.01 | wps 6490.7 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 58224 | lr 0.000131054 | gnorm 0.316 | train_wall 1457 | gb_free 12.7 | wall 17605
|
| 250 |
+
epoch 012 | loss 0.012 | ppl 1.01 | wps 6490.7 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 58224 | lr 0.000131054 | gnorm 0.316 | train_wall 1457 | gb_free 12.7 | wall 17605
|
| 251 |
+
epoch 012 | loss 0.012 | ppl 1.01 | wps 6490.7 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 58224 | lr 0.000131054 | gnorm 0.316 | train_wall 1457 | gb_free 12.7 | wall 17605
|
| 252 |
+
epoch 012 | loss 0.012 | ppl 1.01 | wps 6490.7 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 58224 | lr 0.000131054 | gnorm 0.316 | train_wall 1457 | gb_free 12.7 | wall 17605
|
| 253 |
+
epoch 012 | loss 0.012 | ppl 1.01 | wps 6490.7 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 58224 | lr 0.000131054 | gnorm 0.316 | train_wall 1457 | gb_free 12.7 | wall 17605
|
| 254 |
+
epoch 012 | loss 0.012 | ppl 1.01 | wps 6490.7 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 58224 | lr 0.000131054 | gnorm 0.316 | train_wall 1457 | gb_free 12.7 | wall 17605
|
| 255 |
+
epoch 012 | loss 0.012 | ppl 1.01 | wps 6490.7 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 58224 | lr 0.000131054 | gnorm 0.316 | train_wall 1457 | gb_free 12.7 | wall 17605
|
| 256 |
+
Start iterating over samples
|
| 257 |
+
begin validation on "valid" subset
|
| 258 |
+
epoch 013 | valid on 'valid' subset | loss 0.076 | ppl 1.05 | wps 21465.9 | wpb 1954.1 | bsz 93.8 | num_updates 63076 | best_loss 0.072
|
| 259 |
+
epoch 013 | valid on 'valid' subset | loss 0.076 | ppl 1.05 | wps 21465.9 | wpb 1954.1 | bsz 93.8 | num_updates 63076 | best_loss 0.072
|
| 260 |
+
epoch 013 | valid on 'valid' subset | loss 0.076 | ppl 1.05 | wps 21465.9 | wpb 1954.1 | bsz 93.8 | num_updates 63076 | best_loss 0.072
|
| 261 |
+
epoch 013 | valid on 'valid' subset | loss 0.076 | ppl 1.05 | wps 21465.9 | wpb 1954.1 | bsz 93.8 | num_updates 63076 | best_loss 0.072
|
| 262 |
+
epoch 013 | valid on 'valid' subset | loss 0.076 | ppl 1.05 | wps 21465.9 | wpb 1954.1 | bsz 93.8 | num_updates 63076 | best_loss 0.072
|
| 263 |
+
epoch 013 | valid on 'valid' subset | loss 0.076 | ppl 1.05 | wps 21465.9 | wpb 1954.1 | bsz 93.8 | num_updates 63076 | best_loss 0.072
|
| 264 |
+
epoch 013 | valid on 'valid' subset | loss 0.076 | ppl 1.05 | wps 21465.9 | wpb 1954.1 | bsz 93.8 | num_updates 63076 | best_loss 0.072
|
| 265 |
+
epoch 013 | valid on 'valid' subset | loss 0.076 | ppl 1.05 | wps 21465.9 | wpb 1954.1 | bsz 93.8 | num_updates 63076 | best_loss 0.072
|
| 266 |
+
epoch 013 | valid on 'valid' subset | loss 0.076 | ppl 1.05 | wps 21465.9 | wpb 1954.1 | bsz 93.8 | num_updates 63076 | best_loss 0.072
|
| 267 |
+
epoch 013 | valid on 'valid' subset | loss 0.076 | ppl 1.05 | wps 21465.9 | wpb 1954.1 | bsz 93.8 | num_updates 63076 | best_loss 0.072
|
| 268 |
+
epoch 013 | valid on 'valid' subset | loss 0.076 | ppl 1.05 | wps 21465.9 | wpb 1954.1 | bsz 93.8 | num_updates 63076 | best_loss 0.072
|
| 269 |
+
epoch 013 | valid on 'valid' subset | loss 0.076 | ppl 1.05 | wps 21465.9 | wpb 1954.1 | bsz 93.8 | num_updates 63076 | best_loss 0.072
|
| 270 |
+
epoch 013 | valid on 'valid' subset | loss 0.076 | ppl 1.05 | wps 21465.9 | wpb 1954.1 | bsz 93.8 | num_updates 63076 | best_loss 0.072
|
| 271 |
+
end of epoch 13 (average epoch stats below)
|
| 272 |
+
epoch 013 | loss 0.011 | ppl 1.01 | wps 6491.7 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 63076 | lr 0.000125912 | gnorm 0.297 | train_wall 1457 | gb_free 12.9 | wall 19088
|
| 273 |
+
epoch 013 | loss 0.011 | ppl 1.01 | wps 6491.7 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 63076 | lr 0.000125912 | gnorm 0.297 | train_wall 1457 | gb_free 12.9 | wall 19088
|
| 274 |
+
epoch 013 | loss 0.011 | ppl 1.01 | wps 6491.7 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 63076 | lr 0.000125912 | gnorm 0.297 | train_wall 1457 | gb_free 12.9 | wall 19088
|
| 275 |
+
epoch 013 | loss 0.011 | ppl 1.01 | wps 6491.7 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 63076 | lr 0.000125912 | gnorm 0.297 | train_wall 1457 | gb_free 12.9 | wall 19088
|
| 276 |
+
epoch 013 | loss 0.011 | ppl 1.01 | wps 6491.7 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 63076 | lr 0.000125912 | gnorm 0.297 | train_wall 1457 | gb_free 12.9 | wall 19088
|
| 277 |
+
epoch 013 | loss 0.011 | ppl 1.01 | wps 6491.7 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 63076 | lr 0.000125912 | gnorm 0.297 | train_wall 1457 | gb_free 12.9 | wall 19088
|
| 278 |
+
epoch 013 | loss 0.011 | ppl 1.01 | wps 6491.7 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 63076 | lr 0.000125912 | gnorm 0.297 | train_wall 1457 | gb_free 12.9 | wall 19088
|
| 279 |
+
epoch 013 | loss 0.011 | ppl 1.01 | wps 6491.7 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 63076 | lr 0.000125912 | gnorm 0.297 | train_wall 1457 | gb_free 12.9 | wall 19088
|
| 280 |
+
epoch 013 | loss 0.011 | ppl 1.01 | wps 6491.7 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 63076 | lr 0.000125912 | gnorm 0.297 | train_wall 1457 | gb_free 12.9 | wall 19088
|
| 281 |
+
epoch 013 | loss 0.011 | ppl 1.01 | wps 6491.7 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 63076 | lr 0.000125912 | gnorm 0.297 | train_wall 1457 | gb_free 12.9 | wall 19088
|
| 282 |
+
epoch 013 | loss 0.011 | ppl 1.01 | wps 6491.7 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 63076 | lr 0.000125912 | gnorm 0.297 | train_wall 1457 | gb_free 12.9 | wall 19088
|
| 283 |
+
epoch 013 | loss 0.011 | ppl 1.01 | wps 6491.7 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 63076 | lr 0.000125912 | gnorm 0.297 | train_wall 1457 | gb_free 12.9 | wall 19088
|
| 284 |
+
epoch 013 | loss 0.011 | ppl 1.01 | wps 6491.7 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 63076 | lr 0.000125912 | gnorm 0.297 | train_wall 1457 | gb_free 12.9 | wall 19088
|
| 285 |
+
Start iterating over samples
|
| 286 |
+
begin validation on "valid" subset
|
| 287 |
+
epoch 014 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 21415.4 | wpb 1954.1 | bsz 93.8 | num_updates 67928 | best_loss 0.072
|
| 288 |
+
epoch 014 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 21415.4 | wpb 1954.1 | bsz 93.8 | num_updates 67928 | best_loss 0.072
|
| 289 |
+
epoch 014 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 21415.4 | wpb 1954.1 | bsz 93.8 | num_updates 67928 | best_loss 0.072
|
| 290 |
+
epoch 014 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 21415.4 | wpb 1954.1 | bsz 93.8 | num_updates 67928 | best_loss 0.072
|
| 291 |
+
epoch 014 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 21415.4 | wpb 1954.1 | bsz 93.8 | num_updates 67928 | best_loss 0.072
|
| 292 |
+
epoch 014 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 21415.4 | wpb 1954.1 | bsz 93.8 | num_updates 67928 | best_loss 0.072
|
| 293 |
+
epoch 014 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 21415.4 | wpb 1954.1 | bsz 93.8 | num_updates 67928 | best_loss 0.072
|
| 294 |
+
epoch 014 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 21415.4 | wpb 1954.1 | bsz 93.8 | num_updates 67928 | best_loss 0.072
|
| 295 |
+
epoch 014 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 21415.4 | wpb 1954.1 | bsz 93.8 | num_updates 67928 | best_loss 0.072
|
| 296 |
+
epoch 014 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 21415.4 | wpb 1954.1 | bsz 93.8 | num_updates 67928 | best_loss 0.072
|
| 297 |
+
epoch 014 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 21415.4 | wpb 1954.1 | bsz 93.8 | num_updates 67928 | best_loss 0.072
|
| 298 |
+
epoch 014 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 21415.4 | wpb 1954.1 | bsz 93.8 | num_updates 67928 | best_loss 0.072
|
| 299 |
+
epoch 014 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 21415.4 | wpb 1954.1 | bsz 93.8 | num_updates 67928 | best_loss 0.072
|
| 300 |
+
epoch 014 | valid on 'valid' subset | loss 0.074 | ppl 1.05 | wps 21415.4 | wpb 1954.1 | bsz 93.8 | num_updates 67928 | best_loss 0.072
|
| 301 |
+
end of epoch 14 (average epoch stats below)
|
| 302 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6482.8 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 67928 | lr 0.000121332 | gnorm 0.282 | train_wall 1458 | gb_free 13.1 | wall 20573
|
| 303 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6482.8 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 67928 | lr 0.000121332 | gnorm 0.282 | train_wall 1458 | gb_free 13.1 | wall 20573
|
| 304 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6482.8 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 67928 | lr 0.000121332 | gnorm 0.282 | train_wall 1458 | gb_free 13.1 | wall 20573
|
| 305 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6482.8 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 67928 | lr 0.000121332 | gnorm 0.282 | train_wall 1458 | gb_free 13.1 | wall 20573
|
| 306 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6482.8 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 67928 | lr 0.000121332 | gnorm 0.282 | train_wall 1458 | gb_free 13.1 | wall 20573
|
| 307 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6482.8 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 67928 | lr 0.000121332 | gnorm 0.282 | train_wall 1458 | gb_free 13.1 | wall 20573
|
| 308 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6482.8 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 67928 | lr 0.000121332 | gnorm 0.282 | train_wall 1458 | gb_free 13.1 | wall 20573
|
| 309 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6482.8 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 67928 | lr 0.000121332 | gnorm 0.282 | train_wall 1458 | gb_free 13.1 | wall 20573
|
| 310 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6482.8 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 67928 | lr 0.000121332 | gnorm 0.282 | train_wall 1458 | gb_free 13.1 | wall 20573
|
| 311 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6482.8 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 67928 | lr 0.000121332 | gnorm 0.282 | train_wall 1458 | gb_free 13.1 | wall 20573
|
| 312 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6482.8 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 67928 | lr 0.000121332 | gnorm 0.282 | train_wall 1458 | gb_free 13.1 | wall 20573
|
| 313 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6482.8 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 67928 | lr 0.000121332 | gnorm 0.282 | train_wall 1458 | gb_free 13.1 | wall 20573
|
| 314 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6482.8 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 67928 | lr 0.000121332 | gnorm 0.282 | train_wall 1458 | gb_free 13.1 | wall 20573
|
| 315 |
+
epoch 014 | loss 0.01 | ppl 1.01 | wps 6482.8 | ups 3.27 | wpb 1984.3 | bsz 95.9 | num_updates 67928 | lr 0.000121332 | gnorm 0.282 | train_wall 1458 | gb_free 13.1 | wall 20573
|
| 316 |
+
Start iterating over samples
|
| 317 |
+
begin validation on "valid" subset
|
| 318 |
+
epoch 015 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 21711.6 | wpb 1954.1 | bsz 93.8 | num_updates 72780 | best_loss 0.072
|
| 319 |
+
epoch 015 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 21711.6 | wpb 1954.1 | bsz 93.8 | num_updates 72780 | best_loss 0.072
|
| 320 |
+
epoch 015 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 21711.6 | wpb 1954.1 | bsz 93.8 | num_updates 72780 | best_loss 0.072
|
| 321 |
+
epoch 015 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 21711.6 | wpb 1954.1 | bsz 93.8 | num_updates 72780 | best_loss 0.072
|
| 322 |
+
epoch 015 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 21711.6 | wpb 1954.1 | bsz 93.8 | num_updates 72780 | best_loss 0.072
|
| 323 |
+
epoch 015 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 21711.6 | wpb 1954.1 | bsz 93.8 | num_updates 72780 | best_loss 0.072
|
| 324 |
+
epoch 015 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 21711.6 | wpb 1954.1 | bsz 93.8 | num_updates 72780 | best_loss 0.072
|
| 325 |
+
epoch 015 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 21711.6 | wpb 1954.1 | bsz 93.8 | num_updates 72780 | best_loss 0.072
|
| 326 |
+
epoch 015 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 21711.6 | wpb 1954.1 | bsz 93.8 | num_updates 72780 | best_loss 0.072
|
| 327 |
+
epoch 015 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 21711.6 | wpb 1954.1 | bsz 93.8 | num_updates 72780 | best_loss 0.072
|
| 328 |
+
epoch 015 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 21711.6 | wpb 1954.1 | bsz 93.8 | num_updates 72780 | best_loss 0.072
|
| 329 |
+
epoch 015 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 21711.6 | wpb 1954.1 | bsz 93.8 | num_updates 72780 | best_loss 0.072
|
| 330 |
+
epoch 015 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 21711.6 | wpb 1954.1 | bsz 93.8 | num_updates 72780 | best_loss 0.072
|
| 331 |
+
epoch 015 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 21711.6 | wpb 1954.1 | bsz 93.8 | num_updates 72780 | best_loss 0.072
|
| 332 |
+
epoch 015 | valid on 'valid' subset | loss 0.075 | ppl 1.05 | wps 21711.6 | wpb 1954.1 | bsz 93.8 | num_updates 72780 | best_loss 0.072
|
| 333 |
+
end of epoch 15 (average epoch stats below)
|
| 334 |
+
epoch 015 | loss 0.008 | ppl 1.01 | wps 6543.9 | ups 3.3 | wpb 1984.3 | bsz 95.9 | num_updates 72780 | lr 0.000117218 | gnorm 0.27 | train_wall 1448 | gb_free 12.8 | wall 22044
|
| 335 |
+
epoch 015 | loss 0.008 | ppl 1.01 | wps 6543.9 | ups 3.3 | wpb 1984.3 | bsz 95.9 | num_updates 72780 | lr 0.000117218 | gnorm 0.27 | train_wall 1448 | gb_free 12.8 | wall 22044
|
| 336 |
+
epoch 015 | loss 0.008 | ppl 1.01 | wps 6543.9 | ups 3.3 | wpb 1984.3 | bsz 95.9 | num_updates 72780 | lr 0.000117218 | gnorm 0.27 | train_wall 1448 | gb_free 12.8 | wall 22044
|
| 337 |
+
epoch 015 | loss 0.008 | ppl 1.01 | wps 6543.9 | ups 3.3 | wpb 1984.3 | bsz 95.9 | num_updates 72780 | lr 0.000117218 | gnorm 0.27 | train_wall 1448 | gb_free 12.8 | wall 22044
|
| 338 |
+
epoch 015 | loss 0.008 | ppl 1.01 | wps 6543.9 | ups 3.3 | wpb 1984.3 | bsz 95.9 | num_updates 72780 | lr 0.000117218 | gnorm 0.27 | train_wall 1448 | gb_free 12.8 | wall 22044
|
| 339 |
+
epoch 015 | loss 0.008 | ppl 1.01 | wps 6543.9 | ups 3.3 | wpb 1984.3 | bsz 95.9 | num_updates 72780 | lr 0.000117218 | gnorm 0.27 | train_wall 1448 | gb_free 12.8 | wall 22044
|
| 340 |
+
epoch 015 | loss 0.008 | ppl 1.01 | wps 6543.9 | ups 3.3 | wpb 1984.3 | bsz 95.9 | num_updates 72780 | lr 0.000117218 | gnorm 0.27 | train_wall 1448 | gb_free 12.8 | wall 22044
|
| 341 |
+
epoch 015 | loss 0.008 | ppl 1.01 | wps 6543.9 | ups 3.3 | wpb 1984.3 | bsz 95.9 | num_updates 72780 | lr 0.000117218 | gnorm 0.27 | train_wall 1448 | gb_free 12.8 | wall 22044
|
| 342 |
+
epoch 015 | loss 0.008 | ppl 1.01 | wps 6543.9 | ups 3.3 | wpb 1984.3 | bsz 95.9 | num_updates 72780 | lr 0.000117218 | gnorm 0.27 | train_wall 1448 | gb_free 12.8 | wall 22044
|
| 343 |
+
epoch 015 | loss 0.008 | ppl 1.01 | wps 6543.9 | ups 3.3 | wpb 1984.3 | bsz 95.9 | num_updates 72780 | lr 0.000117218 | gnorm 0.27 | train_wall 1448 | gb_free 12.8 | wall 22044
|
| 344 |
+
epoch 015 | loss 0.008 | ppl 1.01 | wps 6543.9 | ups 3.3 | wpb 1984.3 | bsz 95.9 | num_updates 72780 | lr 0.000117218 | gnorm 0.27 | train_wall 1448 | gb_free 12.8 | wall 22044
|
| 345 |
+
epoch 015 | loss 0.008 | ppl 1.01 | wps 6543.9 | ups 3.3 | wpb 1984.3 | bsz 95.9 | num_updates 72780 | lr 0.000117218 | gnorm 0.27 | train_wall 1448 | gb_free 12.8 | wall 22044
|
| 346 |
+
epoch 015 | loss 0.008 | ppl 1.01 | wps 6543.9 | ups 3.3 | wpb 1984.3 | bsz 95.9 | num_updates 72780 | lr 0.000117218 | gnorm 0.27 | train_wall 1448 | gb_free 12.8 | wall 22044
|
| 347 |
+
epoch 015 | loss 0.008 | ppl 1.01 | wps 6543.9 | ups 3.3 | wpb 1984.3 | bsz 95.9 | num_updates 72780 | lr 0.000117218 | gnorm 0.27 | train_wall 1448 | gb_free 12.8 | wall 22044
|
| 348 |
+
epoch 015 | loss 0.008 | ppl 1.01 | wps 6543.9 | ups 3.3 | wpb 1984.3 | bsz 95.9 | num_updates 72780 | lr 0.000117218 | gnorm 0.27 | train_wall 1448 | gb_free 12.8 | wall 22044
|
| 349 |
+
Start iterating over samples
|
| 350 |
+
begin validation on "valid" subset
|
| 351 |
+
epoch 016 | valid on 'valid' subset | loss 0.079 | ppl 1.06 | wps 21661.2 | wpb 1954.1 | bsz 93.8 | num_updates 77632 | best_loss 0.072
|
| 352 |
+
epoch 016 | valid on 'valid' subset | loss 0.079 | ppl 1.06 | wps 21661.2 | wpb 1954.1 | bsz 93.8 | num_updates 77632 | best_loss 0.072
|
| 353 |
+
epoch 016 | valid on 'valid' subset | loss 0.079 | ppl 1.06 | wps 21661.2 | wpb 1954.1 | bsz 93.8 | num_updates 77632 | best_loss 0.072
|
| 354 |
+
epoch 016 | valid on 'valid' subset | loss 0.079 | ppl 1.06 | wps 21661.2 | wpb 1954.1 | bsz 93.8 | num_updates 77632 | best_loss 0.072
|
| 355 |
+
epoch 016 | valid on 'valid' subset | loss 0.079 | ppl 1.06 | wps 21661.2 | wpb 1954.1 | bsz 93.8 | num_updates 77632 | best_loss 0.072
|
| 356 |
+
epoch 016 | valid on 'valid' subset | loss 0.079 | ppl 1.06 | wps 21661.2 | wpb 1954.1 | bsz 93.8 | num_updates 77632 | best_loss 0.072
|
| 357 |
+
epoch 016 | valid on 'valid' subset | loss 0.079 | ppl 1.06 | wps 21661.2 | wpb 1954.1 | bsz 93.8 | num_updates 77632 | best_loss 0.072
|
| 358 |
+
epoch 016 | valid on 'valid' subset | loss 0.079 | ppl 1.06 | wps 21661.2 | wpb 1954.1 | bsz 93.8 | num_updates 77632 | best_loss 0.072
|
| 359 |
+
epoch 016 | valid on 'valid' subset | loss 0.079 | ppl 1.06 | wps 21661.2 | wpb 1954.1 | bsz 93.8 | num_updates 77632 | best_loss 0.072
|
| 360 |
+
epoch 016 | valid on 'valid' subset | loss 0.079 | ppl 1.06 | wps 21661.2 | wpb 1954.1 | bsz 93.8 | num_updates 77632 | best_loss 0.072
|
| 361 |
+
epoch 016 | valid on 'valid' subset | loss 0.079 | ppl 1.06 | wps 21661.2 | wpb 1954.1 | bsz 93.8 | num_updates 77632 | best_loss 0.072
|
| 362 |
+
epoch 016 | valid on 'valid' subset | loss 0.079 | ppl 1.06 | wps 21661.2 | wpb 1954.1 | bsz 93.8 | num_updates 77632 | best_loss 0.072
|
| 363 |
+
epoch 016 | valid on 'valid' subset | loss 0.079 | ppl 1.06 | wps 21661.2 | wpb 1954.1 | bsz 93.8 | num_updates 77632 | best_loss 0.072
|
| 364 |
+
epoch 016 | valid on 'valid' subset | loss 0.079 | ppl 1.06 | wps 21661.2 | wpb 1954.1 | bsz 93.8 | num_updates 77632 | best_loss 0.072
|
| 365 |
+
epoch 016 | valid on 'valid' subset | loss 0.079 | ppl 1.06 | wps 21661.2 | wpb 1954.1 | bsz 93.8 | num_updates 77632 | best_loss 0.072
|
| 366 |
+
epoch 016 | valid on 'valid' subset | loss 0.079 | ppl 1.06 | wps 21661.2 | wpb 1954.1 | bsz 93.8 | num_updates 77632 | best_loss 0.072
|
| 367 |
+
early stop since valid performance hasn't improved for last 4 runs
|
| 368 |
+
end of epoch 16 (average epoch stats below)
|
| 369 |
+
epoch 016 | loss 0.007 | ppl 1.01 | wps 6535.8 | ups 3.29 | wpb 1984.3 | bsz 95.9 | num_updates 77632 | lr 0.000113496 | gnorm 0.252 | train_wall 1450 | gb_free 13 | wall 23517
|
| 370 |
+
epoch 016 | loss 0.007 | ppl 1.01 | wps 6535.8 | ups 3.29 | wpb 1984.3 | bsz 95.9 | num_updates 77632 | lr 0.000113496 | gnorm 0.252 | train_wall 1450 | gb_free 13 | wall 23517
|
| 371 |
+
epoch 016 | loss 0.007 | ppl 1.01 | wps 6535.8 | ups 3.29 | wpb 1984.3 | bsz 95.9 | num_updates 77632 | lr 0.000113496 | gnorm 0.252 | train_wall 1450 | gb_free 13 | wall 23517
|
| 372 |
+
epoch 016 | loss 0.007 | ppl 1.01 | wps 6535.8 | ups 3.29 | wpb 1984.3 | bsz 95.9 | num_updates 77632 | lr 0.000113496 | gnorm 0.252 | train_wall 1450 | gb_free 13 | wall 23517
|
| 373 |
+
epoch 016 | loss 0.007 | ppl 1.01 | wps 6535.8 | ups 3.29 | wpb 1984.3 | bsz 95.9 | num_updates 77632 | lr 0.000113496 | gnorm 0.252 | train_wall 1450 | gb_free 13 | wall 23517
|
| 374 |
+
epoch 016 | loss 0.007 | ppl 1.01 | wps 6535.8 | ups 3.29 | wpb 1984.3 | bsz 95.9 | num_updates 77632 | lr 0.000113496 | gnorm 0.252 | train_wall 1450 | gb_free 13 | wall 23517
|
| 375 |
+
epoch 016 | loss 0.007 | ppl 1.01 | wps 6535.8 | ups 3.29 | wpb 1984.3 | bsz 95.9 | num_updates 77632 | lr 0.000113496 | gnorm 0.252 | train_wall 1450 | gb_free 13 | wall 23517
|
| 376 |
+
epoch 016 | loss 0.007 | ppl 1.01 | wps 6535.8 | ups 3.29 | wpb 1984.3 | bsz 95.9 | num_updates 77632 | lr 0.000113496 | gnorm 0.252 | train_wall 1450 | gb_free 13 | wall 23517
|
| 377 |
+
epoch 016 | loss 0.007 | ppl 1.01 | wps 6535.8 | ups 3.29 | wpb 1984.3 | bsz 95.9 | num_updates 77632 | lr 0.000113496 | gnorm 0.252 | train_wall 1450 | gb_free 13 | wall 23517
|
| 378 |
+
epoch 016 | loss 0.007 | ppl 1.01 | wps 6535.8 | ups 3.29 | wpb 1984.3 | bsz 95.9 | num_updates 77632 | lr 0.000113496 | gnorm 0.252 | train_wall 1450 | gb_free 13 | wall 23517
|
| 379 |
+
epoch 016 | loss 0.007 | ppl 1.01 | wps 6535.8 | ups 3.29 | wpb 1984.3 | bsz 95.9 | num_updates 77632 | lr 0.000113496 | gnorm 0.252 | train_wall 1450 | gb_free 13 | wall 23517
|
| 380 |
+
epoch 016 | loss 0.007 | ppl 1.01 | wps 6535.8 | ups 3.29 | wpb 1984.3 | bsz 95.9 | num_updates 77632 | lr 0.000113496 | gnorm 0.252 | train_wall 1450 | gb_free 13 | wall 23517
|
| 381 |
+
epoch 016 | loss 0.007 | ppl 1.01 | wps 6535.8 | ups 3.29 | wpb 1984.3 | bsz 95.9 | num_updates 77632 | lr 0.000113496 | gnorm 0.252 | train_wall 1450 | gb_free 13 | wall 23517
|
| 382 |
+
epoch 016 | loss 0.007 | ppl 1.01 | wps 6535.8 | ups 3.29 | wpb 1984.3 | bsz 95.9 | num_updates 77632 | lr 0.000113496 | gnorm 0.252 | train_wall 1450 | gb_free 13 | wall 23517
|
| 383 |
+
epoch 016 | loss 0.007 | ppl 1.01 | wps 6535.8 | ups 3.29 | wpb 1984.3 | bsz 95.9 | num_updates 77632 | lr 0.000113496 | gnorm 0.252 | train_wall 1450 | gb_free 13 | wall 23517
|
| 384 |
+
epoch 016 | loss 0.007 | ppl 1.01 | wps 6535.8 | ups 3.29 | wpb 1984.3 | bsz 95.9 | num_updates 77632 | lr 0.000113496 | gnorm 0.252 | train_wall 1450 | gb_free 13 | wall 23517
|
| 385 |
+
done training in 23517.0 seconds
|