diff --git a/viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/README.md b/viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/adapter_config.json b/viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c4599e0784cb415dba9abbbed59652eec36a099d --- /dev/null +++ b/viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/adapter_config.json @@ -0,0 +1,18 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "/home/dreilly1/Projects/VisCoP/base_vlm/videollama3-image_7b_local", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 16, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "revision": null, + "target_modules": ".*model\\.layers\\..*\\.(gate_proj|o_proj|q_proj|v_proj|up_proj|down_proj|k_proj)$", + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/adapter_model.bin b/viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..e13a03f24af4c9c177f667113323c22a1bf5d569 --- /dev/null +++ b/viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54b0d449306b48fc0193eeb2dec4280c5d91eb1180bc00faaf1e134edd727cbe +size 323097578 diff --git a/viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/config.json b/viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ea57043ff7eeb58ead1b29d4a689a8f2c7a364a6 --- /dev/null +++ b/viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/config.json @@ -0,0 +1,132 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "/home/dreilly1/Projects/VisCoP/base_vlm/videollama3-image_7b_local", + "architectures": [ + "Videollama3Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "DAMO-NLP-SG/VideoLLaMA3-7B-Image--configuration_videollama3.Videollama3Qwen2Config", + "AutoModelForCausalLM": "DAMO-NLP-SG/VideoLLaMA3-7B-Image--modeling_videollama3.Videollama3Qwen2ForCausalLM" + }, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 3584, + "image_aspect_ratio": "square", + "image_size": -1, + "image_token_index": 151665, + "image_token_length": 1, + "include_visual_probes": true, + "include_visual_tokens": true, + "initializer_range": 0.02, + "interaction_module": "cross_attention", + "interaction_module_layers": null, + "intermediate_size": 18944, + "is_alignment": false, + "llm_lr": 1e-05, + "max_frames": 180, + "max_position_embeddings": 32768, + "max_window_layers": 28, + "mm_hidden_size": 1152, + "mm_projector_lr": 1e-05, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_encoder": "DAMO-NLP-SG/SigLIP-NaViT", + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -1, + "model_type": "viscop_qwen2", + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "num_visual_probes": 16, + "probe_token_index": 151668, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 16384, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.46.3", + "use_cache": true, + "use_mm_proj": true, + "use_sliding_window": false, + "use_token_compression": false, + "vision_encoder": "DAMO-NLP-SG/SigLIP-NaViT", + "vision_encoder_config": { + "_attn_implementation_autoset": false, + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "videollama3_vision_encoder", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false + }, + "vision_encoder_lr": null, + "vocab_size": 152064 +} diff --git a/viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/non_lora_trainables.bin b/viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..3bbebf6931e8a5ce4268e467eefb42ce8d24f15b --- /dev/null +++ b/viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf012cd5517edf0873ba621833c3bb639f70191e9f9d733ff69e0bcdd71628be +size 354950700 diff --git a/viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/train_viscop.sh b/viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/train_viscop.sh new file mode 100644 index 0000000000000000000000000000000000000000..0402df00f6404a2053cb81bf7d7d1b792b4a447d --- /dev/null +++ b/viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/train_viscop.sh @@ -0,0 +1,124 @@ +#!/bin/bash +# Environment Variables +ARG_WORLD_SIZE=${1:-1} +ARG_NPROC_PER_NODE=${2:-8} + +if [[ -v MASTER_ADDR_PASSED ]]; then + ARG_MASTER_ADDR=$MASTER_ADDR_PASSED # passed via slurm submission script +else + ARG_MASTER_ADDR=127.0.0.1 # for dev environments +fi +ARG_MASTER_PORT=12355 +# ARG_RANK=$SLURM_NODEID +ARG_RANK=0 + +# Multiple conditions +if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then + WORLD_SIZE=$ARG_WORLD_SIZE + NPROC_PER_NODE=$ARG_NPROC_PER_NODE +fi + +if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then + MASTER_ADDR=$ARG_MASTER_ADDR + MASTER_PORT=$ARG_MASTER_PORT + RANK=$ARG_RANK +fi + +echo "MASTER_ADDR: $MASTER_ADDR. MASTER_PORT: $MASTER_PORT. RANK: $RANK" +echo "WORLD_SIZE: $WORLD_SIZE" +echo "NPROC_PER_NODE: $NPROC_PER_NODE" + +# Training Arguments +GLOBAL_BATCH_SIZE=128 # aka effective batch size +LOCAL_BATCH_SIZE=8 # batch size per GPU +GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)] +echo $GRADIENT_ACCUMULATION_STEPS + +INIT_MODEL=/home/dreilly1/Projects/VisCoP/base_vlm/videollama3-image_7b_local # path to base VLM (for ViSCoP we use VideoLLaMA3 as the base VLM) + +NUM_DATA_WORKERS=8 +NUM_TRAIN_EPOCHS=3 +LORA_TRAINING=True + +# ViSCoP Arguments +NUM_VISUAL_PROBES=16 +INTERACTION_MODULE_POS=all +PASS_PROBES_TO_LLM=True +PASS_VIS_FEATURES_TO_LLM=True + +# Logging Arguments +export WANDB_PROJECT=sony26_mm_viscop +REPORT_TO=wandb +OUTP_DIR=work_dirs/egoexo +RUN_NAME=viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer + +# Data Arguments +# DATA_DIR=/home/dreilly1/Projects/paired_egoexo_dl/paired_videos_DATA/ +DATA_DIR=/data/dreilly1/EgoExo4D_symlink/ +TRAINING_JSON="/home/dreilly1/Projects/viscop_sony/training_jsons/train-instr_viscop_egoview_ALLDATA.json" + +# if [[ $TRAINING_JSON == *"egoview"* ]]; then +# MAX_FRAMES=40 # use 40 frames for training on ego +# else +# MAX_FRAMES=180 +# fi +MAX_FRAMES=180 + +# Optional Arguments. Set TESTING to 1 to quickly test the training script without logging or data workers, useful for debugging +TESTING=0 +if [ $TESTING -eq 1 ]; then + NUM_DATA_WORKERS=0 + REPORT_TO=none + RUN_NAME=TESTING +fi + +mkdir -p "${OUTP_DIR}/${RUN_NAME}/" +cp "$0" "${OUTP_DIR}/${RUN_NAME}/" + +torchrun --nnodes $WORLD_SIZE \ + --nproc_per_node $NPROC_PER_NODE \ + --master_addr=$MASTER_ADDR \ + --master_port=$MASTER_PORT \ + --node_rank $RANK \ + viscop/train_viscop.py \ + --interaction_module_layers $INTERACTION_MODULE_POS \ + --lora_enable $LORA_TRAINING \ + --num_train_epochs $NUM_TRAIN_EPOCHS \ + --deepspeed scripts/zero2.json \ + --model_type viscop_qwen2 \ + --model_path $INIT_MODEL \ + --vision_encoder DAMO-NLP-SG/SigLIP-NaViT \ + --mm_projector_type mlp2x_gelu \ + --data_path $TRAINING_JSON \ + --data_folder $DATA_DIR \ + --image_merge_size 2 \ + --video_merge_size 2 \ + --fps 1 \ + --max_frames $MAX_FRAMES \ + --model_max_length 16384 \ + --mm_max_length 10240 \ + --bf16 True \ + --tf32 True \ + --fp16 False \ + --output_dir ${OUTP_DIR}/${RUN_NAME} \ + --per_device_train_batch_size $LOCAL_BATCH_SIZE \ + --per_device_eval_batch_size 2 \ + --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \ + --evaluation_strategy "no" \ + --save_strategy "no" \ + --save_steps 5000 \ + --save_total_limit 1 \ + --mm_projector_lr 1e-5 \ + --llm_lr 1e-5 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --gradient_checkpointing True \ + --dataloader_num_workers $NUM_DATA_WORKERS \ + --report_to $REPORT_TO \ + --run_name $RUN_NAME \ + --dataset_cache_dir /home/dreilly1/.cache/viscop_datasetcache \ + --include_visual_tokens $PASS_VIS_FEATURES_TO_LLM \ + --include_visual_probes $PASS_PROBES_TO_LLM \ + --num_visual_probes $NUM_VISUAL_PROBES \ No newline at end of file diff --git a/viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/trainer_state.json b/viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9ec4bcded8ab1d99caa10de140e39fb26699383e --- /dev/null +++ b/viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/trainer_state.json @@ -0,0 +1,8634 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.99581589958159, + "eval_steps": 500, + "global_step": 1074, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "autoregressive_loss": 2.375, + "epoch": 0.002789400278940028, + "grad_norm": 8.59101676940918, + "learning_rate": 3.0303030303030305e-07, + "loss": 9.5059, + "step": 1 + }, + { + "autoregressive_loss": 2.3594, + "epoch": 0.005578800557880056, + "grad_norm": 8.758501052856445, + "learning_rate": 6.060606060606061e-07, + "loss": 9.4395, + "step": 2 + }, + { + "autoregressive_loss": 2.375, + "epoch": 0.008368200836820083, + "grad_norm": 8.302231788635254, + "learning_rate": 9.090909090909091e-07, + "loss": 9.5273, + "step": 3 + }, + { + "autoregressive_loss": 2.3438, + "epoch": 0.011157601115760111, + "grad_norm": 8.197135925292969, + "learning_rate": 1.2121212121212122e-06, + "loss": 9.4043, + "step": 4 + }, + { + "autoregressive_loss": 2.375, + "epoch": 0.01394700139470014, + "grad_norm": 11.574838638305664, + "learning_rate": 1.5151515151515152e-06, + "loss": 9.5039, + "step": 5 + }, + { + "autoregressive_loss": 2.375, + "epoch": 0.016736401673640166, + "grad_norm": 8.141172409057617, + "learning_rate": 1.8181818181818183e-06, + "loss": 9.4922, + "step": 6 + }, + { + "autoregressive_loss": 2.375, + "epoch": 0.019525801952580194, + "grad_norm": 8.950714111328125, + "learning_rate": 2.1212121212121216e-06, + "loss": 9.498, + "step": 7 + }, + { + "autoregressive_loss": 2.375, + "epoch": 0.022315202231520222, + "grad_norm": 7.847761631011963, + "learning_rate": 2.4242424242424244e-06, + "loss": 9.4941, + "step": 8 + }, + { + "autoregressive_loss": 2.3438, + "epoch": 0.02510460251046025, + "grad_norm": 9.826142311096191, + "learning_rate": 2.7272727272727272e-06, + "loss": 9.3418, + "step": 9 + }, + { + "autoregressive_loss": 2.3438, + "epoch": 0.02789400278940028, + "grad_norm": 8.81420612335205, + "learning_rate": 3.0303030303030305e-06, + "loss": 9.3848, + "step": 10 + }, + { + "autoregressive_loss": 2.3281, + "epoch": 0.030683403068340307, + "grad_norm": 7.933913707733154, + "learning_rate": 3.3333333333333333e-06, + "loss": 9.3203, + "step": 11 + }, + { + "autoregressive_loss": 2.3438, + "epoch": 0.03347280334728033, + "grad_norm": 9.503207206726074, + "learning_rate": 3.6363636363636366e-06, + "loss": 9.3496, + "step": 12 + }, + { + "autoregressive_loss": 2.2812, + "epoch": 0.03626220362622036, + "grad_norm": 7.174030303955078, + "learning_rate": 3.93939393939394e-06, + "loss": 9.1641, + "step": 13 + }, + { + "autoregressive_loss": 2.2969, + "epoch": 0.03905160390516039, + "grad_norm": 8.553145408630371, + "learning_rate": 4.242424242424243e-06, + "loss": 9.1621, + "step": 14 + }, + { + "autoregressive_loss": 2.25, + "epoch": 0.04184100418410042, + "grad_norm": 8.736946105957031, + "learning_rate": 4.5454545454545455e-06, + "loss": 9.0332, + "step": 15 + }, + { + "autoregressive_loss": 2.1406, + "epoch": 0.044630404463040445, + "grad_norm": 10.106003761291504, + "learning_rate": 4.848484848484849e-06, + "loss": 8.5723, + "step": 16 + }, + { + "autoregressive_loss": 2.125, + "epoch": 0.04741980474198047, + "grad_norm": 12.034590721130371, + "learning_rate": 5.151515151515152e-06, + "loss": 8.4629, + "step": 17 + }, + { + "autoregressive_loss": 2.0625, + "epoch": 0.0502092050209205, + "grad_norm": 12.29151439666748, + "learning_rate": 5.4545454545454545e-06, + "loss": 8.2734, + "step": 18 + }, + { + "autoregressive_loss": 2.0312, + "epoch": 0.05299860529986053, + "grad_norm": 13.058691024780273, + "learning_rate": 5.7575757575757586e-06, + "loss": 8.1016, + "step": 19 + }, + { + "autoregressive_loss": 1.9531, + "epoch": 0.05578800557880056, + "grad_norm": 18.048789978027344, + "learning_rate": 6.060606060606061e-06, + "loss": 7.834, + "step": 20 + }, + { + "autoregressive_loss": 1.875, + "epoch": 0.058577405857740586, + "grad_norm": 17.723424911499023, + "learning_rate": 6.363636363636364e-06, + "loss": 7.4805, + "step": 21 + }, + { + "autoregressive_loss": 1.5234, + "epoch": 0.061366806136680614, + "grad_norm": 20.77867889404297, + "learning_rate": 6.666666666666667e-06, + "loss": 6.0918, + "step": 22 + }, + { + "autoregressive_loss": 1.3906, + "epoch": 0.06415620641562064, + "grad_norm": 18.87694549560547, + "learning_rate": 6.969696969696971e-06, + "loss": 5.5518, + "step": 23 + }, + { + "autoregressive_loss": 1.3203, + "epoch": 0.06694560669456066, + "grad_norm": 18.130693435668945, + "learning_rate": 7.272727272727273e-06, + "loss": 5.2842, + "step": 24 + }, + { + "autoregressive_loss": 1.2344, + "epoch": 0.0697350069735007, + "grad_norm": 18.04229736328125, + "learning_rate": 7.5757575757575764e-06, + "loss": 4.9385, + "step": 25 + }, + { + "autoregressive_loss": 1.1016, + "epoch": 0.07252440725244072, + "grad_norm": 19.869577407836914, + "learning_rate": 7.87878787878788e-06, + "loss": 4.4199, + "step": 26 + }, + { + "autoregressive_loss": 1.0547, + "epoch": 0.07531380753138076, + "grad_norm": 19.50640296936035, + "learning_rate": 8.181818181818183e-06, + "loss": 4.2119, + "step": 27 + }, + { + "autoregressive_loss": 0.9336, + "epoch": 0.07810320781032078, + "grad_norm": 19.852649688720703, + "learning_rate": 8.484848484848486e-06, + "loss": 3.7397, + "step": 28 + }, + { + "autoregressive_loss": 0.5859, + "epoch": 0.08089260808926081, + "grad_norm": 19.918880462646484, + "learning_rate": 8.787878787878788e-06, + "loss": 2.3418, + "step": 29 + }, + { + "autoregressive_loss": 0.5117, + "epoch": 0.08368200836820083, + "grad_norm": 15.370176315307617, + "learning_rate": 9.090909090909091e-06, + "loss": 2.0474, + "step": 30 + }, + { + "autoregressive_loss": 0.4492, + "epoch": 0.08647140864714087, + "grad_norm": 12.441726684570312, + "learning_rate": 9.393939393939396e-06, + "loss": 1.7891, + "step": 31 + }, + { + "autoregressive_loss": 0.377, + "epoch": 0.08926080892608089, + "grad_norm": 7.7080841064453125, + "learning_rate": 9.696969696969698e-06, + "loss": 1.5068, + "step": 32 + }, + { + "autoregressive_loss": 0.3359, + "epoch": 0.09205020920502092, + "grad_norm": 6.849257469177246, + "learning_rate": 1e-05, + "loss": 1.3438, + "step": 33 + }, + { + "autoregressive_loss": 0.293, + "epoch": 0.09483960948396095, + "grad_norm": 5.445654392242432, + "learning_rate": 9.999977231314128e-06, + "loss": 1.1726, + "step": 34 + }, + { + "autoregressive_loss": 0.252, + "epoch": 0.09762900976290098, + "grad_norm": 2.6761014461517334, + "learning_rate": 9.99990892546387e-06, + "loss": 1.0039, + "step": 35 + }, + { + "autoregressive_loss": 0.2393, + "epoch": 0.100418410041841, + "grad_norm": 2.084914445877075, + "learning_rate": 9.999795083071328e-06, + "loss": 0.9578, + "step": 36 + }, + { + "autoregressive_loss": 0.2412, + "epoch": 0.10320781032078104, + "grad_norm": 1.7051442861557007, + "learning_rate": 9.999635705173312e-06, + "loss": 0.9644, + "step": 37 + }, + { + "autoregressive_loss": 0.2285, + "epoch": 0.10599721059972106, + "grad_norm": 1.2426851987838745, + "learning_rate": 9.999430793221356e-06, + "loss": 0.915, + "step": 38 + }, + { + "autoregressive_loss": 0.2168, + "epoch": 0.1087866108786611, + "grad_norm": 1.0455615520477295, + "learning_rate": 9.999180349081688e-06, + "loss": 0.8663, + "step": 39 + }, + { + "autoregressive_loss": 0.2168, + "epoch": 0.11157601115760112, + "grad_norm": 0.9428278207778931, + "learning_rate": 9.998884375035221e-06, + "loss": 0.8666, + "step": 40 + }, + { + "autoregressive_loss": 0.2139, + "epoch": 0.11436541143654114, + "grad_norm": 0.8402647376060486, + "learning_rate": 9.998542873777534e-06, + "loss": 0.8557, + "step": 41 + }, + { + "autoregressive_loss": 0.2031, + "epoch": 0.11715481171548117, + "grad_norm": 0.6171095967292786, + "learning_rate": 9.99815584841884e-06, + "loss": 0.813, + "step": 42 + }, + { + "autoregressive_loss": 0.2188, + "epoch": 0.1199442119944212, + "grad_norm": 0.6756744980812073, + "learning_rate": 9.99772330248396e-06, + "loss": 0.8755, + "step": 43 + }, + { + "autoregressive_loss": 0.2129, + "epoch": 0.12273361227336123, + "grad_norm": 0.5978255867958069, + "learning_rate": 9.997245239912299e-06, + "loss": 0.8545, + "step": 44 + }, + { + "autoregressive_loss": 0.2041, + "epoch": 0.12552301255230125, + "grad_norm": 0.5240582227706909, + "learning_rate": 9.996721665057796e-06, + "loss": 0.8167, + "step": 45 + }, + { + "autoregressive_loss": 0.2012, + "epoch": 0.12831241283124128, + "grad_norm": 0.5132311582565308, + "learning_rate": 9.996152582688899e-06, + "loss": 0.8064, + "step": 46 + }, + { + "autoregressive_loss": 0.2119, + "epoch": 0.13110181311018132, + "grad_norm": 0.5135049223899841, + "learning_rate": 9.995537997988507e-06, + "loss": 0.8474, + "step": 47 + }, + { + "autoregressive_loss": 0.2041, + "epoch": 0.13389121338912133, + "grad_norm": 0.4354476034641266, + "learning_rate": 9.994877916553937e-06, + "loss": 0.8152, + "step": 48 + }, + { + "autoregressive_loss": 0.2051, + "epoch": 0.13668061366806136, + "grad_norm": 0.4159083366394043, + "learning_rate": 9.994172344396866e-06, + "loss": 0.8215, + "step": 49 + }, + { + "autoregressive_loss": 0.2051, + "epoch": 0.1394700139470014, + "grad_norm": 0.3896167576313019, + "learning_rate": 9.99342128794327e-06, + "loss": 0.8203, + "step": 50 + }, + { + "autoregressive_loss": 0.1992, + "epoch": 0.14225941422594143, + "grad_norm": 0.3958797752857208, + "learning_rate": 9.992624754033377e-06, + "loss": 0.7957, + "step": 51 + }, + { + "autoregressive_loss": 0.207, + "epoch": 0.14504881450488144, + "grad_norm": 0.34644994139671326, + "learning_rate": 9.991782749921601e-06, + "loss": 0.8257, + "step": 52 + }, + { + "autoregressive_loss": 0.2051, + "epoch": 0.14783821478382148, + "grad_norm": 0.3714761734008789, + "learning_rate": 9.990895283276472e-06, + "loss": 0.8186, + "step": 53 + }, + { + "autoregressive_loss": 0.2031, + "epoch": 0.1506276150627615, + "grad_norm": 0.35153210163116455, + "learning_rate": 9.98996236218057e-06, + "loss": 0.8142, + "step": 54 + }, + { + "autoregressive_loss": 0.2041, + "epoch": 0.15341701534170155, + "grad_norm": 0.348823606967926, + "learning_rate": 9.98898399513045e-06, + "loss": 0.8169, + "step": 55 + }, + { + "autoregressive_loss": 0.2002, + "epoch": 0.15620641562064155, + "grad_norm": 0.27591821551322937, + "learning_rate": 9.987960191036564e-06, + "loss": 0.8008, + "step": 56 + }, + { + "autoregressive_loss": 0.2031, + "epoch": 0.1589958158995816, + "grad_norm": 0.26520946621894836, + "learning_rate": 9.986890959223181e-06, + "loss": 0.8125, + "step": 57 + }, + { + "autoregressive_loss": 0.2012, + "epoch": 0.16178521617852162, + "grad_norm": 0.3240140974521637, + "learning_rate": 9.985776309428306e-06, + "loss": 0.8035, + "step": 58 + }, + { + "autoregressive_loss": 0.1855, + "epoch": 0.16457461645746166, + "grad_norm": 0.24813607335090637, + "learning_rate": 9.984616251803577e-06, + "loss": 0.7438, + "step": 59 + }, + { + "autoregressive_loss": 0.2051, + "epoch": 0.16736401673640167, + "grad_norm": 0.311318576335907, + "learning_rate": 9.983410796914197e-06, + "loss": 0.822, + "step": 60 + }, + { + "autoregressive_loss": 0.2061, + "epoch": 0.1701534170153417, + "grad_norm": 0.2562341094017029, + "learning_rate": 9.982159955738808e-06, + "loss": 0.824, + "step": 61 + }, + { + "autoregressive_loss": 0.1992, + "epoch": 0.17294281729428174, + "grad_norm": 0.25511008501052856, + "learning_rate": 9.980863739669419e-06, + "loss": 0.7991, + "step": 62 + }, + { + "autoregressive_loss": 0.1953, + "epoch": 0.17573221757322174, + "grad_norm": 0.2315155267715454, + "learning_rate": 9.979522160511282e-06, + "loss": 0.7781, + "step": 63 + }, + { + "autoregressive_loss": 0.1914, + "epoch": 0.17852161785216178, + "grad_norm": 0.2424907684326172, + "learning_rate": 9.978135230482797e-06, + "loss": 0.7661, + "step": 64 + }, + { + "autoregressive_loss": 0.1992, + "epoch": 0.18131101813110181, + "grad_norm": 0.2436961829662323, + "learning_rate": 9.97670296221539e-06, + "loss": 0.7942, + "step": 65 + }, + { + "autoregressive_loss": 0.1943, + "epoch": 0.18410041841004185, + "grad_norm": 0.2763199806213379, + "learning_rate": 9.975225368753412e-06, + "loss": 0.7755, + "step": 66 + }, + { + "autoregressive_loss": 0.1914, + "epoch": 0.18688981868898186, + "grad_norm": 0.19693495333194733, + "learning_rate": 9.973702463554004e-06, + "loss": 0.7632, + "step": 67 + }, + { + "autoregressive_loss": 0.1973, + "epoch": 0.1896792189679219, + "grad_norm": 0.2354680448770523, + "learning_rate": 9.972134260486989e-06, + "loss": 0.7896, + "step": 68 + }, + { + "autoregressive_loss": 0.2012, + "epoch": 0.19246861924686193, + "grad_norm": 0.21391351521015167, + "learning_rate": 9.970520773834734e-06, + "loss": 0.8057, + "step": 69 + }, + { + "autoregressive_loss": 0.2002, + "epoch": 0.19525801952580196, + "grad_norm": 0.2502017021179199, + "learning_rate": 9.968862018292025e-06, + "loss": 0.7998, + "step": 70 + }, + { + "autoregressive_loss": 0.1924, + "epoch": 0.19804741980474197, + "grad_norm": 0.18569432199001312, + "learning_rate": 9.967158008965942e-06, + "loss": 0.7695, + "step": 71 + }, + { + "autoregressive_loss": 0.1934, + "epoch": 0.200836820083682, + "grad_norm": 0.23255318403244019, + "learning_rate": 9.965408761375702e-06, + "loss": 0.7743, + "step": 72 + }, + { + "autoregressive_loss": 0.1846, + "epoch": 0.20362622036262204, + "grad_norm": 0.19455288350582123, + "learning_rate": 9.963614291452532e-06, + "loss": 0.7386, + "step": 73 + }, + { + "autoregressive_loss": 0.2012, + "epoch": 0.20641562064156208, + "grad_norm": 0.24719232320785522, + "learning_rate": 9.961774615539523e-06, + "loss": 0.8037, + "step": 74 + }, + { + "autoregressive_loss": 0.2012, + "epoch": 0.20920502092050208, + "grad_norm": 0.23883651196956635, + "learning_rate": 9.959889750391474e-06, + "loss": 0.8022, + "step": 75 + }, + { + "autoregressive_loss": 0.1719, + "epoch": 0.21199442119944212, + "grad_norm": 0.19742122292518616, + "learning_rate": 9.957959713174748e-06, + "loss": 0.686, + "step": 76 + }, + { + "autoregressive_loss": 0.2031, + "epoch": 0.21478382147838215, + "grad_norm": 0.20371419191360474, + "learning_rate": 9.955984521467108e-06, + "loss": 0.8125, + "step": 77 + }, + { + "autoregressive_loss": 0.1973, + "epoch": 0.2175732217573222, + "grad_norm": 0.2385178953409195, + "learning_rate": 9.953964193257563e-06, + "loss": 0.7881, + "step": 78 + }, + { + "autoregressive_loss": 0.1982, + "epoch": 0.2203626220362622, + "grad_norm": 0.19921010732650757, + "learning_rate": 9.951898746946201e-06, + "loss": 0.7915, + "step": 79 + }, + { + "autoregressive_loss": 0.1924, + "epoch": 0.22315202231520223, + "grad_norm": 0.13990746438503265, + "learning_rate": 9.949788201344019e-06, + "loss": 0.7684, + "step": 80 + }, + { + "autoregressive_loss": 0.1953, + "epoch": 0.22594142259414227, + "grad_norm": 0.16171158850193024, + "learning_rate": 9.947632575672758e-06, + "loss": 0.78, + "step": 81 + }, + { + "autoregressive_loss": 0.1953, + "epoch": 0.22873082287308227, + "grad_norm": 0.19189204275608063, + "learning_rate": 9.945431889564724e-06, + "loss": 0.7827, + "step": 82 + }, + { + "autoregressive_loss": 0.1865, + "epoch": 0.2315202231520223, + "grad_norm": 0.16905158758163452, + "learning_rate": 9.943186163062607e-06, + "loss": 0.7461, + "step": 83 + }, + { + "autoregressive_loss": 0.1855, + "epoch": 0.23430962343096234, + "grad_norm": 0.16456933319568634, + "learning_rate": 9.940895416619308e-06, + "loss": 0.7422, + "step": 84 + }, + { + "autoregressive_loss": 0.1963, + "epoch": 0.23709902370990238, + "grad_norm": 0.1956029236316681, + "learning_rate": 9.938559671097739e-06, + "loss": 0.7854, + "step": 85 + }, + { + "autoregressive_loss": 0.2051, + "epoch": 0.2398884239888424, + "grad_norm": 0.22732217609882355, + "learning_rate": 9.93617894777064e-06, + "loss": 0.8179, + "step": 86 + }, + { + "autoregressive_loss": 0.1807, + "epoch": 0.24267782426778242, + "grad_norm": 0.14600317180156708, + "learning_rate": 9.933753268320391e-06, + "loss": 0.7218, + "step": 87 + }, + { + "autoregressive_loss": 0.1963, + "epoch": 0.24546722454672246, + "grad_norm": 0.15046216547489166, + "learning_rate": 9.931282654838803e-06, + "loss": 0.7847, + "step": 88 + }, + { + "autoregressive_loss": 0.1797, + "epoch": 0.2482566248256625, + "grad_norm": 0.13484808802604675, + "learning_rate": 9.928767129826929e-06, + "loss": 0.719, + "step": 89 + }, + { + "autoregressive_loss": 0.1953, + "epoch": 0.2510460251046025, + "grad_norm": 0.15433381497859955, + "learning_rate": 9.926206716194842e-06, + "loss": 0.7805, + "step": 90 + }, + { + "autoregressive_loss": 0.1885, + "epoch": 0.25383542538354253, + "grad_norm": 0.11705081164836884, + "learning_rate": 9.92360143726145e-06, + "loss": 0.7518, + "step": 91 + }, + { + "autoregressive_loss": 0.2002, + "epoch": 0.25662482566248257, + "grad_norm": 0.19315330684185028, + "learning_rate": 9.920951316754259e-06, + "loss": 0.7996, + "step": 92 + }, + { + "autoregressive_loss": 0.1943, + "epoch": 0.2594142259414226, + "grad_norm": 0.1729285717010498, + "learning_rate": 9.918256378809178e-06, + "loss": 0.7756, + "step": 93 + }, + { + "autoregressive_loss": 0.1914, + "epoch": 0.26220362622036264, + "grad_norm": 0.2017911970615387, + "learning_rate": 9.915516647970283e-06, + "loss": 0.7649, + "step": 94 + }, + { + "autoregressive_loss": 0.1836, + "epoch": 0.2649930264993027, + "grad_norm": 0.14275409281253815, + "learning_rate": 9.9127321491896e-06, + "loss": 0.7344, + "step": 95 + }, + { + "autoregressive_loss": 0.1895, + "epoch": 0.26778242677824265, + "grad_norm": 0.11417101323604584, + "learning_rate": 9.909902907826884e-06, + "loss": 0.7568, + "step": 96 + }, + { + "autoregressive_loss": 0.1982, + "epoch": 0.2705718270571827, + "grad_norm": 0.16609111428260803, + "learning_rate": 9.907028949649376e-06, + "loss": 0.7927, + "step": 97 + }, + { + "autoregressive_loss": 0.1748, + "epoch": 0.2733612273361227, + "grad_norm": 0.1533835083246231, + "learning_rate": 9.904110300831577e-06, + "loss": 0.6996, + "step": 98 + }, + { + "autoregressive_loss": 0.1855, + "epoch": 0.27615062761506276, + "grad_norm": 0.13479143381118774, + "learning_rate": 9.901146987955008e-06, + "loss": 0.7439, + "step": 99 + }, + { + "autoregressive_loss": 0.1914, + "epoch": 0.2789400278940028, + "grad_norm": 0.13857507705688477, + "learning_rate": 9.898139038007962e-06, + "loss": 0.7617, + "step": 100 + }, + { + "autoregressive_loss": 0.1875, + "epoch": 0.28172942817294283, + "grad_norm": 0.11435526609420776, + "learning_rate": 9.895086478385267e-06, + "loss": 0.7498, + "step": 101 + }, + { + "autoregressive_loss": 0.1865, + "epoch": 0.28451882845188287, + "grad_norm": 0.11691369116306305, + "learning_rate": 9.891989336888033e-06, + "loss": 0.7479, + "step": 102 + }, + { + "autoregressive_loss": 0.1885, + "epoch": 0.28730822873082285, + "grad_norm": 0.1401517540216446, + "learning_rate": 9.888847641723394e-06, + "loss": 0.7534, + "step": 103 + }, + { + "autoregressive_loss": 0.1895, + "epoch": 0.2900976290097629, + "grad_norm": 0.1510058045387268, + "learning_rate": 9.88566142150426e-06, + "loss": 0.76, + "step": 104 + }, + { + "autoregressive_loss": 0.1875, + "epoch": 0.2928870292887029, + "grad_norm": 0.11542358249425888, + "learning_rate": 9.88243070524905e-06, + "loss": 0.749, + "step": 105 + }, + { + "autoregressive_loss": 0.1895, + "epoch": 0.29567642956764295, + "grad_norm": 0.1094258576631546, + "learning_rate": 9.87915552238143e-06, + "loss": 0.7571, + "step": 106 + }, + { + "autoregressive_loss": 0.1943, + "epoch": 0.298465829846583, + "grad_norm": 0.15701304376125336, + "learning_rate": 9.87583590273004e-06, + "loss": 0.7788, + "step": 107 + }, + { + "autoregressive_loss": 0.1953, + "epoch": 0.301255230125523, + "grad_norm": 0.13293620944023132, + "learning_rate": 9.872471876528235e-06, + "loss": 0.7791, + "step": 108 + }, + { + "autoregressive_loss": 0.1826, + "epoch": 0.30404463040446306, + "grad_norm": 0.10810933262109756, + "learning_rate": 9.869063474413798e-06, + "loss": 0.7306, + "step": 109 + }, + { + "autoregressive_loss": 0.1973, + "epoch": 0.3068340306834031, + "grad_norm": 0.12312405556440353, + "learning_rate": 9.865610727428661e-06, + "loss": 0.7876, + "step": 110 + }, + { + "autoregressive_loss": 0.1895, + "epoch": 0.30962343096234307, + "grad_norm": 0.17251476645469666, + "learning_rate": 9.862113667018628e-06, + "loss": 0.7576, + "step": 111 + }, + { + "autoregressive_loss": 0.1855, + "epoch": 0.3124128312412831, + "grad_norm": 0.1128096953034401, + "learning_rate": 9.858572325033089e-06, + "loss": 0.7441, + "step": 112 + }, + { + "autoregressive_loss": 0.1855, + "epoch": 0.31520223152022314, + "grad_norm": 0.10087057948112488, + "learning_rate": 9.854986733724724e-06, + "loss": 0.7427, + "step": 113 + }, + { + "autoregressive_loss": 0.1836, + "epoch": 0.3179916317991632, + "grad_norm": 0.10710697621107101, + "learning_rate": 9.851356925749218e-06, + "loss": 0.7368, + "step": 114 + }, + { + "autoregressive_loss": 0.1865, + "epoch": 0.3207810320781032, + "grad_norm": 0.11407770216464996, + "learning_rate": 9.847682934164948e-06, + "loss": 0.7435, + "step": 115 + }, + { + "autoregressive_loss": 0.1934, + "epoch": 0.32357043235704325, + "grad_norm": 0.12674707174301147, + "learning_rate": 9.843964792432701e-06, + "loss": 0.7743, + "step": 116 + }, + { + "autoregressive_loss": 0.1836, + "epoch": 0.3263598326359833, + "grad_norm": 0.09664250910282135, + "learning_rate": 9.840202534415358e-06, + "loss": 0.7341, + "step": 117 + }, + { + "autoregressive_loss": 0.1699, + "epoch": 0.3291492329149233, + "grad_norm": 0.0872848853468895, + "learning_rate": 9.836396194377587e-06, + "loss": 0.679, + "step": 118 + }, + { + "autoregressive_loss": 0.1865, + "epoch": 0.3319386331938633, + "grad_norm": 0.09484855085611343, + "learning_rate": 9.832545806985532e-06, + "loss": 0.7468, + "step": 119 + }, + { + "autoregressive_loss": 0.1875, + "epoch": 0.33472803347280333, + "grad_norm": 0.11564189195632935, + "learning_rate": 9.828651407306495e-06, + "loss": 0.7527, + "step": 120 + }, + { + "autoregressive_loss": 0.1807, + "epoch": 0.33751743375174337, + "grad_norm": 0.14329585433006287, + "learning_rate": 9.824713030808626e-06, + "loss": 0.7222, + "step": 121 + }, + { + "autoregressive_loss": 0.1865, + "epoch": 0.3403068340306834, + "grad_norm": 0.10036856681108475, + "learning_rate": 9.820730713360585e-06, + "loss": 0.7456, + "step": 122 + }, + { + "autoregressive_loss": 0.1816, + "epoch": 0.34309623430962344, + "grad_norm": 0.10172868520021439, + "learning_rate": 9.816704491231226e-06, + "loss": 0.729, + "step": 123 + }, + { + "autoregressive_loss": 0.1826, + "epoch": 0.3458856345885635, + "grad_norm": 0.10828670859336853, + "learning_rate": 9.812634401089265e-06, + "loss": 0.7303, + "step": 124 + }, + { + "autoregressive_loss": 0.1895, + "epoch": 0.3486750348675035, + "grad_norm": 0.13536593317985535, + "learning_rate": 9.808520480002942e-06, + "loss": 0.7595, + "step": 125 + }, + { + "autoregressive_loss": 0.1914, + "epoch": 0.3514644351464435, + "grad_norm": 0.16290919482707977, + "learning_rate": 9.804362765439688e-06, + "loss": 0.7656, + "step": 126 + }, + { + "autoregressive_loss": 0.1855, + "epoch": 0.3542538354253835, + "grad_norm": 0.10276439785957336, + "learning_rate": 9.800161295265782e-06, + "loss": 0.7432, + "step": 127 + }, + { + "autoregressive_loss": 0.1846, + "epoch": 0.35704323570432356, + "grad_norm": 0.1192091628909111, + "learning_rate": 9.795916107746009e-06, + "loss": 0.739, + "step": 128 + }, + { + "autoregressive_loss": 0.1846, + "epoch": 0.3598326359832636, + "grad_norm": 0.09253017604351044, + "learning_rate": 9.7916272415433e-06, + "loss": 0.7383, + "step": 129 + }, + { + "autoregressive_loss": 0.1836, + "epoch": 0.36262203626220363, + "grad_norm": 0.10979027301073074, + "learning_rate": 9.787294735718397e-06, + "loss": 0.7334, + "step": 130 + }, + { + "autoregressive_loss": 0.1729, + "epoch": 0.36541143654114366, + "grad_norm": 0.11798568069934845, + "learning_rate": 9.782918629729486e-06, + "loss": 0.6919, + "step": 131 + }, + { + "autoregressive_loss": 0.1865, + "epoch": 0.3682008368200837, + "grad_norm": 0.11536245048046112, + "learning_rate": 9.778498963431838e-06, + "loss": 0.7463, + "step": 132 + }, + { + "autoregressive_loss": 0.1816, + "epoch": 0.37099023709902373, + "grad_norm": 0.08010507375001907, + "learning_rate": 9.774035777077452e-06, + "loss": 0.7263, + "step": 133 + }, + { + "autoregressive_loss": 0.1895, + "epoch": 0.3737796373779637, + "grad_norm": 0.14403262734413147, + "learning_rate": 9.769529111314683e-06, + "loss": 0.7568, + "step": 134 + }, + { + "autoregressive_loss": 0.1836, + "epoch": 0.37656903765690375, + "grad_norm": 0.11617976427078247, + "learning_rate": 9.764979007187874e-06, + "loss": 0.7357, + "step": 135 + }, + { + "autoregressive_loss": 0.1777, + "epoch": 0.3793584379358438, + "grad_norm": 0.10515890270471573, + "learning_rate": 9.760385506136982e-06, + "loss": 0.7109, + "step": 136 + }, + { + "autoregressive_loss": 0.1797, + "epoch": 0.3821478382147838, + "grad_norm": 0.09763982146978378, + "learning_rate": 9.755748649997197e-06, + "loss": 0.7173, + "step": 137 + }, + { + "autoregressive_loss": 0.1787, + "epoch": 0.38493723849372385, + "grad_norm": 0.07105191797018051, + "learning_rate": 9.751068480998572e-06, + "loss": 0.7122, + "step": 138 + }, + { + "autoregressive_loss": 0.1797, + "epoch": 0.3877266387726639, + "grad_norm": 0.10583049803972244, + "learning_rate": 9.746345041765624e-06, + "loss": 0.7169, + "step": 139 + }, + { + "autoregressive_loss": 0.1865, + "epoch": 0.3905160390516039, + "grad_norm": 0.07271670550107956, + "learning_rate": 9.741578375316953e-06, + "loss": 0.7461, + "step": 140 + }, + { + "autoregressive_loss": 0.1836, + "epoch": 0.39330543933054396, + "grad_norm": 0.07726728916168213, + "learning_rate": 9.736768525064852e-06, + "loss": 0.7346, + "step": 141 + }, + { + "autoregressive_loss": 0.165, + "epoch": 0.39609483960948394, + "grad_norm": 0.1265808492898941, + "learning_rate": 9.731915534814912e-06, + "loss": 0.6605, + "step": 142 + }, + { + "autoregressive_loss": 0.1836, + "epoch": 0.398884239888424, + "grad_norm": 0.09845411032438278, + "learning_rate": 9.727019448765613e-06, + "loss": 0.7339, + "step": 143 + }, + { + "autoregressive_loss": 0.1748, + "epoch": 0.401673640167364, + "grad_norm": 0.087373286485672, + "learning_rate": 9.722080311507938e-06, + "loss": 0.698, + "step": 144 + }, + { + "autoregressive_loss": 0.1738, + "epoch": 0.40446304044630405, + "grad_norm": 0.10568520426750183, + "learning_rate": 9.717098168024948e-06, + "loss": 0.6963, + "step": 145 + }, + { + "autoregressive_loss": 0.1807, + "epoch": 0.4072524407252441, + "grad_norm": 0.12659965455532074, + "learning_rate": 9.712073063691388e-06, + "loss": 0.722, + "step": 146 + }, + { + "autoregressive_loss": 0.1758, + "epoch": 0.4100418410041841, + "grad_norm": 0.08432243019342422, + "learning_rate": 9.707005044273268e-06, + "loss": 0.7061, + "step": 147 + }, + { + "autoregressive_loss": 0.1729, + "epoch": 0.41283124128312415, + "grad_norm": 0.07705723494291306, + "learning_rate": 9.701894155927445e-06, + "loss": 0.693, + "step": 148 + }, + { + "autoregressive_loss": 0.168, + "epoch": 0.41562064156206413, + "grad_norm": 0.09215715527534485, + "learning_rate": 9.696740445201202e-06, + "loss": 0.6724, + "step": 149 + }, + { + "autoregressive_loss": 0.1777, + "epoch": 0.41841004184100417, + "grad_norm": 0.1282772272825241, + "learning_rate": 9.691543959031831e-06, + "loss": 0.7102, + "step": 150 + }, + { + "autoregressive_loss": 0.1797, + "epoch": 0.4211994421199442, + "grad_norm": 0.0899290144443512, + "learning_rate": 9.68630474474619e-06, + "loss": 0.7198, + "step": 151 + }, + { + "autoregressive_loss": 0.1816, + "epoch": 0.42398884239888424, + "grad_norm": 0.13625995814800262, + "learning_rate": 9.681022850060297e-06, + "loss": 0.7283, + "step": 152 + }, + { + "autoregressive_loss": 0.1807, + "epoch": 0.42677824267782427, + "grad_norm": 0.10605072975158691, + "learning_rate": 9.675698323078865e-06, + "loss": 0.7217, + "step": 153 + }, + { + "autoregressive_loss": 0.1758, + "epoch": 0.4295676429567643, + "grad_norm": 0.13614769279956818, + "learning_rate": 9.67033121229489e-06, + "loss": 0.7018, + "step": 154 + }, + { + "autoregressive_loss": 0.1787, + "epoch": 0.43235704323570434, + "grad_norm": 0.08153500407934189, + "learning_rate": 9.664921566589195e-06, + "loss": 0.7136, + "step": 155 + }, + { + "autoregressive_loss": 0.1777, + "epoch": 0.4351464435146444, + "grad_norm": 0.13264736533164978, + "learning_rate": 9.659469435229992e-06, + "loss": 0.7075, + "step": 156 + }, + { + "autoregressive_loss": 0.1807, + "epoch": 0.43793584379358436, + "grad_norm": 0.08948522061109543, + "learning_rate": 9.653974867872424e-06, + "loss": 0.7222, + "step": 157 + }, + { + "autoregressive_loss": 0.1719, + "epoch": 0.4407252440725244, + "grad_norm": 0.08192367851734161, + "learning_rate": 9.648437914558126e-06, + "loss": 0.6859, + "step": 158 + }, + { + "autoregressive_loss": 0.1748, + "epoch": 0.4435146443514644, + "grad_norm": 0.11999509483575821, + "learning_rate": 9.642858625714753e-06, + "loss": 0.6982, + "step": 159 + }, + { + "autoregressive_loss": 0.1738, + "epoch": 0.44630404463040446, + "grad_norm": 0.10731622576713562, + "learning_rate": 9.637237052155541e-06, + "loss": 0.6929, + "step": 160 + }, + { + "autoregressive_loss": 0.1719, + "epoch": 0.4490934449093445, + "grad_norm": 0.06534873694181442, + "learning_rate": 9.631573245078823e-06, + "loss": 0.6869, + "step": 161 + }, + { + "autoregressive_loss": 0.1729, + "epoch": 0.45188284518828453, + "grad_norm": 0.1699635237455368, + "learning_rate": 9.625867256067577e-06, + "loss": 0.689, + "step": 162 + }, + { + "autoregressive_loss": 0.1631, + "epoch": 0.45467224546722457, + "grad_norm": 0.0832730159163475, + "learning_rate": 9.620119137088954e-06, + "loss": 0.6498, + "step": 163 + }, + { + "autoregressive_loss": 0.1895, + "epoch": 0.45746164574616455, + "grad_norm": 0.11415934562683105, + "learning_rate": 9.614328940493797e-06, + "loss": 0.7568, + "step": 164 + }, + { + "autoregressive_loss": 0.1816, + "epoch": 0.4602510460251046, + "grad_norm": 0.1522754728794098, + "learning_rate": 9.608496719016176e-06, + "loss": 0.7252, + "step": 165 + }, + { + "autoregressive_loss": 0.1738, + "epoch": 0.4630404463040446, + "grad_norm": 0.05820317938923836, + "learning_rate": 9.602622525772895e-06, + "loss": 0.6943, + "step": 166 + }, + { + "autoregressive_loss": 0.1582, + "epoch": 0.46582984658298465, + "grad_norm": 0.10029690712690353, + "learning_rate": 9.596706414263022e-06, + "loss": 0.6337, + "step": 167 + }, + { + "autoregressive_loss": 0.1777, + "epoch": 0.4686192468619247, + "grad_norm": 0.08294606953859329, + "learning_rate": 9.59074843836739e-06, + "loss": 0.7131, + "step": 168 + }, + { + "autoregressive_loss": 0.168, + "epoch": 0.4714086471408647, + "grad_norm": 0.09536946564912796, + "learning_rate": 9.584748652348107e-06, + "loss": 0.6714, + "step": 169 + }, + { + "autoregressive_loss": 0.1768, + "epoch": 0.47419804741980476, + "grad_norm": 0.14618976414203644, + "learning_rate": 9.578707110848077e-06, + "loss": 0.707, + "step": 170 + }, + { + "autoregressive_loss": 0.1768, + "epoch": 0.4769874476987448, + "grad_norm": 0.1500958800315857, + "learning_rate": 9.572623868890482e-06, + "loss": 0.7065, + "step": 171 + }, + { + "autoregressive_loss": 0.1777, + "epoch": 0.4797768479776848, + "grad_norm": 0.13136450946331024, + "learning_rate": 9.566498981878289e-06, + "loss": 0.7112, + "step": 172 + }, + { + "autoregressive_loss": 0.1719, + "epoch": 0.4825662482566248, + "grad_norm": 0.09714750945568085, + "learning_rate": 9.560332505593754e-06, + "loss": 0.6857, + "step": 173 + }, + { + "autoregressive_loss": 0.1689, + "epoch": 0.48535564853556484, + "grad_norm": 0.12801992893218994, + "learning_rate": 9.554124496197899e-06, + "loss": 0.6759, + "step": 174 + }, + { + "autoregressive_loss": 0.1797, + "epoch": 0.4881450488145049, + "grad_norm": 0.11420992761850357, + "learning_rate": 9.547875010230009e-06, + "loss": 0.7164, + "step": 175 + }, + { + "autoregressive_loss": 0.1836, + "epoch": 0.4909344490934449, + "grad_norm": 0.13228189945220947, + "learning_rate": 9.54158410460712e-06, + "loss": 0.7363, + "step": 176 + }, + { + "autoregressive_loss": 0.1689, + "epoch": 0.49372384937238495, + "grad_norm": 0.14229175448417664, + "learning_rate": 9.535251836623491e-06, + "loss": 0.6746, + "step": 177 + }, + { + "autoregressive_loss": 0.1748, + "epoch": 0.496513249651325, + "grad_norm": 0.1196829155087471, + "learning_rate": 9.528878263950094e-06, + "loss": 0.6979, + "step": 178 + }, + { + "autoregressive_loss": 0.166, + "epoch": 0.499302649930265, + "grad_norm": 0.08301281929016113, + "learning_rate": 9.522463444634075e-06, + "loss": 0.663, + "step": 179 + }, + { + "autoregressive_loss": 0.1709, + "epoch": 0.502092050209205, + "grad_norm": 0.14559686183929443, + "learning_rate": 9.516007437098238e-06, + "loss": 0.6832, + "step": 180 + }, + { + "autoregressive_loss": 0.166, + "epoch": 0.504881450488145, + "grad_norm": 0.11781580001115799, + "learning_rate": 9.509510300140506e-06, + "loss": 0.6666, + "step": 181 + }, + { + "autoregressive_loss": 0.166, + "epoch": 0.5076708507670851, + "grad_norm": 0.10211572796106339, + "learning_rate": 9.502972092933384e-06, + "loss": 0.6628, + "step": 182 + }, + { + "autoregressive_loss": 0.1699, + "epoch": 0.5104602510460251, + "grad_norm": 0.09645526856184006, + "learning_rate": 9.496392875023433e-06, + "loss": 0.6787, + "step": 183 + }, + { + "autoregressive_loss": 0.1709, + "epoch": 0.5132496513249651, + "grad_norm": 0.09024081379175186, + "learning_rate": 9.489772706330707e-06, + "loss": 0.6836, + "step": 184 + }, + { + "autoregressive_loss": 0.1689, + "epoch": 0.5160390516039052, + "grad_norm": 0.2551220953464508, + "learning_rate": 9.483111647148223e-06, + "loss": 0.6752, + "step": 185 + }, + { + "autoregressive_loss": 0.1602, + "epoch": 0.5188284518828452, + "grad_norm": 0.06289787590503693, + "learning_rate": 9.476409758141404e-06, + "loss": 0.6421, + "step": 186 + }, + { + "autoregressive_loss": 0.1738, + "epoch": 0.5216178521617852, + "grad_norm": 0.1165032610297203, + "learning_rate": 9.469667100347539e-06, + "loss": 0.6938, + "step": 187 + }, + { + "autoregressive_loss": 0.1748, + "epoch": 0.5244072524407253, + "grad_norm": 0.1731247454881668, + "learning_rate": 9.462883735175205e-06, + "loss": 0.7002, + "step": 188 + }, + { + "autoregressive_loss": 0.1699, + "epoch": 0.5271966527196653, + "grad_norm": 0.13784192502498627, + "learning_rate": 9.45605972440373e-06, + "loss": 0.6786, + "step": 189 + }, + { + "autoregressive_loss": 0.1699, + "epoch": 0.5299860529986054, + "grad_norm": 0.08362855017185211, + "learning_rate": 9.449195130182614e-06, + "loss": 0.676, + "step": 190 + }, + { + "autoregressive_loss": 0.1738, + "epoch": 0.5327754532775453, + "grad_norm": 0.14091749489307404, + "learning_rate": 9.442290015030974e-06, + "loss": 0.6936, + "step": 191 + }, + { + "autoregressive_loss": 0.1738, + "epoch": 0.5355648535564853, + "grad_norm": 0.1334657520055771, + "learning_rate": 9.43534444183697e-06, + "loss": 0.697, + "step": 192 + }, + { + "autoregressive_loss": 0.1709, + "epoch": 0.5383542538354253, + "grad_norm": 0.10047294199466705, + "learning_rate": 9.42835847385723e-06, + "loss": 0.6836, + "step": 193 + }, + { + "autoregressive_loss": 0.1758, + "epoch": 0.5411436541143654, + "grad_norm": 0.07055521756410599, + "learning_rate": 9.42133217471628e-06, + "loss": 0.7015, + "step": 194 + }, + { + "autoregressive_loss": 0.168, + "epoch": 0.5439330543933054, + "grad_norm": 0.1407857984304428, + "learning_rate": 9.414265608405956e-06, + "loss": 0.6713, + "step": 195 + }, + { + "autoregressive_loss": 0.1621, + "epoch": 0.5467224546722455, + "grad_norm": 0.11281712353229523, + "learning_rate": 9.407158839284836e-06, + "loss": 0.6472, + "step": 196 + }, + { + "autoregressive_loss": 0.165, + "epoch": 0.5495118549511855, + "grad_norm": 0.06422466784715652, + "learning_rate": 9.40001193207763e-06, + "loss": 0.6599, + "step": 197 + }, + { + "autoregressive_loss": 0.1689, + "epoch": 0.5523012552301255, + "grad_norm": 0.06482964754104614, + "learning_rate": 9.392824951874618e-06, + "loss": 0.6748, + "step": 198 + }, + { + "autoregressive_loss": 0.166, + "epoch": 0.5550906555090656, + "grad_norm": 0.06380688399076462, + "learning_rate": 9.385597964131033e-06, + "loss": 0.6621, + "step": 199 + }, + { + "autoregressive_loss": 0.1709, + "epoch": 0.5578800557880056, + "grad_norm": 0.1602230817079544, + "learning_rate": 9.378331034666483e-06, + "loss": 0.6844, + "step": 200 + }, + { + "autoregressive_loss": 0.165, + "epoch": 0.5606694560669456, + "grad_norm": 0.08944359421730042, + "learning_rate": 9.371024229664342e-06, + "loss": 0.6595, + "step": 201 + }, + { + "autoregressive_loss": 0.1709, + "epoch": 0.5634588563458857, + "grad_norm": 0.14377856254577637, + "learning_rate": 9.363677615671148e-06, + "loss": 0.6841, + "step": 202 + }, + { + "autoregressive_loss": 0.166, + "epoch": 0.5662482566248257, + "grad_norm": 0.07669984549283981, + "learning_rate": 9.356291259596e-06, + "loss": 0.6633, + "step": 203 + }, + { + "autoregressive_loss": 0.167, + "epoch": 0.5690376569037657, + "grad_norm": 0.10646996647119522, + "learning_rate": 9.348865228709947e-06, + "loss": 0.6677, + "step": 204 + }, + { + "autoregressive_loss": 0.1631, + "epoch": 0.5718270571827058, + "grad_norm": 0.07175736129283905, + "learning_rate": 9.341399590645373e-06, + "loss": 0.651, + "step": 205 + }, + { + "autoregressive_loss": 0.1777, + "epoch": 0.5746164574616457, + "grad_norm": 0.17666535079479218, + "learning_rate": 9.333894413395388e-06, + "loss": 0.7136, + "step": 206 + }, + { + "autoregressive_loss": 0.1709, + "epoch": 0.5774058577405857, + "grad_norm": 0.10137853026390076, + "learning_rate": 9.326349765313199e-06, + "loss": 0.6862, + "step": 207 + }, + { + "autoregressive_loss": 0.166, + "epoch": 0.5801952580195258, + "grad_norm": 0.09380345791578293, + "learning_rate": 9.318765715111497e-06, + "loss": 0.6638, + "step": 208 + }, + { + "autoregressive_loss": 0.1543, + "epoch": 0.5829846582984658, + "grad_norm": 0.11779138445854187, + "learning_rate": 9.311142331861821e-06, + "loss": 0.616, + "step": 209 + }, + { + "autoregressive_loss": 0.1719, + "epoch": 0.5857740585774058, + "grad_norm": 0.14351823925971985, + "learning_rate": 9.303479684993943e-06, + "loss": 0.6858, + "step": 210 + }, + { + "autoregressive_loss": 0.168, + "epoch": 0.5885634588563459, + "grad_norm": 0.12170115858316422, + "learning_rate": 9.295777844295219e-06, + "loss": 0.672, + "step": 211 + }, + { + "autoregressive_loss": 0.1758, + "epoch": 0.5913528591352859, + "grad_norm": 0.0903005450963974, + "learning_rate": 9.288036879909967e-06, + "loss": 0.7036, + "step": 212 + }, + { + "autoregressive_loss": 0.168, + "epoch": 0.5941422594142259, + "grad_norm": 0.2040717452764511, + "learning_rate": 9.280256862338822e-06, + "loss": 0.6702, + "step": 213 + }, + { + "autoregressive_loss": 0.167, + "epoch": 0.596931659693166, + "grad_norm": 0.09943161904811859, + "learning_rate": 9.272437862438095e-06, + "loss": 0.67, + "step": 214 + }, + { + "autoregressive_loss": 0.1719, + "epoch": 0.599721059972106, + "grad_norm": 0.14410926401615143, + "learning_rate": 9.264579951419126e-06, + "loss": 0.6864, + "step": 215 + }, + { + "autoregressive_loss": 0.1729, + "epoch": 0.602510460251046, + "grad_norm": 0.15036122500896454, + "learning_rate": 9.256683200847638e-06, + "loss": 0.6918, + "step": 216 + }, + { + "autoregressive_loss": 0.1689, + "epoch": 0.6052998605299861, + "grad_norm": 0.1067957952618599, + "learning_rate": 9.248747682643085e-06, + "loss": 0.6755, + "step": 217 + }, + { + "autoregressive_loss": 0.1641, + "epoch": 0.6080892608089261, + "grad_norm": 0.1623372733592987, + "learning_rate": 9.240773469077994e-06, + "loss": 0.6553, + "step": 218 + }, + { + "autoregressive_loss": 0.1631, + "epoch": 0.6108786610878661, + "grad_norm": 0.057607490569353104, + "learning_rate": 9.232760632777311e-06, + "loss": 0.6517, + "step": 219 + }, + { + "autoregressive_loss": 0.1621, + "epoch": 0.6136680613668062, + "grad_norm": 0.06419090181589127, + "learning_rate": 9.22470924671774e-06, + "loss": 0.6483, + "step": 220 + }, + { + "autoregressive_loss": 0.1621, + "epoch": 0.6164574616457462, + "grad_norm": 0.10253150761127472, + "learning_rate": 9.216619384227068e-06, + "loss": 0.65, + "step": 221 + }, + { + "autoregressive_loss": 0.1641, + "epoch": 0.6192468619246861, + "grad_norm": 0.07451798766851425, + "learning_rate": 9.208491118983515e-06, + "loss": 0.6591, + "step": 222 + }, + { + "autoregressive_loss": 0.1602, + "epoch": 0.6220362622036262, + "grad_norm": 0.06085500493645668, + "learning_rate": 9.200324525015046e-06, + "loss": 0.6384, + "step": 223 + }, + { + "autoregressive_loss": 0.1709, + "epoch": 0.6248256624825662, + "grad_norm": 0.13162988424301147, + "learning_rate": 9.192119676698703e-06, + "loss": 0.6823, + "step": 224 + }, + { + "autoregressive_loss": 0.1641, + "epoch": 0.6276150627615062, + "grad_norm": 0.10592012852430344, + "learning_rate": 9.183876648759937e-06, + "loss": 0.656, + "step": 225 + }, + { + "autoregressive_loss": 0.1582, + "epoch": 0.6304044630404463, + "grad_norm": 0.07134976238012314, + "learning_rate": 9.175595516271911e-06, + "loss": 0.631, + "step": 226 + }, + { + "autoregressive_loss": 0.1641, + "epoch": 0.6331938633193863, + "grad_norm": 0.15556298196315765, + "learning_rate": 9.167276354654827e-06, + "loss": 0.6582, + "step": 227 + }, + { + "autoregressive_loss": 0.1621, + "epoch": 0.6359832635983264, + "grad_norm": 0.07725589722394943, + "learning_rate": 9.158919239675237e-06, + "loss": 0.647, + "step": 228 + }, + { + "autoregressive_loss": 0.1514, + "epoch": 0.6387726638772664, + "grad_norm": 0.0608401820063591, + "learning_rate": 9.150524247445346e-06, + "loss": 0.6041, + "step": 229 + }, + { + "autoregressive_loss": 0.1611, + "epoch": 0.6415620641562064, + "grad_norm": 0.0770239308476448, + "learning_rate": 9.14209145442234e-06, + "loss": 0.6455, + "step": 230 + }, + { + "autoregressive_loss": 0.1514, + "epoch": 0.6443514644351465, + "grad_norm": 0.10680093616247177, + "learning_rate": 9.133620937407656e-06, + "loss": 0.6068, + "step": 231 + }, + { + "autoregressive_loss": 0.1514, + "epoch": 0.6471408647140865, + "grad_norm": 0.09500687569379807, + "learning_rate": 9.125112773546315e-06, + "loss": 0.6053, + "step": 232 + }, + { + "autoregressive_loss": 0.1631, + "epoch": 0.6499302649930265, + "grad_norm": 0.09036453813314438, + "learning_rate": 9.1165670403262e-06, + "loss": 0.6503, + "step": 233 + }, + { + "autoregressive_loss": 0.1582, + "epoch": 0.6527196652719666, + "grad_norm": 0.08100596070289612, + "learning_rate": 9.107983815577359e-06, + "loss": 0.6334, + "step": 234 + }, + { + "autoregressive_loss": 0.1641, + "epoch": 0.6555090655509066, + "grad_norm": 0.10760966688394547, + "learning_rate": 9.09936317747129e-06, + "loss": 0.6564, + "step": 235 + }, + { + "autoregressive_loss": 0.1523, + "epoch": 0.6582984658298466, + "grad_norm": 0.06605391204357147, + "learning_rate": 9.090705204520231e-06, + "loss": 0.6085, + "step": 236 + }, + { + "autoregressive_loss": 0.1719, + "epoch": 0.6610878661087866, + "grad_norm": 0.09414174407720566, + "learning_rate": 9.082009975576452e-06, + "loss": 0.6901, + "step": 237 + }, + { + "autoregressive_loss": 0.1621, + "epoch": 0.6638772663877266, + "grad_norm": 0.19530493021011353, + "learning_rate": 9.073277569831526e-06, + "loss": 0.6464, + "step": 238 + }, + { + "autoregressive_loss": 0.1543, + "epoch": 0.6666666666666666, + "grad_norm": 0.09775504469871521, + "learning_rate": 9.064508066815614e-06, + "loss": 0.6161, + "step": 239 + }, + { + "autoregressive_loss": 0.1641, + "epoch": 0.6694560669456067, + "grad_norm": 0.1467278152704239, + "learning_rate": 9.05570154639674e-06, + "loss": 0.6552, + "step": 240 + }, + { + "autoregressive_loss": 0.1641, + "epoch": 0.6722454672245467, + "grad_norm": 0.13512744009494781, + "learning_rate": 9.046858088780064e-06, + "loss": 0.657, + "step": 241 + }, + { + "autoregressive_loss": 0.1562, + "epoch": 0.6750348675034867, + "grad_norm": 0.06827836483716965, + "learning_rate": 9.03797777450715e-06, + "loss": 0.6235, + "step": 242 + }, + { + "autoregressive_loss": 0.167, + "epoch": 0.6778242677824268, + "grad_norm": 0.05136014148592949, + "learning_rate": 9.02906068445523e-06, + "loss": 0.6689, + "step": 243 + }, + { + "autoregressive_loss": 0.165, + "epoch": 0.6806136680613668, + "grad_norm": 0.08220921456813812, + "learning_rate": 9.020106899836471e-06, + "loss": 0.6582, + "step": 244 + }, + { + "autoregressive_loss": 0.166, + "epoch": 0.6834030683403068, + "grad_norm": 0.1343540996313095, + "learning_rate": 9.011116502197243e-06, + "loss": 0.6637, + "step": 245 + }, + { + "autoregressive_loss": 0.1592, + "epoch": 0.6861924686192469, + "grad_norm": 0.14085440337657928, + "learning_rate": 9.002089573417356e-06, + "loss": 0.6388, + "step": 246 + }, + { + "autoregressive_loss": 0.166, + "epoch": 0.6889818688981869, + "grad_norm": 0.13726770877838135, + "learning_rate": 8.993026195709337e-06, + "loss": 0.6649, + "step": 247 + }, + { + "autoregressive_loss": 0.1602, + "epoch": 0.691771269177127, + "grad_norm": 0.12011515349149704, + "learning_rate": 8.983926451617664e-06, + "loss": 0.6396, + "step": 248 + }, + { + "autoregressive_loss": 0.1641, + "epoch": 0.694560669456067, + "grad_norm": 0.06411609798669815, + "learning_rate": 8.974790424018022e-06, + "loss": 0.6543, + "step": 249 + }, + { + "autoregressive_loss": 0.1602, + "epoch": 0.697350069735007, + "grad_norm": 0.05792788043618202, + "learning_rate": 8.96561819611655e-06, + "loss": 0.6394, + "step": 250 + }, + { + "autoregressive_loss": 0.1602, + "epoch": 0.700139470013947, + "grad_norm": 0.17067191004753113, + "learning_rate": 8.956409851449076e-06, + "loss": 0.6393, + "step": 251 + }, + { + "autoregressive_loss": 0.1523, + "epoch": 0.702928870292887, + "grad_norm": 0.05837100371718407, + "learning_rate": 8.947165473880364e-06, + "loss": 0.6108, + "step": 252 + }, + { + "autoregressive_loss": 0.1602, + "epoch": 0.705718270571827, + "grad_norm": 0.060978155583143234, + "learning_rate": 8.937885147603345e-06, + "loss": 0.6384, + "step": 253 + }, + { + "autoregressive_loss": 0.1592, + "epoch": 0.708507670850767, + "grad_norm": 0.11952283978462219, + "learning_rate": 8.928568957138356e-06, + "loss": 0.6351, + "step": 254 + }, + { + "autoregressive_loss": 0.1621, + "epoch": 0.7112970711297071, + "grad_norm": 0.12812203168869019, + "learning_rate": 8.919216987332357e-06, + "loss": 0.6484, + "step": 255 + }, + { + "autoregressive_loss": 0.1494, + "epoch": 0.7140864714086471, + "grad_norm": 0.048089317977428436, + "learning_rate": 8.909829323358177e-06, + "loss": 0.5983, + "step": 256 + }, + { + "autoregressive_loss": 0.1562, + "epoch": 0.7168758716875872, + "grad_norm": 0.12682293355464935, + "learning_rate": 8.900406050713723e-06, + "loss": 0.6251, + "step": 257 + }, + { + "autoregressive_loss": 0.1523, + "epoch": 0.7196652719665272, + "grad_norm": 0.0492214635014534, + "learning_rate": 8.89094725522121e-06, + "loss": 0.6111, + "step": 258 + }, + { + "autoregressive_loss": 0.1572, + "epoch": 0.7224546722454672, + "grad_norm": 0.08854146301746368, + "learning_rate": 8.881453023026373e-06, + "loss": 0.6307, + "step": 259 + }, + { + "autoregressive_loss": 0.1543, + "epoch": 0.7252440725244073, + "grad_norm": 0.06917643547058105, + "learning_rate": 8.871923440597694e-06, + "loss": 0.6158, + "step": 260 + }, + { + "autoregressive_loss": 0.1572, + "epoch": 0.7280334728033473, + "grad_norm": 0.10623205453157425, + "learning_rate": 8.862358594725595e-06, + "loss": 0.6288, + "step": 261 + }, + { + "autoregressive_loss": 0.1592, + "epoch": 0.7308228730822873, + "grad_norm": 0.10860224813222885, + "learning_rate": 8.852758572521666e-06, + "loss": 0.6373, + "step": 262 + }, + { + "autoregressive_loss": 0.1562, + "epoch": 0.7336122733612274, + "grad_norm": 0.11102679371833801, + "learning_rate": 8.843123461417864e-06, + "loss": 0.6249, + "step": 263 + }, + { + "autoregressive_loss": 0.1611, + "epoch": 0.7364016736401674, + "grad_norm": 0.15997837483882904, + "learning_rate": 8.833453349165713e-06, + "loss": 0.6465, + "step": 264 + }, + { + "autoregressive_loss": 0.1641, + "epoch": 0.7391910739191074, + "grad_norm": 0.14682376384735107, + "learning_rate": 8.823748323835517e-06, + "loss": 0.6528, + "step": 265 + }, + { + "autoregressive_loss": 0.166, + "epoch": 0.7419804741980475, + "grad_norm": 0.04337232932448387, + "learning_rate": 8.814008473815542e-06, + "loss": 0.6624, + "step": 266 + }, + { + "autoregressive_loss": 0.1562, + "epoch": 0.7447698744769874, + "grad_norm": 0.09644059091806412, + "learning_rate": 8.804233887811224e-06, + "loss": 0.6262, + "step": 267 + }, + { + "autoregressive_loss": 0.1602, + "epoch": 0.7475592747559274, + "grad_norm": 0.12444702535867691, + "learning_rate": 8.794424654844352e-06, + "loss": 0.6394, + "step": 268 + }, + { + "autoregressive_loss": 0.1641, + "epoch": 0.7503486750348675, + "grad_norm": 0.18358121812343597, + "learning_rate": 8.784580864252266e-06, + "loss": 0.656, + "step": 269 + }, + { + "autoregressive_loss": 0.1543, + "epoch": 0.7531380753138075, + "grad_norm": 0.057986415922641754, + "learning_rate": 8.774702605687036e-06, + "loss": 0.6149, + "step": 270 + }, + { + "autoregressive_loss": 0.1572, + "epoch": 0.7559274755927475, + "grad_norm": 0.06545285880565643, + "learning_rate": 8.764789969114647e-06, + "loss": 0.629, + "step": 271 + }, + { + "autoregressive_loss": 0.1562, + "epoch": 0.7587168758716876, + "grad_norm": 0.045518986880779266, + "learning_rate": 8.754843044814183e-06, + "loss": 0.6265, + "step": 272 + }, + { + "autoregressive_loss": 0.1543, + "epoch": 0.7615062761506276, + "grad_norm": 0.0992431566119194, + "learning_rate": 8.744861923377e-06, + "loss": 0.6204, + "step": 273 + }, + { + "autoregressive_loss": 0.1504, + "epoch": 0.7642956764295676, + "grad_norm": 0.06116944178938866, + "learning_rate": 8.734846695705912e-06, + "loss": 0.604, + "step": 274 + }, + { + "autoregressive_loss": 0.1562, + "epoch": 0.7670850767085077, + "grad_norm": 0.060240115970373154, + "learning_rate": 8.724797453014342e-06, + "loss": 0.6273, + "step": 275 + }, + { + "autoregressive_loss": 0.1504, + "epoch": 0.7698744769874477, + "grad_norm": 0.06454778462648392, + "learning_rate": 8.714714286825512e-06, + "loss": 0.6014, + "step": 276 + }, + { + "autoregressive_loss": 0.1494, + "epoch": 0.7726638772663877, + "grad_norm": 0.07875101268291473, + "learning_rate": 8.704597288971598e-06, + "loss": 0.5967, + "step": 277 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 0.7754532775453278, + "grad_norm": 0.06594163179397583, + "learning_rate": 8.6944465515929e-06, + "loss": 0.5526, + "step": 278 + }, + { + "autoregressive_loss": 0.1602, + "epoch": 0.7782426778242678, + "grad_norm": 0.07465764880180359, + "learning_rate": 8.684262167136999e-06, + "loss": 0.6404, + "step": 279 + }, + { + "autoregressive_loss": 0.1631, + "epoch": 0.7810320781032078, + "grad_norm": 0.14250822365283966, + "learning_rate": 8.674044228357915e-06, + "loss": 0.6519, + "step": 280 + }, + { + "autoregressive_loss": 0.1602, + "epoch": 0.7838214783821479, + "grad_norm": 0.06395579129457474, + "learning_rate": 8.663792828315259e-06, + "loss": 0.6418, + "step": 281 + }, + { + "autoregressive_loss": 0.1494, + "epoch": 0.7866108786610879, + "grad_norm": 0.09353912621736526, + "learning_rate": 8.6535080603734e-06, + "loss": 0.5986, + "step": 282 + }, + { + "autoregressive_loss": 0.1582, + "epoch": 0.7894002789400278, + "grad_norm": 0.14470505714416504, + "learning_rate": 8.643190018200595e-06, + "loss": 0.6342, + "step": 283 + }, + { + "autoregressive_loss": 0.1523, + "epoch": 0.7921896792189679, + "grad_norm": 0.05637766420841217, + "learning_rate": 8.632838795768149e-06, + "loss": 0.6086, + "step": 284 + }, + { + "autoregressive_loss": 0.1572, + "epoch": 0.7949790794979079, + "grad_norm": 0.13812001049518585, + "learning_rate": 8.622454487349556e-06, + "loss": 0.6293, + "step": 285 + }, + { + "autoregressive_loss": 0.1582, + "epoch": 0.797768479776848, + "grad_norm": 0.1121237501502037, + "learning_rate": 8.612037187519635e-06, + "loss": 0.6329, + "step": 286 + }, + { + "autoregressive_loss": 0.1543, + "epoch": 0.800557880055788, + "grad_norm": 0.06742466241121292, + "learning_rate": 8.601586991153681e-06, + "loss": 0.6149, + "step": 287 + }, + { + "autoregressive_loss": 0.1543, + "epoch": 0.803347280334728, + "grad_norm": 0.08456873893737793, + "learning_rate": 8.591103993426588e-06, + "loss": 0.6187, + "step": 288 + }, + { + "autoregressive_loss": 0.1514, + "epoch": 0.806136680613668, + "grad_norm": 0.12323816865682602, + "learning_rate": 8.580588289811987e-06, + "loss": 0.605, + "step": 289 + }, + { + "autoregressive_loss": 0.1543, + "epoch": 0.8089260808926081, + "grad_norm": 0.10465039312839508, + "learning_rate": 8.570039976081382e-06, + "loss": 0.6177, + "step": 290 + }, + { + "autoregressive_loss": 0.1484, + "epoch": 0.8117154811715481, + "grad_norm": 0.0579032301902771, + "learning_rate": 8.559459148303268e-06, + "loss": 0.597, + "step": 291 + }, + { + "autoregressive_loss": 0.1504, + "epoch": 0.8145048814504882, + "grad_norm": 0.07441180944442749, + "learning_rate": 8.548845902842264e-06, + "loss": 0.6005, + "step": 292 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 0.8172942817294282, + "grad_norm": 0.05027826875448227, + "learning_rate": 8.538200336358227e-06, + "loss": 0.5808, + "step": 293 + }, + { + "autoregressive_loss": 0.1523, + "epoch": 0.8200836820083682, + "grad_norm": 0.06367678940296173, + "learning_rate": 8.527522545805386e-06, + "loss": 0.6106, + "step": 294 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 0.8228730822873083, + "grad_norm": 0.09541547298431396, + "learning_rate": 8.51681262843144e-06, + "loss": 0.5892, + "step": 295 + }, + { + "autoregressive_loss": 0.1582, + "epoch": 0.8256624825662483, + "grad_norm": 0.06780950725078583, + "learning_rate": 8.50607068177669e-06, + "loss": 0.6327, + "step": 296 + }, + { + "autoregressive_loss": 0.1455, + "epoch": 0.8284518828451883, + "grad_norm": 0.05408608913421631, + "learning_rate": 8.495296803673138e-06, + "loss": 0.5813, + "step": 297 + }, + { + "autoregressive_loss": 0.1562, + "epoch": 0.8312412831241283, + "grad_norm": 0.09197176992893219, + "learning_rate": 8.484491092243603e-06, + "loss": 0.6243, + "step": 298 + }, + { + "autoregressive_loss": 0.1523, + "epoch": 0.8340306834030683, + "grad_norm": 0.13312771916389465, + "learning_rate": 8.473653645900825e-06, + "loss": 0.6072, + "step": 299 + }, + { + "autoregressive_loss": 0.1602, + "epoch": 0.8368200836820083, + "grad_norm": 0.11839145421981812, + "learning_rate": 8.462784563346567e-06, + "loss": 0.6414, + "step": 300 + }, + { + "autoregressive_loss": 0.1553, + "epoch": 0.8396094839609484, + "grad_norm": 0.0908496230840683, + "learning_rate": 8.451883943570722e-06, + "loss": 0.6222, + "step": 301 + }, + { + "autoregressive_loss": 0.1504, + "epoch": 0.8423988842398884, + "grad_norm": 0.12035638839006424, + "learning_rate": 8.440951885850402e-06, + "loss": 0.6023, + "step": 302 + }, + { + "autoregressive_loss": 0.1494, + "epoch": 0.8451882845188284, + "grad_norm": 0.10368836671113968, + "learning_rate": 8.429988489749045e-06, + "loss": 0.5977, + "step": 303 + }, + { + "autoregressive_loss": 0.1562, + "epoch": 0.8479776847977685, + "grad_norm": 0.08453302830457687, + "learning_rate": 8.418993855115498e-06, + "loss": 0.6239, + "step": 304 + }, + { + "autoregressive_loss": 0.1592, + "epoch": 0.8507670850767085, + "grad_norm": 0.20027711987495422, + "learning_rate": 8.407968082083116e-06, + "loss": 0.6368, + "step": 305 + }, + { + "autoregressive_loss": 0.1602, + "epoch": 0.8535564853556485, + "grad_norm": 0.05296142399311066, + "learning_rate": 8.396911271068842e-06, + "loss": 0.6415, + "step": 306 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 0.8563458856345886, + "grad_norm": 0.07334242016077042, + "learning_rate": 8.385823522772299e-06, + "loss": 0.558, + "step": 307 + }, + { + "autoregressive_loss": 0.1514, + "epoch": 0.8591352859135286, + "grad_norm": 0.17954812943935394, + "learning_rate": 8.37470493817487e-06, + "loss": 0.6041, + "step": 308 + }, + { + "autoregressive_loss": 0.1582, + "epoch": 0.8619246861924686, + "grad_norm": 0.11821388453245163, + "learning_rate": 8.36355561853878e-06, + "loss": 0.6307, + "step": 309 + }, + { + "autoregressive_loss": 0.1533, + "epoch": 0.8647140864714087, + "grad_norm": 0.16179586946964264, + "learning_rate": 8.352375665406171e-06, + "loss": 0.6149, + "step": 310 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 0.8675034867503487, + "grad_norm": 0.14789487421512604, + "learning_rate": 8.341165180598182e-06, + "loss": 0.5625, + "step": 311 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 0.8702928870292888, + "grad_norm": 0.056019753217697144, + "learning_rate": 8.32992426621401e-06, + "loss": 0.5876, + "step": 312 + }, + { + "autoregressive_loss": 0.1533, + "epoch": 0.8730822873082287, + "grad_norm": 0.08094482123851776, + "learning_rate": 8.318653024629999e-06, + "loss": 0.6128, + "step": 313 + }, + { + "autoregressive_loss": 0.1504, + "epoch": 0.8758716875871687, + "grad_norm": 0.055445753037929535, + "learning_rate": 8.307351558498692e-06, + "loss": 0.6025, + "step": 314 + }, + { + "autoregressive_loss": 0.1475, + "epoch": 0.8786610878661087, + "grad_norm": 0.06612367182970047, + "learning_rate": 8.296019970747901e-06, + "loss": 0.5922, + "step": 315 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 0.8814504881450488, + "grad_norm": 0.04965893551707268, + "learning_rate": 8.284658364579771e-06, + "loss": 0.5787, + "step": 316 + }, + { + "autoregressive_loss": 0.1562, + "epoch": 0.8842398884239888, + "grad_norm": 0.09688432514667511, + "learning_rate": 8.27326684346984e-06, + "loss": 0.6239, + "step": 317 + }, + { + "autoregressive_loss": 0.1504, + "epoch": 0.8870292887029289, + "grad_norm": 0.18504363298416138, + "learning_rate": 8.261845511166092e-06, + "loss": 0.6036, + "step": 318 + }, + { + "autoregressive_loss": 0.1543, + "epoch": 0.8898186889818689, + "grad_norm": 0.08702141791582108, + "learning_rate": 8.250394471688018e-06, + "loss": 0.6185, + "step": 319 + }, + { + "autoregressive_loss": 0.1582, + "epoch": 0.8926080892608089, + "grad_norm": 0.11505435407161713, + "learning_rate": 8.23891382932567e-06, + "loss": 0.6333, + "step": 320 + }, + { + "autoregressive_loss": 0.1572, + "epoch": 0.895397489539749, + "grad_norm": 0.06825438141822815, + "learning_rate": 8.2274036886387e-06, + "loss": 0.6276, + "step": 321 + }, + { + "autoregressive_loss": 0.1475, + "epoch": 0.898186889818689, + "grad_norm": 0.09247445315122604, + "learning_rate": 8.215864154455421e-06, + "loss": 0.59, + "step": 322 + }, + { + "autoregressive_loss": 0.1504, + "epoch": 0.900976290097629, + "grad_norm": 0.0649319589138031, + "learning_rate": 8.204295331871844e-06, + "loss": 0.6045, + "step": 323 + }, + { + "autoregressive_loss": 0.1494, + "epoch": 0.9037656903765691, + "grad_norm": 0.05525384470820427, + "learning_rate": 8.192697326250722e-06, + "loss": 0.5992, + "step": 324 + }, + { + "autoregressive_loss": 0.1504, + "epoch": 0.9065550906555091, + "grad_norm": 0.08519770950078964, + "learning_rate": 8.1810702432206e-06, + "loss": 0.6013, + "step": 325 + }, + { + "autoregressive_loss": 0.1523, + "epoch": 0.9093444909344491, + "grad_norm": 0.12853498756885529, + "learning_rate": 8.169414188674829e-06, + "loss": 0.61, + "step": 326 + }, + { + "autoregressive_loss": 0.1475, + "epoch": 0.9121338912133892, + "grad_norm": 0.14043383300304413, + "learning_rate": 8.157729268770636e-06, + "loss": 0.5891, + "step": 327 + }, + { + "autoregressive_loss": 0.1523, + "epoch": 0.9149232914923291, + "grad_norm": 0.043932583183050156, + "learning_rate": 8.146015589928123e-06, + "loss": 0.6106, + "step": 328 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 0.9177126917712691, + "grad_norm": 0.06435821950435638, + "learning_rate": 8.134273258829322e-06, + "loss": 0.5767, + "step": 329 + }, + { + "autoregressive_loss": 0.1582, + "epoch": 0.9205020920502092, + "grad_norm": 0.08327151834964752, + "learning_rate": 8.122502382417211e-06, + "loss": 0.6311, + "step": 330 + }, + { + "autoregressive_loss": 0.1602, + "epoch": 0.9232914923291492, + "grad_norm": 0.06483109295368195, + "learning_rate": 8.110703067894747e-06, + "loss": 0.6416, + "step": 331 + }, + { + "autoregressive_loss": 0.1494, + "epoch": 0.9260808926080892, + "grad_norm": 0.06019427999854088, + "learning_rate": 8.098875422723884e-06, + "loss": 0.5968, + "step": 332 + }, + { + "autoregressive_loss": 0.1504, + "epoch": 0.9288702928870293, + "grad_norm": 0.06858047097921371, + "learning_rate": 8.087019554624595e-06, + "loss": 0.6033, + "step": 333 + }, + { + "autoregressive_loss": 0.1602, + "epoch": 0.9316596931659693, + "grad_norm": 0.04706525057554245, + "learning_rate": 8.075135571573898e-06, + "loss": 0.6411, + "step": 334 + }, + { + "autoregressive_loss": 0.1523, + "epoch": 0.9344490934449093, + "grad_norm": 0.08539916574954987, + "learning_rate": 8.06322358180486e-06, + "loss": 0.6095, + "step": 335 + }, + { + "autoregressive_loss": 0.1543, + "epoch": 0.9372384937238494, + "grad_norm": 0.06480550765991211, + "learning_rate": 8.051283693805624e-06, + "loss": 0.6182, + "step": 336 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 0.9400278940027894, + "grad_norm": 0.0677114948630333, + "learning_rate": 8.039316016318415e-06, + "loss": 0.5708, + "step": 337 + }, + { + "autoregressive_loss": 0.1533, + "epoch": 0.9428172942817294, + "grad_norm": 0.06868337094783783, + "learning_rate": 8.027320658338547e-06, + "loss": 0.6129, + "step": 338 + }, + { + "autoregressive_loss": 0.1562, + "epoch": 0.9456066945606695, + "grad_norm": 0.05630309134721756, + "learning_rate": 8.015297729113436e-06, + "loss": 0.627, + "step": 339 + }, + { + "autoregressive_loss": 0.1572, + "epoch": 0.9483960948396095, + "grad_norm": 0.08856380730867386, + "learning_rate": 8.0032473381416e-06, + "loss": 0.6274, + "step": 340 + }, + { + "autoregressive_loss": 0.1504, + "epoch": 0.9511854951185496, + "grad_norm": 0.07471572607755661, + "learning_rate": 7.991169595171669e-06, + "loss": 0.6011, + "step": 341 + }, + { + "autoregressive_loss": 0.1543, + "epoch": 0.9539748953974896, + "grad_norm": 0.08280623704195023, + "learning_rate": 7.979064610201372e-06, + "loss": 0.6188, + "step": 342 + }, + { + "autoregressive_loss": 0.1484, + "epoch": 0.9567642956764296, + "grad_norm": 0.06220320984721184, + "learning_rate": 7.966932493476554e-06, + "loss": 0.5979, + "step": 343 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 0.9595536959553695, + "grad_norm": 0.04616249352693558, + "learning_rate": 7.954773355490155e-06, + "loss": 0.567, + "step": 344 + }, + { + "autoregressive_loss": 0.1484, + "epoch": 0.9623430962343096, + "grad_norm": 0.07515771687030792, + "learning_rate": 7.942587306981213e-06, + "loss": 0.5909, + "step": 345 + }, + { + "autoregressive_loss": 0.1484, + "epoch": 0.9651324965132496, + "grad_norm": 0.05773142725229263, + "learning_rate": 7.930374458933852e-06, + "loss": 0.5913, + "step": 346 + }, + { + "autoregressive_loss": 0.1484, + "epoch": 0.9679218967921897, + "grad_norm": 0.07905569672584534, + "learning_rate": 7.918134922576271e-06, + "loss": 0.5918, + "step": 347 + }, + { + "autoregressive_loss": 0.1504, + "epoch": 0.9707112970711297, + "grad_norm": 0.13967609405517578, + "learning_rate": 7.905868809379735e-06, + "loss": 0.6022, + "step": 348 + }, + { + "autoregressive_loss": 0.1504, + "epoch": 0.9735006973500697, + "grad_norm": 0.07748198509216309, + "learning_rate": 7.893576231057553e-06, + "loss": 0.6036, + "step": 349 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 0.9762900976290098, + "grad_norm": 0.05534328147768974, + "learning_rate": 7.88125729956407e-06, + "loss": 0.5709, + "step": 350 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 0.9790794979079498, + "grad_norm": 0.0482962541282177, + "learning_rate": 7.868912127093638e-06, + "loss": 0.5649, + "step": 351 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 0.9818688981868898, + "grad_norm": 0.05185603350400925, + "learning_rate": 7.856540826079595e-06, + "loss": 0.5864, + "step": 352 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 0.9846582984658299, + "grad_norm": 0.05621138587594032, + "learning_rate": 7.844143509193252e-06, + "loss": 0.5811, + "step": 353 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 0.9874476987447699, + "grad_norm": 0.05897721275687218, + "learning_rate": 7.831720289342853e-06, + "loss": 0.5839, + "step": 354 + }, + { + "autoregressive_loss": 0.1455, + "epoch": 0.9902370990237099, + "grad_norm": 0.09645813703536987, + "learning_rate": 7.819271279672553e-06, + "loss": 0.5818, + "step": 355 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 0.99302649930265, + "grad_norm": 0.06643400341272354, + "learning_rate": 7.806796593561389e-06, + "loss": 0.5885, + "step": 356 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 0.99581589958159, + "grad_norm": 0.06707483530044556, + "learning_rate": 7.794296344622246e-06, + "loss": 0.5762, + "step": 357 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 0.99860529986053, + "grad_norm": 0.05718193203210831, + "learning_rate": 7.78177064670082e-06, + "loss": 0.557, + "step": 358 + }, + { + "autoregressive_loss": 0.2285, + "epoch": 1.00139470013947, + "grad_norm": 0.10238175839185715, + "learning_rate": 7.769219613874581e-06, + "loss": 0.9125, + "step": 359 + }, + { + "autoregressive_loss": 0.1484, + "epoch": 1.00418410041841, + "grad_norm": 0.07616080343723297, + "learning_rate": 7.756643360451744e-06, + "loss": 0.5914, + "step": 360 + }, + { + "autoregressive_loss": 0.1621, + "epoch": 1.0069735006973501, + "grad_norm": 0.07913816720247269, + "learning_rate": 7.744042000970207e-06, + "loss": 0.6479, + "step": 361 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.00976290097629, + "grad_norm": 0.06615255028009415, + "learning_rate": 7.731415650196535e-06, + "loss": 0.5835, + "step": 362 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 1.0125523012552302, + "grad_norm": 0.05160265043377876, + "learning_rate": 7.718764423124892e-06, + "loss": 0.554, + "step": 363 + }, + { + "autoregressive_loss": 0.1523, + "epoch": 1.0153417015341701, + "grad_norm": 0.07198240607976913, + "learning_rate": 7.706088434976e-06, + "loss": 0.608, + "step": 364 + }, + { + "autoregressive_loss": 0.1543, + "epoch": 1.0181311018131103, + "grad_norm": 0.07180891185998917, + "learning_rate": 7.6933878011961e-06, + "loss": 0.6169, + "step": 365 + }, + { + "autoregressive_loss": 0.1484, + "epoch": 1.0209205020920502, + "grad_norm": 0.05821176618337631, + "learning_rate": 7.68066263745589e-06, + "loss": 0.5945, + "step": 366 + }, + { + "autoregressive_loss": 0.1572, + "epoch": 1.0237099023709901, + "grad_norm": 0.054356735199689865, + "learning_rate": 7.667913059649468e-06, + "loss": 0.6295, + "step": 367 + }, + { + "autoregressive_loss": 0.1523, + "epoch": 1.0264993026499303, + "grad_norm": 0.05546696111559868, + "learning_rate": 7.65513918389329e-06, + "loss": 0.6116, + "step": 368 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 1.0292887029288702, + "grad_norm": 0.07030601799488068, + "learning_rate": 7.6423411265251e-06, + "loss": 0.5796, + "step": 369 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.0320781032078103, + "grad_norm": 0.04872807860374451, + "learning_rate": 7.629519004102876e-06, + "loss": 0.5869, + "step": 370 + }, + { + "autoregressive_loss": 0.1523, + "epoch": 1.0348675034867503, + "grad_norm": 0.0648675188422203, + "learning_rate": 7.616672933403772e-06, + "loss": 0.6108, + "step": 371 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 1.0376569037656904, + "grad_norm": 0.04379624128341675, + "learning_rate": 7.603803031423046e-06, + "loss": 0.5691, + "step": 372 + }, + { + "autoregressive_loss": 0.1533, + "epoch": 1.0404463040446303, + "grad_norm": 0.058346450328826904, + "learning_rate": 7.590909415373e-06, + "loss": 0.614, + "step": 373 + }, + { + "autoregressive_loss": 0.1484, + "epoch": 1.0432357043235705, + "grad_norm": 0.08056524395942688, + "learning_rate": 7.577992202681912e-06, + "loss": 0.5912, + "step": 374 + }, + { + "autoregressive_loss": 0.1562, + "epoch": 1.0460251046025104, + "grad_norm": 0.04107631742954254, + "learning_rate": 7.565051510992964e-06, + "loss": 0.623, + "step": 375 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 1.0488145048814506, + "grad_norm": 0.03586035966873169, + "learning_rate": 7.552087458163177e-06, + "loss": 0.5509, + "step": 376 + }, + { + "autoregressive_loss": 0.1494, + "epoch": 1.0516039051603905, + "grad_norm": 0.06955567002296448, + "learning_rate": 7.539100162262325e-06, + "loss": 0.5989, + "step": 377 + }, + { + "autoregressive_loss": 0.1455, + "epoch": 1.0543933054393306, + "grad_norm": 0.04627755284309387, + "learning_rate": 7.526089741571876e-06, + "loss": 0.5834, + "step": 378 + }, + { + "autoregressive_loss": 0.1455, + "epoch": 1.0571827057182706, + "grad_norm": 0.04706581309437752, + "learning_rate": 7.5130563145838994e-06, + "loss": 0.5831, + "step": 379 + }, + { + "autoregressive_loss": 0.1543, + "epoch": 1.0599721059972107, + "grad_norm": 0.060845036059617996, + "learning_rate": 7.500000000000001e-06, + "loss": 0.6163, + "step": 380 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 1.0627615062761506, + "grad_norm": 0.05809563770890236, + "learning_rate": 7.486920916730228e-06, + "loss": 0.5599, + "step": 381 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 1.0655509065550905, + "grad_norm": 0.05717488005757332, + "learning_rate": 7.473819183891997e-06, + "loss": 0.5785, + "step": 382 + }, + { + "autoregressive_loss": 0.1494, + "epoch": 1.0683403068340307, + "grad_norm": 0.04571189731359482, + "learning_rate": 7.460694920809004e-06, + "loss": 0.5964, + "step": 383 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 1.0711297071129706, + "grad_norm": 0.04351586475968361, + "learning_rate": 7.447548247010137e-06, + "loss": 0.5635, + "step": 384 + }, + { + "autoregressive_loss": 0.1484, + "epoch": 1.0739191073919108, + "grad_norm": 0.061158858239650726, + "learning_rate": 7.434379282228393e-06, + "loss": 0.5952, + "step": 385 + }, + { + "autoregressive_loss": 0.1436, + "epoch": 1.0767085076708507, + "grad_norm": 0.05735040456056595, + "learning_rate": 7.421188146399776e-06, + "loss": 0.5743, + "step": 386 + }, + { + "autoregressive_loss": 0.1504, + "epoch": 1.0794979079497908, + "grad_norm": 0.056770507246255875, + "learning_rate": 7.407974959662222e-06, + "loss": 0.5997, + "step": 387 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.0822873082287308, + "grad_norm": 0.08442239463329315, + "learning_rate": 7.394739842354489e-06, + "loss": 0.5875, + "step": 388 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 1.085076708507671, + "grad_norm": 0.054207246750593185, + "learning_rate": 7.381482915015068e-06, + "loss": 0.5632, + "step": 389 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 1.0878661087866108, + "grad_norm": 0.04854436591267586, + "learning_rate": 7.368204298381085e-06, + "loss": 0.5651, + "step": 390 + }, + { + "autoregressive_loss": 0.1514, + "epoch": 1.090655509065551, + "grad_norm": 0.049479786306619644, + "learning_rate": 7.3549041133872004e-06, + "loss": 0.605, + "step": 391 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 1.093444909344491, + "grad_norm": 0.10756554454565048, + "learning_rate": 7.341582481164508e-06, + "loss": 0.5604, + "step": 392 + }, + { + "autoregressive_loss": 0.1553, + "epoch": 1.096234309623431, + "grad_norm": 0.0788588598370552, + "learning_rate": 7.328239523039431e-06, + "loss": 0.6232, + "step": 393 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.099023709902371, + "grad_norm": 0.05412541329860687, + "learning_rate": 7.314875360532618e-06, + "loss": 0.5854, + "step": 394 + }, + { + "autoregressive_loss": 0.1318, + "epoch": 1.1018131101813111, + "grad_norm": 0.0457867793738842, + "learning_rate": 7.301490115357837e-06, + "loss": 0.5269, + "step": 395 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.104602510460251, + "grad_norm": 0.07691632956266403, + "learning_rate": 7.288083909420866e-06, + "loss": 0.5861, + "step": 396 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.107391910739191, + "grad_norm": 0.10161058604717255, + "learning_rate": 7.274656864818379e-06, + "loss": 0.5859, + "step": 397 + }, + { + "autoregressive_loss": 0.1611, + "epoch": 1.1101813110181311, + "grad_norm": 0.053617097437381744, + "learning_rate": 7.261209103836843e-06, + "loss": 0.6433, + "step": 398 + }, + { + "autoregressive_loss": 0.1484, + "epoch": 1.112970711297071, + "grad_norm": 0.09791024774312973, + "learning_rate": 7.247740748951394e-06, + "loss": 0.5948, + "step": 399 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 1.1157601115760112, + "grad_norm": 0.05660359933972359, + "learning_rate": 7.234251922824731e-06, + "loss": 0.5619, + "step": 400 + }, + { + "autoregressive_loss": 0.1504, + "epoch": 1.118549511854951, + "grad_norm": 0.05431594327092171, + "learning_rate": 7.220742748305989e-06, + "loss": 0.6022, + "step": 401 + }, + { + "autoregressive_loss": 0.1436, + "epoch": 1.1213389121338913, + "grad_norm": 0.044363100081682205, + "learning_rate": 7.20721334842963e-06, + "loss": 0.5723, + "step": 402 + }, + { + "autoregressive_loss": 0.1533, + "epoch": 1.1241283124128312, + "grad_norm": 0.10367250442504883, + "learning_rate": 7.193663846414318e-06, + "loss": 0.6113, + "step": 403 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 1.1269177126917713, + "grad_norm": 0.05890325456857681, + "learning_rate": 7.180094365661793e-06, + "loss": 0.5691, + "step": 404 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 1.1297071129707112, + "grad_norm": 0.0740562230348587, + "learning_rate": 7.166505029755753e-06, + "loss": 0.5709, + "step": 405 + }, + { + "autoregressive_loss": 0.1611, + "epoch": 1.1324965132496514, + "grad_norm": 0.06338857114315033, + "learning_rate": 7.152895962460727e-06, + "loss": 0.6433, + "step": 406 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 1.1352859135285913, + "grad_norm": 0.06992591172456741, + "learning_rate": 7.139267287720945e-06, + "loss": 0.5811, + "step": 407 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 1.1380753138075315, + "grad_norm": 0.03636496141552925, + "learning_rate": 7.125619129659215e-06, + "loss": 0.5677, + "step": 408 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 1.1408647140864714, + "grad_norm": 0.043684881180524826, + "learning_rate": 7.111951612575783e-06, + "loss": 0.5344, + "step": 409 + }, + { + "autoregressive_loss": 0.1533, + "epoch": 1.1436541143654115, + "grad_norm": 0.041947294026613235, + "learning_rate": 7.0982648609472135e-06, + "loss": 0.6123, + "step": 410 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 1.1464435146443515, + "grad_norm": 0.04358157515525818, + "learning_rate": 7.084558999425245e-06, + "loss": 0.5641, + "step": 411 + }, + { + "autoregressive_loss": 0.1562, + "epoch": 1.1492329149232914, + "grad_norm": 0.05170544236898422, + "learning_rate": 7.0708341528356585e-06, + "loss": 0.6254, + "step": 412 + }, + { + "autoregressive_loss": 0.1475, + "epoch": 1.1520223152022315, + "grad_norm": 0.05332844704389572, + "learning_rate": 7.0570904461771426e-06, + "loss": 0.5881, + "step": 413 + }, + { + "autoregressive_loss": 0.1455, + "epoch": 1.1548117154811715, + "grad_norm": 0.04379379749298096, + "learning_rate": 7.043328004620154e-06, + "loss": 0.582, + "step": 414 + }, + { + "autoregressive_loss": 0.1582, + "epoch": 1.1576011157601116, + "grad_norm": 0.05521988868713379, + "learning_rate": 7.029546953505776e-06, + "loss": 0.6307, + "step": 415 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 1.1603905160390515, + "grad_norm": 0.09003956615924835, + "learning_rate": 7.015747418344578e-06, + "loss": 0.5342, + "step": 416 + }, + { + "autoregressive_loss": 0.1592, + "epoch": 1.1631799163179917, + "grad_norm": 0.07028835266828537, + "learning_rate": 7.0019295248154714e-06, + "loss": 0.6365, + "step": 417 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 1.1659693165969316, + "grad_norm": 0.04405695199966431, + "learning_rate": 6.98809339876457e-06, + "loss": 0.5634, + "step": 418 + }, + { + "autoregressive_loss": 0.1357, + "epoch": 1.1687587168758717, + "grad_norm": 0.04779954254627228, + "learning_rate": 6.974239166204034e-06, + "loss": 0.543, + "step": 419 + }, + { + "autoregressive_loss": 0.1455, + "epoch": 1.1715481171548117, + "grad_norm": 0.05580804497003555, + "learning_rate": 6.960366953310931e-06, + "loss": 0.5813, + "step": 420 + }, + { + "autoregressive_loss": 0.1475, + "epoch": 1.1743375174337518, + "grad_norm": 0.041138842701911926, + "learning_rate": 6.946476886426087e-06, + "loss": 0.5901, + "step": 421 + }, + { + "autoregressive_loss": 0.1484, + "epoch": 1.1771269177126917, + "grad_norm": 0.08709113299846649, + "learning_rate": 6.932569092052927e-06, + "loss": 0.5951, + "step": 422 + }, + { + "autoregressive_loss": 0.1309, + "epoch": 1.1799163179916319, + "grad_norm": 0.05576131492853165, + "learning_rate": 6.918643696856333e-06, + "loss": 0.5239, + "step": 423 + }, + { + "autoregressive_loss": 0.1562, + "epoch": 1.1827057182705718, + "grad_norm": 0.06753681600093842, + "learning_rate": 6.904700827661484e-06, + "loss": 0.624, + "step": 424 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.185495118549512, + "grad_norm": 0.07465287297964096, + "learning_rate": 6.890740611452705e-06, + "loss": 0.5857, + "step": 425 + }, + { + "autoregressive_loss": 0.1533, + "epoch": 1.1882845188284519, + "grad_norm": 0.07471378147602081, + "learning_rate": 6.876763175372306e-06, + "loss": 0.6145, + "step": 426 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 1.1910739191073918, + "grad_norm": 0.07116798311471939, + "learning_rate": 6.862768646719425e-06, + "loss": 0.5596, + "step": 427 + }, + { + "autoregressive_loss": 0.1357, + "epoch": 1.193863319386332, + "grad_norm": 0.0814041942358017, + "learning_rate": 6.848757152948876e-06, + "loss": 0.5437, + "step": 428 + }, + { + "autoregressive_loss": 0.1523, + "epoch": 1.196652719665272, + "grad_norm": 0.08532565087080002, + "learning_rate": 6.834728821669978e-06, + "loss": 0.6079, + "step": 429 + }, + { + "autoregressive_loss": 0.1543, + "epoch": 1.199442119944212, + "grad_norm": 0.043264567852020264, + "learning_rate": 6.820683780645397e-06, + "loss": 0.6145, + "step": 430 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 1.202231520223152, + "grad_norm": 0.06063782423734665, + "learning_rate": 6.806622157789989e-06, + "loss": 0.5704, + "step": 431 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 1.205020920502092, + "grad_norm": 0.052947379648685455, + "learning_rate": 6.7925440811696165e-06, + "loss": 0.5554, + "step": 432 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 1.207810320781032, + "grad_norm": 0.051718465983867645, + "learning_rate": 6.778449679000006e-06, + "loss": 0.5385, + "step": 433 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 1.2105997210599722, + "grad_norm": 0.05708680301904678, + "learning_rate": 6.764339079645561e-06, + "loss": 0.5608, + "step": 434 + }, + { + "autoregressive_loss": 0.1533, + "epoch": 1.213389121338912, + "grad_norm": 0.048807889223098755, + "learning_rate": 6.7502124116182066e-06, + "loss": 0.6128, + "step": 435 + }, + { + "autoregressive_loss": 0.1455, + "epoch": 1.2161785216178522, + "grad_norm": 0.04752994328737259, + "learning_rate": 6.736069803576205e-06, + "loss": 0.5817, + "step": 436 + }, + { + "autoregressive_loss": 0.1484, + "epoch": 1.2189679218967922, + "grad_norm": 0.04288828372955322, + "learning_rate": 6.721911384323e-06, + "loss": 0.5953, + "step": 437 + }, + { + "autoregressive_loss": 0.1504, + "epoch": 1.2217573221757323, + "grad_norm": 0.06321804225444794, + "learning_rate": 6.7077372828060294e-06, + "loss": 0.6023, + "step": 438 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 1.2245467224546722, + "grad_norm": 0.05869006738066673, + "learning_rate": 6.693547628115561e-06, + "loss": 0.5392, + "step": 439 + }, + { + "autoregressive_loss": 0.1299, + "epoch": 1.2273361227336124, + "grad_norm": 0.04987003281712532, + "learning_rate": 6.67934254948351e-06, + "loss": 0.5179, + "step": 440 + }, + { + "autoregressive_loss": 0.1514, + "epoch": 1.2301255230125523, + "grad_norm": 0.048468586057424545, + "learning_rate": 6.6651221762822635e-06, + "loss": 0.6063, + "step": 441 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 1.2329149232914922, + "grad_norm": 0.04421556740999222, + "learning_rate": 6.650886638023508e-06, + "loss": 0.5681, + "step": 442 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 1.2357043235704324, + "grad_norm": 0.04106782376766205, + "learning_rate": 6.636636064357045e-06, + "loss": 0.5753, + "step": 443 + }, + { + "autoregressive_loss": 0.1357, + "epoch": 1.2384937238493725, + "grad_norm": 0.043508779257535934, + "learning_rate": 6.622370585069605e-06, + "loss": 0.5437, + "step": 444 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.2412831241283124, + "grad_norm": 0.04311037436127663, + "learning_rate": 6.608090330083677e-06, + "loss": 0.5845, + "step": 445 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 1.2440725244072524, + "grad_norm": 0.09367021918296814, + "learning_rate": 6.593795429456317e-06, + "loss": 0.5657, + "step": 446 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.2468619246861925, + "grad_norm": 0.08234765380620956, + "learning_rate": 6.579486013377963e-06, + "loss": 0.5863, + "step": 447 + }, + { + "autoregressive_loss": 0.1455, + "epoch": 1.2496513249651324, + "grad_norm": 0.06123524159193039, + "learning_rate": 6.565162212171257e-06, + "loss": 0.5837, + "step": 448 + }, + { + "autoregressive_loss": 0.1484, + "epoch": 1.2524407252440726, + "grad_norm": 0.06861612945795059, + "learning_rate": 6.550824156289852e-06, + "loss": 0.5924, + "step": 449 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 1.2552301255230125, + "grad_norm": 0.05006430298089981, + "learning_rate": 6.536471976317223e-06, + "loss": 0.5597, + "step": 450 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 1.2580195258019526, + "grad_norm": 0.0535871796309948, + "learning_rate": 6.5221058029654815e-06, + "loss": 0.5538, + "step": 451 + }, + { + "autoregressive_loss": 0.1299, + "epoch": 1.2608089260808926, + "grad_norm": 0.03717818856239319, + "learning_rate": 6.507725767074181e-06, + "loss": 0.5199, + "step": 452 + }, + { + "autoregressive_loss": 0.1543, + "epoch": 1.2635983263598327, + "grad_norm": 0.042731910943984985, + "learning_rate": 6.493331999609132e-06, + "loss": 0.6143, + "step": 453 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 1.2663877266387726, + "grad_norm": 0.06432456523180008, + "learning_rate": 6.4789246316612e-06, + "loss": 0.579, + "step": 454 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 1.2691771269177128, + "grad_norm": 0.04289945214986801, + "learning_rate": 6.464503794445121e-06, + "loss": 0.5656, + "step": 455 + }, + { + "autoregressive_loss": 0.1309, + "epoch": 1.2719665271966527, + "grad_norm": 0.04800882562994957, + "learning_rate": 6.450069619298299e-06, + "loss": 0.5222, + "step": 456 + }, + { + "autoregressive_loss": 0.1494, + "epoch": 1.2747559274755926, + "grad_norm": 0.03720393031835556, + "learning_rate": 6.435622237679615e-06, + "loss": 0.5989, + "step": 457 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 1.2775453277545328, + "grad_norm": 0.06400813162326813, + "learning_rate": 6.421161781168226e-06, + "loss": 0.5344, + "step": 458 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 1.280334728033473, + "grad_norm": 0.05536332726478577, + "learning_rate": 6.4066883814623674e-06, + "loss": 0.5657, + "step": 459 + }, + { + "autoregressive_loss": 0.1475, + "epoch": 1.2831241283124128, + "grad_norm": 0.08623214066028595, + "learning_rate": 6.3922021703781574e-06, + "loss": 0.5924, + "step": 460 + }, + { + "autoregressive_loss": 0.1475, + "epoch": 1.2859135285913528, + "grad_norm": 0.048994917422533035, + "learning_rate": 6.377703279848393e-06, + "loss": 0.5894, + "step": 461 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 1.288702928870293, + "grad_norm": 0.0725463479757309, + "learning_rate": 6.363191841921345e-06, + "loss": 0.5802, + "step": 462 + }, + { + "autoregressive_loss": 0.1582, + "epoch": 1.2914923291492328, + "grad_norm": 0.047162577509880066, + "learning_rate": 6.3486679887595635e-06, + "loss": 0.6312, + "step": 463 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 1.294281729428173, + "grad_norm": 0.07416835427284241, + "learning_rate": 6.334131852638669e-06, + "loss": 0.5653, + "step": 464 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 1.297071129707113, + "grad_norm": 0.07332177460193634, + "learning_rate": 6.319583565946147e-06, + "loss": 0.5609, + "step": 465 + }, + { + "autoregressive_loss": 0.1318, + "epoch": 1.299860529986053, + "grad_norm": 0.07667059451341629, + "learning_rate": 6.305023261180146e-06, + "loss": 0.5284, + "step": 466 + }, + { + "autoregressive_loss": 0.1436, + "epoch": 1.302649930264993, + "grad_norm": 0.047988373786211014, + "learning_rate": 6.290451070948269e-06, + "loss": 0.5736, + "step": 467 + }, + { + "autoregressive_loss": 0.1602, + "epoch": 1.3054393305439331, + "grad_norm": 0.06584234535694122, + "learning_rate": 6.275867127966364e-06, + "loss": 0.6401, + "step": 468 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 1.308228730822873, + "grad_norm": 0.06491407752037048, + "learning_rate": 6.261271565057318e-06, + "loss": 0.5394, + "step": 469 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 1.3110181311018132, + "grad_norm": 0.0461663119494915, + "learning_rate": 6.246664515149845e-06, + "loss": 0.5597, + "step": 470 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 1.3138075313807531, + "grad_norm": 0.10100945085287094, + "learning_rate": 6.232046111277277e-06, + "loss": 0.5402, + "step": 471 + }, + { + "autoregressive_loss": 0.1504, + "epoch": 1.316596931659693, + "grad_norm": 0.04224178567528725, + "learning_rate": 6.217416486576354e-06, + "loss": 0.5996, + "step": 472 + }, + { + "autoregressive_loss": 0.1436, + "epoch": 1.3193863319386332, + "grad_norm": 0.06223026290535927, + "learning_rate": 6.202775774286007e-06, + "loss": 0.5736, + "step": 473 + }, + { + "autoregressive_loss": 0.1455, + "epoch": 1.3221757322175733, + "grad_norm": 0.06966451555490494, + "learning_rate": 6.188124107746148e-06, + "loss": 0.5833, + "step": 474 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 1.3249651324965133, + "grad_norm": 0.04242433235049248, + "learning_rate": 6.173461620396453e-06, + "loss": 0.5725, + "step": 475 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 1.3277545327754532, + "grad_norm": 0.06927305459976196, + "learning_rate": 6.158788445775151e-06, + "loss": 0.5801, + "step": 476 + }, + { + "autoregressive_loss": 0.1494, + "epoch": 1.3305439330543933, + "grad_norm": 0.06829552352428436, + "learning_rate": 6.1441047175178025e-06, + "loss": 0.5988, + "step": 477 + }, + { + "autoregressive_loss": 0.1357, + "epoch": 1.3333333333333333, + "grad_norm": 0.08040744066238403, + "learning_rate": 6.129410569356086e-06, + "loss": 0.5432, + "step": 478 + }, + { + "autoregressive_loss": 0.1455, + "epoch": 1.3361227336122734, + "grad_norm": 0.04500404745340347, + "learning_rate": 6.11470613511658e-06, + "loss": 0.5817, + "step": 479 + }, + { + "autoregressive_loss": 0.1436, + "epoch": 1.3389121338912133, + "grad_norm": 0.04723746329545975, + "learning_rate": 6.0999915487195395e-06, + "loss": 0.5747, + "step": 480 + }, + { + "autoregressive_loss": 0.1504, + "epoch": 1.3417015341701535, + "grad_norm": 0.06364046037197113, + "learning_rate": 6.085266944177686e-06, + "loss": 0.6021, + "step": 481 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 1.3444909344490934, + "grad_norm": 0.09549642354249954, + "learning_rate": 6.070532455594974e-06, + "loss": 0.5575, + "step": 482 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 1.3472803347280335, + "grad_norm": 0.052497681230306625, + "learning_rate": 6.055788217165384e-06, + "loss": 0.5526, + "step": 483 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 1.3500697350069735, + "grad_norm": 0.05494900792837143, + "learning_rate": 6.0410343631716865e-06, + "loss": 0.5663, + "step": 484 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 1.3528591352859136, + "grad_norm": 0.06884355843067169, + "learning_rate": 6.0262710279842305e-06, + "loss": 0.54, + "step": 485 + }, + { + "autoregressive_loss": 0.1484, + "epoch": 1.3556485355648535, + "grad_norm": 0.04178055375814438, + "learning_rate": 6.011498346059712e-06, + "loss": 0.5923, + "step": 486 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 1.3584379358437935, + "grad_norm": 0.057555124163627625, + "learning_rate": 5.99671645193995e-06, + "loss": 0.5553, + "step": 487 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.3612273361227336, + "grad_norm": 0.09850379079580307, + "learning_rate": 5.98192548025067e-06, + "loss": 0.5857, + "step": 488 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 1.3640167364016738, + "grad_norm": 0.04874347150325775, + "learning_rate": 5.967125565700266e-06, + "loss": 0.5577, + "step": 489 + }, + { + "autoregressive_loss": 0.1523, + "epoch": 1.3668061366806137, + "grad_norm": 0.0756797045469284, + "learning_rate": 5.952316843078579e-06, + "loss": 0.6082, + "step": 490 + }, + { + "autoregressive_loss": 0.1484, + "epoch": 1.3695955369595536, + "grad_norm": 0.05278534069657326, + "learning_rate": 5.9374994472556715e-06, + "loss": 0.5924, + "step": 491 + }, + { + "autoregressive_loss": 0.1309, + "epoch": 1.3723849372384938, + "grad_norm": 0.04171266779303551, + "learning_rate": 5.922673513180596e-06, + "loss": 0.5248, + "step": 492 + }, + { + "autoregressive_loss": 0.1309, + "epoch": 1.3751743375174337, + "grad_norm": 0.0482805073261261, + "learning_rate": 5.9078391758801646e-06, + "loss": 0.524, + "step": 493 + }, + { + "autoregressive_loss": 0.1582, + "epoch": 1.3779637377963738, + "grad_norm": 0.04665492847561836, + "learning_rate": 5.8929965704577275e-06, + "loss": 0.6332, + "step": 494 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 1.3807531380753137, + "grad_norm": 0.05500170588493347, + "learning_rate": 5.878145832091929e-06, + "loss": 0.5353, + "step": 495 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.383542538354254, + "grad_norm": 0.04270568862557411, + "learning_rate": 5.863287096035491e-06, + "loss": 0.5884, + "step": 496 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 1.3863319386331938, + "grad_norm": 0.04342806339263916, + "learning_rate": 5.848420497613969e-06, + "loss": 0.5394, + "step": 497 + }, + { + "autoregressive_loss": 0.1543, + "epoch": 1.389121338912134, + "grad_norm": 0.06217074766755104, + "learning_rate": 5.833546172224527e-06, + "loss": 0.6151, + "step": 498 + }, + { + "autoregressive_loss": 0.1455, + "epoch": 1.3919107391910739, + "grad_norm": 0.05096966773271561, + "learning_rate": 5.818664255334702e-06, + "loss": 0.5837, + "step": 499 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 1.394700139470014, + "grad_norm": 0.06020465865731239, + "learning_rate": 5.803774882481171e-06, + "loss": 0.5314, + "step": 500 + }, + { + "autoregressive_loss": 0.1357, + "epoch": 1.397489539748954, + "grad_norm": 0.057533930987119675, + "learning_rate": 5.788878189268516e-06, + "loss": 0.5443, + "step": 501 + }, + { + "autoregressive_loss": 0.1494, + "epoch": 1.4002789400278939, + "grad_norm": 0.06091269105672836, + "learning_rate": 5.773974311367987e-06, + "loss": 0.5958, + "step": 502 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 1.403068340306834, + "grad_norm": 0.0814371332526207, + "learning_rate": 5.759063384516271e-06, + "loss": 0.5303, + "step": 503 + }, + { + "autoregressive_loss": 0.1543, + "epoch": 1.4058577405857742, + "grad_norm": 0.07866951078176498, + "learning_rate": 5.7441455445142505e-06, + "loss": 0.6179, + "step": 504 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 1.408647140864714, + "grad_norm": 0.05549406260251999, + "learning_rate": 5.729220927225769e-06, + "loss": 0.5579, + "step": 505 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 1.411436541143654, + "grad_norm": 0.041973553597927094, + "learning_rate": 5.714289668576401e-06, + "loss": 0.5322, + "step": 506 + }, + { + "autoregressive_loss": 0.1504, + "epoch": 1.4142259414225942, + "grad_norm": 0.04181048646569252, + "learning_rate": 5.699351904552196e-06, + "loss": 0.5996, + "step": 507 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.417015341701534, + "grad_norm": 0.06205263361334801, + "learning_rate": 5.68440777119846e-06, + "loss": 0.5869, + "step": 508 + }, + { + "autoregressive_loss": 0.1484, + "epoch": 1.4198047419804742, + "grad_norm": 0.058726534247398376, + "learning_rate": 5.669457404618502e-06, + "loss": 0.5934, + "step": 509 + }, + { + "autoregressive_loss": 0.1289, + "epoch": 1.4225941422594142, + "grad_norm": 0.06739199906587601, + "learning_rate": 5.654500940972405e-06, + "loss": 0.5167, + "step": 510 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 1.4253835425383543, + "grad_norm": 0.045175522565841675, + "learning_rate": 5.639538516475775e-06, + "loss": 0.5693, + "step": 511 + }, + { + "autoregressive_loss": 0.1572, + "epoch": 1.4281729428172942, + "grad_norm": 0.053537365049123764, + "learning_rate": 5.624570267398511e-06, + "loss": 0.6287, + "step": 512 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 1.4309623430962344, + "grad_norm": 0.0531407929956913, + "learning_rate": 5.6095963300635585e-06, + "loss": 0.5391, + "step": 513 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.4337517433751743, + "grad_norm": 0.06669306010007858, + "learning_rate": 5.594616840845666e-06, + "loss": 0.5868, + "step": 514 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 1.4365411436541144, + "grad_norm": 0.05769766494631767, + "learning_rate": 5.579631936170147e-06, + "loss": 0.5459, + "step": 515 + }, + { + "autoregressive_loss": 0.1484, + "epoch": 1.4393305439330544, + "grad_norm": 0.10124175250530243, + "learning_rate": 5.564641752511637e-06, + "loss": 0.5931, + "step": 516 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 1.4421199442119943, + "grad_norm": 0.07149350643157959, + "learning_rate": 5.54964642639285e-06, + "loss": 0.5692, + "step": 517 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 1.4449093444909344, + "grad_norm": 0.060222964733839035, + "learning_rate": 5.534646094383333e-06, + "loss": 0.5808, + "step": 518 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 1.4476987447698746, + "grad_norm": 0.03446779400110245, + "learning_rate": 5.519640893098227e-06, + "loss": 0.552, + "step": 519 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 1.4504881450488145, + "grad_norm": 0.047577984631061554, + "learning_rate": 5.504630959197014e-06, + "loss": 0.5594, + "step": 520 + }, + { + "autoregressive_loss": 0.1484, + "epoch": 1.4532775453277544, + "grad_norm": 0.06725237518548965, + "learning_rate": 5.489616429382285e-06, + "loss": 0.5944, + "step": 521 + }, + { + "autoregressive_loss": 0.1299, + "epoch": 1.4560669456066946, + "grad_norm": 0.04710479453206062, + "learning_rate": 5.474597440398483e-06, + "loss": 0.521, + "step": 522 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 1.4588563458856345, + "grad_norm": 0.04347934201359749, + "learning_rate": 5.459574129030669e-06, + "loss": 0.5685, + "step": 523 + }, + { + "autoregressive_loss": 0.1484, + "epoch": 1.4616457461645747, + "grad_norm": 0.06638723611831665, + "learning_rate": 5.444546632103262e-06, + "loss": 0.5948, + "step": 524 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 1.4644351464435146, + "grad_norm": 0.04341611638665199, + "learning_rate": 5.429515086478805e-06, + "loss": 0.5798, + "step": 525 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 1.4672245467224547, + "grad_norm": 0.045465774834156036, + "learning_rate": 5.414479629056717e-06, + "loss": 0.5355, + "step": 526 + }, + { + "autoregressive_loss": 0.1309, + "epoch": 1.4700139470013946, + "grad_norm": 0.05777398869395256, + "learning_rate": 5.3994403967720366e-06, + "loss": 0.525, + "step": 527 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 1.4728033472803348, + "grad_norm": 0.039686255156993866, + "learning_rate": 5.3843975265941896e-06, + "loss": 0.5343, + "step": 528 + }, + { + "autoregressive_loss": 0.1582, + "epoch": 1.4755927475592747, + "grad_norm": 0.040324702858924866, + "learning_rate": 5.369351155525729e-06, + "loss": 0.6313, + "step": 529 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 1.4783821478382149, + "grad_norm": 0.041002120822668076, + "learning_rate": 5.354301420601095e-06, + "loss": 0.5555, + "step": 530 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 1.4811715481171548, + "grad_norm": 0.03543855994939804, + "learning_rate": 5.33924845888536e-06, + "loss": 0.5366, + "step": 531 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 1.4839609483960947, + "grad_norm": 0.051153745502233505, + "learning_rate": 5.3241924074729865e-06, + "loss": 0.5699, + "step": 532 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 1.4867503486750349, + "grad_norm": 0.047301653772592545, + "learning_rate": 5.30913340348658e-06, + "loss": 0.5648, + "step": 533 + }, + { + "autoregressive_loss": 0.1484, + "epoch": 1.489539748953975, + "grad_norm": 0.05254862830042839, + "learning_rate": 5.294071584075628e-06, + "loss": 0.5951, + "step": 534 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 1.492329149232915, + "grad_norm": 0.06733259558677673, + "learning_rate": 5.279007086415268e-06, + "loss": 0.5452, + "step": 535 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 1.4951185495118549, + "grad_norm": 0.05333971604704857, + "learning_rate": 5.263940047705026e-06, + "loss": 0.563, + "step": 536 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 1.497907949790795, + "grad_norm": 0.06821227073669434, + "learning_rate": 5.24887060516757e-06, + "loss": 0.5724, + "step": 537 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 1.5006973500697351, + "grad_norm": 0.0713428184390068, + "learning_rate": 5.233798896047461e-06, + "loss": 0.5786, + "step": 538 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 1.503486750348675, + "grad_norm": 0.06519678235054016, + "learning_rate": 5.218725057609901e-06, + "loss": 0.5293, + "step": 539 + }, + { + "autoregressive_loss": 0.1494, + "epoch": 1.506276150627615, + "grad_norm": 0.046268217265605927, + "learning_rate": 5.2036492271394915e-06, + "loss": 0.5975, + "step": 540 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 1.5090655509065551, + "grad_norm": 0.07670779526233673, + "learning_rate": 5.188571541938968e-06, + "loss": 0.5365, + "step": 541 + }, + { + "autoregressive_loss": 0.1484, + "epoch": 1.511854951185495, + "grad_norm": 0.05209178850054741, + "learning_rate": 5.1734921393279644e-06, + "loss": 0.5923, + "step": 542 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 1.514644351464435, + "grad_norm": 0.1210055872797966, + "learning_rate": 5.158411156641752e-06, + "loss": 0.5785, + "step": 543 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 1.5174337517433751, + "grad_norm": 0.047878194600343704, + "learning_rate": 5.143328731229994e-06, + "loss": 0.5514, + "step": 544 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 1.5202231520223153, + "grad_norm": 0.07574264705181122, + "learning_rate": 5.128245000455493e-06, + "loss": 0.58, + "step": 545 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 1.5230125523012552, + "grad_norm": 0.05088375508785248, + "learning_rate": 5.113160101692939e-06, + "loss": 0.5399, + "step": 546 + }, + { + "autoregressive_loss": 0.1279, + "epoch": 1.5258019525801951, + "grad_norm": 0.04199855029582977, + "learning_rate": 5.098074172327661e-06, + "loss": 0.5126, + "step": 547 + }, + { + "autoregressive_loss": 0.1523, + "epoch": 1.5285913528591353, + "grad_norm": 0.12200253456830978, + "learning_rate": 5.082987349754376e-06, + "loss": 0.6068, + "step": 548 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 1.5313807531380754, + "grad_norm": 0.04040112718939781, + "learning_rate": 5.0678997713759305e-06, + "loss": 0.5591, + "step": 549 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 1.5341701534170153, + "grad_norm": 0.0648726299405098, + "learning_rate": 5.052811574602059e-06, + "loss": 0.5325, + "step": 550 + }, + { + "autoregressive_loss": 0.1553, + "epoch": 1.5369595536959553, + "grad_norm": 0.07296468317508698, + "learning_rate": 5.0377228968481274e-06, + "loss": 0.6212, + "step": 551 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 1.5397489539748954, + "grad_norm": 0.04140917956829071, + "learning_rate": 5.022633875533879e-06, + "loss": 0.5516, + "step": 552 + }, + { + "autoregressive_loss": 0.1436, + "epoch": 1.5425383542538356, + "grad_norm": 0.039590734988451004, + "learning_rate": 5.00754464808219e-06, + "loss": 0.5751, + "step": 553 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 1.5453277545327755, + "grad_norm": 0.03286973014473915, + "learning_rate": 4.992455351917812e-06, + "loss": 0.5508, + "step": 554 + }, + { + "autoregressive_loss": 0.1279, + "epoch": 1.5481171548117154, + "grad_norm": 0.08781571686267853, + "learning_rate": 4.977366124466122e-06, + "loss": 0.5132, + "step": 555 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 1.5509065550906556, + "grad_norm": 0.046460218727588654, + "learning_rate": 4.962277103151876e-06, + "loss": 0.5693, + "step": 556 + }, + { + "autoregressive_loss": 0.1357, + "epoch": 1.5536959553695955, + "grad_norm": 0.04157834127545357, + "learning_rate": 4.947188425397942e-06, + "loss": 0.5417, + "step": 557 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 1.5564853556485354, + "grad_norm": 0.05682104453444481, + "learning_rate": 4.932100228624072e-06, + "loss": 0.5547, + "step": 558 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 1.5592747559274756, + "grad_norm": 0.090944804251194, + "learning_rate": 4.917012650245626e-06, + "loss": 0.5364, + "step": 559 + }, + { + "autoregressive_loss": 0.1309, + "epoch": 1.5620641562064157, + "grad_norm": 0.059284768998622894, + "learning_rate": 4.901925827672341e-06, + "loss": 0.525, + "step": 560 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 1.5648535564853556, + "grad_norm": 0.053847938776016235, + "learning_rate": 4.886839898307062e-06, + "loss": 0.5503, + "step": 561 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.5676429567642955, + "grad_norm": 0.05444236472249031, + "learning_rate": 4.8717549995445105e-06, + "loss": 0.5851, + "step": 562 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 1.5704323570432357, + "grad_norm": 0.052665553987026215, + "learning_rate": 4.856671268770007e-06, + "loss": 0.5602, + "step": 563 + }, + { + "autoregressive_loss": 0.1484, + "epoch": 1.5732217573221758, + "grad_norm": 0.04669469967484474, + "learning_rate": 4.841588843358251e-06, + "loss": 0.5918, + "step": 564 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 1.5760111576011158, + "grad_norm": 0.050803057849407196, + "learning_rate": 4.826507860672036e-06, + "loss": 0.5319, + "step": 565 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 1.5788005578800557, + "grad_norm": 0.040684860199689865, + "learning_rate": 4.811428458061033e-06, + "loss": 0.5394, + "step": 566 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 1.5815899581589958, + "grad_norm": 0.05127113312482834, + "learning_rate": 4.796350772860511e-06, + "loss": 0.5371, + "step": 567 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 1.584379358437936, + "grad_norm": 0.047691766172647476, + "learning_rate": 4.7812749423901e-06, + "loss": 0.5588, + "step": 568 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 1.587168758716876, + "grad_norm": 0.052663449198007584, + "learning_rate": 4.7662011039525416e-06, + "loss": 0.5653, + "step": 569 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 1.5899581589958158, + "grad_norm": 0.06001275032758713, + "learning_rate": 4.7511293948324325e-06, + "loss": 0.5692, + "step": 570 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 1.592747559274756, + "grad_norm": 0.04391346126794815, + "learning_rate": 4.736059952294975e-06, + "loss": 0.5475, + "step": 571 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 1.5955369595536961, + "grad_norm": 0.052683085203170776, + "learning_rate": 4.720992913584732e-06, + "loss": 0.5559, + "step": 572 + }, + { + "autoregressive_loss": 0.1475, + "epoch": 1.5983263598326358, + "grad_norm": 0.038459159433841705, + "learning_rate": 4.7059284159243725e-06, + "loss": 0.5913, + "step": 573 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 1.601115760111576, + "grad_norm": 0.04457344114780426, + "learning_rate": 4.690866596513421e-06, + "loss": 0.5543, + "step": 574 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 1.6039051603905161, + "grad_norm": 0.08527503162622452, + "learning_rate": 4.675807592527014e-06, + "loss": 0.5601, + "step": 575 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 1.606694560669456, + "grad_norm": 0.04071794077754021, + "learning_rate": 4.660751541114641e-06, + "loss": 0.5366, + "step": 576 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 1.609483960948396, + "grad_norm": 0.04945802316069603, + "learning_rate": 4.645698579398907e-06, + "loss": 0.5775, + "step": 577 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.612273361227336, + "grad_norm": 0.04523865878582001, + "learning_rate": 4.630648844474271e-06, + "loss": 0.5861, + "step": 578 + }, + { + "autoregressive_loss": 0.1455, + "epoch": 1.6150627615062763, + "grad_norm": 0.06278450042009354, + "learning_rate": 4.615602473405812e-06, + "loss": 0.582, + "step": 579 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 1.6178521617852162, + "grad_norm": 0.03390641137957573, + "learning_rate": 4.600559603227963e-06, + "loss": 0.5656, + "step": 580 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 1.620641562064156, + "grad_norm": 0.043791670352220535, + "learning_rate": 4.585520370943285e-06, + "loss": 0.5363, + "step": 581 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.6234309623430963, + "grad_norm": 0.04453590512275696, + "learning_rate": 4.570484913521196e-06, + "loss": 0.5885, + "step": 582 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 1.6262203626220364, + "grad_norm": 0.06411642581224442, + "learning_rate": 4.55545336789674e-06, + "loss": 0.5677, + "step": 583 + }, + { + "autoregressive_loss": 0.1299, + "epoch": 1.6290097629009763, + "grad_norm": 0.054797135293483734, + "learning_rate": 4.540425870969332e-06, + "loss": 0.5217, + "step": 584 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 1.6317991631799162, + "grad_norm": 0.050795480608940125, + "learning_rate": 4.5254025596015175e-06, + "loss": 0.5667, + "step": 585 + }, + { + "autoregressive_loss": 0.1494, + "epoch": 1.6345885634588564, + "grad_norm": 0.05036594346165657, + "learning_rate": 4.510383570617716e-06, + "loss": 0.5985, + "step": 586 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 1.6373779637377965, + "grad_norm": 0.06015456095337868, + "learning_rate": 4.495369040802988e-06, + "loss": 0.5548, + "step": 587 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 1.6401673640167362, + "grad_norm": 0.054865751415491104, + "learning_rate": 4.480359106901775e-06, + "loss": 0.5643, + "step": 588 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 1.6429567642956764, + "grad_norm": 0.04047631472349167, + "learning_rate": 4.465353905616668e-06, + "loss": 0.578, + "step": 589 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 1.6457461645746165, + "grad_norm": 0.057980358600616455, + "learning_rate": 4.4503535736071505e-06, + "loss": 0.5675, + "step": 590 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 1.6485355648535565, + "grad_norm": 0.06562533974647522, + "learning_rate": 4.435358247488365e-06, + "loss": 0.549, + "step": 591 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 1.6513249651324964, + "grad_norm": 0.036771781742572784, + "learning_rate": 4.420368063829854e-06, + "loss": 0.547, + "step": 592 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 1.6541143654114365, + "grad_norm": 0.04067135602235794, + "learning_rate": 4.405383159154337e-06, + "loss": 0.5588, + "step": 593 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 1.6569037656903767, + "grad_norm": 0.05226593464612961, + "learning_rate": 4.390403669936443e-06, + "loss": 0.5415, + "step": 594 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 1.6596931659693166, + "grad_norm": 0.06906784325838089, + "learning_rate": 4.37542973260149e-06, + "loss": 0.5322, + "step": 595 + }, + { + "autoregressive_loss": 0.1562, + "epoch": 1.6624825662482565, + "grad_norm": 0.04818138852715492, + "learning_rate": 4.3604614835242255e-06, + "loss": 0.6238, + "step": 596 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 1.6652719665271967, + "grad_norm": 0.03906245902180672, + "learning_rate": 4.3454990590275966e-06, + "loss": 0.5522, + "step": 597 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.6680613668061368, + "grad_norm": 0.06899507343769073, + "learning_rate": 4.3305425953814985e-06, + "loss": 0.5859, + "step": 598 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 1.6708507670850767, + "grad_norm": 0.06058574840426445, + "learning_rate": 4.315592228801543e-06, + "loss": 0.5613, + "step": 599 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 1.6736401673640167, + "grad_norm": 0.05170174315571785, + "learning_rate": 4.300648095447806e-06, + "loss": 0.553, + "step": 600 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.6764295676429568, + "grad_norm": 0.04078209400177002, + "learning_rate": 4.285710331423603e-06, + "loss": 0.5869, + "step": 601 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.679218967921897, + "grad_norm": 0.05455109849572182, + "learning_rate": 4.2707790727742315e-06, + "loss": 0.585, + "step": 602 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 1.6820083682008367, + "grad_norm": 0.047467466443777084, + "learning_rate": 4.255854455485753e-06, + "loss": 0.5389, + "step": 603 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.6847977684797768, + "grad_norm": 0.07608460634946823, + "learning_rate": 4.24093661548373e-06, + "loss": 0.585, + "step": 604 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.687587168758717, + "grad_norm": 0.03669717162847519, + "learning_rate": 4.226025688632013e-06, + "loss": 0.5836, + "step": 605 + }, + { + "autoregressive_loss": 0.127, + "epoch": 1.6903765690376569, + "grad_norm": 0.09109435975551605, + "learning_rate": 4.211121810731485e-06, + "loss": 0.509, + "step": 606 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 1.6931659693165968, + "grad_norm": 0.04544074460864067, + "learning_rate": 4.196225117518828e-06, + "loss": 0.5692, + "step": 607 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 1.695955369595537, + "grad_norm": 0.03568524494767189, + "learning_rate": 4.181335744665299e-06, + "loss": 0.5527, + "step": 608 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 1.698744769874477, + "grad_norm": 0.05150272697210312, + "learning_rate": 4.166453827775474e-06, + "loss": 0.5619, + "step": 609 + }, + { + "autoregressive_loss": 0.1504, + "epoch": 1.701534170153417, + "grad_norm": 0.08645547926425934, + "learning_rate": 4.1515795023860325e-06, + "loss": 0.6044, + "step": 610 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 1.704323570432357, + "grad_norm": 0.05908164009451866, + "learning_rate": 4.136712903964511e-06, + "loss": 0.5668, + "step": 611 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 1.707112970711297, + "grad_norm": 0.045956894755363464, + "learning_rate": 4.121854167908072e-06, + "loss": 0.5702, + "step": 612 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 1.7099023709902372, + "grad_norm": 0.04155174270272255, + "learning_rate": 4.107003429542273e-06, + "loss": 0.5668, + "step": 613 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 1.7126917712691772, + "grad_norm": 0.036741919815540314, + "learning_rate": 4.092160824119836e-06, + "loss": 0.5637, + "step": 614 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 1.715481171548117, + "grad_norm": 0.0489344447851181, + "learning_rate": 4.077326486819405e-06, + "loss": 0.5708, + "step": 615 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.7182705718270572, + "grad_norm": 0.04210687056183815, + "learning_rate": 4.06250055274433e-06, + "loss": 0.5853, + "step": 616 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 1.7210599721059974, + "grad_norm": 0.04543302208185196, + "learning_rate": 4.047683156921422e-06, + "loss": 0.5448, + "step": 617 + }, + { + "autoregressive_loss": 0.1523, + "epoch": 1.723849372384937, + "grad_norm": 0.04589065536856651, + "learning_rate": 4.0328744342997355e-06, + "loss": 0.6091, + "step": 618 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 1.7266387726638772, + "grad_norm": 0.048579249531030655, + "learning_rate": 4.0180745197493295e-06, + "loss": 0.5789, + "step": 619 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 1.7294281729428174, + "grad_norm": 0.06049260497093201, + "learning_rate": 4.0032835480600516e-06, + "loss": 0.5712, + "step": 620 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 1.7322175732217573, + "grad_norm": 0.03540187329053879, + "learning_rate": 3.9885016539402896e-06, + "loss": 0.5804, + "step": 621 + }, + { + "autoregressive_loss": 0.1318, + "epoch": 1.7350069735006972, + "grad_norm": 0.05291564390063286, + "learning_rate": 3.973728972015771e-06, + "loss": 0.5282, + "step": 622 + }, + { + "autoregressive_loss": 0.1309, + "epoch": 1.7377963737796374, + "grad_norm": 0.04047594964504242, + "learning_rate": 3.958965636828314e-06, + "loss": 0.5247, + "step": 623 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 1.7405857740585775, + "grad_norm": 0.03763250634074211, + "learning_rate": 3.944211782834618e-06, + "loss": 0.5345, + "step": 624 + }, + { + "autoregressive_loss": 0.1592, + "epoch": 1.7433751743375174, + "grad_norm": 0.04064092040061951, + "learning_rate": 3.929467544405027e-06, + "loss": 0.6364, + "step": 625 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 1.7461645746164574, + "grad_norm": 0.04830470681190491, + "learning_rate": 3.9147330558223175e-06, + "loss": 0.5305, + "step": 626 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 1.7489539748953975, + "grad_norm": 0.06140526011586189, + "learning_rate": 3.900008451280462e-06, + "loss": 0.5564, + "step": 627 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 1.7517433751743376, + "grad_norm": 0.041505370289087296, + "learning_rate": 3.885293864883423e-06, + "loss": 0.5637, + "step": 628 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 1.7545327754532776, + "grad_norm": 0.04944341257214546, + "learning_rate": 3.870589430643915e-06, + "loss": 0.5585, + "step": 629 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.7573221757322175, + "grad_norm": 0.08489459753036499, + "learning_rate": 3.8558952824822e-06, + "loss": 0.5868, + "step": 630 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 1.7601115760111576, + "grad_norm": 0.057782676070928574, + "learning_rate": 3.84121155422485e-06, + "loss": 0.5337, + "step": 631 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 1.7629009762900978, + "grad_norm": 0.042774394154548645, + "learning_rate": 3.826538379603549e-06, + "loss": 0.5609, + "step": 632 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 1.7656903765690377, + "grad_norm": 0.05189014971256256, + "learning_rate": 3.8118758922538533e-06, + "loss": 0.5464, + "step": 633 + }, + { + "autoregressive_loss": 0.1299, + "epoch": 1.7684797768479776, + "grad_norm": 0.047947391867637634, + "learning_rate": 3.7972242257139953e-06, + "loss": 0.5205, + "step": 634 + }, + { + "autoregressive_loss": 0.1357, + "epoch": 1.7712691771269178, + "grad_norm": 0.06772277504205704, + "learning_rate": 3.782583513423647e-06, + "loss": 0.5438, + "step": 635 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 1.7740585774058577, + "grad_norm": 0.05353960394859314, + "learning_rate": 3.7679538887227247e-06, + "loss": 0.533, + "step": 636 + }, + { + "autoregressive_loss": 0.1543, + "epoch": 1.7768479776847976, + "grad_norm": 0.05707435682415962, + "learning_rate": 3.753335484850157e-06, + "loss": 0.6184, + "step": 637 + }, + { + "autoregressive_loss": 0.1309, + "epoch": 1.7796373779637378, + "grad_norm": 0.06148119270801544, + "learning_rate": 3.738728434942684e-06, + "loss": 0.5225, + "step": 638 + }, + { + "autoregressive_loss": 0.1182, + "epoch": 1.782426778242678, + "grad_norm": 0.038341064006090164, + "learning_rate": 3.7241328720336377e-06, + "loss": 0.4731, + "step": 639 + }, + { + "autoregressive_loss": 0.1523, + "epoch": 1.7852161785216178, + "grad_norm": 0.044643551111221313, + "learning_rate": 3.709548929051732e-06, + "loss": 0.6099, + "step": 640 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 1.7880055788005578, + "grad_norm": 0.05767514184117317, + "learning_rate": 3.6949767388198554e-06, + "loss": 0.5345, + "step": 641 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 1.790794979079498, + "grad_norm": 0.03940228745341301, + "learning_rate": 3.680416434053854e-06, + "loss": 0.576, + "step": 642 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 1.793584379358438, + "grad_norm": 0.04918529838323593, + "learning_rate": 3.6658681473613333e-06, + "loss": 0.5648, + "step": 643 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 1.796373779637378, + "grad_norm": 0.06549336761236191, + "learning_rate": 3.651332011240437e-06, + "loss": 0.5411, + "step": 644 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 1.799163179916318, + "grad_norm": 0.06422046571969986, + "learning_rate": 3.636808158078656e-06, + "loss": 0.5288, + "step": 645 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 1.801952580195258, + "grad_norm": 0.054812319576740265, + "learning_rate": 3.622296720151608e-06, + "loss": 0.5769, + "step": 646 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.8047419804741982, + "grad_norm": 0.04640192165970802, + "learning_rate": 3.607797829621843e-06, + "loss": 0.5864, + "step": 647 + }, + { + "autoregressive_loss": 0.1289, + "epoch": 1.8075313807531381, + "grad_norm": 0.04267491400241852, + "learning_rate": 3.5933116185376325e-06, + "loss": 0.5155, + "step": 648 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.810320781032078, + "grad_norm": 0.04816003143787384, + "learning_rate": 3.578838218831776e-06, + "loss": 0.5874, + "step": 649 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 1.8131101813110182, + "grad_norm": 0.04113892838358879, + "learning_rate": 3.5643777623203857e-06, + "loss": 0.5802, + "step": 650 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 1.8158995815899581, + "grad_norm": 0.07291996479034424, + "learning_rate": 3.5499303807017018e-06, + "loss": 0.5768, + "step": 651 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 1.818688981868898, + "grad_norm": 0.04361705854535103, + "learning_rate": 3.5354962055548802e-06, + "loss": 0.5696, + "step": 652 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 1.8214783821478382, + "grad_norm": 0.07118409126996994, + "learning_rate": 3.5210753683388014e-06, + "loss": 0.5474, + "step": 653 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 1.8242677824267783, + "grad_norm": 0.06011803448200226, + "learning_rate": 3.5066680003908695e-06, + "loss": 0.53, + "step": 654 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.8270571827057183, + "grad_norm": 0.05469872057437897, + "learning_rate": 3.4922742329258207e-06, + "loss": 0.5856, + "step": 655 + }, + { + "autoregressive_loss": 0.1514, + "epoch": 1.8298465829846582, + "grad_norm": 0.04213607683777809, + "learning_rate": 3.47789419703452e-06, + "loss": 0.604, + "step": 656 + }, + { + "autoregressive_loss": 0.125, + "epoch": 1.8326359832635983, + "grad_norm": 0.03854227066040039, + "learning_rate": 3.463528023682779e-06, + "loss": 0.4988, + "step": 657 + }, + { + "autoregressive_loss": 0.1504, + "epoch": 1.8354253835425385, + "grad_norm": 0.04623892530798912, + "learning_rate": 3.4491758437101487e-06, + "loss": 0.6014, + "step": 658 + }, + { + "autoregressive_loss": 0.1318, + "epoch": 1.8382147838214784, + "grad_norm": 0.0463765487074852, + "learning_rate": 3.4348377878287443e-06, + "loss": 0.5282, + "step": 659 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 1.8410041841004183, + "grad_norm": 0.03241466358304024, + "learning_rate": 3.4205139866220384e-06, + "loss": 0.5642, + "step": 660 + }, + { + "autoregressive_loss": 0.1436, + "epoch": 1.8437935843793585, + "grad_norm": 0.041081734001636505, + "learning_rate": 3.4062045705436863e-06, + "loss": 0.5741, + "step": 661 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 1.8465829846582986, + "grad_norm": 0.04077489674091339, + "learning_rate": 3.391909669916324e-06, + "loss": 0.5771, + "step": 662 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 1.8493723849372385, + "grad_norm": 0.07662832736968994, + "learning_rate": 3.3776294149303956e-06, + "loss": 0.5465, + "step": 663 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 1.8521617852161785, + "grad_norm": 0.04439672455191612, + "learning_rate": 3.3633639356429564e-06, + "loss": 0.5707, + "step": 664 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 1.8549511854951186, + "grad_norm": 0.05089595913887024, + "learning_rate": 3.3491133619764925e-06, + "loss": 0.5627, + "step": 665 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 1.8577405857740585, + "grad_norm": 0.03854743391275406, + "learning_rate": 3.334877823717737e-06, + "loss": 0.5771, + "step": 666 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 1.8605299860529985, + "grad_norm": 0.08145076781511307, + "learning_rate": 3.3206574505164934e-06, + "loss": 0.5647, + "step": 667 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 1.8633193863319386, + "grad_norm": 0.07853725552558899, + "learning_rate": 3.306452371884441e-06, + "loss": 0.5677, + "step": 668 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 1.8661087866108788, + "grad_norm": 0.04233454540371895, + "learning_rate": 3.2922627171939726e-06, + "loss": 0.5549, + "step": 669 + }, + { + "autoregressive_loss": 0.1309, + "epoch": 1.8688981868898187, + "grad_norm": 0.042802345007658005, + "learning_rate": 3.2780886156770016e-06, + "loss": 0.5251, + "step": 670 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 1.8716875871687586, + "grad_norm": 0.037434883415699005, + "learning_rate": 3.263930196423797e-06, + "loss": 0.5687, + "step": 671 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 1.8744769874476988, + "grad_norm": 0.049615342170000076, + "learning_rate": 3.2497875883817955e-06, + "loss": 0.5564, + "step": 672 + }, + { + "autoregressive_loss": 0.1582, + "epoch": 1.877266387726639, + "grad_norm": 0.05447971075773239, + "learning_rate": 3.2356609203544387e-06, + "loss": 0.6364, + "step": 673 + }, + { + "autoregressive_loss": 0.1279, + "epoch": 1.8800557880055788, + "grad_norm": 0.05962167680263519, + "learning_rate": 3.2215503209999952e-06, + "loss": 0.5128, + "step": 674 + }, + { + "autoregressive_loss": 0.1475, + "epoch": 1.8828451882845187, + "grad_norm": 0.04092300310730934, + "learning_rate": 3.207455918830384e-06, + "loss": 0.5898, + "step": 675 + }, + { + "autoregressive_loss": 0.1211, + "epoch": 1.885634588563459, + "grad_norm": 0.045955296605825424, + "learning_rate": 3.193377842210014e-06, + "loss": 0.4857, + "step": 676 + }, + { + "autoregressive_loss": 0.1455, + "epoch": 1.888423988842399, + "grad_norm": 0.061723753809928894, + "learning_rate": 3.179316219354602e-06, + "loss": 0.5808, + "step": 677 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.891213389121339, + "grad_norm": 0.04363270848989487, + "learning_rate": 3.1652711783300234e-06, + "loss": 0.5873, + "step": 678 + }, + { + "autoregressive_loss": 0.1289, + "epoch": 1.8940027894002789, + "grad_norm": 0.06208307296037674, + "learning_rate": 3.1512428470511257e-06, + "loss": 0.516, + "step": 679 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 1.896792189679219, + "grad_norm": 0.07383999228477478, + "learning_rate": 3.1372313532805766e-06, + "loss": 0.5809, + "step": 680 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 1.899581589958159, + "grad_norm": 0.05216548591852188, + "learning_rate": 3.1232368246276956e-06, + "loss": 0.5364, + "step": 681 + }, + { + "autoregressive_loss": 0.1318, + "epoch": 1.9023709902370989, + "grad_norm": 0.04593446105718613, + "learning_rate": 3.1092593885472965e-06, + "loss": 0.5264, + "step": 682 + }, + { + "autoregressive_loss": 0.1318, + "epoch": 1.905160390516039, + "grad_norm": 0.07607099413871765, + "learning_rate": 3.0952991723385152e-06, + "loss": 0.5289, + "step": 683 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 1.9079497907949792, + "grad_norm": 0.03976249694824219, + "learning_rate": 3.0813563031436676e-06, + "loss": 0.5465, + "step": 684 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 1.910739191073919, + "grad_norm": 0.06069750338792801, + "learning_rate": 3.067430907947073e-06, + "loss": 0.5706, + "step": 685 + }, + { + "autoregressive_loss": 0.1279, + "epoch": 1.913528591352859, + "grad_norm": 0.04926614835858345, + "learning_rate": 3.053523113573914e-06, + "loss": 0.5096, + "step": 686 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 1.9163179916317992, + "grad_norm": 0.031872738152742386, + "learning_rate": 3.039633046689069e-06, + "loss": 0.5571, + "step": 687 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 1.9191073919107393, + "grad_norm": 0.04783611744642258, + "learning_rate": 3.0257608337959683e-06, + "loss": 0.5663, + "step": 688 + }, + { + "autoregressive_loss": 0.1211, + "epoch": 1.9218967921896792, + "grad_norm": 0.042002540081739426, + "learning_rate": 3.0119066012354316e-06, + "loss": 0.4852, + "step": 689 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.9246861924686192, + "grad_norm": 0.047062646597623825, + "learning_rate": 2.9980704751845302e-06, + "loss": 0.5862, + "step": 690 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 1.9274755927475593, + "grad_norm": 0.08148302882909775, + "learning_rate": 2.9842525816554237e-06, + "loss": 0.5674, + "step": 691 + }, + { + "autoregressive_loss": 0.1504, + "epoch": 1.9302649930264995, + "grad_norm": 0.0473175086081028, + "learning_rate": 2.9704530464942254e-06, + "loss": 0.6, + "step": 692 + }, + { + "autoregressive_loss": 0.1318, + "epoch": 1.9330543933054394, + "grad_norm": 0.035288743674755096, + "learning_rate": 2.9566719953798474e-06, + "loss": 0.5283, + "step": 693 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 1.9358437935843793, + "grad_norm": 0.03662148118019104, + "learning_rate": 2.942909553822859e-06, + "loss": 0.566, + "step": 694 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 1.9386331938633194, + "grad_norm": 0.057863932102918625, + "learning_rate": 2.929165847164343e-06, + "loss": 0.568, + "step": 695 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 1.9414225941422594, + "grad_norm": 0.03623604401946068, + "learning_rate": 2.9154410005747586e-06, + "loss": 0.5392, + "step": 696 + }, + { + "autoregressive_loss": 0.1289, + "epoch": 1.9442119944211993, + "grad_norm": 0.07199563831090927, + "learning_rate": 2.901735139052787e-06, + "loss": 0.5177, + "step": 697 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 1.9470013947001394, + "grad_norm": 0.04358909651637077, + "learning_rate": 2.888048387424218e-06, + "loss": 0.5665, + "step": 698 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 1.9497907949790796, + "grad_norm": 0.05477432161569595, + "learning_rate": 2.8743808703407866e-06, + "loss": 0.5372, + "step": 699 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 1.9525801952580195, + "grad_norm": 0.04218233749270439, + "learning_rate": 2.8607327122790555e-06, + "loss": 0.5581, + "step": 700 + }, + { + "autoregressive_loss": 0.1279, + "epoch": 1.9553695955369594, + "grad_norm": 0.03505018725991249, + "learning_rate": 2.8471040375392745e-06, + "loss": 0.512, + "step": 701 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 1.9581589958158996, + "grad_norm": 0.036944158375263214, + "learning_rate": 2.833494970244248e-06, + "loss": 0.5469, + "step": 702 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 1.9609483960948397, + "grad_norm": 0.06648257374763489, + "learning_rate": 2.819905634338208e-06, + "loss": 0.5615, + "step": 703 + }, + { + "autoregressive_loss": 0.1299, + "epoch": 1.9637377963737797, + "grad_norm": 0.04316696897149086, + "learning_rate": 2.8063361535856838e-06, + "loss": 0.5192, + "step": 704 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 1.9665271966527196, + "grad_norm": 0.03444093093276024, + "learning_rate": 2.7927866515703705e-06, + "loss": 0.5651, + "step": 705 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 1.9693165969316597, + "grad_norm": 0.04827907308936119, + "learning_rate": 2.7792572516940108e-06, + "loss": 0.5477, + "step": 706 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 1.9721059972105999, + "grad_norm": 0.061702705919742584, + "learning_rate": 2.765748077175272e-06, + "loss": 0.5671, + "step": 707 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 1.9748953974895398, + "grad_norm": 0.05075180158019066, + "learning_rate": 2.752259251048606e-06, + "loss": 0.5712, + "step": 708 + }, + { + "autoregressive_loss": 0.1318, + "epoch": 1.9776847977684797, + "grad_norm": 0.03641447797417641, + "learning_rate": 2.7387908961631597e-06, + "loss": 0.5284, + "step": 709 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 1.9804741980474199, + "grad_norm": 0.05783659964799881, + "learning_rate": 2.725343135181622e-06, + "loss": 0.5397, + "step": 710 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 1.9832635983263598, + "grad_norm": 0.05253494158387184, + "learning_rate": 2.711916090579137e-06, + "loss": 0.5294, + "step": 711 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 1.9860529986052997, + "grad_norm": 0.037178706377744675, + "learning_rate": 2.698509884642163e-06, + "loss": 0.5892, + "step": 712 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 1.9888423988842399, + "grad_norm": 0.04687239229679108, + "learning_rate": 2.6851246394673822e-06, + "loss": 0.5396, + "step": 713 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 1.99163179916318, + "grad_norm": 0.03485679626464844, + "learning_rate": 2.67176047696057e-06, + "loss": 0.537, + "step": 714 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 1.99442119944212, + "grad_norm": 0.037754740566015244, + "learning_rate": 2.6584175188354934e-06, + "loss": 0.5353, + "step": 715 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 1.9972105997210599, + "grad_norm": 0.061961978673934937, + "learning_rate": 2.6450958866128e-06, + "loss": 0.5636, + "step": 716 + }, + { + "autoregressive_loss": 0.2129, + "epoch": 2.0, + "grad_norm": 0.08348535746335983, + "learning_rate": 2.6317957016189155e-06, + "loss": 0.8507, + "step": 717 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.00278940027894, + "grad_norm": 0.04222907871007919, + "learning_rate": 2.618517084984933e-06, + "loss": 0.5615, + "step": 718 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 2.00557880055788, + "grad_norm": 0.06273231655359268, + "learning_rate": 2.6052601576455116e-06, + "loss": 0.5707, + "step": 719 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 2.00836820083682, + "grad_norm": 0.03602023795247078, + "learning_rate": 2.592025040337779e-06, + "loss": 0.577, + "step": 720 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.01115760111576, + "grad_norm": 0.04537821561098099, + "learning_rate": 2.578811853600226e-06, + "loss": 0.5391, + "step": 721 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 2.0139470013947003, + "grad_norm": 0.044788941740989685, + "learning_rate": 2.5656207177716107e-06, + "loss": 0.5518, + "step": 722 + }, + { + "autoregressive_loss": 0.1357, + "epoch": 2.01673640167364, + "grad_norm": 0.04178337752819061, + "learning_rate": 2.552451752989865e-06, + "loss": 0.5439, + "step": 723 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 2.01952580195258, + "grad_norm": 0.05339245870709419, + "learning_rate": 2.539305079190999e-06, + "loss": 0.566, + "step": 724 + }, + { + "autoregressive_loss": 0.1357, + "epoch": 2.0223152022315203, + "grad_norm": 0.055592361837625504, + "learning_rate": 2.5261808161080047e-06, + "loss": 0.5422, + "step": 725 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 2.0251046025104604, + "grad_norm": 0.044874437153339386, + "learning_rate": 2.513079083269774e-06, + "loss": 0.5647, + "step": 726 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.0278940027894, + "grad_norm": 0.08718574047088623, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.5615, + "step": 727 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 2.0306834030683403, + "grad_norm": 0.043546129018068314, + "learning_rate": 2.4869436854161e-06, + "loss": 0.5668, + "step": 728 + }, + { + "autoregressive_loss": 0.1309, + "epoch": 2.0334728033472804, + "grad_norm": 0.05846861004829407, + "learning_rate": 2.4739102584281268e-06, + "loss": 0.5222, + "step": 729 + }, + { + "autoregressive_loss": 0.1436, + "epoch": 2.0362622036262206, + "grad_norm": 0.046463411301374435, + "learning_rate": 2.4608998377376752e-06, + "loss": 0.5747, + "step": 730 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.0390516039051603, + "grad_norm": 0.06893808394670486, + "learning_rate": 2.447912541836826e-06, + "loss": 0.5464, + "step": 731 + }, + { + "autoregressive_loss": 0.1357, + "epoch": 2.0418410041841004, + "grad_norm": 0.04511405527591705, + "learning_rate": 2.4349484890070357e-06, + "loss": 0.5422, + "step": 732 + }, + { + "autoregressive_loss": 0.127, + "epoch": 2.0446304044630406, + "grad_norm": 0.047475576400756836, + "learning_rate": 2.4220077973180906e-06, + "loss": 0.5061, + "step": 733 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 2.0474198047419803, + "grad_norm": 0.055818572640419006, + "learning_rate": 2.4090905846270006e-06, + "loss": 0.5835, + "step": 734 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 2.0502092050209204, + "grad_norm": 0.0584428608417511, + "learning_rate": 2.396196968576957e-06, + "loss": 0.5365, + "step": 735 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 2.0529986052998606, + "grad_norm": 0.045579154044389725, + "learning_rate": 2.3833270665962293e-06, + "loss": 0.5348, + "step": 736 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 2.0557880055788007, + "grad_norm": 0.04334115982055664, + "learning_rate": 2.370480995897127e-06, + "loss": 0.5338, + "step": 737 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.0585774058577404, + "grad_norm": 0.0438629686832428, + "learning_rate": 2.3576588734749022e-06, + "loss": 0.553, + "step": 738 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 2.0613668061366806, + "grad_norm": 0.06326558440923691, + "learning_rate": 2.3448608161067117e-06, + "loss": 0.5592, + "step": 739 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 2.0641562064156207, + "grad_norm": 0.06907150894403458, + "learning_rate": 2.3320869403505324e-06, + "loss": 0.5677, + "step": 740 + }, + { + "autoregressive_loss": 0.1357, + "epoch": 2.066945606694561, + "grad_norm": 0.03730299323797226, + "learning_rate": 2.3193373625441113e-06, + "loss": 0.543, + "step": 741 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.0697350069735005, + "grad_norm": 0.03848947957158089, + "learning_rate": 2.3066121988038996e-06, + "loss": 0.5615, + "step": 742 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.0725244072524407, + "grad_norm": 0.09004633873701096, + "learning_rate": 2.2939115650240008e-06, + "loss": 0.5525, + "step": 743 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 2.075313807531381, + "grad_norm": 0.0494709312915802, + "learning_rate": 2.2812355768751106e-06, + "loss": 0.5507, + "step": 744 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 2.078103207810321, + "grad_norm": 0.0473937951028347, + "learning_rate": 2.268584349803464e-06, + "loss": 0.5505, + "step": 745 + }, + { + "autoregressive_loss": 0.1357, + "epoch": 2.0808926080892607, + "grad_norm": 0.04280379042029381, + "learning_rate": 2.2559579990297943e-06, + "loss": 0.5431, + "step": 746 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.083682008368201, + "grad_norm": 0.04790711775422096, + "learning_rate": 2.2433566395482577e-06, + "loss": 0.5557, + "step": 747 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 2.086471408647141, + "grad_norm": 0.04467952996492386, + "learning_rate": 2.2307803861254207e-06, + "loss": 0.5308, + "step": 748 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.0892608089260807, + "grad_norm": 0.03914961218833923, + "learning_rate": 2.218229353299181e-06, + "loss": 0.5645, + "step": 749 + }, + { + "autoregressive_loss": 0.1309, + "epoch": 2.092050209205021, + "grad_norm": 0.06279167532920837, + "learning_rate": 2.2057036553777565e-06, + "loss": 0.5245, + "step": 750 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 2.094839609483961, + "grad_norm": 0.0417286716401577, + "learning_rate": 2.1932034064386113e-06, + "loss": 0.5309, + "step": 751 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.097629009762901, + "grad_norm": 0.04373554512858391, + "learning_rate": 2.1807287203274504e-06, + "loss": 0.5481, + "step": 752 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.100418410041841, + "grad_norm": 0.05088900402188301, + "learning_rate": 2.168279710657149e-06, + "loss": 0.5459, + "step": 753 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 2.103207810320781, + "grad_norm": 0.03536570072174072, + "learning_rate": 2.1558564908067497e-06, + "loss": 0.5294, + "step": 754 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 2.105997210599721, + "grad_norm": 0.05729952082037926, + "learning_rate": 2.1434591739204062e-06, + "loss": 0.5764, + "step": 755 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.1087866108786613, + "grad_norm": 0.06749259680509567, + "learning_rate": 2.1310878729063645e-06, + "loss": 0.5535, + "step": 756 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.111576011157601, + "grad_norm": 0.04441975802183151, + "learning_rate": 2.118742700435931e-06, + "loss": 0.5396, + "step": 757 + }, + { + "autoregressive_loss": 0.1436, + "epoch": 2.114365411436541, + "grad_norm": 0.07300915569067001, + "learning_rate": 2.1064237689424483e-06, + "loss": 0.574, + "step": 758 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.1171548117154813, + "grad_norm": 0.04833308979868889, + "learning_rate": 2.0941311906202672e-06, + "loss": 0.5634, + "step": 759 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 2.1199442119944214, + "grad_norm": 0.03211741894483566, + "learning_rate": 2.081865077423731e-06, + "loss": 0.5594, + "step": 760 + }, + { + "autoregressive_loss": 0.1309, + "epoch": 2.122733612273361, + "grad_norm": 0.06226465851068497, + "learning_rate": 2.06962554106615e-06, + "loss": 0.5222, + "step": 761 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.1255230125523012, + "grad_norm": 0.041222646832466125, + "learning_rate": 2.0574126930187882e-06, + "loss": 0.5618, + "step": 762 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 2.1283124128312414, + "grad_norm": 0.039174895733594894, + "learning_rate": 2.0452266445098457e-06, + "loss": 0.5505, + "step": 763 + }, + { + "autoregressive_loss": 0.1357, + "epoch": 2.131101813110181, + "grad_norm": 0.04521974176168442, + "learning_rate": 2.0330675065234466e-06, + "loss": 0.5433, + "step": 764 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.1338912133891212, + "grad_norm": 0.041523002088069916, + "learning_rate": 2.0209353897986288e-06, + "loss": 0.5543, + "step": 765 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 2.1366806136680614, + "grad_norm": 0.041622865945100784, + "learning_rate": 2.0088304048283337e-06, + "loss": 0.532, + "step": 766 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 2.1394700139470015, + "grad_norm": 0.03866825997829437, + "learning_rate": 1.9967526618584016e-06, + "loss": 0.5792, + "step": 767 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.1422594142259412, + "grad_norm": 0.04294265806674957, + "learning_rate": 1.984702270886566e-06, + "loss": 0.5461, + "step": 768 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.1450488145048814, + "grad_norm": 0.05038651451468468, + "learning_rate": 1.9726793416614532e-06, + "loss": 0.5649, + "step": 769 + }, + { + "autoregressive_loss": 0.1318, + "epoch": 2.1478382147838215, + "grad_norm": 0.060318537056446075, + "learning_rate": 1.9606839836815872e-06, + "loss": 0.5264, + "step": 770 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.1506276150627617, + "grad_norm": 0.058659203350543976, + "learning_rate": 1.948716306194376e-06, + "loss": 0.5548, + "step": 771 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 2.1534170153417014, + "grad_norm": 0.03577926382422447, + "learning_rate": 1.9367764181951403e-06, + "loss": 0.5575, + "step": 772 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.1562064156206415, + "grad_norm": 0.08682086318731308, + "learning_rate": 1.924864428426103e-06, + "loss": 0.5369, + "step": 773 + }, + { + "autoregressive_loss": 0.1299, + "epoch": 2.1589958158995817, + "grad_norm": 0.057879719883203506, + "learning_rate": 1.9129804453754053e-06, + "loss": 0.5172, + "step": 774 + }, + { + "autoregressive_loss": 0.1455, + "epoch": 2.161785216178522, + "grad_norm": 0.05870634689927101, + "learning_rate": 1.9011245772761173e-06, + "loss": 0.5815, + "step": 775 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 2.1645746164574615, + "grad_norm": 0.058728061616420746, + "learning_rate": 1.889296932105254e-06, + "loss": 0.5475, + "step": 776 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 2.1673640167364017, + "grad_norm": 0.04313179478049278, + "learning_rate": 1.8774976175827898e-06, + "loss": 0.558, + "step": 777 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 2.170153417015342, + "grad_norm": 0.036421120166778564, + "learning_rate": 1.8657267411706802e-06, + "loss": 0.5676, + "step": 778 + }, + { + "autoregressive_loss": 0.1455, + "epoch": 2.172942817294282, + "grad_norm": 0.048895541578531265, + "learning_rate": 1.853984410071879e-06, + "loss": 0.5826, + "step": 779 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 2.1757322175732217, + "grad_norm": 0.03952264413237572, + "learning_rate": 1.8422707312293663e-06, + "loss": 0.576, + "step": 780 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.178521617852162, + "grad_norm": 0.045816633850336075, + "learning_rate": 1.8305858113251717e-06, + "loss": 0.5479, + "step": 781 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.181311018131102, + "grad_norm": 0.05922139063477516, + "learning_rate": 1.8189297567794029e-06, + "loss": 0.549, + "step": 782 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 2.1841004184100417, + "grad_norm": 0.04462834447622299, + "learning_rate": 1.8073026737492783e-06, + "loss": 0.5529, + "step": 783 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 2.186889818688982, + "grad_norm": 0.05726335570216179, + "learning_rate": 1.7957046681281582e-06, + "loss": 0.5327, + "step": 784 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 2.189679218967922, + "grad_norm": 0.05668376758694649, + "learning_rate": 1.7841358455445807e-06, + "loss": 0.5321, + "step": 785 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.192468619246862, + "grad_norm": 0.056753162294626236, + "learning_rate": 1.7725963113612998e-06, + "loss": 0.5453, + "step": 786 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.195258019525802, + "grad_norm": 0.051855746656656265, + "learning_rate": 1.7610861706743316e-06, + "loss": 0.5374, + "step": 787 + }, + { + "autoregressive_loss": 0.1455, + "epoch": 2.198047419804742, + "grad_norm": 0.0365370437502861, + "learning_rate": 1.7496055283119812e-06, + "loss": 0.5807, + "step": 788 + }, + { + "autoregressive_loss": 0.1245, + "epoch": 2.200836820083682, + "grad_norm": 0.04801502078771591, + "learning_rate": 1.7381544888339103e-06, + "loss": 0.4989, + "step": 789 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 2.2036262203626222, + "grad_norm": 0.06376361101865768, + "learning_rate": 1.726733156530161e-06, + "loss": 0.5511, + "step": 790 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.206415620641562, + "grad_norm": 0.0734049454331398, + "learning_rate": 1.7153416354202307e-06, + "loss": 0.5531, + "step": 791 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 2.209205020920502, + "grad_norm": 0.04886513575911522, + "learning_rate": 1.7039800292520997e-06, + "loss": 0.5509, + "step": 792 + }, + { + "autoregressive_loss": 0.1279, + "epoch": 2.2119944211994422, + "grad_norm": 0.043709028512239456, + "learning_rate": 1.69264844150131e-06, + "loss": 0.5112, + "step": 793 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.214783821478382, + "grad_norm": 0.0725773423910141, + "learning_rate": 1.6813469753700013e-06, + "loss": 0.5624, + "step": 794 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.217573221757322, + "grad_norm": 0.05580553412437439, + "learning_rate": 1.6700757337859907e-06, + "loss": 0.5594, + "step": 795 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.2203626220362622, + "grad_norm": 0.03892694413661957, + "learning_rate": 1.6588348194018205e-06, + "loss": 0.5551, + "step": 796 + }, + { + "autoregressive_loss": 0.1514, + "epoch": 2.2231520223152024, + "grad_norm": 0.0665748119354248, + "learning_rate": 1.6476243345938293e-06, + "loss": 0.6039, + "step": 797 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.225941422594142, + "grad_norm": 0.03626946732401848, + "learning_rate": 1.6364443814612207e-06, + "loss": 0.5536, + "step": 798 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.228730822873082, + "grad_norm": 0.04426373913884163, + "learning_rate": 1.6252950618251311e-06, + "loss": 0.5465, + "step": 799 + }, + { + "autoregressive_loss": 0.1475, + "epoch": 2.2315202231520224, + "grad_norm": 0.04741590470075607, + "learning_rate": 1.614176477227703e-06, + "loss": 0.5894, + "step": 800 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 2.2343096234309625, + "grad_norm": 0.03842854127287865, + "learning_rate": 1.6030887289311604e-06, + "loss": 0.5681, + "step": 801 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.237099023709902, + "grad_norm": 0.04062565043568611, + "learning_rate": 1.5920319179168859e-06, + "loss": 0.5603, + "step": 802 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 2.2398884239888424, + "grad_norm": 0.037748780101537704, + "learning_rate": 1.5810061448845028e-06, + "loss": 0.5797, + "step": 803 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 2.2426778242677825, + "grad_norm": 0.0525917187333107, + "learning_rate": 1.5700115102509562e-06, + "loss": 0.5509, + "step": 804 + }, + { + "autoregressive_loss": 0.125, + "epoch": 2.2454672245467227, + "grad_norm": 0.048002853989601135, + "learning_rate": 1.5590481141495988e-06, + "loss": 0.5, + "step": 805 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 2.2482566248256624, + "grad_norm": 0.04887459799647331, + "learning_rate": 1.5481160564292802e-06, + "loss": 0.5709, + "step": 806 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.2510460251046025, + "grad_norm": 0.04123985767364502, + "learning_rate": 1.5372154366534325e-06, + "loss": 0.5417, + "step": 807 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.2538354253835426, + "grad_norm": 0.039552632719278336, + "learning_rate": 1.5263463540991769e-06, + "loss": 0.5601, + "step": 808 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.256624825662483, + "grad_norm": 0.04675934091210365, + "learning_rate": 1.5155089077563968e-06, + "loss": 0.5632, + "step": 809 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.2594142259414225, + "grad_norm": 0.045213863253593445, + "learning_rate": 1.5047031963268617e-06, + "loss": 0.549, + "step": 810 + }, + { + "autoregressive_loss": 0.1455, + "epoch": 2.2622036262203626, + "grad_norm": 0.06601358950138092, + "learning_rate": 1.49392931822331e-06, + "loss": 0.5809, + "step": 811 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.264993026499303, + "grad_norm": 0.046800632029771805, + "learning_rate": 1.4831873715685597e-06, + "loss": 0.5636, + "step": 812 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 2.2677824267782425, + "grad_norm": 0.041625406593084335, + "learning_rate": 1.4724774541946145e-06, + "loss": 0.5674, + "step": 813 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 2.2705718270571826, + "grad_norm": 0.040445588529109955, + "learning_rate": 1.461799663641773e-06, + "loss": 0.5596, + "step": 814 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.273361227336123, + "grad_norm": 0.04801641032099724, + "learning_rate": 1.4511540971577377e-06, + "loss": 0.5533, + "step": 815 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.276150627615063, + "grad_norm": 0.0419541671872139, + "learning_rate": 1.440540851696733e-06, + "loss": 0.5645, + "step": 816 + }, + { + "autoregressive_loss": 0.1309, + "epoch": 2.2789400278940026, + "grad_norm": 0.045688845217227936, + "learning_rate": 1.429960023918619e-06, + "loss": 0.5259, + "step": 817 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.2817294281729428, + "grad_norm": 0.03887026757001877, + "learning_rate": 1.4194117101880134e-06, + "loss": 0.5547, + "step": 818 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.284518828451883, + "grad_norm": 0.040075451135635376, + "learning_rate": 1.4088960065734137e-06, + "loss": 0.5367, + "step": 819 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.287308228730823, + "grad_norm": 0.056806497275829315, + "learning_rate": 1.3984130088463204e-06, + "loss": 0.5406, + "step": 820 + }, + { + "autoregressive_loss": 0.1465, + "epoch": 2.2900976290097628, + "grad_norm": 0.04092267155647278, + "learning_rate": 1.3879628124803662e-06, + "loss": 0.5836, + "step": 821 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.292887029288703, + "grad_norm": 0.07309859991073608, + "learning_rate": 1.3775455126504466e-06, + "loss": 0.563, + "step": 822 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 2.295676429567643, + "grad_norm": 0.07060068845748901, + "learning_rate": 1.3671612042318527e-06, + "loss": 0.5597, + "step": 823 + }, + { + "autoregressive_loss": 0.1436, + "epoch": 2.2984658298465828, + "grad_norm": 0.042138878256082535, + "learning_rate": 1.3568099817994068e-06, + "loss": 0.574, + "step": 824 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.301255230125523, + "grad_norm": 0.032517045736312866, + "learning_rate": 1.3464919396266018e-06, + "loss": 0.5521, + "step": 825 + }, + { + "autoregressive_loss": 0.1436, + "epoch": 2.304044630404463, + "grad_norm": 0.05858975648880005, + "learning_rate": 1.3362071716847424e-06, + "loss": 0.5739, + "step": 826 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 2.306834030683403, + "grad_norm": 0.05994977802038193, + "learning_rate": 1.3259557716420868e-06, + "loss": 0.5759, + "step": 827 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 2.309623430962343, + "grad_norm": 0.05566610395908356, + "learning_rate": 1.3157378328630027e-06, + "loss": 0.5569, + "step": 828 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 2.312412831241283, + "grad_norm": 0.04783601313829422, + "learning_rate": 1.3055534484070997e-06, + "loss": 0.5681, + "step": 829 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 2.315202231520223, + "grad_norm": 0.06399188190698624, + "learning_rate": 1.2954027110284035e-06, + "loss": 0.5648, + "step": 830 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.3179916317991633, + "grad_norm": 0.06466829031705856, + "learning_rate": 1.285285713174489e-06, + "loss": 0.5477, + "step": 831 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.320781032078103, + "grad_norm": 0.06253597885370255, + "learning_rate": 1.2752025469856598e-06, + "loss": 0.5565, + "step": 832 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 2.323570432357043, + "grad_norm": 0.06909258663654327, + "learning_rate": 1.2651533042940883e-06, + "loss": 0.5701, + "step": 833 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.3263598326359833, + "grad_norm": 0.0547388531267643, + "learning_rate": 1.2551380766230003e-06, + "loss": 0.5482, + "step": 834 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.3291492329149235, + "grad_norm": 0.05019734054803848, + "learning_rate": 1.2451569551858183e-06, + "loss": 0.5624, + "step": 835 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 2.331938633193863, + "grad_norm": 0.054106589406728745, + "learning_rate": 1.2352100308853548e-06, + "loss": 0.5701, + "step": 836 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 2.3347280334728033, + "grad_norm": 0.054025404155254364, + "learning_rate": 1.225297394312966e-06, + "loss": 0.5707, + "step": 837 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.3375174337517435, + "grad_norm": 0.04458696022629738, + "learning_rate": 1.2154191357477352e-06, + "loss": 0.5458, + "step": 838 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 2.3403068340306836, + "grad_norm": 0.04139189049601555, + "learning_rate": 1.205575345155649e-06, + "loss": 0.5651, + "step": 839 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 2.3430962343096233, + "grad_norm": 0.05216503143310547, + "learning_rate": 1.1957661121887782e-06, + "loss": 0.5582, + "step": 840 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 2.3458856345885635, + "grad_norm": 0.04340316355228424, + "learning_rate": 1.1859915261844596e-06, + "loss": 0.5353, + "step": 841 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 2.3486750348675036, + "grad_norm": 0.046154510229825974, + "learning_rate": 1.1762516761644831e-06, + "loss": 0.5508, + "step": 842 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.3514644351464433, + "grad_norm": 0.04004541039466858, + "learning_rate": 1.1665466508342876e-06, + "loss": 0.5535, + "step": 843 + }, + { + "autoregressive_loss": 0.1279, + "epoch": 2.3542538354253835, + "grad_norm": 0.04298239201307297, + "learning_rate": 1.1568765385821373e-06, + "loss": 0.512, + "step": 844 + }, + { + "autoregressive_loss": 0.1475, + "epoch": 2.3570432357043236, + "grad_norm": 0.04745339974761009, + "learning_rate": 1.147241427478336e-06, + "loss": 0.5889, + "step": 845 + }, + { + "autoregressive_loss": 0.1357, + "epoch": 2.3598326359832638, + "grad_norm": 0.04377756267786026, + "learning_rate": 1.1376414052744055e-06, + "loss": 0.5424, + "step": 846 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 2.3626220362622035, + "grad_norm": 0.039170995354652405, + "learning_rate": 1.128076559402308e-06, + "loss": 0.576, + "step": 847 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 2.3654114365411436, + "grad_norm": 0.04267503321170807, + "learning_rate": 1.1185469769736262e-06, + "loss": 0.5345, + "step": 848 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.3682008368200838, + "grad_norm": 0.04406587406992912, + "learning_rate": 1.1090527447787924e-06, + "loss": 0.5632, + "step": 849 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.370990237099024, + "grad_norm": 0.041781019419431686, + "learning_rate": 1.0995939492862783e-06, + "loss": 0.5488, + "step": 850 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.3737796373779636, + "grad_norm": 0.03950845077633858, + "learning_rate": 1.0901706766418247e-06, + "loss": 0.5535, + "step": 851 + }, + { + "autoregressive_loss": 0.1318, + "epoch": 2.3765690376569037, + "grad_norm": 0.040908582508563995, + "learning_rate": 1.0807830126676444e-06, + "loss": 0.5296, + "step": 852 + }, + { + "autoregressive_loss": 0.1187, + "epoch": 2.379358437935844, + "grad_norm": 0.03591518849134445, + "learning_rate": 1.0714310428616464e-06, + "loss": 0.4747, + "step": 853 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.3821478382147836, + "grad_norm": 0.03663348779082298, + "learning_rate": 1.0621148523966552e-06, + "loss": 0.5542, + "step": 854 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 2.3849372384937237, + "grad_norm": 0.0410367026925087, + "learning_rate": 1.052834526119637e-06, + "loss": 0.5707, + "step": 855 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.387726638772664, + "grad_norm": 0.057585518807172775, + "learning_rate": 1.0435901485509254e-06, + "loss": 0.5559, + "step": 856 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.390516039051604, + "grad_norm": 0.058736659586429596, + "learning_rate": 1.0343818038834513e-06, + "loss": 0.5619, + "step": 857 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.393305439330544, + "grad_norm": 0.05812650918960571, + "learning_rate": 1.0252095759819785e-06, + "loss": 0.5562, + "step": 858 + }, + { + "autoregressive_loss": 0.1309, + "epoch": 2.396094839609484, + "grad_norm": 0.04882519319653511, + "learning_rate": 1.016073548382337e-06, + "loss": 0.5234, + "step": 859 + }, + { + "autoregressive_loss": 0.1455, + "epoch": 2.398884239888424, + "grad_norm": 0.07367956638336182, + "learning_rate": 1.0069738042906635e-06, + "loss": 0.5828, + "step": 860 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.401673640167364, + "grad_norm": 0.0563926100730896, + "learning_rate": 9.979104265826438e-07, + "loss": 0.553, + "step": 861 + }, + { + "autoregressive_loss": 0.1299, + "epoch": 2.404463040446304, + "grad_norm": 0.0694463849067688, + "learning_rate": 9.888834978027589e-07, + "loss": 0.5186, + "step": 862 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.407252440725244, + "grad_norm": 0.05170215666294098, + "learning_rate": 9.798931001635298e-07, + "loss": 0.5385, + "step": 863 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.410041841004184, + "grad_norm": 0.06111734360456467, + "learning_rate": 9.709393155447734e-07, + "loss": 0.5406, + "step": 864 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.4128312412831243, + "grad_norm": 0.046918660402297974, + "learning_rate": 9.62022225492853e-07, + "loss": 0.5469, + "step": 865 + }, + { + "autoregressive_loss": 0.1309, + "epoch": 2.415620641562064, + "grad_norm": 0.05116235464811325, + "learning_rate": 9.531419112199375e-07, + "loss": 0.5266, + "step": 866 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.418410041841004, + "grad_norm": 0.040750402957201004, + "learning_rate": 9.442984536032612e-07, + "loss": 0.5453, + "step": 867 + }, + { + "autoregressive_loss": 0.1309, + "epoch": 2.4211994421199443, + "grad_norm": 0.056528490036726, + "learning_rate": 9.354919331843865e-07, + "loss": 0.5216, + "step": 868 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 2.4239888423988845, + "grad_norm": 0.04475468397140503, + "learning_rate": 9.267224301684763e-07, + "loss": 0.5322, + "step": 869 + }, + { + "autoregressive_loss": 0.1494, + "epoch": 2.426778242677824, + "grad_norm": 0.04414912685751915, + "learning_rate": 9.17990024423549e-07, + "loss": 0.5957, + "step": 870 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 2.4295676429567643, + "grad_norm": 0.05583546683192253, + "learning_rate": 9.09294795479771e-07, + "loss": 0.5521, + "step": 871 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.4323570432357045, + "grad_norm": 0.06340214610099792, + "learning_rate": 9.006368225287116e-07, + "loss": 0.564, + "step": 872 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.435146443514644, + "grad_norm": 0.04264232516288757, + "learning_rate": 8.920161844226416e-07, + "loss": 0.5409, + "step": 873 + }, + { + "autoregressive_loss": 0.1318, + "epoch": 2.4379358437935843, + "grad_norm": 0.07757823169231415, + "learning_rate": 8.834329596737995e-07, + "loss": 0.5275, + "step": 874 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 2.4407252440725244, + "grad_norm": 0.05813584849238396, + "learning_rate": 8.748872264536856e-07, + "loss": 0.5659, + "step": 875 + }, + { + "autoregressive_loss": 0.1299, + "epoch": 2.4435146443514646, + "grad_norm": 0.03897373378276825, + "learning_rate": 8.663790625923451e-07, + "loss": 0.5219, + "step": 876 + }, + { + "autoregressive_loss": 0.1455, + "epoch": 2.4463040446304043, + "grad_norm": 0.048923391848802567, + "learning_rate": 8.57908545577662e-07, + "loss": 0.5826, + "step": 877 + }, + { + "autoregressive_loss": 0.1309, + "epoch": 2.4490934449093444, + "grad_norm": 0.0375228188931942, + "learning_rate": 8.494757525546538e-07, + "loss": 0.525, + "step": 878 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 2.4518828451882846, + "grad_norm": 0.04614968225359917, + "learning_rate": 8.410807603247656e-07, + "loss": 0.5504, + "step": 879 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.4546722454672247, + "grad_norm": 0.046223465353250504, + "learning_rate": 8.327236453451743e-07, + "loss": 0.5481, + "step": 880 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 2.4574616457461644, + "grad_norm": 0.06220763921737671, + "learning_rate": 8.244044837280901e-07, + "loss": 0.5521, + "step": 881 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 2.4602510460251046, + "grad_norm": 0.03261826932430267, + "learning_rate": 8.161233512400641e-07, + "loss": 0.5338, + "step": 882 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 2.4630404463040447, + "grad_norm": 0.06636366248130798, + "learning_rate": 8.078803233012966e-07, + "loss": 0.5317, + "step": 883 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.4658298465829844, + "grad_norm": 0.061175450682640076, + "learning_rate": 7.996754749849567e-07, + "loss": 0.5479, + "step": 884 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.4686192468619246, + "grad_norm": 0.05717315897345543, + "learning_rate": 7.915088810164856e-07, + "loss": 0.5475, + "step": 885 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 2.4714086471408647, + "grad_norm": 0.07552015036344528, + "learning_rate": 7.833806157729329e-07, + "loss": 0.578, + "step": 886 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 2.474198047419805, + "grad_norm": 0.03606550395488739, + "learning_rate": 7.752907532822613e-07, + "loss": 0.558, + "step": 887 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 2.476987447698745, + "grad_norm": 0.04468367621302605, + "learning_rate": 7.672393672226902e-07, + "loss": 0.5712, + "step": 888 + }, + { + "autoregressive_loss": 0.126, + "epoch": 2.4797768479776847, + "grad_norm": 0.11802934110164642, + "learning_rate": 7.592265309220071e-07, + "loss": 0.5049, + "step": 889 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.482566248256625, + "grad_norm": 0.08075188100337982, + "learning_rate": 7.512523173569175e-07, + "loss": 0.5631, + "step": 890 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 2.485355648535565, + "grad_norm": 0.04730691388249397, + "learning_rate": 7.433167991523632e-07, + "loss": 0.5748, + "step": 891 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.4881450488145047, + "grad_norm": 0.03654900938272476, + "learning_rate": 7.354200485808749e-07, + "loss": 0.5604, + "step": 892 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 2.490934449093445, + "grad_norm": 0.050974465906620026, + "learning_rate": 7.275621375619058e-07, + "loss": 0.5706, + "step": 893 + }, + { + "autoregressive_loss": 0.1357, + "epoch": 2.493723849372385, + "grad_norm": 0.04459230229258537, + "learning_rate": 7.197431376611785e-07, + "loss": 0.5436, + "step": 894 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 2.496513249651325, + "grad_norm": 0.036724887788295746, + "learning_rate": 7.11963120090034e-07, + "loss": 0.5287, + "step": 895 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 2.499302649930265, + "grad_norm": 0.06120232492685318, + "learning_rate": 7.042221557047823e-07, + "loss": 0.5729, + "step": 896 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.502092050209205, + "grad_norm": 0.03728732466697693, + "learning_rate": 6.96520315006059e-07, + "loss": 0.5404, + "step": 897 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.504881450488145, + "grad_norm": 0.10492336004972458, + "learning_rate": 6.888576681381798e-07, + "loss": 0.5493, + "step": 898 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 2.5076708507670853, + "grad_norm": 0.05667152628302574, + "learning_rate": 6.81234284888505e-07, + "loss": 0.5505, + "step": 899 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.510460251046025, + "grad_norm": 0.05117593705654144, + "learning_rate": 6.736502346868018e-07, + "loss": 0.5562, + "step": 900 + }, + { + "autoregressive_loss": 0.1318, + "epoch": 2.513249651324965, + "grad_norm": 0.04988648742437363, + "learning_rate": 6.661055866046134e-07, + "loss": 0.5288, + "step": 901 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.5160390516039053, + "grad_norm": 0.06195830926299095, + "learning_rate": 6.586004093546277e-07, + "loss": 0.5406, + "step": 902 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 2.518828451882845, + "grad_norm": 0.046834927052259445, + "learning_rate": 6.511347712900545e-07, + "loss": 0.5574, + "step": 903 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.521617852161785, + "grad_norm": 0.049452099949121475, + "learning_rate": 6.437087404040016e-07, + "loss": 0.5548, + "step": 904 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.5244072524407253, + "grad_norm": 0.03884280472993851, + "learning_rate": 6.363223843288535e-07, + "loss": 0.5389, + "step": 905 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 2.5271966527196654, + "grad_norm": 0.058246903121471405, + "learning_rate": 6.289757703356597e-07, + "loss": 0.5326, + "step": 906 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 2.5299860529986056, + "grad_norm": 0.033294159919023514, + "learning_rate": 6.216689653335184e-07, + "loss": 0.5585, + "step": 907 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.5327754532775453, + "grad_norm": 0.051013752818107605, + "learning_rate": 6.144020358689679e-07, + "loss": 0.5536, + "step": 908 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 2.5355648535564854, + "grad_norm": 0.04889146238565445, + "learning_rate": 6.071750481253835e-07, + "loss": 0.567, + "step": 909 + }, + { + "autoregressive_loss": 0.1357, + "epoch": 2.5383542538354256, + "grad_norm": 0.05315745994448662, + "learning_rate": 5.999880679223702e-07, + "loss": 0.543, + "step": 910 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.5411436541143653, + "grad_norm": 0.04282662644982338, + "learning_rate": 5.928411607151651e-07, + "loss": 0.5569, + "step": 911 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.5439330543933054, + "grad_norm": 0.04072648286819458, + "learning_rate": 5.857343915940434e-07, + "loss": 0.546, + "step": 912 + }, + { + "autoregressive_loss": 0.1318, + "epoch": 2.5467224546722456, + "grad_norm": 0.047470949590206146, + "learning_rate": 5.786678252837213e-07, + "loss": 0.527, + "step": 913 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 2.5495118549511853, + "grad_norm": 0.051288094371557236, + "learning_rate": 5.71641526142771e-07, + "loss": 0.5328, + "step": 914 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.5523012552301254, + "grad_norm": 0.04957340657711029, + "learning_rate": 5.646555581630319e-07, + "loss": 0.5459, + "step": 915 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.5550906555090656, + "grad_norm": 0.03466577082872391, + "learning_rate": 5.577099849690276e-07, + "loss": 0.5402, + "step": 916 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.5578800557880057, + "grad_norm": 0.04276926442980766, + "learning_rate": 5.508048698173879e-07, + "loss": 0.549, + "step": 917 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 2.560669456066946, + "grad_norm": 0.06325326859951019, + "learning_rate": 5.439402755962719e-07, + "loss": 0.5333, + "step": 918 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.5634588563458856, + "grad_norm": 0.04299675300717354, + "learning_rate": 5.371162648247957e-07, + "loss": 0.5493, + "step": 919 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.5662482566248257, + "grad_norm": 0.03942457586526871, + "learning_rate": 5.303328996524626e-07, + "loss": 0.5614, + "step": 920 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 2.569037656903766, + "grad_norm": 0.04302982985973358, + "learning_rate": 5.235902418585958e-07, + "loss": 0.5326, + "step": 921 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 2.5718270571827055, + "grad_norm": 0.04436048865318298, + "learning_rate": 5.168883528517793e-07, + "loss": 0.5492, + "step": 922 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.5746164574616457, + "grad_norm": 0.034793443977832794, + "learning_rate": 5.102272936692948e-07, + "loss": 0.5365, + "step": 923 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 2.577405857740586, + "grad_norm": 0.08216363936662674, + "learning_rate": 5.036071249765673e-07, + "loss": 0.5704, + "step": 924 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 2.5801952580195255, + "grad_norm": 0.041240446269512177, + "learning_rate": 4.970279070666162e-07, + "loss": 0.5657, + "step": 925 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 2.5829846582984657, + "grad_norm": 0.04501833766698837, + "learning_rate": 4.904896998594955e-07, + "loss": 0.5297, + "step": 926 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 2.585774058577406, + "grad_norm": 0.03940986096858978, + "learning_rate": 4.839925629017638e-07, + "loss": 0.5306, + "step": 927 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.588563458856346, + "grad_norm": 0.06501064449548721, + "learning_rate": 4.775365553659256e-07, + "loss": 0.5389, + "step": 928 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 2.591352859135286, + "grad_norm": 0.04714985936880112, + "learning_rate": 4.711217360499082e-07, + "loss": 0.5717, + "step": 929 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.594142259414226, + "grad_norm": 0.03243463113903999, + "learning_rate": 4.6474816337650883e-07, + "loss": 0.564, + "step": 930 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 2.596931659693166, + "grad_norm": 0.048983991146087646, + "learning_rate": 4.5841589539288187e-07, + "loss": 0.5706, + "step": 931 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.599721059972106, + "grad_norm": 0.04724277928471565, + "learning_rate": 4.5212498976999196e-07, + "loss": 0.5396, + "step": 932 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.602510460251046, + "grad_norm": 0.0572696290910244, + "learning_rate": 4.458755038021029e-07, + "loss": 0.5468, + "step": 933 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 2.605299860529986, + "grad_norm": 0.03813404217362404, + "learning_rate": 4.3966749440624736e-07, + "loss": 0.5645, + "step": 934 + }, + { + "autoregressive_loss": 0.1357, + "epoch": 2.608089260808926, + "grad_norm": 0.0384247750043869, + "learning_rate": 4.3350101812171143e-07, + "loss": 0.5437, + "step": 935 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 2.6108786610878663, + "grad_norm": 0.04985614866018295, + "learning_rate": 4.2737613110951924e-07, + "loss": 0.552, + "step": 936 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 2.6136680613668064, + "grad_norm": 0.04539071395993233, + "learning_rate": 4.2129288915192355e-07, + "loss": 0.569, + "step": 937 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 2.616457461645746, + "grad_norm": 0.037422724068164825, + "learning_rate": 4.152513476518927e-07, + "loss": 0.5337, + "step": 938 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.6192468619246863, + "grad_norm": 0.0522053986787796, + "learning_rate": 4.092515616326126e-07, + "loss": 0.5543, + "step": 939 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 2.6220362622036264, + "grad_norm": 0.07626638561487198, + "learning_rate": 4.0329358573697906e-07, + "loss": 0.5349, + "step": 940 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.624825662482566, + "grad_norm": 0.05171051621437073, + "learning_rate": 3.973774742271047e-07, + "loss": 0.562, + "step": 941 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.6276150627615062, + "grad_norm": 0.043028004467487335, + "learning_rate": 3.9150328098382593e-07, + "loss": 0.5625, + "step": 942 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 2.6304044630404464, + "grad_norm": 0.04002896323800087, + "learning_rate": 3.8567105950620353e-07, + "loss": 0.5594, + "step": 943 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.633193863319386, + "grad_norm": 0.04796924442052841, + "learning_rate": 3.798808629110479e-07, + "loss": 0.5532, + "step": 944 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 2.6359832635983262, + "grad_norm": 0.041307199746370316, + "learning_rate": 3.7413274393242327e-07, + "loss": 0.5486, + "step": 945 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.6387726638772664, + "grad_norm": 0.047942422330379486, + "learning_rate": 3.68426754921179e-07, + "loss": 0.5547, + "step": 946 + }, + { + "autoregressive_loss": 0.1357, + "epoch": 2.6415620641562065, + "grad_norm": 0.044221945106983185, + "learning_rate": 3.6276294784446e-07, + "loss": 0.5421, + "step": 947 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 2.6443514644351467, + "grad_norm": 0.04576009511947632, + "learning_rate": 3.5714137428524754e-07, + "loss": 0.5345, + "step": 948 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 2.6471408647140864, + "grad_norm": 0.04960264638066292, + "learning_rate": 3.5156208544187554e-07, + "loss": 0.557, + "step": 949 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.6499302649930265, + "grad_norm": 0.0384841188788414, + "learning_rate": 3.460251321275759e-07, + "loss": 0.5626, + "step": 950 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.6527196652719667, + "grad_norm": 0.05527389422059059, + "learning_rate": 3.4053056477000856e-07, + "loss": 0.5607, + "step": 951 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.6555090655509064, + "grad_norm": 0.06486639380455017, + "learning_rate": 3.350784334108048e-07, + "loss": 0.5411, + "step": 952 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 2.6582984658298465, + "grad_norm": 0.047621067613363266, + "learning_rate": 3.2966878770511025e-07, + "loss": 0.5514, + "step": 953 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.6610878661087867, + "grad_norm": 0.0559660978615284, + "learning_rate": 3.24301676921136e-07, + "loss": 0.5535, + "step": 954 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 2.6638772663877264, + "grad_norm": 0.06857193261384964, + "learning_rate": 3.189771499397043e-07, + "loss": 0.579, + "step": 955 + }, + { + "autoregressive_loss": 0.1436, + "epoch": 2.6666666666666665, + "grad_norm": 0.04918047785758972, + "learning_rate": 3.136952552538092e-07, + "loss": 0.5734, + "step": 956 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 2.6694560669456067, + "grad_norm": 0.054500024765729904, + "learning_rate": 3.084560409681703e-07, + "loss": 0.5337, + "step": 957 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 2.672245467224547, + "grad_norm": 0.04290428012609482, + "learning_rate": 3.0325955479879765e-07, + "loss": 0.5713, + "step": 958 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.675034867503487, + "grad_norm": 0.05252581089735031, + "learning_rate": 2.981058440725559e-07, + "loss": 0.5536, + "step": 959 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 2.6778242677824267, + "grad_norm": 0.05756058916449547, + "learning_rate": 2.929949557267331e-07, + "loss": 0.5674, + "step": 960 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 2.680613668061367, + "grad_norm": 0.04172331839799881, + "learning_rate": 2.8792693630861345e-07, + "loss": 0.568, + "step": 961 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.683403068340307, + "grad_norm": 0.0539584755897522, + "learning_rate": 2.829018319750543e-07, + "loss": 0.5499, + "step": 962 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 2.6861924686192467, + "grad_norm": 0.05727997049689293, + "learning_rate": 2.779196884920643e-07, + "loss": 0.5782, + "step": 963 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 2.688981868898187, + "grad_norm": 0.05714195966720581, + "learning_rate": 2.729805512343875e-07, + "loss": 0.5684, + "step": 964 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.691771269177127, + "grad_norm": 0.05160605534911156, + "learning_rate": 2.6808446518508835e-07, + "loss": 0.5553, + "step": 965 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 2.694560669456067, + "grad_norm": 0.051672399044036865, + "learning_rate": 2.632314749351483e-07, + "loss": 0.5645, + "step": 966 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.6973500697350072, + "grad_norm": 0.04370841011404991, + "learning_rate": 2.5842162468304845e-07, + "loss": 0.5563, + "step": 967 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 2.700139470013947, + "grad_norm": 0.04391193762421608, + "learning_rate": 2.5365495823437834e-07, + "loss": 0.551, + "step": 968 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.702928870292887, + "grad_norm": 0.05152919143438339, + "learning_rate": 2.489315190014291e-07, + "loss": 0.5367, + "step": 969 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.7057182705718272, + "grad_norm": 0.05620885267853737, + "learning_rate": 2.4425135000280374e-07, + "loss": 0.5375, + "step": 970 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.708507670850767, + "grad_norm": 0.05687708780169487, + "learning_rate": 2.3961449386302017e-07, + "loss": 0.5552, + "step": 971 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.711297071129707, + "grad_norm": 0.03745347633957863, + "learning_rate": 2.3502099281212775e-07, + "loss": 0.5444, + "step": 972 + }, + { + "autoregressive_loss": 0.1357, + "epoch": 2.7140864714086472, + "grad_norm": 0.0565638393163681, + "learning_rate": 2.3047088868531796e-07, + "loss": 0.54, + "step": 973 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.716875871687587, + "grad_norm": 0.05238328129053116, + "learning_rate": 2.2596422292254893e-07, + "loss": 0.5382, + "step": 974 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.719665271966527, + "grad_norm": 0.04350074753165245, + "learning_rate": 2.2150103656816357e-07, + "loss": 0.5547, + "step": 975 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.7224546722454672, + "grad_norm": 0.04363437741994858, + "learning_rate": 2.1708137027051601e-07, + "loss": 0.5521, + "step": 976 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.7252440725244074, + "grad_norm": 0.04332887753844261, + "learning_rate": 2.1270526428160466e-07, + "loss": 0.5398, + "step": 977 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.7280334728033475, + "grad_norm": 0.043262653052806854, + "learning_rate": 2.0837275845670135e-07, + "loss": 0.5652, + "step": 978 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.730822873082287, + "grad_norm": 0.07472798228263855, + "learning_rate": 2.0408389225399339e-07, + "loss": 0.5482, + "step": 979 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 2.7336122733612274, + "grad_norm": 0.06956392526626587, + "learning_rate": 1.9983870473421761e-07, + "loss": 0.5687, + "step": 980 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 2.7364016736401675, + "grad_norm": 0.03768423944711685, + "learning_rate": 1.9563723456031303e-07, + "loss": 0.5289, + "step": 981 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 2.739191073919107, + "grad_norm": 0.04597004875540733, + "learning_rate": 1.9147951999705928e-07, + "loss": 0.5781, + "step": 982 + }, + { + "autoregressive_loss": 0.1309, + "epoch": 2.7419804741980474, + "grad_norm": 0.037546850740909576, + "learning_rate": 1.8736559891073703e-07, + "loss": 0.5256, + "step": 983 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 2.7447698744769875, + "grad_norm": 0.03999371826648712, + "learning_rate": 1.8329550876877488e-07, + "loss": 0.533, + "step": 984 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 2.747559274755927, + "grad_norm": 0.04794856160879135, + "learning_rate": 1.7926928663941635e-07, + "loss": 0.5691, + "step": 985 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 2.7503486750348674, + "grad_norm": 0.09193428605794907, + "learning_rate": 1.7528696919137444e-07, + "loss": 0.5507, + "step": 986 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.7531380753138075, + "grad_norm": 0.047861214727163315, + "learning_rate": 1.7134859269350546e-07, + "loss": 0.5562, + "step": 987 + }, + { + "autoregressive_loss": 0.1309, + "epoch": 2.7559274755927476, + "grad_norm": 0.04118425026535988, + "learning_rate": 1.6745419301446962e-07, + "loss": 0.5239, + "step": 988 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.758716875871688, + "grad_norm": 0.040041618049144745, + "learning_rate": 1.6360380562241428e-07, + "loss": 0.5525, + "step": 989 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 2.7615062761506275, + "grad_norm": 0.04993029683828354, + "learning_rate": 1.5979746558464237e-07, + "loss": 0.572, + "step": 990 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 2.7642956764295676, + "grad_norm": 0.04825897142291069, + "learning_rate": 1.5603520756729885e-07, + "loss": 0.5656, + "step": 991 + }, + { + "autoregressive_loss": 0.127, + "epoch": 2.767085076708508, + "grad_norm": 0.04772040992975235, + "learning_rate": 1.5231706583505256e-07, + "loss": 0.5065, + "step": 992 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 2.7698744769874475, + "grad_norm": 0.06290243566036224, + "learning_rate": 1.486430742507833e-07, + "loss": 0.5505, + "step": 993 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.7726638772663876, + "grad_norm": 0.04308522492647171, + "learning_rate": 1.4501326627527513e-07, + "loss": 0.5554, + "step": 994 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.775453277545328, + "grad_norm": 0.06266027688980103, + "learning_rate": 1.4142767496691135e-07, + "loss": 0.5465, + "step": 995 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 2.778242677824268, + "grad_norm": 0.039447031915187836, + "learning_rate": 1.3788633298137288e-07, + "loss": 0.5336, + "step": 996 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 2.781032078103208, + "grad_norm": 0.05073726549744606, + "learning_rate": 1.3438927257134083e-07, + "loss": 0.5514, + "step": 997 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.7838214783821478, + "grad_norm": 0.04281417280435562, + "learning_rate": 1.3093652558620384e-07, + "loss": 0.5604, + "step": 998 + }, + { + "autoregressive_loss": 0.1309, + "epoch": 2.786610878661088, + "grad_norm": 0.0752372071146965, + "learning_rate": 1.2752812347176514e-07, + "loss": 0.5216, + "step": 999 + }, + { + "autoregressive_loss": 0.1357, + "epoch": 2.789400278940028, + "grad_norm": 0.041656434535980225, + "learning_rate": 1.2416409726996037e-07, + "loss": 0.5422, + "step": 1000 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.7921896792189678, + "grad_norm": 0.050259776413440704, + "learning_rate": 1.2084447761857244e-07, + "loss": 0.5548, + "step": 1001 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.794979079497908, + "grad_norm": 0.060818664729595184, + "learning_rate": 1.1756929475095103e-07, + "loss": 0.5371, + "step": 1002 + }, + { + "autoregressive_loss": 0.1484, + "epoch": 2.797768479776848, + "grad_norm": 0.058473262935876846, + "learning_rate": 1.143385784957407e-07, + "loss": 0.5924, + "step": 1003 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 2.8005578800557878, + "grad_norm": 0.04789732024073601, + "learning_rate": 1.111523582766072e-07, + "loss": 0.5579, + "step": 1004 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.803347280334728, + "grad_norm": 0.05591192469000816, + "learning_rate": 1.0801066311196872e-07, + "loss": 0.5632, + "step": 1005 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 2.806136680613668, + "grad_norm": 0.037469569593667984, + "learning_rate": 1.0491352161473345e-07, + "loss": 0.5292, + "step": 1006 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 2.808926080892608, + "grad_norm": 0.073211669921875, + "learning_rate": 1.018609619920391e-07, + "loss": 0.5599, + "step": 1007 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.8117154811715483, + "grad_norm": 0.04752061143517494, + "learning_rate": 9.885301204499321e-08, + "loss": 0.564, + "step": 1008 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 2.814504881450488, + "grad_norm": 0.0885554850101471, + "learning_rate": 9.588969916842272e-08, + "loss": 0.5614, + "step": 1009 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.817294281729428, + "grad_norm": 0.03420235216617584, + "learning_rate": 9.297105035062426e-08, + "loss": 0.5551, + "step": 1010 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 2.8200836820083683, + "grad_norm": 0.05352286621928215, + "learning_rate": 9.009709217311702e-08, + "loss": 0.532, + "step": 1011 + }, + { + "autoregressive_loss": 0.1309, + "epoch": 2.822873082287308, + "grad_norm": 0.06655533611774445, + "learning_rate": 8.72678508104008e-08, + "loss": 0.5223, + "step": 1012 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 2.825662482566248, + "grad_norm": 0.0439903624355793, + "learning_rate": 8.448335202971891e-08, + "loss": 0.5579, + "step": 1013 + }, + { + "autoregressive_loss": 0.1309, + "epoch": 2.8284518828451883, + "grad_norm": 0.050511352717876434, + "learning_rate": 8.174362119082291e-08, + "loss": 0.5253, + "step": 1014 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.831241283124128, + "grad_norm": 0.04805862531065941, + "learning_rate": 7.9048683245741e-08, + "loss": 0.5525, + "step": 1015 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.834030683403068, + "grad_norm": 0.04510214552283287, + "learning_rate": 7.639856273855106e-08, + "loss": 0.5638, + "step": 1016 + }, + { + "autoregressive_loss": 0.1318, + "epoch": 2.8368200836820083, + "grad_norm": 0.05768810585141182, + "learning_rate": 7.379328380515805e-08, + "loss": 0.5302, + "step": 1017 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.8396094839609485, + "grad_norm": 0.03900930657982826, + "learning_rate": 7.123287017307302e-08, + "loss": 0.5518, + "step": 1018 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.8423988842398886, + "grad_norm": 0.0370975062251091, + "learning_rate": 6.871734516119721e-08, + "loss": 0.5374, + "step": 1019 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 2.8451882845188283, + "grad_norm": 0.04506475478410721, + "learning_rate": 6.624673167961004e-08, + "loss": 0.5334, + "step": 1020 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 2.8479776847977685, + "grad_norm": 0.047408945858478546, + "learning_rate": 6.382105222936085e-08, + "loss": 0.5347, + "step": 1021 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.8507670850767086, + "grad_norm": 0.03986638784408569, + "learning_rate": 6.144032890226304e-08, + "loss": 0.5616, + "step": 1022 + }, + { + "autoregressive_loss": 0.1211, + "epoch": 2.8535564853556483, + "grad_norm": 0.04241688549518585, + "learning_rate": 5.910458338069192e-08, + "loss": 0.4833, + "step": 1023 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 2.8563458856345885, + "grad_norm": 0.05851636826992035, + "learning_rate": 5.6813836937392175e-08, + "loss": 0.5669, + "step": 1024 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 2.8591352859135286, + "grad_norm": 0.061409734189510345, + "learning_rate": 5.456811043527632e-08, + "loss": 0.536, + "step": 1025 + }, + { + "autoregressive_loss": 0.1416, + "epoch": 2.8619246861924688, + "grad_norm": 0.04442998766899109, + "learning_rate": 5.236742432724262e-08, + "loss": 0.5662, + "step": 1026 + }, + { + "autoregressive_loss": 0.1338, + "epoch": 2.864714086471409, + "grad_norm": 0.06895561516284943, + "learning_rate": 5.021179865598136e-08, + "loss": 0.5348, + "step": 1027 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 2.8675034867503486, + "grad_norm": 0.03826350346207619, + "learning_rate": 4.810125305379998e-08, + "loss": 0.5339, + "step": 1028 + }, + { + "autoregressive_loss": 0.1299, + "epoch": 2.8702928870292888, + "grad_norm": 0.04373703896999359, + "learning_rate": 4.6035806742436575e-08, + "loss": 0.5182, + "step": 1029 + }, + { + "autoregressive_loss": 0.1387, + "epoch": 2.873082287308229, + "grad_norm": 0.05748748034238815, + "learning_rate": 4.4015478532891675e-08, + "loss": 0.556, + "step": 1030 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 2.8758716875871686, + "grad_norm": 0.062418486922979355, + "learning_rate": 4.20402868252523e-08, + "loss": 0.5686, + "step": 1031 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 2.8786610878661087, + "grad_norm": 0.04879818856716156, + "learning_rate": 4.01102496085265e-08, + "loss": 0.5702, + "step": 1032 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.881450488145049, + "grad_norm": 0.04410684108734131, + "learning_rate": 3.822538446047852e-08, + "loss": 0.5372, + "step": 1033 + }, + { + "autoregressive_loss": 0.1289, + "epoch": 2.8842398884239886, + "grad_norm": 0.049608103930950165, + "learning_rate": 3.6385708547468925e-08, + "loss": 0.5149, + "step": 1034 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.8870292887029287, + "grad_norm": 0.04671331122517586, + "learning_rate": 3.4591238624299696e-08, + "loss": 0.5607, + "step": 1035 + }, + { + "autoregressive_loss": 0.1318, + "epoch": 2.889818688981869, + "grad_norm": 0.06376811116933823, + "learning_rate": 3.284199103405883e-08, + "loss": 0.5283, + "step": 1036 + }, + { + "autoregressive_loss": 0.1357, + "epoch": 2.892608089260809, + "grad_norm": 0.057606492191553116, + "learning_rate": 3.113798170797489e-08, + "loss": 0.5438, + "step": 1037 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.895397489539749, + "grad_norm": 0.04149043187499046, + "learning_rate": 2.9479226165268216e-08, + "loss": 0.5472, + "step": 1038 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.898186889818689, + "grad_norm": 0.03625166416168213, + "learning_rate": 2.7865739513012746e-08, + "loss": 0.5416, + "step": 1039 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 2.900976290097629, + "grad_norm": 0.05523681640625, + "learning_rate": 2.629753644599664e-08, + "loss": 0.5681, + "step": 1040 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 2.903765690376569, + "grad_norm": 0.045702096074819565, + "learning_rate": 2.4774631246589075e-08, + "loss": 0.5516, + "step": 1041 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 2.906555090655509, + "grad_norm": 0.03823622316122055, + "learning_rate": 2.3297037784609787e-08, + "loss": 0.5487, + "step": 1042 + }, + { + "autoregressive_loss": 0.1484, + "epoch": 2.909344490934449, + "grad_norm": 0.06298030912876129, + "learning_rate": 2.1864769517204177e-08, + "loss": 0.5944, + "step": 1043 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 2.912133891213389, + "grad_norm": 0.08082084357738495, + "learning_rate": 2.0477839488718398e-08, + "loss": 0.5592, + "step": 1044 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.914923291492329, + "grad_norm": 0.05376262217760086, + "learning_rate": 1.913626033058169e-08, + "loss": 0.5496, + "step": 1045 + }, + { + "autoregressive_loss": 0.1318, + "epoch": 2.917712691771269, + "grad_norm": 0.04540029168128967, + "learning_rate": 1.784004426119257e-08, + "loss": 0.5283, + "step": 1046 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.920502092050209, + "grad_norm": 0.04353891685605049, + "learning_rate": 1.6589203085804473e-08, + "loss": 0.564, + "step": 1047 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.9232914923291493, + "grad_norm": 0.04505160078406334, + "learning_rate": 1.538374819642252e-08, + "loss": 0.5638, + "step": 1048 + }, + { + "autoregressive_loss": 0.1357, + "epoch": 2.9260808926080895, + "grad_norm": 0.036994658410549164, + "learning_rate": 1.4223690571695815e-08, + "loss": 0.5438, + "step": 1049 + }, + { + "autoregressive_loss": 0.1357, + "epoch": 2.928870292887029, + "grad_norm": 0.037269510328769684, + "learning_rate": 1.3109040776819181e-08, + "loss": 0.5422, + "step": 1050 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.9316596931659693, + "grad_norm": 0.04000972583889961, + "learning_rate": 1.2039808963437705e-08, + "loss": 0.5382, + "step": 1051 + }, + { + "autoregressive_loss": 0.1328, + "epoch": 2.9344490934449095, + "grad_norm": 0.054407402873039246, + "learning_rate": 1.1016004869551788e-08, + "loss": 0.5311, + "step": 1052 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.937238493723849, + "grad_norm": 0.0554296113550663, + "learning_rate": 1.0037637819431123e-08, + "loss": 0.5458, + "step": 1053 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.9400278940027893, + "grad_norm": 0.04119548574090004, + "learning_rate": 9.10471672352864e-09, + "loss": 0.5604, + "step": 1054 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.9428172942817294, + "grad_norm": 0.041542235761880875, + "learning_rate": 8.217250078400018e-09, + "loss": 0.5502, + "step": 1055 + }, + { + "autoregressive_loss": 0.1445, + "epoch": 2.9456066945606696, + "grad_norm": 0.06054938957095146, + "learning_rate": 7.375245966623757e-09, + "loss": 0.5789, + "step": 1056 + }, + { + "autoregressive_loss": 0.1357, + "epoch": 2.9483960948396097, + "grad_norm": 0.05004243180155754, + "learning_rate": 6.5787120567317734e-09, + "loss": 0.5424, + "step": 1057 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.9511854951185494, + "grad_norm": 0.04572566971182823, + "learning_rate": 5.827655603135585e-09, + "loss": 0.5366, + "step": 1058 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.9539748953974896, + "grad_norm": 0.05564025044441223, + "learning_rate": 5.122083446062464e-09, + "loss": 0.563, + "step": 1059 + }, + { + "autoregressive_loss": 0.1436, + "epoch": 2.9567642956764297, + "grad_norm": 0.042186152189970016, + "learning_rate": 4.462002011493271e-09, + "loss": 0.5715, + "step": 1060 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.9595536959553694, + "grad_norm": 0.08448714017868042, + "learning_rate": 3.847417311102497e-09, + "loss": 0.5642, + "step": 1061 + }, + { + "autoregressive_loss": 0.126, + "epoch": 2.9623430962343096, + "grad_norm": 0.031185420230031013, + "learning_rate": 3.2783349422044197e-09, + "loss": 0.5056, + "step": 1062 + }, + { + "autoregressive_loss": 0.1396, + "epoch": 2.9651324965132497, + "grad_norm": 0.06501400470733643, + "learning_rate": 2.7547600877020355e-09, + "loss": 0.5586, + "step": 1063 + }, + { + "autoregressive_loss": 0.1309, + "epoch": 2.9679218967921894, + "grad_norm": 0.05552037060260773, + "learning_rate": 2.276697516039872e-09, + "loss": 0.522, + "step": 1064 + }, + { + "autoregressive_loss": 0.1377, + "epoch": 2.9707112970711296, + "grad_norm": 0.05961514636874199, + "learning_rate": 1.8441515811612465e-09, + "loss": 0.5498, + "step": 1065 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.9735006973500697, + "grad_norm": 0.0567060150206089, + "learning_rate": 1.4571262224666315e-09, + "loss": 0.5475, + "step": 1066 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.97629009762901, + "grad_norm": 0.04679831117391586, + "learning_rate": 1.1156249647797934e-09, + "loss": 0.563, + "step": 1067 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.97907949790795, + "grad_norm": 0.03514951467514038, + "learning_rate": 8.196509183139301e-10, + "loss": 0.5444, + "step": 1068 + }, + { + "autoregressive_loss": 0.1348, + "epoch": 2.9818688981868897, + "grad_norm": 0.04582400992512703, + "learning_rate": 5.692067786455813e-10, + "loss": 0.5369, + "step": 1069 + }, + { + "autoregressive_loss": 0.1426, + "epoch": 2.98465829846583, + "grad_norm": 0.040323395282030106, + "learning_rate": 3.6429482668853824e-10, + "loss": 0.5703, + "step": 1070 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.98744769874477, + "grad_norm": 0.03774525225162506, + "learning_rate": 2.0491692867330438e-10, + "loss": 0.5472, + "step": 1071 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 2.9902370990237097, + "grad_norm": 0.056675516068935394, + "learning_rate": 9.107453612933192e-11, + "loss": 0.5464, + "step": 1072 + }, + { + "autoregressive_loss": 0.1318, + "epoch": 2.99302649930265, + "grad_norm": 0.06555616110563278, + "learning_rate": 2.2768685873364448e-11, + "loss": 0.5272, + "step": 1073 + }, + { + "autoregressive_loss": 0.1406, + "epoch": 2.99581589958159, + "grad_norm": 0.052892349660396576, + "learning_rate": 0.0, + "loss": 0.5613, + "step": 1074 + }, + { + "epoch": 2.99581589958159, + "step": 1074, + "total_flos": 4.821241819091383e+19, + "train_loss": 0.79851999407152, + "train_runtime": 44889.3859, + "train_samples_per_second": 3.067, + "train_steps_per_second": 0.024 + } + ], + "logging_steps": 1.0, + "max_steps": 1074, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 5000.0, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.821241819091383e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}