diff --git a/checkpoint-10624/config.json b/checkpoint-10624/config.json new file mode 100644 index 0000000000000000000000000000000000000000..875ba5d9d44b85d53592976d4c2b6425bc63516f --- /dev/null +++ b/checkpoint-10624/config.json @@ -0,0 +1,61 @@ +{ + "_name_or_path": "t5-small", + "architectures": [ + "T5ForConditionalGeneration" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 512, + "decoder_start_token_id": 0, + "dense_act_fn": "relu", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "relu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": false, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 6, + "num_heads": 8, + "num_layers": 6, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "torch_dtype": "float32", + "transformers_version": "4.49.0", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/checkpoint-10624/generation_config.json b/checkpoint-10624/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eba25c5db1745fe5324f4f7e8890c19853e21453 --- /dev/null +++ b/checkpoint-10624/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.49.0" +} diff --git a/checkpoint-10624/model.safetensors b/checkpoint-10624/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b5f613df10d258ac6dd82ca27fdc95aba2c57351 --- /dev/null +++ b/checkpoint-10624/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2a52a542fc460795d1eea30717f34069c2ac222ad447e5856a482d9f306f637 +size 242041896 diff --git a/checkpoint-10624/optimizer.pt b/checkpoint-10624/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..637759d9389805f0d6c2590d1fe9cec53626d32b --- /dev/null +++ b/checkpoint-10624/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:626a689b9399ca5bc9be0ba4001343c82189ae6b5f17c2f154f5f420be8c8efd +size 484163514 diff --git a/checkpoint-10624/rng_state.pth b/checkpoint-10624/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..80e9ce9dc2eed45c4ad0276ba2aea9f9d62d4822 --- /dev/null +++ b/checkpoint-10624/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:871dae08d4b0f588e7fb11dfff89a25046714f93a8b345b0a8564e6033959fb5 +size 14244 diff --git a/checkpoint-10624/scheduler.pt b/checkpoint-10624/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2a0b2ee1e036e9fbdcb114f010b7a5bd9ebcaa47 --- /dev/null +++ b/checkpoint-10624/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff5e9fe88ecbc6317ecb28b185c99a3462fbdc58b45639f3e0d8c2dc79b5584f +size 1064 diff --git a/checkpoint-10624/trainer_state.json b/checkpoint-10624/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4487304eda7caacf671c1a6ef8aaa5346569f873 --- /dev/null +++ b/checkpoint-10624/trainer_state.json @@ -0,0 +1,180 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 16.0, + "eval_steps": 500, + "global_step": 10624, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.7530120481927711, + "grad_norm": 0.2647170126438141, + "learning_rate": 0.0004811746987951807, + "loss": 0.3311, + "step": 500 + }, + { + "epoch": 1.5060240963855422, + "grad_norm": 0.22880347073078156, + "learning_rate": 0.00046234939759036143, + "loss": 0.0907, + "step": 1000 + }, + { + "epoch": 2.2590361445783134, + "grad_norm": 0.1677163541316986, + "learning_rate": 0.00044352409638554217, + "loss": 0.0568, + "step": 1500 + }, + { + "epoch": 3.0120481927710845, + "grad_norm": 0.12338300049304962, + "learning_rate": 0.0004246987951807229, + "loss": 0.0451, + "step": 2000 + }, + { + "epoch": 3.765060240963855, + "grad_norm": 0.08597979694604874, + "learning_rate": 0.0004058734939759036, + "loss": 0.0386, + "step": 2500 + }, + { + "epoch": 4.518072289156627, + "grad_norm": 0.0988745242357254, + "learning_rate": 0.00038704819277108433, + "loss": 0.0352, + "step": 3000 + }, + { + "epoch": 5.271084337349397, + "grad_norm": 0.11785969883203506, + "learning_rate": 0.00036822289156626507, + "loss": 0.0331, + "step": 3500 + }, + { + "epoch": 6.024096385542169, + "grad_norm": 0.09906379133462906, + "learning_rate": 0.0003493975903614458, + "loss": 0.0315, + "step": 4000 + }, + { + "epoch": 6.77710843373494, + "grad_norm": 0.1129639744758606, + "learning_rate": 0.0003305722891566265, + "loss": 0.0301, + "step": 4500 + }, + { + "epoch": 7.530120481927711, + "grad_norm": 0.07321502268314362, + "learning_rate": 0.00031174698795180723, + "loss": 0.0292, + "step": 5000 + }, + { + "epoch": 8.283132530120483, + "grad_norm": 0.05083702132105827, + "learning_rate": 0.0002929216867469879, + "loss": 0.028, + "step": 5500 + }, + { + "epoch": 9.036144578313253, + "grad_norm": 0.073179692029953, + "learning_rate": 0.0002740963855421687, + "loss": 0.0275, + "step": 6000 + }, + { + "epoch": 9.789156626506024, + "grad_norm": 0.060432616621255875, + "learning_rate": 0.0002552710843373494, + "loss": 0.0266, + "step": 6500 + }, + { + "epoch": 10.542168674698795, + "grad_norm": 0.05641400068998337, + "learning_rate": 0.00023644578313253013, + "loss": 0.0265, + "step": 7000 + }, + { + "epoch": 11.295180722891565, + "grad_norm": 0.055228352546691895, + "learning_rate": 0.00021762048192771087, + "loss": 0.0257, + "step": 7500 + }, + { + "epoch": 12.048192771084338, + "grad_norm": 0.055986884981393814, + "learning_rate": 0.00019879518072289158, + "loss": 0.0254, + "step": 8000 + }, + { + "epoch": 12.801204819277109, + "grad_norm": 0.06879087537527084, + "learning_rate": 0.0001799698795180723, + "loss": 0.025, + "step": 8500 + }, + { + "epoch": 13.55421686746988, + "grad_norm": 0.08162941783666611, + "learning_rate": 0.00016114457831325303, + "loss": 0.0248, + "step": 9000 + }, + { + "epoch": 14.30722891566265, + "grad_norm": 0.0502689927816391, + "learning_rate": 0.00014231927710843374, + "loss": 0.0242, + "step": 9500 + }, + { + "epoch": 15.060240963855422, + "grad_norm": 0.052483588457107544, + "learning_rate": 0.00012349397590361445, + "loss": 0.0245, + "step": 10000 + }, + { + "epoch": 15.813253012048193, + "grad_norm": 0.04214683175086975, + "learning_rate": 0.00010466867469879517, + "loss": 0.0238, + "step": 10500 + } + ], + "logging_steps": 500, + "max_steps": 13280, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5744176738074624.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-10624/training_args.bin b/checkpoint-10624/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c4458cc722a24e1abde581b8667a40b1674f5aea --- /dev/null +++ b/checkpoint-10624/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec2ab2506605f8b233e63780e2c7aa14dc16da409c9e0e5c14641534e5ff016 +size 5304 diff --git a/checkpoint-11288/config.json b/checkpoint-11288/config.json new file mode 100644 index 0000000000000000000000000000000000000000..875ba5d9d44b85d53592976d4c2b6425bc63516f --- /dev/null +++ b/checkpoint-11288/config.json @@ -0,0 +1,61 @@ +{ + "_name_or_path": "t5-small", + "architectures": [ + "T5ForConditionalGeneration" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 512, + "decoder_start_token_id": 0, + "dense_act_fn": "relu", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "relu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": false, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 6, + "num_heads": 8, + "num_layers": 6, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "torch_dtype": "float32", + "transformers_version": "4.49.0", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/checkpoint-11288/generation_config.json b/checkpoint-11288/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eba25c5db1745fe5324f4f7e8890c19853e21453 --- /dev/null +++ b/checkpoint-11288/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.49.0" +} diff --git a/checkpoint-11288/model.safetensors b/checkpoint-11288/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..999ba4d5d35cfe6e8907002b37ef75cc39e3c0f6 --- /dev/null +++ b/checkpoint-11288/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d30b7966eb336d8ee47c0a75a7ba2a9369f2b9f5ff64ec647694b39217a6fe3 +size 242041896 diff --git a/checkpoint-11288/optimizer.pt b/checkpoint-11288/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6296790a6b4555028724b89099ea4c5e6d441be0 --- /dev/null +++ b/checkpoint-11288/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:225d5d0cebb7017617e05571854333032b6acc41d71a97baa451858b61dc93e9 +size 484163514 diff --git a/checkpoint-11288/rng_state.pth b/checkpoint-11288/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a5a878f5fb1255aa849f7f7458a72e804c6ad730 --- /dev/null +++ b/checkpoint-11288/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea982c40bf13ee6d59e20a0c92fb57845229dff0ba14bb916750b0adb8f60d26 +size 14244 diff --git a/checkpoint-11288/scheduler.pt b/checkpoint-11288/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a60738b15b74fb7a574c18563e7a777c4d6e33b --- /dev/null +++ b/checkpoint-11288/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f49e004a60a12580012411533599c9840ef90bb62ac0e44a6af9f00aa574415 +size 1064 diff --git a/checkpoint-11288/trainer_state.json b/checkpoint-11288/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..cac0194474ca437485e99478782ed879ca9b16aa --- /dev/null +++ b/checkpoint-11288/trainer_state.json @@ -0,0 +1,187 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 17.0, + "eval_steps": 500, + "global_step": 11288, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.7530120481927711, + "grad_norm": 0.2647170126438141, + "learning_rate": 0.0004811746987951807, + "loss": 0.3311, + "step": 500 + }, + { + "epoch": 1.5060240963855422, + "grad_norm": 0.22880347073078156, + "learning_rate": 0.00046234939759036143, + "loss": 0.0907, + "step": 1000 + }, + { + "epoch": 2.2590361445783134, + "grad_norm": 0.1677163541316986, + "learning_rate": 0.00044352409638554217, + "loss": 0.0568, + "step": 1500 + }, + { + "epoch": 3.0120481927710845, + "grad_norm": 0.12338300049304962, + "learning_rate": 0.0004246987951807229, + "loss": 0.0451, + "step": 2000 + }, + { + "epoch": 3.765060240963855, + "grad_norm": 0.08597979694604874, + "learning_rate": 0.0004058734939759036, + "loss": 0.0386, + "step": 2500 + }, + { + "epoch": 4.518072289156627, + "grad_norm": 0.0988745242357254, + "learning_rate": 0.00038704819277108433, + "loss": 0.0352, + "step": 3000 + }, + { + "epoch": 5.271084337349397, + "grad_norm": 0.11785969883203506, + "learning_rate": 0.00036822289156626507, + "loss": 0.0331, + "step": 3500 + }, + { + "epoch": 6.024096385542169, + "grad_norm": 0.09906379133462906, + "learning_rate": 0.0003493975903614458, + "loss": 0.0315, + "step": 4000 + }, + { + "epoch": 6.77710843373494, + "grad_norm": 0.1129639744758606, + "learning_rate": 0.0003305722891566265, + "loss": 0.0301, + "step": 4500 + }, + { + "epoch": 7.530120481927711, + "grad_norm": 0.07321502268314362, + "learning_rate": 0.00031174698795180723, + "loss": 0.0292, + "step": 5000 + }, + { + "epoch": 8.283132530120483, + "grad_norm": 0.05083702132105827, + "learning_rate": 0.0002929216867469879, + "loss": 0.028, + "step": 5500 + }, + { + "epoch": 9.036144578313253, + "grad_norm": 0.073179692029953, + "learning_rate": 0.0002740963855421687, + "loss": 0.0275, + "step": 6000 + }, + { + "epoch": 9.789156626506024, + "grad_norm": 0.060432616621255875, + "learning_rate": 0.0002552710843373494, + "loss": 0.0266, + "step": 6500 + }, + { + "epoch": 10.542168674698795, + "grad_norm": 0.05641400068998337, + "learning_rate": 0.00023644578313253013, + "loss": 0.0265, + "step": 7000 + }, + { + "epoch": 11.295180722891565, + "grad_norm": 0.055228352546691895, + "learning_rate": 0.00021762048192771087, + "loss": 0.0257, + "step": 7500 + }, + { + "epoch": 12.048192771084338, + "grad_norm": 0.055986884981393814, + "learning_rate": 0.00019879518072289158, + "loss": 0.0254, + "step": 8000 + }, + { + "epoch": 12.801204819277109, + "grad_norm": 0.06879087537527084, + "learning_rate": 0.0001799698795180723, + "loss": 0.025, + "step": 8500 + }, + { + "epoch": 13.55421686746988, + "grad_norm": 0.08162941783666611, + "learning_rate": 0.00016114457831325303, + "loss": 0.0248, + "step": 9000 + }, + { + "epoch": 14.30722891566265, + "grad_norm": 0.0502689927816391, + "learning_rate": 0.00014231927710843374, + "loss": 0.0242, + "step": 9500 + }, + { + "epoch": 15.060240963855422, + "grad_norm": 0.052483588457107544, + "learning_rate": 0.00012349397590361445, + "loss": 0.0245, + "step": 10000 + }, + { + "epoch": 15.813253012048193, + "grad_norm": 0.04214683175086975, + "learning_rate": 0.00010466867469879517, + "loss": 0.0238, + "step": 10500 + }, + { + "epoch": 16.566265060240966, + "grad_norm": 0.03767360374331474, + "learning_rate": 8.58433734939759e-05, + "loss": 0.0239, + "step": 11000 + } + ], + "logging_steps": 500, + "max_steps": 13280, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6103187784204288.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-11288/training_args.bin b/checkpoint-11288/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c4458cc722a24e1abde581b8667a40b1674f5aea --- /dev/null +++ b/checkpoint-11288/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec2ab2506605f8b233e63780e2c7aa14dc16da409c9e0e5c14641534e5ff016 +size 5304 diff --git a/checkpoint-11952/config.json b/checkpoint-11952/config.json new file mode 100644 index 0000000000000000000000000000000000000000..875ba5d9d44b85d53592976d4c2b6425bc63516f --- /dev/null +++ b/checkpoint-11952/config.json @@ -0,0 +1,61 @@ +{ + "_name_or_path": "t5-small", + "architectures": [ + "T5ForConditionalGeneration" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 512, + "decoder_start_token_id": 0, + "dense_act_fn": "relu", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "relu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": false, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 6, + "num_heads": 8, + "num_layers": 6, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "torch_dtype": "float32", + "transformers_version": "4.49.0", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/checkpoint-11952/generation_config.json b/checkpoint-11952/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eba25c5db1745fe5324f4f7e8890c19853e21453 --- /dev/null +++ b/checkpoint-11952/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.49.0" +} diff --git a/checkpoint-11952/model.safetensors b/checkpoint-11952/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..62eeb003177d9a78dd4d5c8a420cf61c9bd8d468 --- /dev/null +++ b/checkpoint-11952/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2eb96a9f08127ec7592e050ea89cd120320db5b74966dc5656609bbd208511c2 +size 242041896 diff --git a/checkpoint-11952/optimizer.pt b/checkpoint-11952/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b982611e2cadc740001eefe5114a8bdaef42804f --- /dev/null +++ b/checkpoint-11952/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9cce771a11a0f6043f4a77c724a7499306145b423778e42ee3413da80892d3d +size 484163514 diff --git a/checkpoint-11952/rng_state.pth b/checkpoint-11952/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8088e255e0e2a38088857ba32eb7a2168a175d9 --- /dev/null +++ b/checkpoint-11952/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c503ee05f88e202769d6afdb9591580059e32575c1447b22ecf57bac3a49734 +size 14244 diff --git a/checkpoint-11952/scheduler.pt b/checkpoint-11952/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c73e4454f93b631ca00379de0f530371c0e6e2ab --- /dev/null +++ b/checkpoint-11952/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22cc1df3a420b1528700cb2c2255107d6d7b4d17b007299a04e9c786c3c472a4 +size 1064 diff --git a/checkpoint-11952/trainer_state.json b/checkpoint-11952/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d0eb48e71b44fd2e762f85d07b6602c7f684cc8f --- /dev/null +++ b/checkpoint-11952/trainer_state.json @@ -0,0 +1,194 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 18.0, + "eval_steps": 500, + "global_step": 11952, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.7530120481927711, + "grad_norm": 0.2647170126438141, + "learning_rate": 0.0004811746987951807, + "loss": 0.3311, + "step": 500 + }, + { + "epoch": 1.5060240963855422, + "grad_norm": 0.22880347073078156, + "learning_rate": 0.00046234939759036143, + "loss": 0.0907, + "step": 1000 + }, + { + "epoch": 2.2590361445783134, + "grad_norm": 0.1677163541316986, + "learning_rate": 0.00044352409638554217, + "loss": 0.0568, + "step": 1500 + }, + { + "epoch": 3.0120481927710845, + "grad_norm": 0.12338300049304962, + "learning_rate": 0.0004246987951807229, + "loss": 0.0451, + "step": 2000 + }, + { + "epoch": 3.765060240963855, + "grad_norm": 0.08597979694604874, + "learning_rate": 0.0004058734939759036, + "loss": 0.0386, + "step": 2500 + }, + { + "epoch": 4.518072289156627, + "grad_norm": 0.0988745242357254, + "learning_rate": 0.00038704819277108433, + "loss": 0.0352, + "step": 3000 + }, + { + "epoch": 5.271084337349397, + "grad_norm": 0.11785969883203506, + "learning_rate": 0.00036822289156626507, + "loss": 0.0331, + "step": 3500 + }, + { + "epoch": 6.024096385542169, + "grad_norm": 0.09906379133462906, + "learning_rate": 0.0003493975903614458, + "loss": 0.0315, + "step": 4000 + }, + { + "epoch": 6.77710843373494, + "grad_norm": 0.1129639744758606, + "learning_rate": 0.0003305722891566265, + "loss": 0.0301, + "step": 4500 + }, + { + "epoch": 7.530120481927711, + "grad_norm": 0.07321502268314362, + "learning_rate": 0.00031174698795180723, + "loss": 0.0292, + "step": 5000 + }, + { + "epoch": 8.283132530120483, + "grad_norm": 0.05083702132105827, + "learning_rate": 0.0002929216867469879, + "loss": 0.028, + "step": 5500 + }, + { + "epoch": 9.036144578313253, + "grad_norm": 0.073179692029953, + "learning_rate": 0.0002740963855421687, + "loss": 0.0275, + "step": 6000 + }, + { + "epoch": 9.789156626506024, + "grad_norm": 0.060432616621255875, + "learning_rate": 0.0002552710843373494, + "loss": 0.0266, + "step": 6500 + }, + { + "epoch": 10.542168674698795, + "grad_norm": 0.05641400068998337, + "learning_rate": 0.00023644578313253013, + "loss": 0.0265, + "step": 7000 + }, + { + "epoch": 11.295180722891565, + "grad_norm": 0.055228352546691895, + "learning_rate": 0.00021762048192771087, + "loss": 0.0257, + "step": 7500 + }, + { + "epoch": 12.048192771084338, + "grad_norm": 0.055986884981393814, + "learning_rate": 0.00019879518072289158, + "loss": 0.0254, + "step": 8000 + }, + { + "epoch": 12.801204819277109, + "grad_norm": 0.06879087537527084, + "learning_rate": 0.0001799698795180723, + "loss": 0.025, + "step": 8500 + }, + { + "epoch": 13.55421686746988, + "grad_norm": 0.08162941783666611, + "learning_rate": 0.00016114457831325303, + "loss": 0.0248, + "step": 9000 + }, + { + "epoch": 14.30722891566265, + "grad_norm": 0.0502689927816391, + "learning_rate": 0.00014231927710843374, + "loss": 0.0242, + "step": 9500 + }, + { + "epoch": 15.060240963855422, + "grad_norm": 0.052483588457107544, + "learning_rate": 0.00012349397590361445, + "loss": 0.0245, + "step": 10000 + }, + { + "epoch": 15.813253012048193, + "grad_norm": 0.04214683175086975, + "learning_rate": 0.00010466867469879517, + "loss": 0.0238, + "step": 10500 + }, + { + "epoch": 16.566265060240966, + "grad_norm": 0.03767360374331474, + "learning_rate": 8.58433734939759e-05, + "loss": 0.0239, + "step": 11000 + }, + { + "epoch": 17.319277108433734, + "grad_norm": 0.04902500659227371, + "learning_rate": 6.701807228915662e-05, + "loss": 0.0234, + "step": 11500 + } + ], + "logging_steps": 500, + "max_steps": 13280, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6462198830333952.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-11952/training_args.bin b/checkpoint-11952/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c4458cc722a24e1abde581b8667a40b1674f5aea --- /dev/null +++ b/checkpoint-11952/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec2ab2506605f8b233e63780e2c7aa14dc16da409c9e0e5c14641534e5ff016 +size 5304 diff --git a/checkpoint-12616/config.json b/checkpoint-12616/config.json new file mode 100644 index 0000000000000000000000000000000000000000..875ba5d9d44b85d53592976d4c2b6425bc63516f --- /dev/null +++ b/checkpoint-12616/config.json @@ -0,0 +1,61 @@ +{ + "_name_or_path": "t5-small", + "architectures": [ + "T5ForConditionalGeneration" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 512, + "decoder_start_token_id": 0, + "dense_act_fn": "relu", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "relu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": false, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 6, + "num_heads": 8, + "num_layers": 6, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "torch_dtype": "float32", + "transformers_version": "4.49.0", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/checkpoint-12616/generation_config.json b/checkpoint-12616/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eba25c5db1745fe5324f4f7e8890c19853e21453 --- /dev/null +++ b/checkpoint-12616/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.49.0" +} diff --git a/checkpoint-12616/model.safetensors b/checkpoint-12616/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..72c97f274d8de5ddeff84f8fcac255df2538ca7b --- /dev/null +++ b/checkpoint-12616/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e6b7740a417775a39ca07bbcb3f19c41d1de00ef5d3c600eb612b81f032d12a +size 242041896 diff --git a/checkpoint-12616/optimizer.pt b/checkpoint-12616/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb1186611179290409a3e793d57b24b4c4038e6e --- /dev/null +++ b/checkpoint-12616/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:298fd0e26e0e50e6407d3e5b11ce54997833eae70441b852ffc0ce668a639ccc +size 484163514 diff --git a/checkpoint-12616/rng_state.pth b/checkpoint-12616/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..0862a9631ba1d610bab2e11aa1a7bef4b80c2f1a --- /dev/null +++ b/checkpoint-12616/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21b5b2d0d4ef3a10347f33189f220aa44bc97dadc5d71a227e0fb3e22c380ff6 +size 14244 diff --git a/checkpoint-12616/scheduler.pt b/checkpoint-12616/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..83ee3334a46b18a95261753b2f3689451785b2eb --- /dev/null +++ b/checkpoint-12616/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:822b1fd7a3f099d382f34877c8a7ac2274ddcadf9bb3122afba35168bc2788e6 +size 1064 diff --git a/checkpoint-12616/trainer_state.json b/checkpoint-12616/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..de0ed48a1636858d1034dad4d77886834a53b9eb --- /dev/null +++ b/checkpoint-12616/trainer_state.json @@ -0,0 +1,208 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 19.0, + "eval_steps": 500, + "global_step": 12616, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.7530120481927711, + "grad_norm": 0.2647170126438141, + "learning_rate": 0.0004811746987951807, + "loss": 0.3311, + "step": 500 + }, + { + "epoch": 1.5060240963855422, + "grad_norm": 0.22880347073078156, + "learning_rate": 0.00046234939759036143, + "loss": 0.0907, + "step": 1000 + }, + { + "epoch": 2.2590361445783134, + "grad_norm": 0.1677163541316986, + "learning_rate": 0.00044352409638554217, + "loss": 0.0568, + "step": 1500 + }, + { + "epoch": 3.0120481927710845, + "grad_norm": 0.12338300049304962, + "learning_rate": 0.0004246987951807229, + "loss": 0.0451, + "step": 2000 + }, + { + "epoch": 3.765060240963855, + "grad_norm": 0.08597979694604874, + "learning_rate": 0.0004058734939759036, + "loss": 0.0386, + "step": 2500 + }, + { + "epoch": 4.518072289156627, + "grad_norm": 0.0988745242357254, + "learning_rate": 0.00038704819277108433, + "loss": 0.0352, + "step": 3000 + }, + { + "epoch": 5.271084337349397, + "grad_norm": 0.11785969883203506, + "learning_rate": 0.00036822289156626507, + "loss": 0.0331, + "step": 3500 + }, + { + "epoch": 6.024096385542169, + "grad_norm": 0.09906379133462906, + "learning_rate": 0.0003493975903614458, + "loss": 0.0315, + "step": 4000 + }, + { + "epoch": 6.77710843373494, + "grad_norm": 0.1129639744758606, + "learning_rate": 0.0003305722891566265, + "loss": 0.0301, + "step": 4500 + }, + { + "epoch": 7.530120481927711, + "grad_norm": 0.07321502268314362, + "learning_rate": 0.00031174698795180723, + "loss": 0.0292, + "step": 5000 + }, + { + "epoch": 8.283132530120483, + "grad_norm": 0.05083702132105827, + "learning_rate": 0.0002929216867469879, + "loss": 0.028, + "step": 5500 + }, + { + "epoch": 9.036144578313253, + "grad_norm": 0.073179692029953, + "learning_rate": 0.0002740963855421687, + "loss": 0.0275, + "step": 6000 + }, + { + "epoch": 9.789156626506024, + "grad_norm": 0.060432616621255875, + "learning_rate": 0.0002552710843373494, + "loss": 0.0266, + "step": 6500 + }, + { + "epoch": 10.542168674698795, + "grad_norm": 0.05641400068998337, + "learning_rate": 0.00023644578313253013, + "loss": 0.0265, + "step": 7000 + }, + { + "epoch": 11.295180722891565, + "grad_norm": 0.055228352546691895, + "learning_rate": 0.00021762048192771087, + "loss": 0.0257, + "step": 7500 + }, + { + "epoch": 12.048192771084338, + "grad_norm": 0.055986884981393814, + "learning_rate": 0.00019879518072289158, + "loss": 0.0254, + "step": 8000 + }, + { + "epoch": 12.801204819277109, + "grad_norm": 0.06879087537527084, + "learning_rate": 0.0001799698795180723, + "loss": 0.025, + "step": 8500 + }, + { + "epoch": 13.55421686746988, + "grad_norm": 0.08162941783666611, + "learning_rate": 0.00016114457831325303, + "loss": 0.0248, + "step": 9000 + }, + { + "epoch": 14.30722891566265, + "grad_norm": 0.0502689927816391, + "learning_rate": 0.00014231927710843374, + "loss": 0.0242, + "step": 9500 + }, + { + "epoch": 15.060240963855422, + "grad_norm": 0.052483588457107544, + "learning_rate": 0.00012349397590361445, + "loss": 0.0245, + "step": 10000 + }, + { + "epoch": 15.813253012048193, + "grad_norm": 0.04214683175086975, + "learning_rate": 0.00010466867469879517, + "loss": 0.0238, + "step": 10500 + }, + { + "epoch": 16.566265060240966, + "grad_norm": 0.03767360374331474, + "learning_rate": 8.58433734939759e-05, + "loss": 0.0239, + "step": 11000 + }, + { + "epoch": 17.319277108433734, + "grad_norm": 0.04902500659227371, + "learning_rate": 6.701807228915662e-05, + "loss": 0.0234, + "step": 11500 + }, + { + "epoch": 18.072289156626507, + "grad_norm": 0.058824148029088974, + "learning_rate": 4.8192771084337354e-05, + "loss": 0.0232, + "step": 12000 + }, + { + "epoch": 18.825301204819276, + "grad_norm": 0.06361762434244156, + "learning_rate": 2.9367469879518075e-05, + "loss": 0.0231, + "step": 12500 + } + ], + "logging_steps": 500, + "max_steps": 13280, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6821209876463616.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-12616/training_args.bin b/checkpoint-12616/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c4458cc722a24e1abde581b8667a40b1674f5aea --- /dev/null +++ b/checkpoint-12616/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec2ab2506605f8b233e63780e2c7aa14dc16da409c9e0e5c14641534e5ff016 +size 5304 diff --git a/checkpoint-1328/config.json b/checkpoint-1328/config.json new file mode 100644 index 0000000000000000000000000000000000000000..875ba5d9d44b85d53592976d4c2b6425bc63516f --- /dev/null +++ b/checkpoint-1328/config.json @@ -0,0 +1,61 @@ +{ + "_name_or_path": "t5-small", + "architectures": [ + "T5ForConditionalGeneration" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 512, + "decoder_start_token_id": 0, + "dense_act_fn": "relu", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "relu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": false, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 6, + "num_heads": 8, + "num_layers": 6, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "torch_dtype": "float32", + "transformers_version": "4.49.0", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/checkpoint-1328/generation_config.json b/checkpoint-1328/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eba25c5db1745fe5324f4f7e8890c19853e21453 --- /dev/null +++ b/checkpoint-1328/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.49.0" +} diff --git a/checkpoint-1328/model.safetensors b/checkpoint-1328/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5199b8eef8710b41a0f8805807204836d91faed4 --- /dev/null +++ b/checkpoint-1328/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cb90960a87bb689d468344aeff8b4e9d5590b017945b0a486ddb999a6542813 +size 242041896 diff --git a/checkpoint-1328/optimizer.pt b/checkpoint-1328/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d42a316334aec1d47f29992b951b97c311b05bc --- /dev/null +++ b/checkpoint-1328/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5100188e6dcc93a1184d247117fb0df100240535a266ec5f78e8b8bd3943062e +size 484163514 diff --git a/checkpoint-1328/rng_state.pth b/checkpoint-1328/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e3ba157c04424f8d197eac69f4dc419170034322 --- /dev/null +++ b/checkpoint-1328/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8737df5603a32aae5e5e9c2da6de40078a49291ae4477cd8b2ae7c3f57890fbc +size 14244 diff --git a/checkpoint-1328/scheduler.pt b/checkpoint-1328/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..62518a06aaada65952059dc1dc9b4587f3bde683 --- /dev/null +++ b/checkpoint-1328/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b51d887a83d6c3823b34be1c27e0a48d7412ba51ea0a6f9ef63aa05d8faea20 +size 1064 diff --git a/checkpoint-1328/trainer_state.json b/checkpoint-1328/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0e04ea27f6f756183b8dbdecfa9d850e34fda7b8 --- /dev/null +++ b/checkpoint-1328/trainer_state.json @@ -0,0 +1,47 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 1328, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.7530120481927711, + "grad_norm": 0.2647170126438141, + "learning_rate": 0.0004811746987951807, + "loss": 0.3311, + "step": 500 + }, + { + "epoch": 1.5060240963855422, + "grad_norm": 0.22880347073078156, + "learning_rate": 0.00046234939759036143, + "loss": 0.0907, + "step": 1000 + } + ], + "logging_steps": 500, + "max_steps": 13280, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 718022092259328.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1328/training_args.bin b/checkpoint-1328/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c4458cc722a24e1abde581b8667a40b1674f5aea --- /dev/null +++ b/checkpoint-1328/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec2ab2506605f8b233e63780e2c7aa14dc16da409c9e0e5c14641534e5ff016 +size 5304 diff --git a/checkpoint-13280/config.json b/checkpoint-13280/config.json new file mode 100644 index 0000000000000000000000000000000000000000..875ba5d9d44b85d53592976d4c2b6425bc63516f --- /dev/null +++ b/checkpoint-13280/config.json @@ -0,0 +1,61 @@ +{ + "_name_or_path": "t5-small", + "architectures": [ + "T5ForConditionalGeneration" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 512, + "decoder_start_token_id": 0, + "dense_act_fn": "relu", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "relu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": false, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 6, + "num_heads": 8, + "num_layers": 6, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "torch_dtype": "float32", + "transformers_version": "4.49.0", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/checkpoint-13280/generation_config.json b/checkpoint-13280/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eba25c5db1745fe5324f4f7e8890c19853e21453 --- /dev/null +++ b/checkpoint-13280/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.49.0" +} diff --git a/checkpoint-13280/model.safetensors b/checkpoint-13280/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b80690044e61d360fd0e7f2da5549a764a1fdac4 --- /dev/null +++ b/checkpoint-13280/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8bd36f4c4292c3d390f593bc70a7f6421f9732c1adde462e862d0e1dd1f20c9 +size 242041896 diff --git a/checkpoint-13280/optimizer.pt b/checkpoint-13280/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..08968775aecea7b21468be71c46e9bf2b2c8d240 --- /dev/null +++ b/checkpoint-13280/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:301f1132d8ad5001f1b12cf4300d5b23653e59d8885b0c5005bf916df091417c +size 484163514 diff --git a/checkpoint-13280/rng_state.pth b/checkpoint-13280/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..8a2972c81a759bd6cc347465ee6885c690a43978 --- /dev/null +++ b/checkpoint-13280/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb1f5a9993254e34859ad003f7605d6f3577e096450d91bae1e372fe7a69711b +size 14244 diff --git a/checkpoint-13280/scheduler.pt b/checkpoint-13280/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c31f0c3d7bbfe5d9013f0c947a69f06f7bfe724 --- /dev/null +++ b/checkpoint-13280/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cd06fe26347b0405fefe5ed1eea662b1479b2b2b4875e24a34f8e1b73b73ad7 +size 1064 diff --git a/checkpoint-13280/trainer_state.json b/checkpoint-13280/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f57e8c02d4727b8bf83abe190f348c99b39a51e8 --- /dev/null +++ b/checkpoint-13280/trainer_state.json @@ -0,0 +1,215 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 20.0, + "eval_steps": 500, + "global_step": 13280, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.7530120481927711, + "grad_norm": 0.2647170126438141, + "learning_rate": 0.0004811746987951807, + "loss": 0.3311, + "step": 500 + }, + { + "epoch": 1.5060240963855422, + "grad_norm": 0.22880347073078156, + "learning_rate": 0.00046234939759036143, + "loss": 0.0907, + "step": 1000 + }, + { + "epoch": 2.2590361445783134, + "grad_norm": 0.1677163541316986, + "learning_rate": 0.00044352409638554217, + "loss": 0.0568, + "step": 1500 + }, + { + "epoch": 3.0120481927710845, + "grad_norm": 0.12338300049304962, + "learning_rate": 0.0004246987951807229, + "loss": 0.0451, + "step": 2000 + }, + { + "epoch": 3.765060240963855, + "grad_norm": 0.08597979694604874, + "learning_rate": 0.0004058734939759036, + "loss": 0.0386, + "step": 2500 + }, + { + "epoch": 4.518072289156627, + "grad_norm": 0.0988745242357254, + "learning_rate": 0.00038704819277108433, + "loss": 0.0352, + "step": 3000 + }, + { + "epoch": 5.271084337349397, + "grad_norm": 0.11785969883203506, + "learning_rate": 0.00036822289156626507, + "loss": 0.0331, + "step": 3500 + }, + { + "epoch": 6.024096385542169, + "grad_norm": 0.09906379133462906, + "learning_rate": 0.0003493975903614458, + "loss": 0.0315, + "step": 4000 + }, + { + "epoch": 6.77710843373494, + "grad_norm": 0.1129639744758606, + "learning_rate": 0.0003305722891566265, + "loss": 0.0301, + "step": 4500 + }, + { + "epoch": 7.530120481927711, + "grad_norm": 0.07321502268314362, + "learning_rate": 0.00031174698795180723, + "loss": 0.0292, + "step": 5000 + }, + { + "epoch": 8.283132530120483, + "grad_norm": 0.05083702132105827, + "learning_rate": 0.0002929216867469879, + "loss": 0.028, + "step": 5500 + }, + { + "epoch": 9.036144578313253, + "grad_norm": 0.073179692029953, + "learning_rate": 0.0002740963855421687, + "loss": 0.0275, + "step": 6000 + }, + { + "epoch": 9.789156626506024, + "grad_norm": 0.060432616621255875, + "learning_rate": 0.0002552710843373494, + "loss": 0.0266, + "step": 6500 + }, + { + "epoch": 10.542168674698795, + "grad_norm": 0.05641400068998337, + "learning_rate": 0.00023644578313253013, + "loss": 0.0265, + "step": 7000 + }, + { + "epoch": 11.295180722891565, + "grad_norm": 0.055228352546691895, + "learning_rate": 0.00021762048192771087, + "loss": 0.0257, + "step": 7500 + }, + { + "epoch": 12.048192771084338, + "grad_norm": 0.055986884981393814, + "learning_rate": 0.00019879518072289158, + "loss": 0.0254, + "step": 8000 + }, + { + "epoch": 12.801204819277109, + "grad_norm": 0.06879087537527084, + "learning_rate": 0.0001799698795180723, + "loss": 0.025, + "step": 8500 + }, + { + "epoch": 13.55421686746988, + "grad_norm": 0.08162941783666611, + "learning_rate": 0.00016114457831325303, + "loss": 0.0248, + "step": 9000 + }, + { + "epoch": 14.30722891566265, + "grad_norm": 0.0502689927816391, + "learning_rate": 0.00014231927710843374, + "loss": 0.0242, + "step": 9500 + }, + { + "epoch": 15.060240963855422, + "grad_norm": 0.052483588457107544, + "learning_rate": 0.00012349397590361445, + "loss": 0.0245, + "step": 10000 + }, + { + "epoch": 15.813253012048193, + "grad_norm": 0.04214683175086975, + "learning_rate": 0.00010466867469879517, + "loss": 0.0238, + "step": 10500 + }, + { + "epoch": 16.566265060240966, + "grad_norm": 0.03767360374331474, + "learning_rate": 8.58433734939759e-05, + "loss": 0.0239, + "step": 11000 + }, + { + "epoch": 17.319277108433734, + "grad_norm": 0.04902500659227371, + "learning_rate": 6.701807228915662e-05, + "loss": 0.0234, + "step": 11500 + }, + { + "epoch": 18.072289156626507, + "grad_norm": 0.058824148029088974, + "learning_rate": 4.8192771084337354e-05, + "loss": 0.0232, + "step": 12000 + }, + { + "epoch": 18.825301204819276, + "grad_norm": 0.06361762434244156, + "learning_rate": 2.9367469879518075e-05, + "loss": 0.0231, + "step": 12500 + }, + { + "epoch": 19.57831325301205, + "grad_norm": 0.053078796714544296, + "learning_rate": 1.0542168674698795e-05, + "loss": 0.0232, + "step": 13000 + } + ], + "logging_steps": 500, + "max_steps": 13280, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7180220922593280.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-13280/training_args.bin b/checkpoint-13280/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c4458cc722a24e1abde581b8667a40b1674f5aea --- /dev/null +++ b/checkpoint-13280/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec2ab2506605f8b233e63780e2c7aa14dc16da409c9e0e5c14641534e5ff016 +size 5304 diff --git a/checkpoint-1992/config.json b/checkpoint-1992/config.json new file mode 100644 index 0000000000000000000000000000000000000000..875ba5d9d44b85d53592976d4c2b6425bc63516f --- /dev/null +++ b/checkpoint-1992/config.json @@ -0,0 +1,61 @@ +{ + "_name_or_path": "t5-small", + "architectures": [ + "T5ForConditionalGeneration" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 512, + "decoder_start_token_id": 0, + "dense_act_fn": "relu", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "relu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": false, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 6, + "num_heads": 8, + "num_layers": 6, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "torch_dtype": "float32", + "transformers_version": "4.49.0", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/checkpoint-1992/generation_config.json b/checkpoint-1992/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eba25c5db1745fe5324f4f7e8890c19853e21453 --- /dev/null +++ b/checkpoint-1992/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.49.0" +} diff --git a/checkpoint-1992/model.safetensors b/checkpoint-1992/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..319405e4e3719406d7e6a9684fd0549d5d73a0fb --- /dev/null +++ b/checkpoint-1992/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4654c9cf89a81b68bd35bdc6ee00aaf996858cbfd524ab9a2f899ba3c710dd9b +size 242041896 diff --git a/checkpoint-1992/optimizer.pt b/checkpoint-1992/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab0daa0f04a8bbb18ab7b3298f1dd4fa35799458 --- /dev/null +++ b/checkpoint-1992/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1926465b24f3299cef677e55de777f74ed47bceb1d2f671026fc7252dbe2335 +size 484163514 diff --git a/checkpoint-1992/rng_state.pth b/checkpoint-1992/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..781d63b0f677dcbb17abc3b3c80e53f6c5d686d0 --- /dev/null +++ b/checkpoint-1992/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28b3210c6546f5c024fe5468a4c5cddf81630f8b46f6f1c93edbffea14133b6b +size 14244 diff --git a/checkpoint-1992/scheduler.pt b/checkpoint-1992/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..490af9dd36e6e9f50ef3fac87482eea3974c47ca --- /dev/null +++ b/checkpoint-1992/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a73930a28f6f17579b171324c0dc859f039babe921c42761a59cde8ea4ef9213 +size 1064 diff --git a/checkpoint-1992/trainer_state.json b/checkpoint-1992/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c46ded255c5bde1c6e2d6c379bee1968e1c9ca21 --- /dev/null +++ b/checkpoint-1992/trainer_state.json @@ -0,0 +1,54 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1992, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.7530120481927711, + "grad_norm": 0.2647170126438141, + "learning_rate": 0.0004811746987951807, + "loss": 0.3311, + "step": 500 + }, + { + "epoch": 1.5060240963855422, + "grad_norm": 0.22880347073078156, + "learning_rate": 0.00046234939759036143, + "loss": 0.0907, + "step": 1000 + }, + { + "epoch": 2.2590361445783134, + "grad_norm": 0.1677163541316986, + "learning_rate": 0.00044352409638554217, + "loss": 0.0568, + "step": 1500 + } + ], + "logging_steps": 500, + "max_steps": 13280, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1077033138388992.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1992/training_args.bin b/checkpoint-1992/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c4458cc722a24e1abde581b8667a40b1674f5aea --- /dev/null +++ b/checkpoint-1992/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec2ab2506605f8b233e63780e2c7aa14dc16da409c9e0e5c14641534e5ff016 +size 5304 diff --git a/checkpoint-2656/config.json b/checkpoint-2656/config.json new file mode 100644 index 0000000000000000000000000000000000000000..875ba5d9d44b85d53592976d4c2b6425bc63516f --- /dev/null +++ b/checkpoint-2656/config.json @@ -0,0 +1,61 @@ +{ + "_name_or_path": "t5-small", + "architectures": [ + "T5ForConditionalGeneration" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 512, + "decoder_start_token_id": 0, + "dense_act_fn": "relu", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "relu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": false, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 6, + "num_heads": 8, + "num_layers": 6, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "torch_dtype": "float32", + "transformers_version": "4.49.0", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/checkpoint-2656/generation_config.json b/checkpoint-2656/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eba25c5db1745fe5324f4f7e8890c19853e21453 --- /dev/null +++ b/checkpoint-2656/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.49.0" +} diff --git a/checkpoint-2656/model.safetensors b/checkpoint-2656/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..084ba2c29275f6618c0a9d6cb78be0e678360702 --- /dev/null +++ b/checkpoint-2656/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4ec6681d5e4655771228f516ef82ae9953b4bd8a03023f89c01ef653ebd3c65 +size 242041896 diff --git a/checkpoint-2656/optimizer.pt b/checkpoint-2656/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..57d3ff2dbc91efde266de56e886c329e67552c07 --- /dev/null +++ b/checkpoint-2656/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b307f8e49034866c5c015630f8fa654a331a9e803649ccf615b1b56d0516f90 +size 484163514 diff --git a/checkpoint-2656/rng_state.pth b/checkpoint-2656/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2d077571a7a87e97984796dbc7a5adceaf7a94b2 --- /dev/null +++ b/checkpoint-2656/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae0f1d3440823ebea1f70cff469eda177d4a0550eadf34c3a2c8b334329912b3 +size 14244 diff --git a/checkpoint-2656/scheduler.pt b/checkpoint-2656/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..725bd0bd1e8e2bacca254cb8ec535d5d0f29e80d --- /dev/null +++ b/checkpoint-2656/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee111b2e58fd87c7cff54e4b9df7ecced7c7c69c404c5b2b50093b27271a3d80 +size 1064 diff --git a/checkpoint-2656/trainer_state.json b/checkpoint-2656/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7e457a4b2c127dfdea5d21780a1a791250bf38fe --- /dev/null +++ b/checkpoint-2656/trainer_state.json @@ -0,0 +1,68 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 2656, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.7530120481927711, + "grad_norm": 0.2647170126438141, + "learning_rate": 0.0004811746987951807, + "loss": 0.3311, + "step": 500 + }, + { + "epoch": 1.5060240963855422, + "grad_norm": 0.22880347073078156, + "learning_rate": 0.00046234939759036143, + "loss": 0.0907, + "step": 1000 + }, + { + "epoch": 2.2590361445783134, + "grad_norm": 0.1677163541316986, + "learning_rate": 0.00044352409638554217, + "loss": 0.0568, + "step": 1500 + }, + { + "epoch": 3.0120481927710845, + "grad_norm": 0.12338300049304962, + "learning_rate": 0.0004246987951807229, + "loss": 0.0451, + "step": 2000 + }, + { + "epoch": 3.765060240963855, + "grad_norm": 0.08597979694604874, + "learning_rate": 0.0004058734939759036, + "loss": 0.0386, + "step": 2500 + } + ], + "logging_steps": 500, + "max_steps": 13280, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1436044184518656.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2656/training_args.bin b/checkpoint-2656/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c4458cc722a24e1abde581b8667a40b1674f5aea --- /dev/null +++ b/checkpoint-2656/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec2ab2506605f8b233e63780e2c7aa14dc16da409c9e0e5c14641534e5ff016 +size 5304 diff --git a/checkpoint-3320/config.json b/checkpoint-3320/config.json new file mode 100644 index 0000000000000000000000000000000000000000..875ba5d9d44b85d53592976d4c2b6425bc63516f --- /dev/null +++ b/checkpoint-3320/config.json @@ -0,0 +1,61 @@ +{ + "_name_or_path": "t5-small", + "architectures": [ + "T5ForConditionalGeneration" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 512, + "decoder_start_token_id": 0, + "dense_act_fn": "relu", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "relu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": false, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 6, + "num_heads": 8, + "num_layers": 6, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "torch_dtype": "float32", + "transformers_version": "4.49.0", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/checkpoint-3320/generation_config.json b/checkpoint-3320/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eba25c5db1745fe5324f4f7e8890c19853e21453 --- /dev/null +++ b/checkpoint-3320/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.49.0" +} diff --git a/checkpoint-3320/model.safetensors b/checkpoint-3320/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..945b6f3f4509cda778eef4d48e0b4dd0c4fd555f --- /dev/null +++ b/checkpoint-3320/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e7399036d6d5e0ff1e62004f221407f1fcbea3065f4fca268c8343ebd2d4a9a +size 242041896 diff --git a/checkpoint-3320/optimizer.pt b/checkpoint-3320/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6af9b0da4e2c35e99f6d8f304b5169295a2d1fb9 --- /dev/null +++ b/checkpoint-3320/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:caa347079a7baa18cc423c737e9185fe57aaf20106ce7a5b3afd2f5b1c501d8f +size 484163514 diff --git a/checkpoint-3320/rng_state.pth b/checkpoint-3320/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e6404db70c5ad1c3d0ae2bdabd947c93d0af3271 --- /dev/null +++ b/checkpoint-3320/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:529b97068a3d90cb9fb6c7d90e901db3252ceebae3b8338675debc65f0a7d72d +size 14244 diff --git a/checkpoint-3320/scheduler.pt b/checkpoint-3320/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e6a0c462212ecc793cde8a2c265bdff33b76613 --- /dev/null +++ b/checkpoint-3320/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b758b7d414e163d0c48d4e99ae5665ecea0166d393eeca607ecdd2a896818721 +size 1064 diff --git a/checkpoint-3320/trainer_state.json b/checkpoint-3320/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ff7a6876cab5995ef2d8595440bf023ae0408c29 --- /dev/null +++ b/checkpoint-3320/trainer_state.json @@ -0,0 +1,75 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 3320, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.7530120481927711, + "grad_norm": 0.2647170126438141, + "learning_rate": 0.0004811746987951807, + "loss": 0.3311, + "step": 500 + }, + { + "epoch": 1.5060240963855422, + "grad_norm": 0.22880347073078156, + "learning_rate": 0.00046234939759036143, + "loss": 0.0907, + "step": 1000 + }, + { + "epoch": 2.2590361445783134, + "grad_norm": 0.1677163541316986, + "learning_rate": 0.00044352409638554217, + "loss": 0.0568, + "step": 1500 + }, + { + "epoch": 3.0120481927710845, + "grad_norm": 0.12338300049304962, + "learning_rate": 0.0004246987951807229, + "loss": 0.0451, + "step": 2000 + }, + { + "epoch": 3.765060240963855, + "grad_norm": 0.08597979694604874, + "learning_rate": 0.0004058734939759036, + "loss": 0.0386, + "step": 2500 + }, + { + "epoch": 4.518072289156627, + "grad_norm": 0.0988745242357254, + "learning_rate": 0.00038704819277108433, + "loss": 0.0352, + "step": 3000 + } + ], + "logging_steps": 500, + "max_steps": 13280, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1795055230648320.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3320/training_args.bin b/checkpoint-3320/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c4458cc722a24e1abde581b8667a40b1674f5aea --- /dev/null +++ b/checkpoint-3320/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec2ab2506605f8b233e63780e2c7aa14dc16da409c9e0e5c14641534e5ff016 +size 5304 diff --git a/checkpoint-3984/config.json b/checkpoint-3984/config.json new file mode 100644 index 0000000000000000000000000000000000000000..875ba5d9d44b85d53592976d4c2b6425bc63516f --- /dev/null +++ b/checkpoint-3984/config.json @@ -0,0 +1,61 @@ +{ + "_name_or_path": "t5-small", + "architectures": [ + "T5ForConditionalGeneration" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 512, + "decoder_start_token_id": 0, + "dense_act_fn": "relu", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "relu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": false, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 6, + "num_heads": 8, + "num_layers": 6, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "torch_dtype": "float32", + "transformers_version": "4.49.0", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/checkpoint-3984/generation_config.json b/checkpoint-3984/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eba25c5db1745fe5324f4f7e8890c19853e21453 --- /dev/null +++ b/checkpoint-3984/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.49.0" +} diff --git a/checkpoint-3984/model.safetensors b/checkpoint-3984/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..30f7d99c37e674931b45c7e7dff4c20984d5344a --- /dev/null +++ b/checkpoint-3984/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08db43789d7b94431ebff25043dfb90f12efaceba68e72907f16f9e4cbb9b9b3 +size 242041896 diff --git a/checkpoint-3984/optimizer.pt b/checkpoint-3984/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a41b132e8c9887130752ceb3ed6599ff77b79c65 --- /dev/null +++ b/checkpoint-3984/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e12588ec100a97e446d7a448bac840d4bd2ce689b5391cae9fd7706c164d88c +size 484163514 diff --git a/checkpoint-3984/rng_state.pth b/checkpoint-3984/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..eb31094de8b73410ce44a3b8ea6902e33b005795 --- /dev/null +++ b/checkpoint-3984/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f8dc8a7a89a3faf3bb85cdca2961459049b8b93a23c61b6d875073636018d37 +size 14244 diff --git a/checkpoint-3984/scheduler.pt b/checkpoint-3984/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c9e610d77d7c8a192e66a871fd3cab8420fa63c --- /dev/null +++ b/checkpoint-3984/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55a38be13b64a4268dec8b12c25db249c63b5cb35f03790de5dba401337d59a7 +size 1064 diff --git a/checkpoint-3984/trainer_state.json b/checkpoint-3984/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..697ec9be4315a83dc20bb2110a83c719b8ee5114 --- /dev/null +++ b/checkpoint-3984/trainer_state.json @@ -0,0 +1,82 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.0, + "eval_steps": 500, + "global_step": 3984, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.7530120481927711, + "grad_norm": 0.2647170126438141, + "learning_rate": 0.0004811746987951807, + "loss": 0.3311, + "step": 500 + }, + { + "epoch": 1.5060240963855422, + "grad_norm": 0.22880347073078156, + "learning_rate": 0.00046234939759036143, + "loss": 0.0907, + "step": 1000 + }, + { + "epoch": 2.2590361445783134, + "grad_norm": 0.1677163541316986, + "learning_rate": 0.00044352409638554217, + "loss": 0.0568, + "step": 1500 + }, + { + "epoch": 3.0120481927710845, + "grad_norm": 0.12338300049304962, + "learning_rate": 0.0004246987951807229, + "loss": 0.0451, + "step": 2000 + }, + { + "epoch": 3.765060240963855, + "grad_norm": 0.08597979694604874, + "learning_rate": 0.0004058734939759036, + "loss": 0.0386, + "step": 2500 + }, + { + "epoch": 4.518072289156627, + "grad_norm": 0.0988745242357254, + "learning_rate": 0.00038704819277108433, + "loss": 0.0352, + "step": 3000 + }, + { + "epoch": 5.271084337349397, + "grad_norm": 0.11785969883203506, + "learning_rate": 0.00036822289156626507, + "loss": 0.0331, + "step": 3500 + } + ], + "logging_steps": 500, + "max_steps": 13280, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2154066276777984.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3984/training_args.bin b/checkpoint-3984/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c4458cc722a24e1abde581b8667a40b1674f5aea --- /dev/null +++ b/checkpoint-3984/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec2ab2506605f8b233e63780e2c7aa14dc16da409c9e0e5c14641534e5ff016 +size 5304 diff --git a/checkpoint-4648/config.json b/checkpoint-4648/config.json new file mode 100644 index 0000000000000000000000000000000000000000..875ba5d9d44b85d53592976d4c2b6425bc63516f --- /dev/null +++ b/checkpoint-4648/config.json @@ -0,0 +1,61 @@ +{ + "_name_or_path": "t5-small", + "architectures": [ + "T5ForConditionalGeneration" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 512, + "decoder_start_token_id": 0, + "dense_act_fn": "relu", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "relu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": false, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 6, + "num_heads": 8, + "num_layers": 6, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "torch_dtype": "float32", + "transformers_version": "4.49.0", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/checkpoint-4648/generation_config.json b/checkpoint-4648/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eba25c5db1745fe5324f4f7e8890c19853e21453 --- /dev/null +++ b/checkpoint-4648/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.49.0" +} diff --git a/checkpoint-4648/model.safetensors b/checkpoint-4648/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3636ed8559ceffa02f455161ac5c43b1a52a77ba --- /dev/null +++ b/checkpoint-4648/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:448ef945fa76cd4e82bb178701e8ae578edde09c47d80f194c7746380fa6a609 +size 242041896 diff --git a/checkpoint-4648/optimizer.pt b/checkpoint-4648/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c994b658ccf6b1e2cfacc574dcc3c524a2d54ac --- /dev/null +++ b/checkpoint-4648/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d1dec9842ed2d0bfbaa1b2b86332c91693557caa9658897c7b33fab6a538be4 +size 484163514 diff --git a/checkpoint-4648/rng_state.pth b/checkpoint-4648/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..65fb5db9edaf0180f96886d7effedef259c96945 --- /dev/null +++ b/checkpoint-4648/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c17df5c2080c1049dea480106929daa9a55517c204856b809e27f5fcc16cebaa +size 14244 diff --git a/checkpoint-4648/scheduler.pt b/checkpoint-4648/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1058f05f541befc59fb016b0f0b06fa83aa6babe --- /dev/null +++ b/checkpoint-4648/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5152b17c98368152e474b87d9c64ba6d97c8428f85637733e237f8b69b5bd937 +size 1064 diff --git a/checkpoint-4648/trainer_state.json b/checkpoint-4648/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c50dcc1da7697c8072a8c86dbe73e4c21d42bda3 --- /dev/null +++ b/checkpoint-4648/trainer_state.json @@ -0,0 +1,96 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.0, + "eval_steps": 500, + "global_step": 4648, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.7530120481927711, + "grad_norm": 0.2647170126438141, + "learning_rate": 0.0004811746987951807, + "loss": 0.3311, + "step": 500 + }, + { + "epoch": 1.5060240963855422, + "grad_norm": 0.22880347073078156, + "learning_rate": 0.00046234939759036143, + "loss": 0.0907, + "step": 1000 + }, + { + "epoch": 2.2590361445783134, + "grad_norm": 0.1677163541316986, + "learning_rate": 0.00044352409638554217, + "loss": 0.0568, + "step": 1500 + }, + { + "epoch": 3.0120481927710845, + "grad_norm": 0.12338300049304962, + "learning_rate": 0.0004246987951807229, + "loss": 0.0451, + "step": 2000 + }, + { + "epoch": 3.765060240963855, + "grad_norm": 0.08597979694604874, + "learning_rate": 0.0004058734939759036, + "loss": 0.0386, + "step": 2500 + }, + { + "epoch": 4.518072289156627, + "grad_norm": 0.0988745242357254, + "learning_rate": 0.00038704819277108433, + "loss": 0.0352, + "step": 3000 + }, + { + "epoch": 5.271084337349397, + "grad_norm": 0.11785969883203506, + "learning_rate": 0.00036822289156626507, + "loss": 0.0331, + "step": 3500 + }, + { + "epoch": 6.024096385542169, + "grad_norm": 0.09906379133462906, + "learning_rate": 0.0003493975903614458, + "loss": 0.0315, + "step": 4000 + }, + { + "epoch": 6.77710843373494, + "grad_norm": 0.1129639744758606, + "learning_rate": 0.0003305722891566265, + "loss": 0.0301, + "step": 4500 + } + ], + "logging_steps": 500, + "max_steps": 13280, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2513077322907648.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4648/training_args.bin b/checkpoint-4648/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c4458cc722a24e1abde581b8667a40b1674f5aea --- /dev/null +++ b/checkpoint-4648/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec2ab2506605f8b233e63780e2c7aa14dc16da409c9e0e5c14641534e5ff016 +size 5304 diff --git a/checkpoint-5312/config.json b/checkpoint-5312/config.json new file mode 100644 index 0000000000000000000000000000000000000000..875ba5d9d44b85d53592976d4c2b6425bc63516f --- /dev/null +++ b/checkpoint-5312/config.json @@ -0,0 +1,61 @@ +{ + "_name_or_path": "t5-small", + "architectures": [ + "T5ForConditionalGeneration" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 512, + "decoder_start_token_id": 0, + "dense_act_fn": "relu", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "relu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": false, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 6, + "num_heads": 8, + "num_layers": 6, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "torch_dtype": "float32", + "transformers_version": "4.49.0", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/checkpoint-5312/generation_config.json b/checkpoint-5312/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eba25c5db1745fe5324f4f7e8890c19853e21453 --- /dev/null +++ b/checkpoint-5312/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.49.0" +} diff --git a/checkpoint-5312/model.safetensors b/checkpoint-5312/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..930e7d1942ca313062a9f932fa660fadbbd18578 --- /dev/null +++ b/checkpoint-5312/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97885c22d9eb94e533d5173e0636885717a8db079549ca29089a0c10818e46d5 +size 242041896 diff --git a/checkpoint-5312/optimizer.pt b/checkpoint-5312/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..73d0750946142b371cf06969e6dd4a8dc3b56b54 --- /dev/null +++ b/checkpoint-5312/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6634dd9a4e7c4067d256a6e4da06a62dd827d1a796294a721d79d66e157cc993 +size 484163514 diff --git a/checkpoint-5312/rng_state.pth b/checkpoint-5312/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..31ee14ad4e013587e6348ff4ed1d931d8779518f --- /dev/null +++ b/checkpoint-5312/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77c239e7c3f387e4f5cbfdc069a340b4dc4816ee410605f4c761fdd1968cba1e +size 14244 diff --git a/checkpoint-5312/scheduler.pt b/checkpoint-5312/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d723c517c6ddf090d224ad81c543d0b1d41b9d4d --- /dev/null +++ b/checkpoint-5312/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2df5eec69e36f13fe010830741b5864241e4d5169d6ce46963c1274f11e51521 +size 1064 diff --git a/checkpoint-5312/trainer_state.json b/checkpoint-5312/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..46b6a7daf487fb2dfb7d6b25ad9cc669dfefcecc --- /dev/null +++ b/checkpoint-5312/trainer_state.json @@ -0,0 +1,103 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.0, + "eval_steps": 500, + "global_step": 5312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.7530120481927711, + "grad_norm": 0.2647170126438141, + "learning_rate": 0.0004811746987951807, + "loss": 0.3311, + "step": 500 + }, + { + "epoch": 1.5060240963855422, + "grad_norm": 0.22880347073078156, + "learning_rate": 0.00046234939759036143, + "loss": 0.0907, + "step": 1000 + }, + { + "epoch": 2.2590361445783134, + "grad_norm": 0.1677163541316986, + "learning_rate": 0.00044352409638554217, + "loss": 0.0568, + "step": 1500 + }, + { + "epoch": 3.0120481927710845, + "grad_norm": 0.12338300049304962, + "learning_rate": 0.0004246987951807229, + "loss": 0.0451, + "step": 2000 + }, + { + "epoch": 3.765060240963855, + "grad_norm": 0.08597979694604874, + "learning_rate": 0.0004058734939759036, + "loss": 0.0386, + "step": 2500 + }, + { + "epoch": 4.518072289156627, + "grad_norm": 0.0988745242357254, + "learning_rate": 0.00038704819277108433, + "loss": 0.0352, + "step": 3000 + }, + { + "epoch": 5.271084337349397, + "grad_norm": 0.11785969883203506, + "learning_rate": 0.00036822289156626507, + "loss": 0.0331, + "step": 3500 + }, + { + "epoch": 6.024096385542169, + "grad_norm": 0.09906379133462906, + "learning_rate": 0.0003493975903614458, + "loss": 0.0315, + "step": 4000 + }, + { + "epoch": 6.77710843373494, + "grad_norm": 0.1129639744758606, + "learning_rate": 0.0003305722891566265, + "loss": 0.0301, + "step": 4500 + }, + { + "epoch": 7.530120481927711, + "grad_norm": 0.07321502268314362, + "learning_rate": 0.00031174698795180723, + "loss": 0.0292, + "step": 5000 + } + ], + "logging_steps": 500, + "max_steps": 13280, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2872088369037312.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-5312/training_args.bin b/checkpoint-5312/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c4458cc722a24e1abde581b8667a40b1674f5aea --- /dev/null +++ b/checkpoint-5312/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec2ab2506605f8b233e63780e2c7aa14dc16da409c9e0e5c14641534e5ff016 +size 5304 diff --git a/checkpoint-5976/config.json b/checkpoint-5976/config.json new file mode 100644 index 0000000000000000000000000000000000000000..875ba5d9d44b85d53592976d4c2b6425bc63516f --- /dev/null +++ b/checkpoint-5976/config.json @@ -0,0 +1,61 @@ +{ + "_name_or_path": "t5-small", + "architectures": [ + "T5ForConditionalGeneration" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 512, + "decoder_start_token_id": 0, + "dense_act_fn": "relu", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "relu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": false, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 6, + "num_heads": 8, + "num_layers": 6, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "torch_dtype": "float32", + "transformers_version": "4.49.0", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/checkpoint-5976/generation_config.json b/checkpoint-5976/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eba25c5db1745fe5324f4f7e8890c19853e21453 --- /dev/null +++ b/checkpoint-5976/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.49.0" +} diff --git a/checkpoint-5976/model.safetensors b/checkpoint-5976/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..628f01ba6e3b9876af051d0c08ea803a2ec6d3c0 --- /dev/null +++ b/checkpoint-5976/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:311cbd6d4cd82234eb76ff9c9ce6fd953e257c6ae5f7dcbb286977d0b335de54 +size 242041896 diff --git a/checkpoint-5976/optimizer.pt b/checkpoint-5976/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8354df8c03d621c778ecff9e3f7f22a4fb54e344 --- /dev/null +++ b/checkpoint-5976/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:198916391e1273de56278390d5acf7c8d1a34a75617147c870c0d8316b1c0139 +size 484163514 diff --git a/checkpoint-5976/rng_state.pth b/checkpoint-5976/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..890b864a7d06688a6633f54f2661ab99eb956388 --- /dev/null +++ b/checkpoint-5976/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae9e66325b1927e9d04818200f5bc6cf72234ba0e00a7c2866209e02546bc09b +size 14244 diff --git a/checkpoint-5976/scheduler.pt b/checkpoint-5976/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b36338911aeeb839c0859868b91ed783ca7371c --- /dev/null +++ b/checkpoint-5976/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:917a41ff3e07e604221fa6f89f369743864ab7c91a1a30a583d4e4de1beb4787 +size 1064 diff --git a/checkpoint-5976/trainer_state.json b/checkpoint-5976/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..643d274c0f7d1dbe18967c86f37d85447e38004b --- /dev/null +++ b/checkpoint-5976/trainer_state.json @@ -0,0 +1,110 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.0, + "eval_steps": 500, + "global_step": 5976, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.7530120481927711, + "grad_norm": 0.2647170126438141, + "learning_rate": 0.0004811746987951807, + "loss": 0.3311, + "step": 500 + }, + { + "epoch": 1.5060240963855422, + "grad_norm": 0.22880347073078156, + "learning_rate": 0.00046234939759036143, + "loss": 0.0907, + "step": 1000 + }, + { + "epoch": 2.2590361445783134, + "grad_norm": 0.1677163541316986, + "learning_rate": 0.00044352409638554217, + "loss": 0.0568, + "step": 1500 + }, + { + "epoch": 3.0120481927710845, + "grad_norm": 0.12338300049304962, + "learning_rate": 0.0004246987951807229, + "loss": 0.0451, + "step": 2000 + }, + { + "epoch": 3.765060240963855, + "grad_norm": 0.08597979694604874, + "learning_rate": 0.0004058734939759036, + "loss": 0.0386, + "step": 2500 + }, + { + "epoch": 4.518072289156627, + "grad_norm": 0.0988745242357254, + "learning_rate": 0.00038704819277108433, + "loss": 0.0352, + "step": 3000 + }, + { + "epoch": 5.271084337349397, + "grad_norm": 0.11785969883203506, + "learning_rate": 0.00036822289156626507, + "loss": 0.0331, + "step": 3500 + }, + { + "epoch": 6.024096385542169, + "grad_norm": 0.09906379133462906, + "learning_rate": 0.0003493975903614458, + "loss": 0.0315, + "step": 4000 + }, + { + "epoch": 6.77710843373494, + "grad_norm": 0.1129639744758606, + "learning_rate": 0.0003305722891566265, + "loss": 0.0301, + "step": 4500 + }, + { + "epoch": 7.530120481927711, + "grad_norm": 0.07321502268314362, + "learning_rate": 0.00031174698795180723, + "loss": 0.0292, + "step": 5000 + }, + { + "epoch": 8.283132530120483, + "grad_norm": 0.05083702132105827, + "learning_rate": 0.0002929216867469879, + "loss": 0.028, + "step": 5500 + } + ], + "logging_steps": 500, + "max_steps": 13280, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3231099415166976.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-5976/training_args.bin b/checkpoint-5976/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c4458cc722a24e1abde581b8667a40b1674f5aea --- /dev/null +++ b/checkpoint-5976/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec2ab2506605f8b233e63780e2c7aa14dc16da409c9e0e5c14641534e5ff016 +size 5304 diff --git a/checkpoint-664/config.json b/checkpoint-664/config.json new file mode 100644 index 0000000000000000000000000000000000000000..875ba5d9d44b85d53592976d4c2b6425bc63516f --- /dev/null +++ b/checkpoint-664/config.json @@ -0,0 +1,61 @@ +{ + "_name_or_path": "t5-small", + "architectures": [ + "T5ForConditionalGeneration" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 512, + "decoder_start_token_id": 0, + "dense_act_fn": "relu", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "relu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": false, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 6, + "num_heads": 8, + "num_layers": 6, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "torch_dtype": "float32", + "transformers_version": "4.49.0", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/checkpoint-664/generation_config.json b/checkpoint-664/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eba25c5db1745fe5324f4f7e8890c19853e21453 --- /dev/null +++ b/checkpoint-664/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.49.0" +} diff --git a/checkpoint-664/model.safetensors b/checkpoint-664/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bb406f540db284f35bff6bc0300a3d1215ceb35f --- /dev/null +++ b/checkpoint-664/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5326698f51b49d4338344b480ff4e7e8c672f556c44750b3a50c5d1b162f97a2 +size 242041896 diff --git a/checkpoint-664/optimizer.pt b/checkpoint-664/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3e08cfc50d5ef01f0648420673a4eb7570230cb --- /dev/null +++ b/checkpoint-664/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab5d4848a96aed8a1fff784a146e81e46205935859ca1ecbb7603ddeb35b3e84 +size 484163514 diff --git a/checkpoint-664/rng_state.pth b/checkpoint-664/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..256830d8e6ffd79234fc150e5427c5a89677b4a4 --- /dev/null +++ b/checkpoint-664/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:759c31a50d8add4657e65be041371e769aefb366f904d90ba4259345c29f9990 +size 14244 diff --git a/checkpoint-664/scheduler.pt b/checkpoint-664/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..338b16e86bfda1388fb16c465ac25c64ea03038e --- /dev/null +++ b/checkpoint-664/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41cecb99d54993a8c486c753142cc0a78f52ecd733a3208a065fc8d71c0abf0b +size 1064 diff --git a/checkpoint-664/trainer_state.json b/checkpoint-664/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..44f5ce7cfd67089e8be2980fabdc08876489d16f --- /dev/null +++ b/checkpoint-664/trainer_state.json @@ -0,0 +1,40 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 664, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.7530120481927711, + "grad_norm": 0.2647170126438141, + "learning_rate": 0.0004811746987951807, + "loss": 0.3311, + "step": 500 + } + ], + "logging_steps": 500, + "max_steps": 13280, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 359011046129664.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-664/training_args.bin b/checkpoint-664/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c4458cc722a24e1abde581b8667a40b1674f5aea --- /dev/null +++ b/checkpoint-664/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec2ab2506605f8b233e63780e2c7aa14dc16da409c9e0e5c14641534e5ff016 +size 5304 diff --git a/checkpoint-6640/config.json b/checkpoint-6640/config.json new file mode 100644 index 0000000000000000000000000000000000000000..875ba5d9d44b85d53592976d4c2b6425bc63516f --- /dev/null +++ b/checkpoint-6640/config.json @@ -0,0 +1,61 @@ +{ + "_name_or_path": "t5-small", + "architectures": [ + "T5ForConditionalGeneration" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 512, + "decoder_start_token_id": 0, + "dense_act_fn": "relu", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "relu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": false, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 6, + "num_heads": 8, + "num_layers": 6, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "torch_dtype": "float32", + "transformers_version": "4.49.0", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/checkpoint-6640/generation_config.json b/checkpoint-6640/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eba25c5db1745fe5324f4f7e8890c19853e21453 --- /dev/null +++ b/checkpoint-6640/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.49.0" +} diff --git a/checkpoint-6640/model.safetensors b/checkpoint-6640/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ae2b656c100f3ba628b0613e37b5bd26f44105f3 --- /dev/null +++ b/checkpoint-6640/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92259b98bc9b79af81d13747e953252c6a06be36482dfd40b06c76b6e040185b +size 242041896 diff --git a/checkpoint-6640/optimizer.pt b/checkpoint-6640/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d568c779d71952512ecca8dcebd2ca4d12711303 --- /dev/null +++ b/checkpoint-6640/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cabf1a2e7283a2b1d2b32a4d1456a0064eb8205847773c9ea6a061369c96000 +size 484163514 diff --git a/checkpoint-6640/rng_state.pth b/checkpoint-6640/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..6c5a252837d45edbb41ea64e1cf8ecaa5af625ad --- /dev/null +++ b/checkpoint-6640/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:120f67d248858b7906f6721742710d32974b84e1a9b4111d6feda98d03fd33ab +size 14244 diff --git a/checkpoint-6640/scheduler.pt b/checkpoint-6640/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d679820ab769ca1efedbcf357d2f84b37b840c4f --- /dev/null +++ b/checkpoint-6640/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:555dff0a6f6c5d04e4a4657553dd850a6f3cad38c29a9822aa9df4a6bff3beea +size 1064 diff --git a/checkpoint-6640/trainer_state.json b/checkpoint-6640/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6e42fe7e76d11f76f73a315b743a895e36ba5294 --- /dev/null +++ b/checkpoint-6640/trainer_state.json @@ -0,0 +1,124 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 500, + "global_step": 6640, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.7530120481927711, + "grad_norm": 0.2647170126438141, + "learning_rate": 0.0004811746987951807, + "loss": 0.3311, + "step": 500 + }, + { + "epoch": 1.5060240963855422, + "grad_norm": 0.22880347073078156, + "learning_rate": 0.00046234939759036143, + "loss": 0.0907, + "step": 1000 + }, + { + "epoch": 2.2590361445783134, + "grad_norm": 0.1677163541316986, + "learning_rate": 0.00044352409638554217, + "loss": 0.0568, + "step": 1500 + }, + { + "epoch": 3.0120481927710845, + "grad_norm": 0.12338300049304962, + "learning_rate": 0.0004246987951807229, + "loss": 0.0451, + "step": 2000 + }, + { + "epoch": 3.765060240963855, + "grad_norm": 0.08597979694604874, + "learning_rate": 0.0004058734939759036, + "loss": 0.0386, + "step": 2500 + }, + { + "epoch": 4.518072289156627, + "grad_norm": 0.0988745242357254, + "learning_rate": 0.00038704819277108433, + "loss": 0.0352, + "step": 3000 + }, + { + "epoch": 5.271084337349397, + "grad_norm": 0.11785969883203506, + "learning_rate": 0.00036822289156626507, + "loss": 0.0331, + "step": 3500 + }, + { + "epoch": 6.024096385542169, + "grad_norm": 0.09906379133462906, + "learning_rate": 0.0003493975903614458, + "loss": 0.0315, + "step": 4000 + }, + { + "epoch": 6.77710843373494, + "grad_norm": 0.1129639744758606, + "learning_rate": 0.0003305722891566265, + "loss": 0.0301, + "step": 4500 + }, + { + "epoch": 7.530120481927711, + "grad_norm": 0.07321502268314362, + "learning_rate": 0.00031174698795180723, + "loss": 0.0292, + "step": 5000 + }, + { + "epoch": 8.283132530120483, + "grad_norm": 0.05083702132105827, + "learning_rate": 0.0002929216867469879, + "loss": 0.028, + "step": 5500 + }, + { + "epoch": 9.036144578313253, + "grad_norm": 0.073179692029953, + "learning_rate": 0.0002740963855421687, + "loss": 0.0275, + "step": 6000 + }, + { + "epoch": 9.789156626506024, + "grad_norm": 0.060432616621255875, + "learning_rate": 0.0002552710843373494, + "loss": 0.0266, + "step": 6500 + } + ], + "logging_steps": 500, + "max_steps": 13280, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3590110461296640.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-6640/training_args.bin b/checkpoint-6640/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c4458cc722a24e1abde581b8667a40b1674f5aea --- /dev/null +++ b/checkpoint-6640/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec2ab2506605f8b233e63780e2c7aa14dc16da409c9e0e5c14641534e5ff016 +size 5304 diff --git a/checkpoint-7304/config.json b/checkpoint-7304/config.json new file mode 100644 index 0000000000000000000000000000000000000000..875ba5d9d44b85d53592976d4c2b6425bc63516f --- /dev/null +++ b/checkpoint-7304/config.json @@ -0,0 +1,61 @@ +{ + "_name_or_path": "t5-small", + "architectures": [ + "T5ForConditionalGeneration" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 512, + "decoder_start_token_id": 0, + "dense_act_fn": "relu", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "relu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": false, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 6, + "num_heads": 8, + "num_layers": 6, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "torch_dtype": "float32", + "transformers_version": "4.49.0", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/checkpoint-7304/generation_config.json b/checkpoint-7304/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eba25c5db1745fe5324f4f7e8890c19853e21453 --- /dev/null +++ b/checkpoint-7304/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.49.0" +} diff --git a/checkpoint-7304/model.safetensors b/checkpoint-7304/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..16354591c3a98344dea586aa72bf65ba6dfd3cac --- /dev/null +++ b/checkpoint-7304/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97c41cfd1192c3344cd4cec5cebc61b1d8e5be2de2ed9729d86cbdac3f4cd073 +size 242041896 diff --git a/checkpoint-7304/optimizer.pt b/checkpoint-7304/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..997add471125d148918d0df318ab7bf478264c4f --- /dev/null +++ b/checkpoint-7304/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a7d52b02948c75110045598b29ee91cdefd925c73102050776caa706dec1454 +size 484163514 diff --git a/checkpoint-7304/rng_state.pth b/checkpoint-7304/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3e05fae5846a4b2cea726a3eb442245381058ead --- /dev/null +++ b/checkpoint-7304/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fab093e333ad3fd097582a8ce805c2b1762364df5f23e2da0e43c6f6ff6d0ae +size 14244 diff --git a/checkpoint-7304/scheduler.pt b/checkpoint-7304/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c721ec4b35892a10ebe66746a13d3eb25661ebfa --- /dev/null +++ b/checkpoint-7304/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf23df0d8cedd91a793d43623f7046ed4e399742ab23cf64b70e5f4e12b10297 +size 1064 diff --git a/checkpoint-7304/trainer_state.json b/checkpoint-7304/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1d0231fa5792e59274c9425b183d55e59ae87dd3 --- /dev/null +++ b/checkpoint-7304/trainer_state.json @@ -0,0 +1,131 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 11.0, + "eval_steps": 500, + "global_step": 7304, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.7530120481927711, + "grad_norm": 0.2647170126438141, + "learning_rate": 0.0004811746987951807, + "loss": 0.3311, + "step": 500 + }, + { + "epoch": 1.5060240963855422, + "grad_norm": 0.22880347073078156, + "learning_rate": 0.00046234939759036143, + "loss": 0.0907, + "step": 1000 + }, + { + "epoch": 2.2590361445783134, + "grad_norm": 0.1677163541316986, + "learning_rate": 0.00044352409638554217, + "loss": 0.0568, + "step": 1500 + }, + { + "epoch": 3.0120481927710845, + "grad_norm": 0.12338300049304962, + "learning_rate": 0.0004246987951807229, + "loss": 0.0451, + "step": 2000 + }, + { + "epoch": 3.765060240963855, + "grad_norm": 0.08597979694604874, + "learning_rate": 0.0004058734939759036, + "loss": 0.0386, + "step": 2500 + }, + { + "epoch": 4.518072289156627, + "grad_norm": 0.0988745242357254, + "learning_rate": 0.00038704819277108433, + "loss": 0.0352, + "step": 3000 + }, + { + "epoch": 5.271084337349397, + "grad_norm": 0.11785969883203506, + "learning_rate": 0.00036822289156626507, + "loss": 0.0331, + "step": 3500 + }, + { + "epoch": 6.024096385542169, + "grad_norm": 0.09906379133462906, + "learning_rate": 0.0003493975903614458, + "loss": 0.0315, + "step": 4000 + }, + { + "epoch": 6.77710843373494, + "grad_norm": 0.1129639744758606, + "learning_rate": 0.0003305722891566265, + "loss": 0.0301, + "step": 4500 + }, + { + "epoch": 7.530120481927711, + "grad_norm": 0.07321502268314362, + "learning_rate": 0.00031174698795180723, + "loss": 0.0292, + "step": 5000 + }, + { + "epoch": 8.283132530120483, + "grad_norm": 0.05083702132105827, + "learning_rate": 0.0002929216867469879, + "loss": 0.028, + "step": 5500 + }, + { + "epoch": 9.036144578313253, + "grad_norm": 0.073179692029953, + "learning_rate": 0.0002740963855421687, + "loss": 0.0275, + "step": 6000 + }, + { + "epoch": 9.789156626506024, + "grad_norm": 0.060432616621255875, + "learning_rate": 0.0002552710843373494, + "loss": 0.0266, + "step": 6500 + }, + { + "epoch": 10.542168674698795, + "grad_norm": 0.05641400068998337, + "learning_rate": 0.00023644578313253013, + "loss": 0.0265, + "step": 7000 + } + ], + "logging_steps": 500, + "max_steps": 13280, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3949121507426304.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-7304/training_args.bin b/checkpoint-7304/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c4458cc722a24e1abde581b8667a40b1674f5aea --- /dev/null +++ b/checkpoint-7304/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec2ab2506605f8b233e63780e2c7aa14dc16da409c9e0e5c14641534e5ff016 +size 5304 diff --git a/checkpoint-7968/config.json b/checkpoint-7968/config.json new file mode 100644 index 0000000000000000000000000000000000000000..875ba5d9d44b85d53592976d4c2b6425bc63516f --- /dev/null +++ b/checkpoint-7968/config.json @@ -0,0 +1,61 @@ +{ + "_name_or_path": "t5-small", + "architectures": [ + "T5ForConditionalGeneration" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 512, + "decoder_start_token_id": 0, + "dense_act_fn": "relu", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "relu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": false, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 6, + "num_heads": 8, + "num_layers": 6, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "torch_dtype": "float32", + "transformers_version": "4.49.0", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/checkpoint-7968/generation_config.json b/checkpoint-7968/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eba25c5db1745fe5324f4f7e8890c19853e21453 --- /dev/null +++ b/checkpoint-7968/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.49.0" +} diff --git a/checkpoint-7968/model.safetensors b/checkpoint-7968/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..382e80579ae0705d8d27ef8ed0eb792bb0d3a28d --- /dev/null +++ b/checkpoint-7968/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0fbb2ed6307aa958e65b192a0cdafa1bc9b8dfa0b79efb83044c0eeaba67d8f +size 242041896 diff --git a/checkpoint-7968/optimizer.pt b/checkpoint-7968/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7204a6272329349b6b7a57b6bee598b3ed5a24ac --- /dev/null +++ b/checkpoint-7968/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ce84f9c60232c41edd5348b819a63874ff3df25cc00ba0d5b8325264ab3f56d +size 484163514 diff --git a/checkpoint-7968/rng_state.pth b/checkpoint-7968/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..5e919fd89a9980d80dd5381e8a89ea1e2196736a --- /dev/null +++ b/checkpoint-7968/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a28fd3207e4b154c13ff75e506377848dc740f461fb5bb9bd86078faa06f31c +size 14244 diff --git a/checkpoint-7968/scheduler.pt b/checkpoint-7968/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4267e4c9be4bfcc1f773f39df7989bdc04e9970c --- /dev/null +++ b/checkpoint-7968/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a63f49a11ca4824d69d15383245b8a9d6993c3692826ceaa22f1653db67e142 +size 1064 diff --git a/checkpoint-7968/trainer_state.json b/checkpoint-7968/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f826720213039a6bd884a0b0fb3e1a93c4674cfc --- /dev/null +++ b/checkpoint-7968/trainer_state.json @@ -0,0 +1,138 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 12.0, + "eval_steps": 500, + "global_step": 7968, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.7530120481927711, + "grad_norm": 0.2647170126438141, + "learning_rate": 0.0004811746987951807, + "loss": 0.3311, + "step": 500 + }, + { + "epoch": 1.5060240963855422, + "grad_norm": 0.22880347073078156, + "learning_rate": 0.00046234939759036143, + "loss": 0.0907, + "step": 1000 + }, + { + "epoch": 2.2590361445783134, + "grad_norm": 0.1677163541316986, + "learning_rate": 0.00044352409638554217, + "loss": 0.0568, + "step": 1500 + }, + { + "epoch": 3.0120481927710845, + "grad_norm": 0.12338300049304962, + "learning_rate": 0.0004246987951807229, + "loss": 0.0451, + "step": 2000 + }, + { + "epoch": 3.765060240963855, + "grad_norm": 0.08597979694604874, + "learning_rate": 0.0004058734939759036, + "loss": 0.0386, + "step": 2500 + }, + { + "epoch": 4.518072289156627, + "grad_norm": 0.0988745242357254, + "learning_rate": 0.00038704819277108433, + "loss": 0.0352, + "step": 3000 + }, + { + "epoch": 5.271084337349397, + "grad_norm": 0.11785969883203506, + "learning_rate": 0.00036822289156626507, + "loss": 0.0331, + "step": 3500 + }, + { + "epoch": 6.024096385542169, + "grad_norm": 0.09906379133462906, + "learning_rate": 0.0003493975903614458, + "loss": 0.0315, + "step": 4000 + }, + { + "epoch": 6.77710843373494, + "grad_norm": 0.1129639744758606, + "learning_rate": 0.0003305722891566265, + "loss": 0.0301, + "step": 4500 + }, + { + "epoch": 7.530120481927711, + "grad_norm": 0.07321502268314362, + "learning_rate": 0.00031174698795180723, + "loss": 0.0292, + "step": 5000 + }, + { + "epoch": 8.283132530120483, + "grad_norm": 0.05083702132105827, + "learning_rate": 0.0002929216867469879, + "loss": 0.028, + "step": 5500 + }, + { + "epoch": 9.036144578313253, + "grad_norm": 0.073179692029953, + "learning_rate": 0.0002740963855421687, + "loss": 0.0275, + "step": 6000 + }, + { + "epoch": 9.789156626506024, + "grad_norm": 0.060432616621255875, + "learning_rate": 0.0002552710843373494, + "loss": 0.0266, + "step": 6500 + }, + { + "epoch": 10.542168674698795, + "grad_norm": 0.05641400068998337, + "learning_rate": 0.00023644578313253013, + "loss": 0.0265, + "step": 7000 + }, + { + "epoch": 11.295180722891565, + "grad_norm": 0.055228352546691895, + "learning_rate": 0.00021762048192771087, + "loss": 0.0257, + "step": 7500 + } + ], + "logging_steps": 500, + "max_steps": 13280, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4308132553555968.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-7968/training_args.bin b/checkpoint-7968/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c4458cc722a24e1abde581b8667a40b1674f5aea --- /dev/null +++ b/checkpoint-7968/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec2ab2506605f8b233e63780e2c7aa14dc16da409c9e0e5c14641534e5ff016 +size 5304 diff --git a/checkpoint-8632/config.json b/checkpoint-8632/config.json new file mode 100644 index 0000000000000000000000000000000000000000..875ba5d9d44b85d53592976d4c2b6425bc63516f --- /dev/null +++ b/checkpoint-8632/config.json @@ -0,0 +1,61 @@ +{ + "_name_or_path": "t5-small", + "architectures": [ + "T5ForConditionalGeneration" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 512, + "decoder_start_token_id": 0, + "dense_act_fn": "relu", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "relu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": false, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 6, + "num_heads": 8, + "num_layers": 6, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "torch_dtype": "float32", + "transformers_version": "4.49.0", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/checkpoint-8632/generation_config.json b/checkpoint-8632/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eba25c5db1745fe5324f4f7e8890c19853e21453 --- /dev/null +++ b/checkpoint-8632/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.49.0" +} diff --git a/checkpoint-8632/model.safetensors b/checkpoint-8632/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..32ca7bd19f98e480c93918c44926e0ca178d43e4 --- /dev/null +++ b/checkpoint-8632/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:423127f85334723d8f0b13943974887b1f9dc65a4255ec5a752226cb7eacfe97 +size 242041896 diff --git a/checkpoint-8632/optimizer.pt b/checkpoint-8632/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..aad22bdf0c2d49d7540f00d158e804ae0648570e --- /dev/null +++ b/checkpoint-8632/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ade0ef0cd6af2cd90a44219869104ecf9f0b1dd2ea3a38f475914239f1e6749 +size 484163514 diff --git a/checkpoint-8632/rng_state.pth b/checkpoint-8632/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..33ffb7b9f445798c1b0b4982bb572f18920dfa4a --- /dev/null +++ b/checkpoint-8632/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:facdb018b684b4c3efb1e1ab6d16ad42a9832276c58db1afbda251d4a51b2a08 +size 14244 diff --git a/checkpoint-8632/scheduler.pt b/checkpoint-8632/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..09f4279b8b4f8dc5df6fbfd9d557298ef467125f --- /dev/null +++ b/checkpoint-8632/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84024b040e732904477d5a9d2a12639148bccd02a323b6b5cbd1dc78e2402c0f +size 1064 diff --git a/checkpoint-8632/trainer_state.json b/checkpoint-8632/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a1867eace4bb9097ff4585b2f8fdd5fa6a34dd04 --- /dev/null +++ b/checkpoint-8632/trainer_state.json @@ -0,0 +1,152 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 13.0, + "eval_steps": 500, + "global_step": 8632, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.7530120481927711, + "grad_norm": 0.2647170126438141, + "learning_rate": 0.0004811746987951807, + "loss": 0.3311, + "step": 500 + }, + { + "epoch": 1.5060240963855422, + "grad_norm": 0.22880347073078156, + "learning_rate": 0.00046234939759036143, + "loss": 0.0907, + "step": 1000 + }, + { + "epoch": 2.2590361445783134, + "grad_norm": 0.1677163541316986, + "learning_rate": 0.00044352409638554217, + "loss": 0.0568, + "step": 1500 + }, + { + "epoch": 3.0120481927710845, + "grad_norm": 0.12338300049304962, + "learning_rate": 0.0004246987951807229, + "loss": 0.0451, + "step": 2000 + }, + { + "epoch": 3.765060240963855, + "grad_norm": 0.08597979694604874, + "learning_rate": 0.0004058734939759036, + "loss": 0.0386, + "step": 2500 + }, + { + "epoch": 4.518072289156627, + "grad_norm": 0.0988745242357254, + "learning_rate": 0.00038704819277108433, + "loss": 0.0352, + "step": 3000 + }, + { + "epoch": 5.271084337349397, + "grad_norm": 0.11785969883203506, + "learning_rate": 0.00036822289156626507, + "loss": 0.0331, + "step": 3500 + }, + { + "epoch": 6.024096385542169, + "grad_norm": 0.09906379133462906, + "learning_rate": 0.0003493975903614458, + "loss": 0.0315, + "step": 4000 + }, + { + "epoch": 6.77710843373494, + "grad_norm": 0.1129639744758606, + "learning_rate": 0.0003305722891566265, + "loss": 0.0301, + "step": 4500 + }, + { + "epoch": 7.530120481927711, + "grad_norm": 0.07321502268314362, + "learning_rate": 0.00031174698795180723, + "loss": 0.0292, + "step": 5000 + }, + { + "epoch": 8.283132530120483, + "grad_norm": 0.05083702132105827, + "learning_rate": 0.0002929216867469879, + "loss": 0.028, + "step": 5500 + }, + { + "epoch": 9.036144578313253, + "grad_norm": 0.073179692029953, + "learning_rate": 0.0002740963855421687, + "loss": 0.0275, + "step": 6000 + }, + { + "epoch": 9.789156626506024, + "grad_norm": 0.060432616621255875, + "learning_rate": 0.0002552710843373494, + "loss": 0.0266, + "step": 6500 + }, + { + "epoch": 10.542168674698795, + "grad_norm": 0.05641400068998337, + "learning_rate": 0.00023644578313253013, + "loss": 0.0265, + "step": 7000 + }, + { + "epoch": 11.295180722891565, + "grad_norm": 0.055228352546691895, + "learning_rate": 0.00021762048192771087, + "loss": 0.0257, + "step": 7500 + }, + { + "epoch": 12.048192771084338, + "grad_norm": 0.055986884981393814, + "learning_rate": 0.00019879518072289158, + "loss": 0.0254, + "step": 8000 + }, + { + "epoch": 12.801204819277109, + "grad_norm": 0.06879087537527084, + "learning_rate": 0.0001799698795180723, + "loss": 0.025, + "step": 8500 + } + ], + "logging_steps": 500, + "max_steps": 13280, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4667143599685632.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-8632/training_args.bin b/checkpoint-8632/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c4458cc722a24e1abde581b8667a40b1674f5aea --- /dev/null +++ b/checkpoint-8632/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec2ab2506605f8b233e63780e2c7aa14dc16da409c9e0e5c14641534e5ff016 +size 5304 diff --git a/checkpoint-9296/config.json b/checkpoint-9296/config.json new file mode 100644 index 0000000000000000000000000000000000000000..875ba5d9d44b85d53592976d4c2b6425bc63516f --- /dev/null +++ b/checkpoint-9296/config.json @@ -0,0 +1,61 @@ +{ + "_name_or_path": "t5-small", + "architectures": [ + "T5ForConditionalGeneration" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 512, + "decoder_start_token_id": 0, + "dense_act_fn": "relu", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "relu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": false, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 6, + "num_heads": 8, + "num_layers": 6, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "torch_dtype": "float32", + "transformers_version": "4.49.0", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/checkpoint-9296/generation_config.json b/checkpoint-9296/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eba25c5db1745fe5324f4f7e8890c19853e21453 --- /dev/null +++ b/checkpoint-9296/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.49.0" +} diff --git a/checkpoint-9296/model.safetensors b/checkpoint-9296/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a02f21926b55b84e961949b7f5249ab4a4585e3d --- /dev/null +++ b/checkpoint-9296/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9c43c63b6a53b98c89568399b9dba931903a3fcdc0278c25bc10f15eff73987 +size 242041896 diff --git a/checkpoint-9296/optimizer.pt b/checkpoint-9296/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0a1e4a1968cb1d9f4343dbc58d99a42a928aae7 --- /dev/null +++ b/checkpoint-9296/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfc0a8e0aea09ecc4bd36162faff192c14d00107415b38c493dde44285779d73 +size 484163514 diff --git a/checkpoint-9296/rng_state.pth b/checkpoint-9296/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1393ccf749963fdd54ac2d13b62ac588c93e1970 --- /dev/null +++ b/checkpoint-9296/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81dbd91029efc92c25690930a99c8652810dc0b232afd868efb417494a80eb80 +size 14244 diff --git a/checkpoint-9296/scheduler.pt b/checkpoint-9296/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c19c86df771f93c1639437f1f82f0d87be89b40 --- /dev/null +++ b/checkpoint-9296/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e39d54119fd051b5fc229728fd2918ca57336e2b5ba6d0c996febf241d7d4c9 +size 1064 diff --git a/checkpoint-9296/trainer_state.json b/checkpoint-9296/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6bc33986c7defe164e4efe77467f61c54ba0fe58 --- /dev/null +++ b/checkpoint-9296/trainer_state.json @@ -0,0 +1,159 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 14.0, + "eval_steps": 500, + "global_step": 9296, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.7530120481927711, + "grad_norm": 0.2647170126438141, + "learning_rate": 0.0004811746987951807, + "loss": 0.3311, + "step": 500 + }, + { + "epoch": 1.5060240963855422, + "grad_norm": 0.22880347073078156, + "learning_rate": 0.00046234939759036143, + "loss": 0.0907, + "step": 1000 + }, + { + "epoch": 2.2590361445783134, + "grad_norm": 0.1677163541316986, + "learning_rate": 0.00044352409638554217, + "loss": 0.0568, + "step": 1500 + }, + { + "epoch": 3.0120481927710845, + "grad_norm": 0.12338300049304962, + "learning_rate": 0.0004246987951807229, + "loss": 0.0451, + "step": 2000 + }, + { + "epoch": 3.765060240963855, + "grad_norm": 0.08597979694604874, + "learning_rate": 0.0004058734939759036, + "loss": 0.0386, + "step": 2500 + }, + { + "epoch": 4.518072289156627, + "grad_norm": 0.0988745242357254, + "learning_rate": 0.00038704819277108433, + "loss": 0.0352, + "step": 3000 + }, + { + "epoch": 5.271084337349397, + "grad_norm": 0.11785969883203506, + "learning_rate": 0.00036822289156626507, + "loss": 0.0331, + "step": 3500 + }, + { + "epoch": 6.024096385542169, + "grad_norm": 0.09906379133462906, + "learning_rate": 0.0003493975903614458, + "loss": 0.0315, + "step": 4000 + }, + { + "epoch": 6.77710843373494, + "grad_norm": 0.1129639744758606, + "learning_rate": 0.0003305722891566265, + "loss": 0.0301, + "step": 4500 + }, + { + "epoch": 7.530120481927711, + "grad_norm": 0.07321502268314362, + "learning_rate": 0.00031174698795180723, + "loss": 0.0292, + "step": 5000 + }, + { + "epoch": 8.283132530120483, + "grad_norm": 0.05083702132105827, + "learning_rate": 0.0002929216867469879, + "loss": 0.028, + "step": 5500 + }, + { + "epoch": 9.036144578313253, + "grad_norm": 0.073179692029953, + "learning_rate": 0.0002740963855421687, + "loss": 0.0275, + "step": 6000 + }, + { + "epoch": 9.789156626506024, + "grad_norm": 0.060432616621255875, + "learning_rate": 0.0002552710843373494, + "loss": 0.0266, + "step": 6500 + }, + { + "epoch": 10.542168674698795, + "grad_norm": 0.05641400068998337, + "learning_rate": 0.00023644578313253013, + "loss": 0.0265, + "step": 7000 + }, + { + "epoch": 11.295180722891565, + "grad_norm": 0.055228352546691895, + "learning_rate": 0.00021762048192771087, + "loss": 0.0257, + "step": 7500 + }, + { + "epoch": 12.048192771084338, + "grad_norm": 0.055986884981393814, + "learning_rate": 0.00019879518072289158, + "loss": 0.0254, + "step": 8000 + }, + { + "epoch": 12.801204819277109, + "grad_norm": 0.06879087537527084, + "learning_rate": 0.0001799698795180723, + "loss": 0.025, + "step": 8500 + }, + { + "epoch": 13.55421686746988, + "grad_norm": 0.08162941783666611, + "learning_rate": 0.00016114457831325303, + "loss": 0.0248, + "step": 9000 + } + ], + "logging_steps": 500, + "max_steps": 13280, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5026154645815296.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-9296/training_args.bin b/checkpoint-9296/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c4458cc722a24e1abde581b8667a40b1674f5aea --- /dev/null +++ b/checkpoint-9296/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec2ab2506605f8b233e63780e2c7aa14dc16da409c9e0e5c14641534e5ff016 +size 5304 diff --git a/checkpoint-9960/config.json b/checkpoint-9960/config.json new file mode 100644 index 0000000000000000000000000000000000000000..875ba5d9d44b85d53592976d4c2b6425bc63516f --- /dev/null +++ b/checkpoint-9960/config.json @@ -0,0 +1,61 @@ +{ + "_name_or_path": "t5-small", + "architectures": [ + "T5ForConditionalGeneration" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 512, + "decoder_start_token_id": 0, + "dense_act_fn": "relu", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "relu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": false, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 6, + "num_heads": 8, + "num_layers": 6, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "torch_dtype": "float32", + "transformers_version": "4.49.0", + "use_cache": true, + "vocab_size": 32128 +} diff --git a/checkpoint-9960/generation_config.json b/checkpoint-9960/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eba25c5db1745fe5324f4f7e8890c19853e21453 --- /dev/null +++ b/checkpoint-9960/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "decoder_start_token_id": 0, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.49.0" +} diff --git a/checkpoint-9960/model.safetensors b/checkpoint-9960/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dc4fb7d49fd5e62f671c411ec70a26f6d2cbaf12 --- /dev/null +++ b/checkpoint-9960/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1751f7e5e385ad51d87be4a1e8a3700dfd2961f3cdb846dc3c1ac6debd623a4 +size 242041896 diff --git a/checkpoint-9960/optimizer.pt b/checkpoint-9960/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..db98c0c4f0ec992af96930f2bf49e56c759fac15 --- /dev/null +++ b/checkpoint-9960/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e2eae6ac59120332d8809ee801e8bc0bd26ae8920c3ec9db6d1aeca9d0822d4 +size 484163514 diff --git a/checkpoint-9960/rng_state.pth b/checkpoint-9960/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..0a9b938dfcb34f678a705022a28e7f46bc6fa991 --- /dev/null +++ b/checkpoint-9960/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d29906570537c4494d8c94edc995d63f294bce306ddf5fd5675ba94aaa973eb5 +size 14244 diff --git a/checkpoint-9960/scheduler.pt b/checkpoint-9960/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5285cac7332751f2cd4d5099af6d72740ecdada0 --- /dev/null +++ b/checkpoint-9960/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb98c87248ee12946b4022814d53bfb8bb097b76257109323c69d88d57c830ec +size 1064 diff --git a/checkpoint-9960/trainer_state.json b/checkpoint-9960/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dc7a97650289fd772cbb6b27fe86b6088a1b5349 --- /dev/null +++ b/checkpoint-9960/trainer_state.json @@ -0,0 +1,166 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 15.0, + "eval_steps": 500, + "global_step": 9960, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.7530120481927711, + "grad_norm": 0.2647170126438141, + "learning_rate": 0.0004811746987951807, + "loss": 0.3311, + "step": 500 + }, + { + "epoch": 1.5060240963855422, + "grad_norm": 0.22880347073078156, + "learning_rate": 0.00046234939759036143, + "loss": 0.0907, + "step": 1000 + }, + { + "epoch": 2.2590361445783134, + "grad_norm": 0.1677163541316986, + "learning_rate": 0.00044352409638554217, + "loss": 0.0568, + "step": 1500 + }, + { + "epoch": 3.0120481927710845, + "grad_norm": 0.12338300049304962, + "learning_rate": 0.0004246987951807229, + "loss": 0.0451, + "step": 2000 + }, + { + "epoch": 3.765060240963855, + "grad_norm": 0.08597979694604874, + "learning_rate": 0.0004058734939759036, + "loss": 0.0386, + "step": 2500 + }, + { + "epoch": 4.518072289156627, + "grad_norm": 0.0988745242357254, + "learning_rate": 0.00038704819277108433, + "loss": 0.0352, + "step": 3000 + }, + { + "epoch": 5.271084337349397, + "grad_norm": 0.11785969883203506, + "learning_rate": 0.00036822289156626507, + "loss": 0.0331, + "step": 3500 + }, + { + "epoch": 6.024096385542169, + "grad_norm": 0.09906379133462906, + "learning_rate": 0.0003493975903614458, + "loss": 0.0315, + "step": 4000 + }, + { + "epoch": 6.77710843373494, + "grad_norm": 0.1129639744758606, + "learning_rate": 0.0003305722891566265, + "loss": 0.0301, + "step": 4500 + }, + { + "epoch": 7.530120481927711, + "grad_norm": 0.07321502268314362, + "learning_rate": 0.00031174698795180723, + "loss": 0.0292, + "step": 5000 + }, + { + "epoch": 8.283132530120483, + "grad_norm": 0.05083702132105827, + "learning_rate": 0.0002929216867469879, + "loss": 0.028, + "step": 5500 + }, + { + "epoch": 9.036144578313253, + "grad_norm": 0.073179692029953, + "learning_rate": 0.0002740963855421687, + "loss": 0.0275, + "step": 6000 + }, + { + "epoch": 9.789156626506024, + "grad_norm": 0.060432616621255875, + "learning_rate": 0.0002552710843373494, + "loss": 0.0266, + "step": 6500 + }, + { + "epoch": 10.542168674698795, + "grad_norm": 0.05641400068998337, + "learning_rate": 0.00023644578313253013, + "loss": 0.0265, + "step": 7000 + }, + { + "epoch": 11.295180722891565, + "grad_norm": 0.055228352546691895, + "learning_rate": 0.00021762048192771087, + "loss": 0.0257, + "step": 7500 + }, + { + "epoch": 12.048192771084338, + "grad_norm": 0.055986884981393814, + "learning_rate": 0.00019879518072289158, + "loss": 0.0254, + "step": 8000 + }, + { + "epoch": 12.801204819277109, + "grad_norm": 0.06879087537527084, + "learning_rate": 0.0001799698795180723, + "loss": 0.025, + "step": 8500 + }, + { + "epoch": 13.55421686746988, + "grad_norm": 0.08162941783666611, + "learning_rate": 0.00016114457831325303, + "loss": 0.0248, + "step": 9000 + }, + { + "epoch": 14.30722891566265, + "grad_norm": 0.0502689927816391, + "learning_rate": 0.00014231927710843374, + "loss": 0.0242, + "step": 9500 + } + ], + "logging_steps": 500, + "max_steps": 13280, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5385165691944960.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-9960/training_args.bin b/checkpoint-9960/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c4458cc722a24e1abde581b8667a40b1674f5aea --- /dev/null +++ b/checkpoint-9960/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec2ab2506605f8b233e63780e2c7aa14dc16da409c9e0e5c14641534e5ff016 +size 5304