diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..718d4a36bdc8345b73a7c514ccd94a24018cdd29 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,363 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-100/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-100/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-100/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1700/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1700/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1700/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1700/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1700/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1700/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1700/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1700/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1700/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1700/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1700/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1700/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1700/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1700/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1700/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1700/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1700/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1700/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1800/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1800/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1800/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1800/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1800/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1800/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1800/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1800/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1800/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1800/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1800/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1800/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1800/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1800/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1800/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1800/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1800/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1800/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1900/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1900/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1900/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1900/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1900/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1900/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1900/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1900/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1900/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1900/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1900/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1900/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1900/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1900/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1900/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1900/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1900/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1900/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-200/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-200/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-2000/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-2000/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-2000/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-2000/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-2000/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-2000/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-2000/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-2000/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-2000/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-2000/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-2000/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-2000/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-2000/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-2000/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-2000/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-2000/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-2000/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-2000/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-300/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-300/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-400/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-400/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-500/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-500/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-600/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-600/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-700/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-700/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-800/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-800/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-900/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-900/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d04d3349cff0fb405ee978366b20bb721da5e75e --- /dev/null +++ b/README.md @@ -0,0 +1,202 @@ +--- +base_model: bigcode/starcoderbase-1b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.2 \ No newline at end of file diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8830797ef244d7e4bf4b41da0c65a7c9ba9e8db7 --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,36 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "bigcode/starcoderbase-1b", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_attn", + "c_proj", + "c_fc", + "c_attn" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/adapter_model.safetensors b/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a71e2190d61f004f6e37d93f9c7256d97df20a9b --- /dev/null +++ b/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:619eeb3ce00fa2451a888fc83ce28ba50cbbb9c0ebe24b76bb88b36565ea632c +size 22241240 diff --git a/checkpoint-100/optimizer_0/.metadata b/checkpoint-100/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..d55cbfdacba1ab5dfea6e381c4200ff832367eef --- /dev/null +++ b/checkpoint-100/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64dae846ba7fb20e07cd1d076660c586147ab0156c74645966a8b65ae2664a1b +size 869361 diff --git a/checkpoint-100/optimizer_0/__0_0.distcp b/checkpoint-100/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..2195e8a9b1e5ba9104369164d00f1894cdce064d --- /dev/null +++ b/checkpoint-100/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58c5105cc1702bf3c00aeba22dd04785e56b18c1e4c226b520ec2c885c9a89e0 +size 6008476 diff --git a/checkpoint-100/optimizer_0/__1_0.distcp b/checkpoint-100/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..c552b9ed324715b9b4c8f8f5a97d69807ede99e1 --- /dev/null +++ b/checkpoint-100/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ad7704459dd20c3600510d2a3f3e2262b9cf201f9b39c8bbb56594e8323105a +size 6041200 diff --git a/checkpoint-100/optimizer_0/__2_0.distcp b/checkpoint-100/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..f84e8cb641109b047f1965f0a89745e21a57dc22 --- /dev/null +++ b/checkpoint-100/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf91022a5990365e63959c3c8d31090ec6de97e0e40f785028cd2edf5e8ebb11 +size 6041200 diff --git a/checkpoint-100/optimizer_0/__3_0.distcp b/checkpoint-100/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..05f96ef4213bf48af660edd16f3bf3a0eaaac123 --- /dev/null +++ b/checkpoint-100/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e379152275e4774566323e2ec970af7cfa502bdfd8a90da768f8367aa78e3563 +size 6043476 diff --git a/checkpoint-100/optimizer_0/__4_0.distcp b/checkpoint-100/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..5c6b4424b8bc5bf80e478dd2689dd00cda129fbb --- /dev/null +++ b/checkpoint-100/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79f37e30aec02a02c27e7289c98513c01bc510ffc57bf74982f835c5b1c0f683 +size 6057364 diff --git a/checkpoint-100/optimizer_0/__5_0.distcp b/checkpoint-100/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..a2795f206e7afb796fbae8b0738c1ff785a317f6 --- /dev/null +++ b/checkpoint-100/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11729525c054a51a5107225a7d60b0313aae518730d08b2749238c346a1e906c +size 6042612 diff --git a/checkpoint-100/optimizer_0/__6_0.distcp b/checkpoint-100/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..b8bf7ba391eb4251d7174a2cb0b935148321497b --- /dev/null +++ b/checkpoint-100/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a9842cd8b1a5ec425a6a9e6050a067d2c5b08db5f487a4f91e6f3c238af6431 +size 6042612 diff --git a/checkpoint-100/optimizer_0/__7_0.distcp b/checkpoint-100/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..421b97a9d805befd7c6baa10aee16b7eeb6b0c22 --- /dev/null +++ b/checkpoint-100/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f0aba0269721eae1aec05e75b68b171e11b144f469cd6adfb1e19a2824b3791 +size 6042612 diff --git a/checkpoint-100/pytorch_model_fsdp_0/.metadata b/checkpoint-100/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..158fb984e724d9d280ef4e4aca8188577e5e45f0 --- /dev/null +++ b/checkpoint-100/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b95e896afde034e76a7b5fdba70015f5e507f24fb23d13a2eb125b6af6b822e +size 339851 diff --git a/checkpoint-100/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-100/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..647a6ea25778de30cc546f2ce38ff51063c15644 --- /dev/null +++ b/checkpoint-100/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:276c6102d25252a3058a7c14b3c6e037e6f40aad1e997e057bbd43c56bbcc029 +size 3003648 diff --git a/checkpoint-100/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-100/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..09192d04d936c20233ae7dc135f9dae9cfe78d95 --- /dev/null +++ b/checkpoint-100/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e0f5b8a430b58a8377337a11a1118be38e4e4065799f22312df73e9539edfb5 +size 3003648 diff --git a/checkpoint-100/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-100/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..8004963d3221a0e6cc1e577ccf7a24ace0e011d1 --- /dev/null +++ b/checkpoint-100/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:920db2946d599036845a8d8e58570c0a129df18a251ccf6054c13cd055056f35 +size 3003648 diff --git a/checkpoint-100/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-100/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..bd2c0f3162b424b29583060db14415a4a66e8653 --- /dev/null +++ b/checkpoint-100/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f62dad52a118474a5ac8feb1156d2bfde1a14979703f1ff312fcbfd30d8a3c1 +size 3003648 diff --git a/checkpoint-100/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-100/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..836173f1890448c07d43856d9d5176dfd2ad58d4 --- /dev/null +++ b/checkpoint-100/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3c1a0848cd7291dad379b68801f2e8a9d845989ada5fd42d00e5a23da6597ad +size 3003648 diff --git a/checkpoint-100/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-100/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..2b288286c1203bb85fb13efa35fc8994f9705d22 --- /dev/null +++ b/checkpoint-100/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:819bba510acfcb80bf63bce4c0a35be2f0af56ae2e49a72a05af94128b05fb5a +size 3003648 diff --git a/checkpoint-100/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-100/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..96563ae0fed478d8e31720bc9d40191ac70ecb9c --- /dev/null +++ b/checkpoint-100/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c591981b57d09088dd5dd3c2c4990f922ffe7df92389509d8b2703137ea6085 +size 3003648 diff --git a/checkpoint-100/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-100/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..b6e7562aae52ed95239b3d7c6cde5f406dd0c020 --- /dev/null +++ b/checkpoint-100/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9b5f0504de2c99266a04deca823b3673b779578680f0e82ddd604156fde6ffc +size 3003648 diff --git a/checkpoint-100/rng_state_0.pth b/checkpoint-100/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..f736bfe532d7e99bd98a6a32866d848a21a02d76 --- /dev/null +++ b/checkpoint-100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e67ac3c4038beb665d2cc4bd735b6f05977897a2757187e8c7c8e6b89fa4ad3d +size 15920 diff --git a/checkpoint-100/rng_state_1.pth b/checkpoint-100/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..7b46f01f4effba938049f318258b3987bffd06aa --- /dev/null +++ b/checkpoint-100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:219fc23b677e769ae4d4806c12e3df4ff2b78c28b311847bec2ecb99a35a51e6 +size 15984 diff --git a/checkpoint-100/rng_state_2.pth b/checkpoint-100/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..5e5963a1697f39b58d0a25802e72eac30bae3e81 --- /dev/null +++ b/checkpoint-100/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a352b51cb68b5d8818a7a28f74f32dffb1095d7e281ff4022f0e365fe98a8ee0 +size 15984 diff --git a/checkpoint-100/rng_state_3.pth b/checkpoint-100/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..a84335319e6e42b032d1d6da793e5377d412cf81 --- /dev/null +++ b/checkpoint-100/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df9e8f9785d0d17d62f1ba8a141384aa7a6438a53ecad5f21d877594c31b45d0 +size 15984 diff --git a/checkpoint-100/rng_state_4.pth b/checkpoint-100/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..1a0ed8248d472d9e71bbf034a061371cbb346355 --- /dev/null +++ b/checkpoint-100/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a770deea02d60eea9348a15bd08ac4f95c99d6b5b113eb31bc2fa7631dba1988 +size 15984 diff --git a/checkpoint-100/rng_state_5.pth b/checkpoint-100/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..74a809578c3cc5e6c60f1c6d9e0defa9447cc719 --- /dev/null +++ b/checkpoint-100/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72b60fb34596e1922ede5c5a2479b5b98033b4b988dc300df633c12f5e255755 +size 15984 diff --git a/checkpoint-100/rng_state_6.pth b/checkpoint-100/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..84a106e46cc9752aacfad3281bd314777c83b985 --- /dev/null +++ b/checkpoint-100/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84274411027a8eb72ed1179f8209a875b6f4101ac7c2790eaef04102df49af52 +size 15984 diff --git a/checkpoint-100/rng_state_7.pth b/checkpoint-100/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..10798d00a6ae647b8d8bdb74a97955808519e065 --- /dev/null +++ b/checkpoint-100/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bbc89ce90c5e7e9f362eb48f3be0a6f39aee82e598c876d2d126ef971bbbfdc +size 15984 diff --git a/checkpoint-100/scheduler.pt b/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..18a25b44ce07bc51cbcafce5586c7593482826a5 --- /dev/null +++ b/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b84ab1237abc7bd4d31945126355c5b6d9e26cb338d88dae9fd60030b2e1fb3 +size 1064 diff --git a/checkpoint-100/trainer_state.json b/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5a4f9c74d51f84c9dde07654d53aeeba2072f843 --- /dev/null +++ b/checkpoint-100/trainer_state.json @@ -0,0 +1,70 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.05, + "eval_steps": 100, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.14176353812217712, + "learning_rate": 0.0004, + "loss": 1.1525, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.1460508555173874, + "learning_rate": 0.0004998852503731983, + "loss": 1.047, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.2368021011352539, + "learning_rate": 0.0004993848168027977, + "loss": 0.8529, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.14488168060779572, + "learning_rate": 0.0004984880506341147, + "loss": 0.9756, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9470569491386414, + "eval_runtime": 845.9115, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 100 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0079315433619456e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1000/optimizer_0/.metadata b/checkpoint-1000/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..b15ad87b5b45f255a3110a8799f3cbad5babdf61 --- /dev/null +++ b/checkpoint-1000/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32c4b3e6e77e6c4fe0e4ec5ffe86cd9420b22487406f98c30d94993a7cae6335 +size 869362 diff --git a/checkpoint-1000/optimizer_0/__0_0.distcp b/checkpoint-1000/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..4d67dce9ef535a81b9445e83d0a53f4fd66c02cc --- /dev/null +++ b/checkpoint-1000/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fc4f6f9dc852ef0f917b009518f39d10d68a4536c1f51998cea4a5b83e892da +size 6008476 diff --git a/checkpoint-1000/optimizer_0/__1_0.distcp b/checkpoint-1000/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..86816776a4c2e39894bd54226f098e51206bbbf3 --- /dev/null +++ b/checkpoint-1000/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7312e1c4e946ab65367b2525624545d48b44d074f9e0ecba0290412be956f5e3 +size 6041200 diff --git a/checkpoint-1000/optimizer_0/__2_0.distcp b/checkpoint-1000/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..33e9991f01fce7740b205cc7a9c9a6191a46fa23 --- /dev/null +++ b/checkpoint-1000/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f002da732a464fc63c29885772852f44b142bb93c15442eb20c54fd05c05b90 +size 6041200 diff --git a/checkpoint-1000/optimizer_0/__3_0.distcp b/checkpoint-1000/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..5692ea20de13de7543f73323ee31307e87adcf05 --- /dev/null +++ b/checkpoint-1000/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f041b71230e388a6e048350c575718e2117303df15582bbd21841c79a30563f1 +size 6043476 diff --git a/checkpoint-1000/optimizer_0/__4_0.distcp b/checkpoint-1000/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..c0319a708f6c18d9694e7beaa5c1efd745fa9ebe --- /dev/null +++ b/checkpoint-1000/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abff4649eea683c30203f7c7d0079317bf744e54003c8ae8d366f67c1bf3881a +size 6057364 diff --git a/checkpoint-1000/optimizer_0/__5_0.distcp b/checkpoint-1000/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..8282a530063eb52860b1c5b44289b60bbe6ffebc --- /dev/null +++ b/checkpoint-1000/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:488ddea956a0925e88414a20672caa84392b5dfdf67f9d99a1286fa6236f6112 +size 6042612 diff --git a/checkpoint-1000/optimizer_0/__6_0.distcp b/checkpoint-1000/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..6db1979744920b12fcc4fc6457f7b783ade16d83 --- /dev/null +++ b/checkpoint-1000/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cebd75cf3f08edef15e48523d99f60d10d2e3d2fe33216bfde563ad390ca8b8 +size 6042612 diff --git a/checkpoint-1000/optimizer_0/__7_0.distcp b/checkpoint-1000/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..642051f4634918c37a54ebfd30d07fa79f6122e4 --- /dev/null +++ b/checkpoint-1000/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00fe62b7cdadb932c335a2a670e603f0ca4b8cad10a1a11c6af8f89b71c9e4d0 +size 6042612 diff --git a/checkpoint-1000/pytorch_model_fsdp_0/.metadata b/checkpoint-1000/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..23e81c231cb5851a8aac4d1113501b23ec3ab37c --- /dev/null +++ b/checkpoint-1000/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83582bb2c3364686161e3eda0fc44aa67a4e2029e1fc9605e333669c6f4a3215 +size 339852 diff --git a/checkpoint-1000/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-1000/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..452c4de083151b9f6063a696142fc8da963879ec --- /dev/null +++ b/checkpoint-1000/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa89fe800d11e2d111c108ff934abffb09954deec83c8c403892b023df67f064 +size 3003648 diff --git a/checkpoint-1000/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-1000/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..7ce980439e37a737ab04ff0af9d32404160a28ff --- /dev/null +++ b/checkpoint-1000/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cf772693a9e7e6e2205ed5f483cfd7c3c577459292281ccac91680f3f19e64a +size 3003648 diff --git a/checkpoint-1000/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-1000/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..2c8eda802bec776a044fea709ad1b09abdaa822a --- /dev/null +++ b/checkpoint-1000/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f05dae3a3b9c4ac6c5ebed01990323e5f0321ea9b2a52e83653840798f0ceb7 +size 3003648 diff --git a/checkpoint-1000/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-1000/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..07f96944358a1c95355b290185b0795bb5b4dd44 --- /dev/null +++ b/checkpoint-1000/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02a8d4367facec4482692297aeaae3e91fc2dbc7f4924caa4d2f84d174cc6c98 +size 3003648 diff --git a/checkpoint-1000/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-1000/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..4807b582146326ab754ac7c1e0c63c043e702e89 --- /dev/null +++ b/checkpoint-1000/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71e6479ff28ace5eeada37360c30b3d17db987b8c05d0ca6046bdcd2b40a65fc +size 3003648 diff --git a/checkpoint-1000/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-1000/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..e3d9ef7d0d74202dc36c4eb5fac0cc5aaddb3756 --- /dev/null +++ b/checkpoint-1000/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0861299d5e24a80f99564ac77a2acd79f83f89c1ebe9124cdd6bdbd80b3d505 +size 3003648 diff --git a/checkpoint-1000/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-1000/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..9cfdd426fa9975608f3e404e7896683003066302 --- /dev/null +++ b/checkpoint-1000/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fed1b4f6d88ffc7b06186d89144870b25154a95fd66b9323e44bd46a3131737 +size 3003648 diff --git a/checkpoint-1000/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-1000/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..92dd6ef9179daa60b526e2fae1cbd4f4a845a8c2 --- /dev/null +++ b/checkpoint-1000/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c38c7c6f67489e926a46c26ad5fd48cee00118acf74bc9665f6b1793aa4910ea +size 3003648 diff --git a/checkpoint-1000/rng_state_0.pth b/checkpoint-1000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d5af0d362a3113d75dda7637fde01a29169fb8ef --- /dev/null +++ b/checkpoint-1000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cce6fd095e8164b6174af806d5b65f1592b912a16965a6ac33d77e523c8ae2a +size 15920 diff --git a/checkpoint-1000/rng_state_1.pth b/checkpoint-1000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3452b4c6342c5827692e58dda66dc3088e599489 --- /dev/null +++ b/checkpoint-1000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c89ca7da8c752e198c07a80618c28fafea39abe5f5e38d625a1d96b586893f6e +size 15984 diff --git a/checkpoint-1000/rng_state_2.pth b/checkpoint-1000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..404a7e06dec824b6e49e724be0a89e3a76291d21 --- /dev/null +++ b/checkpoint-1000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26a64aa0a7bd0e9443e2c11a9e1b32b905f251349e940dd3776471dd51dc9441 +size 15984 diff --git a/checkpoint-1000/rng_state_3.pth b/checkpoint-1000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..501b011b332818b8f0fa85551fc7e8679c367117 --- /dev/null +++ b/checkpoint-1000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a594654588fc00a315de06ecb649724c8831626a965fe1794770a5720439d77 +size 15984 diff --git a/checkpoint-1000/rng_state_4.pth b/checkpoint-1000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..cd7be1f793b16cca6404cb79e494ff102b8c2ffe --- /dev/null +++ b/checkpoint-1000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f05c534760702d2b502b038f225706d0fc2398437c12ed59dc6afeacd0f91fdb +size 15984 diff --git a/checkpoint-1000/rng_state_5.pth b/checkpoint-1000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..659abf7b203527bae8d883d7408581f8a26efaa1 --- /dev/null +++ b/checkpoint-1000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5642030f3e712a115127de41444ed858a7a4cd47e591eb5d813c9053141d0ee8 +size 15984 diff --git a/checkpoint-1000/rng_state_6.pth b/checkpoint-1000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..502636b8c422c55e279d34385c87e89e89b2c774 --- /dev/null +++ b/checkpoint-1000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e95998bcb1c519354ca7d81b1fc52e904512e12dac25fcb5d083773e08e027ec +size 15984 diff --git a/checkpoint-1000/rng_state_7.pth b/checkpoint-1000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..c1ec569e924cf8b598e112a5174e2f903d038527 --- /dev/null +++ b/checkpoint-1000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ac0d9420ffc0691423064caa05e83bae45e9091902d0a644809c8e2535119b7 +size 15984 diff --git a/checkpoint-1000/scheduler.pt b/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e37d0618b489fc19085acd1bbe069567c7c8447d --- /dev/null +++ b/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b23d844ad7ae2eb6c7cbba3f70be2436823b11da6591df71ddcc7059f5593c4 +size 1064 diff --git a/checkpoint-1000/trainer_state.json b/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3f1abd7362372d170a3ce165f98e1dfda326d61b --- /dev/null +++ b/checkpoint-1000/trainer_state.json @@ -0,0 +1,394 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5, + "eval_steps": 100, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.14176353812217712, + "learning_rate": 0.0004, + "loss": 1.1525, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.1460508555173874, + "learning_rate": 0.0004998852503731983, + "loss": 1.047, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.2368021011352539, + "learning_rate": 0.0004993848168027977, + "loss": 0.8529, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.14488168060779572, + "learning_rate": 0.0004984880506341147, + "loss": 0.9756, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9470569491386414, + "eval_runtime": 845.9115, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.1415175348520279, + "learning_rate": 0.0004971963770447935, + "loss": 0.9564, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.1751064509153366, + "learning_rate": 0.0004955118488155782, + "loss": 0.711, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.1439943015575409, + "learning_rate": 0.0004934371430679492, + "loss": 0.9409, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.15947112441062927, + "learning_rate": 0.0004909755570095319, + "loss": 0.8979, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9711233973503113, + "eval_runtime": 845.6433, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.14685837924480438, + "learning_rate": 0.0004881310026940389, + "loss": 0.6376, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.15040776133537292, + "learning_rate": 0.0004849080008040734, + "loss": 0.927, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.16087745130062103, + "learning_rate": 0.00048131167346667446, + "loss": 0.8456, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.15025638043880463, + "learning_rate": 0.00047734773611302284, + "loss": 0.6029, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 1.0056413412094116, + "eval_runtime": 848.447, + "eval_samples_per_second": 1.293, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.15893957018852234, + "learning_rate": 0.0004730224883952422, + "loss": 0.9036, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.1535714715719223, + "learning_rate": 0.0004683428041747334, + "loss": 0.828, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.1718970686197281, + "learning_rate": 0.0004633161205979517, + "loss": 0.5944, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.17664535343647003, + "learning_rate": 0.0004579504262769877, + "loss": 0.8654, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 1.0151112079620361, + "eval_runtime": 839.7482, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.17122088372707367, + "learning_rate": 0.0004522542485937369, + "loss": 0.8078, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.19420970976352692, + "learning_rate": 0.00044623664014783386, + "loss": 0.5735, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.18166953325271606, + "learning_rate": 0.00043990716436988924, + "loss": 0.8604, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.1502382755279541, + "learning_rate": 0.0004332758803228925, + "loss": 0.7673, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 1.0454745292663574, + "eval_runtime": 841.3951, + "eval_samples_per_second": 1.304, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.16875839233398438, + "learning_rate": 0.00042635332671593575, + "loss": 0.5882, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.16070039570331573, + "learning_rate": 0.00041915050515566445, + "loss": 0.8175, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.16584184765815735, + "learning_rate": 0.00041167886266207167, + "loss": 0.7795, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.15345798432826996, + "learning_rate": 0.0004039502734764241, + "loss": 0.7331, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 1.0627943277359009, + "eval_runtime": 849.4944, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 600 + }, + { + "epoch": 0.3125, + "grad_norm": 0.16241031885147095, + "learning_rate": 0.0003959770201902294, + "loss": 0.7436, + "step": 625 + }, + { + "epoch": 0.325, + "grad_norm": 0.1531112939119339, + "learning_rate": 0.0003877717742252371, + "loss": 0.6345, + "step": 650 + }, + { + "epoch": 0.3375, + "grad_norm": 0.15790140628814697, + "learning_rate": 0.00037934757569549495, + "loss": 0.7351, + "step": 675 + }, + { + "epoch": 0.35, + "grad_norm": 0.1822807490825653, + "learning_rate": 0.00037071781268346345, + "loss": 0.745, + "step": 700 + }, + { + "epoch": 0.35, + "eval_loss": 1.07315993309021, + "eval_runtime": 837.5276, + "eval_samples_per_second": 1.31, + "eval_steps_per_second": 0.021, + "step": 700 + }, + { + "epoch": 0.3625, + "grad_norm": 0.15983282029628754, + "learning_rate": 0.00036189619996312495, + "loss": 0.5972, + "step": 725 + }, + { + "epoch": 0.375, + "grad_norm": 0.18202389776706696, + "learning_rate": 0.00035289675720390174, + "loss": 0.759, + "step": 750 + }, + { + "epoch": 0.3875, + "grad_norm": 0.16057106852531433, + "learning_rate": 0.00034373378669002105, + "loss": 0.7358, + "step": 775 + }, + { + "epoch": 0.4, + "grad_norm": 0.1680625081062317, + "learning_rate": 0.00033442185059073706, + "loss": 0.5636, + "step": 800 + }, + { + "epoch": 0.4, + "eval_loss": 1.0932456254959106, + "eval_runtime": 840.1331, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 800 + }, + { + "epoch": 0.4125, + "grad_norm": 0.15613198280334473, + "learning_rate": 0.00032497574781753367, + "loss": 0.7596, + "step": 825 + }, + { + "epoch": 0.425, + "grad_norm": 0.1628854125738144, + "learning_rate": 0.000315410490505086, + "loss": 0.7282, + "step": 850 + }, + { + "epoch": 0.4375, + "grad_norm": 0.16740979254245758, + "learning_rate": 0.0003057412801533589, + "loss": 0.5325, + "step": 875 + }, + { + "epoch": 0.45, + "grad_norm": 0.1634828895330429, + "learning_rate": 0.0002959834834687587, + "loss": 0.778, + "step": 900 + }, + { + "epoch": 0.45, + "eval_loss": 1.0952215194702148, + "eval_runtime": 836.473, + "eval_samples_per_second": 1.311, + "eval_steps_per_second": 0.022, + "step": 900 + }, + { + "epoch": 0.4625, + "grad_norm": 0.161700040102005, + "learning_rate": 0.00028615260794273236, + "loss": 0.7255, + "step": 925 + }, + { + "epoch": 0.475, + "grad_norm": 0.16280074417591095, + "learning_rate": 0.00027626427720662416, + "loss": 0.4993, + "step": 950 + }, + { + "epoch": 0.4875, + "grad_norm": 0.16114428639411926, + "learning_rate": 0.00026633420620195917, + "loss": 0.7765, + "step": 975 + }, + { + "epoch": 0.5, + "grad_norm": 0.16209788620471954, + "learning_rate": 0.00025637817620561263, + "loss": 0.7221, + "step": 1000 + }, + { + "epoch": 0.5, + "eval_loss": 1.1190329790115356, + "eval_runtime": 835.4164, + "eval_samples_per_second": 1.313, + "eval_steps_per_second": 0.022, + "step": 1000 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0079315433619456e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1100/optimizer_0/.metadata b/checkpoint-1100/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..93a23522dc073ae164fc7d8f8a9db2804f7d7509 --- /dev/null +++ b/checkpoint-1100/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:527e7126311cb82054cc4cb906cc4dd984ad5c8b2be939f5b79a6f312fcb316a +size 869362 diff --git a/checkpoint-1100/optimizer_0/__0_0.distcp b/checkpoint-1100/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..9fcb8b24c7afa98630c19a6c44c4983332ab5f6b --- /dev/null +++ b/checkpoint-1100/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ced761d194b9de05c0f1758600deee8bc5786000438491a4fab5a31c4ec205d +size 6008476 diff --git a/checkpoint-1100/optimizer_0/__1_0.distcp b/checkpoint-1100/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..564d44adc1934e34f72cfc2a1d12d8a1ba73165d --- /dev/null +++ b/checkpoint-1100/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7a5a332de43724cb9ef1ffeeafaf18dbca12333c908ea64d777f11260c9129e +size 6041200 diff --git a/checkpoint-1100/optimizer_0/__2_0.distcp b/checkpoint-1100/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..b4a26075ab4348feea64eb4034d15b75f1c631b2 --- /dev/null +++ b/checkpoint-1100/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cb4498471bf50d01e5cd9f3f8ea53812b762777d42bf99882830a3cfd82049f +size 6041200 diff --git a/checkpoint-1100/optimizer_0/__3_0.distcp b/checkpoint-1100/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..e7e3e9739d29d68a0d5a81a6aeb73f81652eaf2f --- /dev/null +++ b/checkpoint-1100/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c76e7ac55d4d9e598d590ad693cdc653609c3dba90cbdc143c2432705a7ae4a1 +size 6043476 diff --git a/checkpoint-1100/optimizer_0/__4_0.distcp b/checkpoint-1100/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..044c0d451f32878bf18785fea1aa93aaa316ff22 --- /dev/null +++ b/checkpoint-1100/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc90d9094fa5abbfb300c29c93aca9a60007f41e6d199650b87fc2a4b968a275 +size 6057364 diff --git a/checkpoint-1100/optimizer_0/__5_0.distcp b/checkpoint-1100/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..0719cd83533bbc25b45e22791bb9fe9f30459559 --- /dev/null +++ b/checkpoint-1100/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67f9658e3ea184d63d0482de02c00da325cee0c199402b81e80905bbfce9f553 +size 6042612 diff --git a/checkpoint-1100/optimizer_0/__6_0.distcp b/checkpoint-1100/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..8cc23b367d692d7268d2c1c4842f3a4edc0ce7e2 --- /dev/null +++ b/checkpoint-1100/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:638a354e9890d394d7dc38506ab403df786e17ce9c322654b031c55c468c1af2 +size 6042612 diff --git a/checkpoint-1100/optimizer_0/__7_0.distcp b/checkpoint-1100/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..c1f945024c6b06e6b55eeeb8bac1071448d77874 --- /dev/null +++ b/checkpoint-1100/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:118500145adc1a05ea7f789fdaef0460aa2f4e0d57f94bfcd3afaa73fb45a489 +size 6042612 diff --git a/checkpoint-1100/pytorch_model_fsdp_0/.metadata b/checkpoint-1100/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..17ca4331abb10f95078b3af152ff5c467768a6f0 --- /dev/null +++ b/checkpoint-1100/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05b6d48ed6552eab660d94389010610d38a3fb3adce2d3c26ba945b9b74b8e65 +size 339852 diff --git a/checkpoint-1100/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-1100/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..fa77473703940e566f540b49324ad8e9d8abb0b1 --- /dev/null +++ b/checkpoint-1100/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a997d9fefb4d80170781d280e9e9e1d69ba04e4d46b91e140e177426eeea09b +size 3003648 diff --git a/checkpoint-1100/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-1100/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..5b65405db5d7c8ea0cf6e1d9d4011546ecb93316 --- /dev/null +++ b/checkpoint-1100/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05fecc1a16829080ed7ff47d9ed458fb08a6b9a065be6140a4dc17bd347c9953 +size 3003648 diff --git a/checkpoint-1100/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-1100/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..a96a0ea3a220fdde6d1d2a2d5829c716a111d622 --- /dev/null +++ b/checkpoint-1100/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45559bd57ec11621c7b29c3572e471430748bfb30bb66045cea2a5d259166900 +size 3003648 diff --git a/checkpoint-1100/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-1100/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..1640c8b653bec2d3b69924fafc0fafe88d837a03 --- /dev/null +++ b/checkpoint-1100/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:844a0479d625652f2fb479ee1dfac9ae01163eac74dae629244ef56605704f7e +size 3003648 diff --git a/checkpoint-1100/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-1100/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..5c80f0781bd00511d625e5e88fe188e9ec45922c --- /dev/null +++ b/checkpoint-1100/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d1735c6e960fabb5e8def87f9ae907f30b33a3fefa7d2e6a66f0a1ba8261e0a +size 3003648 diff --git a/checkpoint-1100/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-1100/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..55cb564980a767d63dfa0aa137fbe323af84e0b6 --- /dev/null +++ b/checkpoint-1100/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5d330cf65b1433ddcb44c6633d3f6faa6f69ce35d28da964148264cf3d14014 +size 3003648 diff --git a/checkpoint-1100/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-1100/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..1fbd20cd8e8beded2561918469344e793bc87fbc --- /dev/null +++ b/checkpoint-1100/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b73b732ec81cc064b2b35dd99e74525551e9ae3da2a2c4906302e8e52a78c473 +size 3003648 diff --git a/checkpoint-1100/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-1100/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..9c1fbcb680668267c236a1d06c5ed0d5c55813f7 --- /dev/null +++ b/checkpoint-1100/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa671e89dce7e3c48d341de5f98eda8684f9fb3b7dabe71163822f8d9cd3f9c5 +size 3003648 diff --git a/checkpoint-1100/rng_state_0.pth b/checkpoint-1100/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..228f1ad5a1b03d68979173cf35d1c2903949b4b0 --- /dev/null +++ b/checkpoint-1100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10951c05f9fb192e43c36f7be898ee80966c186349da1034b098ec9159a5ec9b +size 15920 diff --git a/checkpoint-1100/rng_state_1.pth b/checkpoint-1100/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..079ab70e3eb64f8877a09c067ba6f6841daf2e84 --- /dev/null +++ b/checkpoint-1100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41b525dff0f684213798d62adb1acac1209a73873811052adb7c1ab57cebef53 +size 15984 diff --git a/checkpoint-1100/rng_state_2.pth b/checkpoint-1100/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6987c37178156f21a931cf21098a254a4a2d339a --- /dev/null +++ b/checkpoint-1100/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af6ec805c024be58842b00ba146913cc39f31735ead84ce44b5bc8288671b8c3 +size 15984 diff --git a/checkpoint-1100/rng_state_3.pth b/checkpoint-1100/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..3a5fb7aa094abdadccec49f4e3a0cb7cf7671110 --- /dev/null +++ b/checkpoint-1100/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bd93f2f6c415f784da2333f3cb1d21155a8fd7d1eec27a52408206334a2aee8 +size 15984 diff --git a/checkpoint-1100/rng_state_4.pth b/checkpoint-1100/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..d995c17a722803f0fe0facd7d913119c6d0440a1 --- /dev/null +++ b/checkpoint-1100/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5a6c06f83feb658a3a9dae7756b28bbc7f946e746dbbf609ae7b29aade1ab39 +size 15984 diff --git a/checkpoint-1100/rng_state_5.pth b/checkpoint-1100/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..fe2ed8c7907681ecbd60f9ad528dfc8fdcc0dfe8 --- /dev/null +++ b/checkpoint-1100/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9848dabf5d790c0b63a0c06050b93ff06a12f036d5a22192fa1c0ea1eea577cd +size 15984 diff --git a/checkpoint-1100/rng_state_6.pth b/checkpoint-1100/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..0e2b6906b013eae0f780865ac0113031b64f0d3d --- /dev/null +++ b/checkpoint-1100/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cd49cf108d0c9d7f33194257dd81143851b474af0a01f8ac96dd71a1d515195 +size 15984 diff --git a/checkpoint-1100/rng_state_7.pth b/checkpoint-1100/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..bf68cc860287179a359ac7d204b102b9ea9be48b --- /dev/null +++ b/checkpoint-1100/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dfed987246f4749dbec2c7d8e35618cdf9dc4d6bb56c4a6fc8b18b57228705b +size 15984 diff --git a/checkpoint-1100/scheduler.pt b/checkpoint-1100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..784ad2399b5919ecf3eccc367c9fa04fbfab4619 --- /dev/null +++ b/checkpoint-1100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2594e962e3980706571eb12f2ed27e8aed3b5e373484af50799e77cad68ebb48 +size 1064 diff --git a/checkpoint-1100/trainer_state.json b/checkpoint-1100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d21317ce8cf43749843df07e027d655ce01bc735 --- /dev/null +++ b/checkpoint-1100/trainer_state.json @@ -0,0 +1,430 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.55, + "eval_steps": 100, + "global_step": 1100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.14176353812217712, + "learning_rate": 0.0004, + "loss": 1.1525, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.1460508555173874, + "learning_rate": 0.0004998852503731983, + "loss": 1.047, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.2368021011352539, + "learning_rate": 0.0004993848168027977, + "loss": 0.8529, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.14488168060779572, + "learning_rate": 0.0004984880506341147, + "loss": 0.9756, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9470569491386414, + "eval_runtime": 845.9115, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.1415175348520279, + "learning_rate": 0.0004971963770447935, + "loss": 0.9564, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.1751064509153366, + "learning_rate": 0.0004955118488155782, + "loss": 0.711, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.1439943015575409, + "learning_rate": 0.0004934371430679492, + "loss": 0.9409, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.15947112441062927, + "learning_rate": 0.0004909755570095319, + "loss": 0.8979, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9711233973503113, + "eval_runtime": 845.6433, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.14685837924480438, + "learning_rate": 0.0004881310026940389, + "loss": 0.6376, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.15040776133537292, + "learning_rate": 0.0004849080008040734, + "loss": 0.927, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.16087745130062103, + "learning_rate": 0.00048131167346667446, + "loss": 0.8456, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.15025638043880463, + "learning_rate": 0.00047734773611302284, + "loss": 0.6029, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 1.0056413412094116, + "eval_runtime": 848.447, + "eval_samples_per_second": 1.293, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.15893957018852234, + "learning_rate": 0.0004730224883952422, + "loss": 0.9036, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.1535714715719223, + "learning_rate": 0.0004683428041747334, + "loss": 0.828, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.1718970686197281, + "learning_rate": 0.0004633161205979517, + "loss": 0.5944, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.17664535343647003, + "learning_rate": 0.0004579504262769877, + "loss": 0.8654, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 1.0151112079620361, + "eval_runtime": 839.7482, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.17122088372707367, + "learning_rate": 0.0004522542485937369, + "loss": 0.8078, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.19420970976352692, + "learning_rate": 0.00044623664014783386, + "loss": 0.5735, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.18166953325271606, + "learning_rate": 0.00043990716436988924, + "loss": 0.8604, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.1502382755279541, + "learning_rate": 0.0004332758803228925, + "loss": 0.7673, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 1.0454745292663574, + "eval_runtime": 841.3951, + "eval_samples_per_second": 1.304, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.16875839233398438, + "learning_rate": 0.00042635332671593575, + "loss": 0.5882, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.16070039570331573, + "learning_rate": 0.00041915050515566445, + "loss": 0.8175, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.16584184765815735, + "learning_rate": 0.00041167886266207167, + "loss": 0.7795, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.15345798432826996, + "learning_rate": 0.0004039502734764241, + "loss": 0.7331, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 1.0627943277359009, + "eval_runtime": 849.4944, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 600 + }, + { + "epoch": 0.3125, + "grad_norm": 0.16241031885147095, + "learning_rate": 0.0003959770201902294, + "loss": 0.7436, + "step": 625 + }, + { + "epoch": 0.325, + "grad_norm": 0.1531112939119339, + "learning_rate": 0.0003877717742252371, + "loss": 0.6345, + "step": 650 + }, + { + "epoch": 0.3375, + "grad_norm": 0.15790140628814697, + "learning_rate": 0.00037934757569549495, + "loss": 0.7351, + "step": 675 + }, + { + "epoch": 0.35, + "grad_norm": 0.1822807490825653, + "learning_rate": 0.00037071781268346345, + "loss": 0.745, + "step": 700 + }, + { + "epoch": 0.35, + "eval_loss": 1.07315993309021, + "eval_runtime": 837.5276, + "eval_samples_per_second": 1.31, + "eval_steps_per_second": 0.021, + "step": 700 + }, + { + "epoch": 0.3625, + "grad_norm": 0.15983282029628754, + "learning_rate": 0.00036189619996312495, + "loss": 0.5972, + "step": 725 + }, + { + "epoch": 0.375, + "grad_norm": 0.18202389776706696, + "learning_rate": 0.00035289675720390174, + "loss": 0.759, + "step": 750 + }, + { + "epoch": 0.3875, + "grad_norm": 0.16057106852531433, + "learning_rate": 0.00034373378669002105, + "loss": 0.7358, + "step": 775 + }, + { + "epoch": 0.4, + "grad_norm": 0.1680625081062317, + "learning_rate": 0.00033442185059073706, + "loss": 0.5636, + "step": 800 + }, + { + "epoch": 0.4, + "eval_loss": 1.0932456254959106, + "eval_runtime": 840.1331, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 800 + }, + { + "epoch": 0.4125, + "grad_norm": 0.15613198280334473, + "learning_rate": 0.00032497574781753367, + "loss": 0.7596, + "step": 825 + }, + { + "epoch": 0.425, + "grad_norm": 0.1628854125738144, + "learning_rate": 0.000315410490505086, + "loss": 0.7282, + "step": 850 + }, + { + "epoch": 0.4375, + "grad_norm": 0.16740979254245758, + "learning_rate": 0.0003057412801533589, + "loss": 0.5325, + "step": 875 + }, + { + "epoch": 0.45, + "grad_norm": 0.1634828895330429, + "learning_rate": 0.0002959834834687587, + "loss": 0.778, + "step": 900 + }, + { + "epoch": 0.45, + "eval_loss": 1.0952215194702148, + "eval_runtime": 836.473, + "eval_samples_per_second": 1.311, + "eval_steps_per_second": 0.022, + "step": 900 + }, + { + "epoch": 0.4625, + "grad_norm": 0.161700040102005, + "learning_rate": 0.00028615260794273236, + "loss": 0.7255, + "step": 925 + }, + { + "epoch": 0.475, + "grad_norm": 0.16280074417591095, + "learning_rate": 0.00027626427720662416, + "loss": 0.4993, + "step": 950 + }, + { + "epoch": 0.4875, + "grad_norm": 0.16114428639411926, + "learning_rate": 0.00026633420620195917, + "loss": 0.7765, + "step": 975 + }, + { + "epoch": 0.5, + "grad_norm": 0.16209788620471954, + "learning_rate": 0.00025637817620561263, + "loss": 0.7221, + "step": 1000 + }, + { + "epoch": 0.5, + "eval_loss": 1.1190329790115356, + "eval_runtime": 835.4164, + "eval_samples_per_second": 1.313, + "eval_steps_per_second": 0.022, + "step": 1000 + }, + { + "epoch": 0.5125, + "grad_norm": 0.15057620406150818, + "learning_rate": 0.0002464120097495559, + "loss": 0.4929, + "step": 1025 + }, + { + "epoch": 0.525, + "grad_norm": 0.16715486347675323, + "learning_rate": 0.00023645154547503855, + "loss": 0.7896, + "step": 1050 + }, + { + "epoch": 0.5375, + "grad_norm": 0.1591680645942688, + "learning_rate": 0.00022651261296116894, + "loss": 0.6999, + "step": 1075 + }, + { + "epoch": 0.55, + "grad_norm": 0.17062760889530182, + "learning_rate": 0.00021661100756789666, + "loss": 0.4708, + "step": 1100 + }, + { + "epoch": 0.55, + "eval_loss": 1.1381303071975708, + "eval_runtime": 839.2576, + "eval_samples_per_second": 1.307, + "eval_steps_per_second": 0.021, + "step": 1100 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1087246976981402e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1200/optimizer_0/.metadata b/checkpoint-1200/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..8591fad7f77ef2da66790b5d2a295481f5e7cfe6 --- /dev/null +++ b/checkpoint-1200/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:621d38e2b5f08c91c678b95c92ba03f99673d527c357532e42fed647ee7b0ad7 +size 869362 diff --git a/checkpoint-1200/optimizer_0/__0_0.distcp b/checkpoint-1200/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..5380029390ff81dc86e86c67462aaa94a674d59b --- /dev/null +++ b/checkpoint-1200/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc0f6b552ef891ce745f7a3eeeced1767a120ea0e9511fd54cba164d3fadcec8 +size 6008476 diff --git a/checkpoint-1200/optimizer_0/__1_0.distcp b/checkpoint-1200/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..36c158274f419f6e168f29c7d8fabdf1135af7eb --- /dev/null +++ b/checkpoint-1200/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8f9c1e9938042c6b3aab615ee88a590dc7a5c7064d7cde09d30a95d65a2541b +size 6041200 diff --git a/checkpoint-1200/optimizer_0/__2_0.distcp b/checkpoint-1200/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..413971d3c2fc14db62e3f20732f69d701256fc8d --- /dev/null +++ b/checkpoint-1200/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8b8a7a195cb05a27994eaea93d583902c8f31068994e71cf0e5d965b319bebe +size 6041200 diff --git a/checkpoint-1200/optimizer_0/__3_0.distcp b/checkpoint-1200/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..71f43eb0faabc9183900620c18eb592a10097260 --- /dev/null +++ b/checkpoint-1200/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:586384a235d706bbe918aabb3a1cfefbf31a735d7ec4cf77279c2f160e122627 +size 6043476 diff --git a/checkpoint-1200/optimizer_0/__4_0.distcp b/checkpoint-1200/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..bb6904220ddf23d7f1e9ae53648b6ddaba32fc2e --- /dev/null +++ b/checkpoint-1200/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ff766f4cdb9a2063187c73ce03ba45c65176bbbf9529e665f27804acdad1598 +size 6057364 diff --git a/checkpoint-1200/optimizer_0/__5_0.distcp b/checkpoint-1200/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..bd6d7bf759dddd8109ce67bf708e0423001e7f29 --- /dev/null +++ b/checkpoint-1200/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eabfb35b3172b31d2d4f3c563b15fc24231e9a5608978a62e7a55c86520701d3 +size 6042612 diff --git a/checkpoint-1200/optimizer_0/__6_0.distcp b/checkpoint-1200/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..9b4a1a1283720bd90acbeb5ac8fbe45c3d109493 --- /dev/null +++ b/checkpoint-1200/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:189477effe3709e0ab388e2cc5126f2036507d37f98a260bd5467d262edc1024 +size 6042612 diff --git a/checkpoint-1200/optimizer_0/__7_0.distcp b/checkpoint-1200/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..e92cf549e264090d5b115fc74471612d1f0a887d --- /dev/null +++ b/checkpoint-1200/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bcce2d9c90ced077da6319929963daae6b80f09f019ceeb6ddbb903155a9e92 +size 6042612 diff --git a/checkpoint-1200/pytorch_model_fsdp_0/.metadata b/checkpoint-1200/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..5aa21178568c84d04eba77685029304af0c61b8a --- /dev/null +++ b/checkpoint-1200/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed3daf4ed3e3117a74864aec6c79370054944b1813609204a7b850aa310e7386 +size 339852 diff --git a/checkpoint-1200/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-1200/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..cbbc8108e5d0100f62006a7e3843036547c8dd94 --- /dev/null +++ b/checkpoint-1200/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:386b2163f1b5ee6a6ec1038828e8dcf41518207cb0798469379eb7f617b0ae5a +size 3003648 diff --git a/checkpoint-1200/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-1200/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..2d81acf73fa46fac57c71d06bb2006c44d552f50 --- /dev/null +++ b/checkpoint-1200/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acec06e57959602aadc8fe189121bcc770f51cf8e51f0ff172bd8b679d1f9eaf +size 3003648 diff --git a/checkpoint-1200/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-1200/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..1b638e6c7c55628c32ce4efbc388a01e98251de7 --- /dev/null +++ b/checkpoint-1200/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba5d28604ab83e4e2735877348d8b6d1ef224c7cfe14b0e67f5e169517345dff +size 3003648 diff --git a/checkpoint-1200/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-1200/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..8680420c2517979d009dc7b239ab20033e63a96e --- /dev/null +++ b/checkpoint-1200/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbb0547ded73d144dc3b22c332d31639b0577a73253f00da56b13e5def1a8af1 +size 3003648 diff --git a/checkpoint-1200/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-1200/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..4d7061a713cf9e1394c24591d5adfdaaea0d0853 --- /dev/null +++ b/checkpoint-1200/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee3245207a038cf54e6b7f431b744928661b4da46223d663241eeb7cadafe261 +size 3003648 diff --git a/checkpoint-1200/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-1200/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..50e5e3cd8881cc4f7d26b9b170b6cffa6864a3c4 --- /dev/null +++ b/checkpoint-1200/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25878b7be63c11c15903a31f0d8d276fbad4ce81e2dadf8aa77cc86c41ae66e6 +size 3003648 diff --git a/checkpoint-1200/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-1200/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..53f76356e59dbef069d2245cdd42838446acb980 --- /dev/null +++ b/checkpoint-1200/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16af1e326dbe8777e65498cbe88acc541335a0757e2522dbb0c9c598b49161f7 +size 3003648 diff --git a/checkpoint-1200/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-1200/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..c7f8d0d873b79d96370e3688a0fd584902696f6e --- /dev/null +++ b/checkpoint-1200/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:861b31f574b06a93aab479d9b2c77d38c2c2f008137b3d60db2161d1565a5d1b +size 3003648 diff --git a/checkpoint-1200/rng_state_0.pth b/checkpoint-1200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5f189e74ec3c2b32bdf53731d15eeabdb45c473c --- /dev/null +++ b/checkpoint-1200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84c26b2a8acc7af904f88833cb6aa2007f56b758cc2bb09f4af6a136dcf2254e +size 15920 diff --git a/checkpoint-1200/rng_state_1.pth b/checkpoint-1200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..e7569c9da2848265f26dc13884b4d74d5c78a6d1 --- /dev/null +++ b/checkpoint-1200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:158efb2c453739224cd1a02c979b41052987fa4da6c1ca00610e5d806809e0b8 +size 15984 diff --git a/checkpoint-1200/rng_state_2.pth b/checkpoint-1200/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..96ddbce766334aeca24bedb3ad7523b5858cf5be --- /dev/null +++ b/checkpoint-1200/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e69bd054b5237b13d22eff3a3128acb49fbeda87aa873c3641f4221fa18abfb +size 15984 diff --git a/checkpoint-1200/rng_state_3.pth b/checkpoint-1200/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..bd11ab4ad1207e0e4d739bbf2e662fc3ae7ad75b --- /dev/null +++ b/checkpoint-1200/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01ca716fbae1882edf4fce6f1a9b9ee51f9bcdd08bceaec254a0906850aa5f3a +size 15984 diff --git a/checkpoint-1200/rng_state_4.pth b/checkpoint-1200/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..133fdff1da712ef7cbf2f6acd126fa45a65dfec5 --- /dev/null +++ b/checkpoint-1200/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:baca997feb34a76ea1c234c3cba01504c1ec11987d3b0b60d72a72245855c8b5 +size 15984 diff --git a/checkpoint-1200/rng_state_5.pth b/checkpoint-1200/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..c20e2ebff6f82adba59a56ee75a0f79d8f92695c --- /dev/null +++ b/checkpoint-1200/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1105b7213ffc67ccbe98e30fca965f1bea43cb45c136d5c8d7bdddc87ef6cde +size 15984 diff --git a/checkpoint-1200/rng_state_6.pth b/checkpoint-1200/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..30c01500e59bca4e78787572747633ce30a0ff44 --- /dev/null +++ b/checkpoint-1200/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4502eeb033ad80488d0ce1a43b8dcabb5e0552838a1afaa44f198f1fa8519580 +size 15984 diff --git a/checkpoint-1200/rng_state_7.pth b/checkpoint-1200/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..296cab46339e243450eba8853d7e69a5d544069d --- /dev/null +++ b/checkpoint-1200/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:364b0c128330c69d4648979aa306cf1cdc8e9a164f74b3a408e4fe68d4f6da7b +size 15984 diff --git a/checkpoint-1200/scheduler.pt b/checkpoint-1200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb1b33cc62da996510524591d7252c6e3ea166f0 --- /dev/null +++ b/checkpoint-1200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b015bc498e5b4ffb7ed88672aba64b1aba2e32e94c4926ce4107ea8baf36834c +size 1064 diff --git a/checkpoint-1200/trainer_state.json b/checkpoint-1200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..63be29c1fc2c67a8af492954d60ac9adbf75f7db --- /dev/null +++ b/checkpoint-1200/trainer_state.json @@ -0,0 +1,466 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6, + "eval_steps": 100, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.14176353812217712, + "learning_rate": 0.0004, + "loss": 1.1525, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.1460508555173874, + "learning_rate": 0.0004998852503731983, + "loss": 1.047, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.2368021011352539, + "learning_rate": 0.0004993848168027977, + "loss": 0.8529, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.14488168060779572, + "learning_rate": 0.0004984880506341147, + "loss": 0.9756, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9470569491386414, + "eval_runtime": 845.9115, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.1415175348520279, + "learning_rate": 0.0004971963770447935, + "loss": 0.9564, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.1751064509153366, + "learning_rate": 0.0004955118488155782, + "loss": 0.711, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.1439943015575409, + "learning_rate": 0.0004934371430679492, + "loss": 0.9409, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.15947112441062927, + "learning_rate": 0.0004909755570095319, + "loss": 0.8979, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9711233973503113, + "eval_runtime": 845.6433, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.14685837924480438, + "learning_rate": 0.0004881310026940389, + "loss": 0.6376, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.15040776133537292, + "learning_rate": 0.0004849080008040734, + "loss": 0.927, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.16087745130062103, + "learning_rate": 0.00048131167346667446, + "loss": 0.8456, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.15025638043880463, + "learning_rate": 0.00047734773611302284, + "loss": 0.6029, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 1.0056413412094116, + "eval_runtime": 848.447, + "eval_samples_per_second": 1.293, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.15893957018852234, + "learning_rate": 0.0004730224883952422, + "loss": 0.9036, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.1535714715719223, + "learning_rate": 0.0004683428041747334, + "loss": 0.828, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.1718970686197281, + "learning_rate": 0.0004633161205979517, + "loss": 0.5944, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.17664535343647003, + "learning_rate": 0.0004579504262769877, + "loss": 0.8654, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 1.0151112079620361, + "eval_runtime": 839.7482, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.17122088372707367, + "learning_rate": 0.0004522542485937369, + "loss": 0.8078, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.19420970976352692, + "learning_rate": 0.00044623664014783386, + "loss": 0.5735, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.18166953325271606, + "learning_rate": 0.00043990716436988924, + "loss": 0.8604, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.1502382755279541, + "learning_rate": 0.0004332758803228925, + "loss": 0.7673, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 1.0454745292663574, + "eval_runtime": 841.3951, + "eval_samples_per_second": 1.304, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.16875839233398438, + "learning_rate": 0.00042635332671593575, + "loss": 0.5882, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.16070039570331573, + "learning_rate": 0.00041915050515566445, + "loss": 0.8175, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.16584184765815735, + "learning_rate": 0.00041167886266207167, + "loss": 0.7795, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.15345798432826996, + "learning_rate": 0.0004039502734764241, + "loss": 0.7331, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 1.0627943277359009, + "eval_runtime": 849.4944, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 600 + }, + { + "epoch": 0.3125, + "grad_norm": 0.16241031885147095, + "learning_rate": 0.0003959770201902294, + "loss": 0.7436, + "step": 625 + }, + { + "epoch": 0.325, + "grad_norm": 0.1531112939119339, + "learning_rate": 0.0003877717742252371, + "loss": 0.6345, + "step": 650 + }, + { + "epoch": 0.3375, + "grad_norm": 0.15790140628814697, + "learning_rate": 0.00037934757569549495, + "loss": 0.7351, + "step": 675 + }, + { + "epoch": 0.35, + "grad_norm": 0.1822807490825653, + "learning_rate": 0.00037071781268346345, + "loss": 0.745, + "step": 700 + }, + { + "epoch": 0.35, + "eval_loss": 1.07315993309021, + "eval_runtime": 837.5276, + "eval_samples_per_second": 1.31, + "eval_steps_per_second": 0.021, + "step": 700 + }, + { + "epoch": 0.3625, + "grad_norm": 0.15983282029628754, + "learning_rate": 0.00036189619996312495, + "loss": 0.5972, + "step": 725 + }, + { + "epoch": 0.375, + "grad_norm": 0.18202389776706696, + "learning_rate": 0.00035289675720390174, + "loss": 0.759, + "step": 750 + }, + { + "epoch": 0.3875, + "grad_norm": 0.16057106852531433, + "learning_rate": 0.00034373378669002105, + "loss": 0.7358, + "step": 775 + }, + { + "epoch": 0.4, + "grad_norm": 0.1680625081062317, + "learning_rate": 0.00033442185059073706, + "loss": 0.5636, + "step": 800 + }, + { + "epoch": 0.4, + "eval_loss": 1.0932456254959106, + "eval_runtime": 840.1331, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 800 + }, + { + "epoch": 0.4125, + "grad_norm": 0.15613198280334473, + "learning_rate": 0.00032497574781753367, + "loss": 0.7596, + "step": 825 + }, + { + "epoch": 0.425, + "grad_norm": 0.1628854125738144, + "learning_rate": 0.000315410490505086, + "loss": 0.7282, + "step": 850 + }, + { + "epoch": 0.4375, + "grad_norm": 0.16740979254245758, + "learning_rate": 0.0003057412801533589, + "loss": 0.5325, + "step": 875 + }, + { + "epoch": 0.45, + "grad_norm": 0.1634828895330429, + "learning_rate": 0.0002959834834687587, + "loss": 0.778, + "step": 900 + }, + { + "epoch": 0.45, + "eval_loss": 1.0952215194702148, + "eval_runtime": 836.473, + "eval_samples_per_second": 1.311, + "eval_steps_per_second": 0.022, + "step": 900 + }, + { + "epoch": 0.4625, + "grad_norm": 0.161700040102005, + "learning_rate": 0.00028615260794273236, + "loss": 0.7255, + "step": 925 + }, + { + "epoch": 0.475, + "grad_norm": 0.16280074417591095, + "learning_rate": 0.00027626427720662416, + "loss": 0.4993, + "step": 950 + }, + { + "epoch": 0.4875, + "grad_norm": 0.16114428639411926, + "learning_rate": 0.00026633420620195917, + "loss": 0.7765, + "step": 975 + }, + { + "epoch": 0.5, + "grad_norm": 0.16209788620471954, + "learning_rate": 0.00025637817620561263, + "loss": 0.7221, + "step": 1000 + }, + { + "epoch": 0.5, + "eval_loss": 1.1190329790115356, + "eval_runtime": 835.4164, + "eval_samples_per_second": 1.313, + "eval_steps_per_second": 0.022, + "step": 1000 + }, + { + "epoch": 0.5125, + "grad_norm": 0.15057620406150818, + "learning_rate": 0.0002464120097495559, + "loss": 0.4929, + "step": 1025 + }, + { + "epoch": 0.525, + "grad_norm": 0.16715486347675323, + "learning_rate": 0.00023645154547503855, + "loss": 0.7896, + "step": 1050 + }, + { + "epoch": 0.5375, + "grad_norm": 0.1591680645942688, + "learning_rate": 0.00022651261296116894, + "loss": 0.6999, + "step": 1075 + }, + { + "epoch": 0.55, + "grad_norm": 0.17062760889530182, + "learning_rate": 0.00021661100756789666, + "loss": 0.4708, + "step": 1100 + }, + { + "epoch": 0.55, + "eval_loss": 1.1381303071975708, + "eval_runtime": 839.2576, + "eval_samples_per_second": 1.307, + "eval_steps_per_second": 0.021, + "step": 1100 + }, + { + "epoch": 0.5625, + "grad_norm": 0.16746024787425995, + "learning_rate": 0.00020676246533337764, + "loss": 0.8073, + "step": 1125 + }, + { + "epoch": 0.575, + "grad_norm": 0.18011848628520966, + "learning_rate": 0.00019698263796561526, + "loss": 0.7153, + "step": 1150 + }, + { + "epoch": 0.5875, + "grad_norm": 0.16664239764213562, + "learning_rate": 0.00018728706796812333, + "loss": 0.6313, + "step": 1175 + }, + { + "epoch": 0.6, + "grad_norm": 0.1630394607782364, + "learning_rate": 0.00017769116393914037, + "loss": 0.6952, + "step": 1200 + }, + { + "epoch": 0.6, + "eval_loss": 1.1242510080337524, + "eval_runtime": 847.0203, + "eval_samples_per_second": 1.295, + "eval_steps_per_second": 0.021, + "step": 1200 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2095178520343347e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1300/optimizer_0/.metadata b/checkpoint-1300/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..7885b107640c3313d28bc6f5b11f24546172c3f6 --- /dev/null +++ b/checkpoint-1300/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:385a78caa5f0eb7bbf10ed4ddc6a0f0505e8231eae9f04f10021db54c5d8dab4 +size 869362 diff --git a/checkpoint-1300/optimizer_0/__0_0.distcp b/checkpoint-1300/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..2a9c3f2e235c53dd671e3cb3a4b096900fb04afa --- /dev/null +++ b/checkpoint-1300/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58819bc5c85b2e66dd875389f6a1a73ea4d1901a420a897d0171d8137f1494d5 +size 6008476 diff --git a/checkpoint-1300/optimizer_0/__1_0.distcp b/checkpoint-1300/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..90fc917e29def6296398583e9d5a311fdb386182 --- /dev/null +++ b/checkpoint-1300/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05e4cc575a699ea54569278a91693c5832f97078ec63bc32a52c08e94a673e2c +size 6041200 diff --git a/checkpoint-1300/optimizer_0/__2_0.distcp b/checkpoint-1300/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..bd46ce92baba17b44ef70b7ae0459aca16cd79b2 --- /dev/null +++ b/checkpoint-1300/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8423e8f8312eabc7563290ba4c1123fad7650115615be8a54ba4329ed936a29a +size 6041200 diff --git a/checkpoint-1300/optimizer_0/__3_0.distcp b/checkpoint-1300/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..ec36049bb4ceb4a26216dbaae6843e1453a519b9 --- /dev/null +++ b/checkpoint-1300/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05519194b03280f9aad670049c4e94875dcae4041d14596056eedd46d03b71e0 +size 6043476 diff --git a/checkpoint-1300/optimizer_0/__4_0.distcp b/checkpoint-1300/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..333c7d90739a8728fcd623bc50d7a384e395a9ae --- /dev/null +++ b/checkpoint-1300/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5cb2fa1ae171a1827ae02430d855170431aa312209c490b49d5748d2a987a32 +size 6057364 diff --git a/checkpoint-1300/optimizer_0/__5_0.distcp b/checkpoint-1300/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..3e6602a913f424ac587e49dba41e939358cdfe43 --- /dev/null +++ b/checkpoint-1300/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:882c371ffde7ac58ce382dbd9bb487bde800a6357caa5379a055a5ddcfbd3f16 +size 6042612 diff --git a/checkpoint-1300/optimizer_0/__6_0.distcp b/checkpoint-1300/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..2d05c02b7f88f376bcfeee464985ca9ce1e57c03 --- /dev/null +++ b/checkpoint-1300/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eaad5c5a9e43c2e49277b8e33e2fc055e8e3e1c438fd86cf80f5d6dc55a6ad21 +size 6042612 diff --git a/checkpoint-1300/optimizer_0/__7_0.distcp b/checkpoint-1300/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..90808d511997b926aca1a5f0a22bf337c7cf0661 --- /dev/null +++ b/checkpoint-1300/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb54b7ed9b6b80cc321e0a03029b4269e5f22864ecba2371f88c24883611be42 +size 6042612 diff --git a/checkpoint-1300/pytorch_model_fsdp_0/.metadata b/checkpoint-1300/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..1cbc7512d4faf28b110e83395c6b138b966c7962 --- /dev/null +++ b/checkpoint-1300/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b84a4083b299731f53c4e293bd49ab1d5198236129ad3f806e6332dab8cb4997 +size 339852 diff --git a/checkpoint-1300/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-1300/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..9e34089367ab16f0798a11bb94d82e3ee5ece5f5 --- /dev/null +++ b/checkpoint-1300/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d385037c05817e72e9eda895da3fac74cd1a74936bf934a09df65834efc75e8d +size 3003648 diff --git a/checkpoint-1300/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-1300/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..96616a814dccc4fbd340de74c4a0a9351f2a6a22 --- /dev/null +++ b/checkpoint-1300/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:802ae77d540fe2975e8fe0c590012c42c87ac05b1c8d0762b59acf4ad1110ce4 +size 3003648 diff --git a/checkpoint-1300/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-1300/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..820c8ac3f0a7470319346c265b3ea78e30478ed8 --- /dev/null +++ b/checkpoint-1300/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:274685b98e68a0ae04d5a1101f8b20ef844390b21a7f5c830ca70522c4ceb69f +size 3003648 diff --git a/checkpoint-1300/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-1300/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..9ee58002112e7b9290913210a9e7dc09d9f9dc64 --- /dev/null +++ b/checkpoint-1300/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bac547f443ba36645eee5f9f72e585a499ed89eb6252de1fa27142f4d433ccb0 +size 3003648 diff --git a/checkpoint-1300/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-1300/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..670d84f955da4430f4c38cbf13b651ab0f8b4e2e --- /dev/null +++ b/checkpoint-1300/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69a76fe508e19add0a411c404b387895c67bc7dc876000a0cabbf576eab3a79e +size 3003648 diff --git a/checkpoint-1300/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-1300/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..f5f9f11413aff3d2a6164c815740ec60a458b6ac --- /dev/null +++ b/checkpoint-1300/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf778f9fd1065ddb802113254e4194bb1dae99fd9ee43be576ccd2cd2c4da896 +size 3003648 diff --git a/checkpoint-1300/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-1300/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..f6d13c4824e8f5651843b8cf46e0c2c1669f90dc --- /dev/null +++ b/checkpoint-1300/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b08f646edd67437f0130f7165369dfba793c932d381896a1570a57db7783c85 +size 3003648 diff --git a/checkpoint-1300/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-1300/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..ab157596d68f43838bc3eaa3d29225d40f8a7e99 --- /dev/null +++ b/checkpoint-1300/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81c6594f339d04fa79dfa65fceba62ea7321bcf4c125c34eec0c95120f8f3a1a +size 3003648 diff --git a/checkpoint-1300/rng_state_0.pth b/checkpoint-1300/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..02683b94819a040ad40ce62df07cbf9f1df7ac07 --- /dev/null +++ b/checkpoint-1300/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7fd89795021dd0160bb820ad9e658cd1b0d80e3405b507e1c81edf6001bc8ca +size 15920 diff --git a/checkpoint-1300/rng_state_1.pth b/checkpoint-1300/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2f2d417794539025b8ef9394635570b28a840a69 --- /dev/null +++ b/checkpoint-1300/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59c14448ee646c5307c2110c2dbd86e370f0560e1b3ea0772e54e8789b3405d6 +size 15984 diff --git a/checkpoint-1300/rng_state_2.pth b/checkpoint-1300/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..83a36ac16625c9b64cfdbc6cad05b7f4c7fa3422 --- /dev/null +++ b/checkpoint-1300/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5be62bd202108de11264c1be2eb9abd7dc33d1b2edb627ba15cd11e3cb6250d +size 15984 diff --git a/checkpoint-1300/rng_state_3.pth b/checkpoint-1300/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..efed9169b167ba01c0f8245575a19958f3143771 --- /dev/null +++ b/checkpoint-1300/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1f4f8d1e91666e894d50d7e2886591715ba36ed4a759c2ea2acd4a2145bc0a1 +size 15984 diff --git a/checkpoint-1300/rng_state_4.pth b/checkpoint-1300/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..f6924c1ba4ca473f751b2d2c970bb6a03fba8a6e --- /dev/null +++ b/checkpoint-1300/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41e84b003216499d66cd69ab0951adaa3bcbda1e67fd3962cc82600206da2c25 +size 15984 diff --git a/checkpoint-1300/rng_state_5.pth b/checkpoint-1300/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..50fedbed6bae3628716cf7e83cf39732524b200b --- /dev/null +++ b/checkpoint-1300/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d07b16fc7185bf60589c6dc98ebb7edb5b3e9a7ecf3a0cb1a83bfbc60ed674c2 +size 15984 diff --git a/checkpoint-1300/rng_state_6.pth b/checkpoint-1300/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..f76d98ab45c53e6a766641c614b055d3fd180f40 --- /dev/null +++ b/checkpoint-1300/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45b6bda4561656d04ad5a1edb957a0fd798a9af4aac1623510b9407bc589f070 +size 15984 diff --git a/checkpoint-1300/rng_state_7.pth b/checkpoint-1300/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..6a446cf921419ddb5cd7b23d29c622635cedce4a --- /dev/null +++ b/checkpoint-1300/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:088bf86c6b6f9482925a6c46f7a5976920adeb27963828cdc042e3e4328e7bff +size 15984 diff --git a/checkpoint-1300/scheduler.pt b/checkpoint-1300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..72652d1ca1d20c565375320a634597caa64d6264 --- /dev/null +++ b/checkpoint-1300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da7bd64925acbd9bc3dbd1a44a27b1aa523daf766a871a6ffb2ba33b7fc1ea02 +size 1064 diff --git a/checkpoint-1300/trainer_state.json b/checkpoint-1300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ad936c9ef7ce4b8609ba244c29f22d9258d14c10 --- /dev/null +++ b/checkpoint-1300/trainer_state.json @@ -0,0 +1,502 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.65, + "eval_steps": 100, + "global_step": 1300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.14176353812217712, + "learning_rate": 0.0004, + "loss": 1.1525, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.1460508555173874, + "learning_rate": 0.0004998852503731983, + "loss": 1.047, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.2368021011352539, + "learning_rate": 0.0004993848168027977, + "loss": 0.8529, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.14488168060779572, + "learning_rate": 0.0004984880506341147, + "loss": 0.9756, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9470569491386414, + "eval_runtime": 845.9115, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.1415175348520279, + "learning_rate": 0.0004971963770447935, + "loss": 0.9564, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.1751064509153366, + "learning_rate": 0.0004955118488155782, + "loss": 0.711, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.1439943015575409, + "learning_rate": 0.0004934371430679492, + "loss": 0.9409, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.15947112441062927, + "learning_rate": 0.0004909755570095319, + "loss": 0.8979, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9711233973503113, + "eval_runtime": 845.6433, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.14685837924480438, + "learning_rate": 0.0004881310026940389, + "loss": 0.6376, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.15040776133537292, + "learning_rate": 0.0004849080008040734, + "loss": 0.927, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.16087745130062103, + "learning_rate": 0.00048131167346667446, + "loss": 0.8456, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.15025638043880463, + "learning_rate": 0.00047734773611302284, + "loss": 0.6029, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 1.0056413412094116, + "eval_runtime": 848.447, + "eval_samples_per_second": 1.293, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.15893957018852234, + "learning_rate": 0.0004730224883952422, + "loss": 0.9036, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.1535714715719223, + "learning_rate": 0.0004683428041747334, + "loss": 0.828, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.1718970686197281, + "learning_rate": 0.0004633161205979517, + "loss": 0.5944, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.17664535343647003, + "learning_rate": 0.0004579504262769877, + "loss": 0.8654, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 1.0151112079620361, + "eval_runtime": 839.7482, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.17122088372707367, + "learning_rate": 0.0004522542485937369, + "loss": 0.8078, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.19420970976352692, + "learning_rate": 0.00044623664014783386, + "loss": 0.5735, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.18166953325271606, + "learning_rate": 0.00043990716436988924, + "loss": 0.8604, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.1502382755279541, + "learning_rate": 0.0004332758803228925, + "loss": 0.7673, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 1.0454745292663574, + "eval_runtime": 841.3951, + "eval_samples_per_second": 1.304, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.16875839233398438, + "learning_rate": 0.00042635332671593575, + "loss": 0.5882, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.16070039570331573, + "learning_rate": 0.00041915050515566445, + "loss": 0.8175, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.16584184765815735, + "learning_rate": 0.00041167886266207167, + "loss": 0.7795, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.15345798432826996, + "learning_rate": 0.0004039502734764241, + "loss": 0.7331, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 1.0627943277359009, + "eval_runtime": 849.4944, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 600 + }, + { + "epoch": 0.3125, + "grad_norm": 0.16241031885147095, + "learning_rate": 0.0003959770201902294, + "loss": 0.7436, + "step": 625 + }, + { + "epoch": 0.325, + "grad_norm": 0.1531112939119339, + "learning_rate": 0.0003877717742252371, + "loss": 0.6345, + "step": 650 + }, + { + "epoch": 0.3375, + "grad_norm": 0.15790140628814697, + "learning_rate": 0.00037934757569549495, + "loss": 0.7351, + "step": 675 + }, + { + "epoch": 0.35, + "grad_norm": 0.1822807490825653, + "learning_rate": 0.00037071781268346345, + "loss": 0.745, + "step": 700 + }, + { + "epoch": 0.35, + "eval_loss": 1.07315993309021, + "eval_runtime": 837.5276, + "eval_samples_per_second": 1.31, + "eval_steps_per_second": 0.021, + "step": 700 + }, + { + "epoch": 0.3625, + "grad_norm": 0.15983282029628754, + "learning_rate": 0.00036189619996312495, + "loss": 0.5972, + "step": 725 + }, + { + "epoch": 0.375, + "grad_norm": 0.18202389776706696, + "learning_rate": 0.00035289675720390174, + "loss": 0.759, + "step": 750 + }, + { + "epoch": 0.3875, + "grad_norm": 0.16057106852531433, + "learning_rate": 0.00034373378669002105, + "loss": 0.7358, + "step": 775 + }, + { + "epoch": 0.4, + "grad_norm": 0.1680625081062317, + "learning_rate": 0.00033442185059073706, + "loss": 0.5636, + "step": 800 + }, + { + "epoch": 0.4, + "eval_loss": 1.0932456254959106, + "eval_runtime": 840.1331, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 800 + }, + { + "epoch": 0.4125, + "grad_norm": 0.15613198280334473, + "learning_rate": 0.00032497574781753367, + "loss": 0.7596, + "step": 825 + }, + { + "epoch": 0.425, + "grad_norm": 0.1628854125738144, + "learning_rate": 0.000315410490505086, + "loss": 0.7282, + "step": 850 + }, + { + "epoch": 0.4375, + "grad_norm": 0.16740979254245758, + "learning_rate": 0.0003057412801533589, + "loss": 0.5325, + "step": 875 + }, + { + "epoch": 0.45, + "grad_norm": 0.1634828895330429, + "learning_rate": 0.0002959834834687587, + "loss": 0.778, + "step": 900 + }, + { + "epoch": 0.45, + "eval_loss": 1.0952215194702148, + "eval_runtime": 836.473, + "eval_samples_per_second": 1.311, + "eval_steps_per_second": 0.022, + "step": 900 + }, + { + "epoch": 0.4625, + "grad_norm": 0.161700040102005, + "learning_rate": 0.00028615260794273236, + "loss": 0.7255, + "step": 925 + }, + { + "epoch": 0.475, + "grad_norm": 0.16280074417591095, + "learning_rate": 0.00027626427720662416, + "loss": 0.4993, + "step": 950 + }, + { + "epoch": 0.4875, + "grad_norm": 0.16114428639411926, + "learning_rate": 0.00026633420620195917, + "loss": 0.7765, + "step": 975 + }, + { + "epoch": 0.5, + "grad_norm": 0.16209788620471954, + "learning_rate": 0.00025637817620561263, + "loss": 0.7221, + "step": 1000 + }, + { + "epoch": 0.5, + "eval_loss": 1.1190329790115356, + "eval_runtime": 835.4164, + "eval_samples_per_second": 1.313, + "eval_steps_per_second": 0.022, + "step": 1000 + }, + { + "epoch": 0.5125, + "grad_norm": 0.15057620406150818, + "learning_rate": 0.0002464120097495559, + "loss": 0.4929, + "step": 1025 + }, + { + "epoch": 0.525, + "grad_norm": 0.16715486347675323, + "learning_rate": 0.00023645154547503855, + "loss": 0.7896, + "step": 1050 + }, + { + "epoch": 0.5375, + "grad_norm": 0.1591680645942688, + "learning_rate": 0.00022651261296116894, + "loss": 0.6999, + "step": 1075 + }, + { + "epoch": 0.55, + "grad_norm": 0.17062760889530182, + "learning_rate": 0.00021661100756789666, + "loss": 0.4708, + "step": 1100 + }, + { + "epoch": 0.55, + "eval_loss": 1.1381303071975708, + "eval_runtime": 839.2576, + "eval_samples_per_second": 1.307, + "eval_steps_per_second": 0.021, + "step": 1100 + }, + { + "epoch": 0.5625, + "grad_norm": 0.16746024787425995, + "learning_rate": 0.00020676246533337764, + "loss": 0.8073, + "step": 1125 + }, + { + "epoch": 0.575, + "grad_norm": 0.18011848628520966, + "learning_rate": 0.00019698263796561526, + "loss": 0.7153, + "step": 1150 + }, + { + "epoch": 0.5875, + "grad_norm": 0.16664239764213562, + "learning_rate": 0.00018728706796812333, + "loss": 0.6313, + "step": 1175 + }, + { + "epoch": 0.6, + "grad_norm": 0.1630394607782364, + "learning_rate": 0.00017769116393914037, + "loss": 0.6952, + "step": 1200 + }, + { + "epoch": 0.6, + "eval_loss": 1.1242510080337524, + "eval_runtime": 847.0203, + "eval_samples_per_second": 1.295, + "eval_steps_per_second": 0.021, + "step": 1200 + }, + { + "epoch": 0.6125, + "grad_norm": 0.16016128659248352, + "learning_rate": 0.00016821017608365264, + "loss": 0.6161, + "step": 1225 + }, + { + "epoch": 0.625, + "grad_norm": 0.17186138033866882, + "learning_rate": 0.00015885917197714112, + "loss": 0.623, + "step": 1250 + }, + { + "epoch": 0.6375, + "grad_norm": 0.1764240562915802, + "learning_rate": 0.00014965301261957238, + "loss": 0.6988, + "step": 1275 + }, + { + "epoch": 0.65, + "grad_norm": 0.16019247472286224, + "learning_rate": 0.00014060632881768558, + "loss": 0.599, + "step": 1300 + }, + { + "epoch": 0.65, + "eval_loss": 1.1349693536758423, + "eval_runtime": 842.5056, + "eval_samples_per_second": 1.302, + "eval_steps_per_second": 0.021, + "step": 1300 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3103110063705293e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1400/optimizer_0/.metadata b/checkpoint-1400/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..593a3431ce9d36250ca401323813011b842d4148 --- /dev/null +++ b/checkpoint-1400/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54aeeeaace554884acc45eb83839e6329380003e419284572de07fc5fefe94af +size 869362 diff --git a/checkpoint-1400/optimizer_0/__0_0.distcp b/checkpoint-1400/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..f41082d58d7dcd4de859b3164f796e69c78c223d --- /dev/null +++ b/checkpoint-1400/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a5ad48ce5fbb9da3559e7110f0ed73b1a65dd698dcefa5a745b12842f2b0503 +size 6008476 diff --git a/checkpoint-1400/optimizer_0/__1_0.distcp b/checkpoint-1400/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..680e815ac029b27fa58aa367277ec0ce15dd73fc --- /dev/null +++ b/checkpoint-1400/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3db8cbbfa67ee1395efdb6f1ca874014bf588c86c817d01b8c9c95b660a8002 +size 6041200 diff --git a/checkpoint-1400/optimizer_0/__2_0.distcp b/checkpoint-1400/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..90f7175e3819764e6e2775c2e86cd1c5bfd64af8 --- /dev/null +++ b/checkpoint-1400/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa535eadea96cd3956185cee0a43ec5088af71e5b783bf8e5cc632a73676192c +size 6041200 diff --git a/checkpoint-1400/optimizer_0/__3_0.distcp b/checkpoint-1400/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..bc7933036fcb22f3a4dd7e1e3543221545ab7b0e --- /dev/null +++ b/checkpoint-1400/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3692e1deff0675170a371b96aea040714467a16b16fdbdd03251bbe1f6394902 +size 6043476 diff --git a/checkpoint-1400/optimizer_0/__4_0.distcp b/checkpoint-1400/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..92a92ee51263d7313e942e48ca30173bdb01ca47 --- /dev/null +++ b/checkpoint-1400/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:264a2dd55c9aeae4d65efc0f8313211607bcc1a571a8e69bd1d21d29ef1dd4ae +size 6057364 diff --git a/checkpoint-1400/optimizer_0/__5_0.distcp b/checkpoint-1400/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..3d9123dc2c5b0ef7b6106037efa654df1c5427a7 --- /dev/null +++ b/checkpoint-1400/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8737380dbdd61565df37fdc1b4be8c0a8f526f3dcfce7c5a033ed562d7dca4b0 +size 6042612 diff --git a/checkpoint-1400/optimizer_0/__6_0.distcp b/checkpoint-1400/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..2802637a472cd42856c4ae922082ed197c07f7ab --- /dev/null +++ b/checkpoint-1400/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3ecc4261271ba2fac0b56ea0b4767e353e016ba1f1307ba11909969c40fcde1 +size 6042612 diff --git a/checkpoint-1400/optimizer_0/__7_0.distcp b/checkpoint-1400/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..ca65c2de08218a8e5a97ec0f63b226af1a94922c --- /dev/null +++ b/checkpoint-1400/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c29d207cc79bf16921867f8e121ff795969e8b60011f66bd3d82e14a8ebf57da +size 6042612 diff --git a/checkpoint-1400/pytorch_model_fsdp_0/.metadata b/checkpoint-1400/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..30b08c1b490b718726fec8c16fa0a7a5fa56c988 --- /dev/null +++ b/checkpoint-1400/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e07a944b47100ae789073825d24a757c6dc55719cc1bdee845c8bbb495494d38 +size 339852 diff --git a/checkpoint-1400/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-1400/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..57123fba99c0f673c93a4497265600d7c170bfe8 --- /dev/null +++ b/checkpoint-1400/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2a2df6bed94120d4abded7f2e28470c39425aff2810bec091962f28aa9fea7d +size 3003648 diff --git a/checkpoint-1400/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-1400/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..2f3d1ec479d1b8ab410ed30660cf5d5f16b8b6c6 --- /dev/null +++ b/checkpoint-1400/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e25c5af233777e7156fae8c858c4948e04b53ef3703adc95278d784d7e985b0 +size 3003648 diff --git a/checkpoint-1400/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-1400/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..f23ca59035bedfbb94d9bb1b85414b01646de5a8 --- /dev/null +++ b/checkpoint-1400/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22efe7eeccd71f2b0de4462cf9b5e58b696c5e166c1867960e8b9748567a0971 +size 3003648 diff --git a/checkpoint-1400/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-1400/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..691ebd42a4074a0f5335e3b441e360c93cc43212 --- /dev/null +++ b/checkpoint-1400/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f04a3d9ea98ba51c57498742048d239469e74c00dd96526cd1ba6bbd08387f1c +size 3003648 diff --git a/checkpoint-1400/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-1400/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..ce3c4dda832c38e98e34de91287b9d09b0703a12 --- /dev/null +++ b/checkpoint-1400/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8eb5456ee9987f0234aa72fa3f6b584d262f36e295d8980570022a6f8e08cacb +size 3003648 diff --git a/checkpoint-1400/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-1400/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..49f82b50182c2d7463a85c60416645775020d5eb --- /dev/null +++ b/checkpoint-1400/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf22d778d2183158e215019ccc749595acc48b121121acaaa02f8b4fff2965b4 +size 3003648 diff --git a/checkpoint-1400/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-1400/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..b0197cf6ba657f39fca79965d17b51a8bb85e498 --- /dev/null +++ b/checkpoint-1400/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4921174f7580f7c0ff3bc7b6f5a8957d85bcdf00790f75b418763e03f6ae203 +size 3003648 diff --git a/checkpoint-1400/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-1400/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..df917dfc9450c61535bd58d3086ba5f6ff20de27 --- /dev/null +++ b/checkpoint-1400/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf5117932849aee38377497ea6ca50c9ae718f3c9488e848b421275bb3120b40 +size 3003648 diff --git a/checkpoint-1400/rng_state_0.pth b/checkpoint-1400/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..227ec93c3e8ef56f4b4c2cca828cef131af39a71 --- /dev/null +++ b/checkpoint-1400/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d42a2eedba7a4091055a96ccd3dd4da2eaca6fbbe25f1c2e80ab817e8886a5de +size 15920 diff --git a/checkpoint-1400/rng_state_1.pth b/checkpoint-1400/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..de560c5e245753e21ecc7f31871114a4704ce933 --- /dev/null +++ b/checkpoint-1400/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9179322cd15281e69c1b0d5a3f50657fc311319f087cacb8b2e5938e7eb07e9a +size 15984 diff --git a/checkpoint-1400/rng_state_2.pth b/checkpoint-1400/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..2b105f23b205018fb489a337ae2d5b3f7d705293 --- /dev/null +++ b/checkpoint-1400/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:575be61fed0627afbdff204b0816dae4e338f50bd0d2049ecaaf1655573b6da8 +size 15984 diff --git a/checkpoint-1400/rng_state_3.pth b/checkpoint-1400/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..e6015106f13742c4787da603dabde3c77d68a3d8 --- /dev/null +++ b/checkpoint-1400/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78eba4696cda3ab2ac997a1652955234d9cd351ca320a61eebda0d811d802485 +size 15984 diff --git a/checkpoint-1400/rng_state_4.pth b/checkpoint-1400/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..34a4632a882aa5d234a0ee74fc465997f9dc55d6 --- /dev/null +++ b/checkpoint-1400/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:079ee4a128bab7511bd7c5d4c741b16adf9d7557be80143b70136278e26989b6 +size 15984 diff --git a/checkpoint-1400/rng_state_5.pth b/checkpoint-1400/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..9ebabe22764141052f308a11d43efdb26f55c746 --- /dev/null +++ b/checkpoint-1400/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af7a288fd74a41f90a133314e24ec753386013d31f41ff53c5be06f970265382 +size 15984 diff --git a/checkpoint-1400/rng_state_6.pth b/checkpoint-1400/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..291457d8a66ca684b3e5b40da23d5d44f68da956 --- /dev/null +++ b/checkpoint-1400/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d86ea638f561758fd19b01eae63675ca617641f68ebc2017b5613bf3f2cf71ff +size 15984 diff --git a/checkpoint-1400/rng_state_7.pth b/checkpoint-1400/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..289a19856c5fa6f64baecb063315516f8ad2ca77 --- /dev/null +++ b/checkpoint-1400/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19b9e9a5b7eb9ee987ef2641478c13a247070b27f40b93bd75415ecff0952a25 +size 15984 diff --git a/checkpoint-1400/scheduler.pt b/checkpoint-1400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0a26edbd6707caad9939b100160a29bf32569fa --- /dev/null +++ b/checkpoint-1400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f00dc98dfb48cefd72fc26922748e64c046f326e2d0dc623af08b2bb7f66af78 +size 1064 diff --git a/checkpoint-1400/trainer_state.json b/checkpoint-1400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..09006fca8c8ba414a04af14b03960244848511fd --- /dev/null +++ b/checkpoint-1400/trainer_state.json @@ -0,0 +1,538 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7, + "eval_steps": 100, + "global_step": 1400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.14176353812217712, + "learning_rate": 0.0004, + "loss": 1.1525, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.1460508555173874, + "learning_rate": 0.0004998852503731983, + "loss": 1.047, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.2368021011352539, + "learning_rate": 0.0004993848168027977, + "loss": 0.8529, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.14488168060779572, + "learning_rate": 0.0004984880506341147, + "loss": 0.9756, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9470569491386414, + "eval_runtime": 845.9115, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.1415175348520279, + "learning_rate": 0.0004971963770447935, + "loss": 0.9564, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.1751064509153366, + "learning_rate": 0.0004955118488155782, + "loss": 0.711, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.1439943015575409, + "learning_rate": 0.0004934371430679492, + "loss": 0.9409, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.15947112441062927, + "learning_rate": 0.0004909755570095319, + "loss": 0.8979, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9711233973503113, + "eval_runtime": 845.6433, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.14685837924480438, + "learning_rate": 0.0004881310026940389, + "loss": 0.6376, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.15040776133537292, + "learning_rate": 0.0004849080008040734, + "loss": 0.927, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.16087745130062103, + "learning_rate": 0.00048131167346667446, + "loss": 0.8456, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.15025638043880463, + "learning_rate": 0.00047734773611302284, + "loss": 0.6029, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 1.0056413412094116, + "eval_runtime": 848.447, + "eval_samples_per_second": 1.293, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.15893957018852234, + "learning_rate": 0.0004730224883952422, + "loss": 0.9036, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.1535714715719223, + "learning_rate": 0.0004683428041747334, + "loss": 0.828, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.1718970686197281, + "learning_rate": 0.0004633161205979517, + "loss": 0.5944, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.17664535343647003, + "learning_rate": 0.0004579504262769877, + "loss": 0.8654, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 1.0151112079620361, + "eval_runtime": 839.7482, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.17122088372707367, + "learning_rate": 0.0004522542485937369, + "loss": 0.8078, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.19420970976352692, + "learning_rate": 0.00044623664014783386, + "loss": 0.5735, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.18166953325271606, + "learning_rate": 0.00043990716436988924, + "loss": 0.8604, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.1502382755279541, + "learning_rate": 0.0004332758803228925, + "loss": 0.7673, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 1.0454745292663574, + "eval_runtime": 841.3951, + "eval_samples_per_second": 1.304, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.16875839233398438, + "learning_rate": 0.00042635332671593575, + "loss": 0.5882, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.16070039570331573, + "learning_rate": 0.00041915050515566445, + "loss": 0.8175, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.16584184765815735, + "learning_rate": 0.00041167886266207167, + "loss": 0.7795, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.15345798432826996, + "learning_rate": 0.0004039502734764241, + "loss": 0.7331, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 1.0627943277359009, + "eval_runtime": 849.4944, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 600 + }, + { + "epoch": 0.3125, + "grad_norm": 0.16241031885147095, + "learning_rate": 0.0003959770201902294, + "loss": 0.7436, + "step": 625 + }, + { + "epoch": 0.325, + "grad_norm": 0.1531112939119339, + "learning_rate": 0.0003877717742252371, + "loss": 0.6345, + "step": 650 + }, + { + "epoch": 0.3375, + "grad_norm": 0.15790140628814697, + "learning_rate": 0.00037934757569549495, + "loss": 0.7351, + "step": 675 + }, + { + "epoch": 0.35, + "grad_norm": 0.1822807490825653, + "learning_rate": 0.00037071781268346345, + "loss": 0.745, + "step": 700 + }, + { + "epoch": 0.35, + "eval_loss": 1.07315993309021, + "eval_runtime": 837.5276, + "eval_samples_per_second": 1.31, + "eval_steps_per_second": 0.021, + "step": 700 + }, + { + "epoch": 0.3625, + "grad_norm": 0.15983282029628754, + "learning_rate": 0.00036189619996312495, + "loss": 0.5972, + "step": 725 + }, + { + "epoch": 0.375, + "grad_norm": 0.18202389776706696, + "learning_rate": 0.00035289675720390174, + "loss": 0.759, + "step": 750 + }, + { + "epoch": 0.3875, + "grad_norm": 0.16057106852531433, + "learning_rate": 0.00034373378669002105, + "loss": 0.7358, + "step": 775 + }, + { + "epoch": 0.4, + "grad_norm": 0.1680625081062317, + "learning_rate": 0.00033442185059073706, + "loss": 0.5636, + "step": 800 + }, + { + "epoch": 0.4, + "eval_loss": 1.0932456254959106, + "eval_runtime": 840.1331, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 800 + }, + { + "epoch": 0.4125, + "grad_norm": 0.15613198280334473, + "learning_rate": 0.00032497574781753367, + "loss": 0.7596, + "step": 825 + }, + { + "epoch": 0.425, + "grad_norm": 0.1628854125738144, + "learning_rate": 0.000315410490505086, + "loss": 0.7282, + "step": 850 + }, + { + "epoch": 0.4375, + "grad_norm": 0.16740979254245758, + "learning_rate": 0.0003057412801533589, + "loss": 0.5325, + "step": 875 + }, + { + "epoch": 0.45, + "grad_norm": 0.1634828895330429, + "learning_rate": 0.0002959834834687587, + "loss": 0.778, + "step": 900 + }, + { + "epoch": 0.45, + "eval_loss": 1.0952215194702148, + "eval_runtime": 836.473, + "eval_samples_per_second": 1.311, + "eval_steps_per_second": 0.022, + "step": 900 + }, + { + "epoch": 0.4625, + "grad_norm": 0.161700040102005, + "learning_rate": 0.00028615260794273236, + "loss": 0.7255, + "step": 925 + }, + { + "epoch": 0.475, + "grad_norm": 0.16280074417591095, + "learning_rate": 0.00027626427720662416, + "loss": 0.4993, + "step": 950 + }, + { + "epoch": 0.4875, + "grad_norm": 0.16114428639411926, + "learning_rate": 0.00026633420620195917, + "loss": 0.7765, + "step": 975 + }, + { + "epoch": 0.5, + "grad_norm": 0.16209788620471954, + "learning_rate": 0.00025637817620561263, + "loss": 0.7221, + "step": 1000 + }, + { + "epoch": 0.5, + "eval_loss": 1.1190329790115356, + "eval_runtime": 835.4164, + "eval_samples_per_second": 1.313, + "eval_steps_per_second": 0.022, + "step": 1000 + }, + { + "epoch": 0.5125, + "grad_norm": 0.15057620406150818, + "learning_rate": 0.0002464120097495559, + "loss": 0.4929, + "step": 1025 + }, + { + "epoch": 0.525, + "grad_norm": 0.16715486347675323, + "learning_rate": 0.00023645154547503855, + "loss": 0.7896, + "step": 1050 + }, + { + "epoch": 0.5375, + "grad_norm": 0.1591680645942688, + "learning_rate": 0.00022651261296116894, + "loss": 0.6999, + "step": 1075 + }, + { + "epoch": 0.55, + "grad_norm": 0.17062760889530182, + "learning_rate": 0.00021661100756789666, + "loss": 0.4708, + "step": 1100 + }, + { + "epoch": 0.55, + "eval_loss": 1.1381303071975708, + "eval_runtime": 839.2576, + "eval_samples_per_second": 1.307, + "eval_steps_per_second": 0.021, + "step": 1100 + }, + { + "epoch": 0.5625, + "grad_norm": 0.16746024787425995, + "learning_rate": 0.00020676246533337764, + "loss": 0.8073, + "step": 1125 + }, + { + "epoch": 0.575, + "grad_norm": 0.18011848628520966, + "learning_rate": 0.00019698263796561526, + "loss": 0.7153, + "step": 1150 + }, + { + "epoch": 0.5875, + "grad_norm": 0.16664239764213562, + "learning_rate": 0.00018728706796812333, + "loss": 0.6313, + "step": 1175 + }, + { + "epoch": 0.6, + "grad_norm": 0.1630394607782364, + "learning_rate": 0.00017769116393914037, + "loss": 0.6952, + "step": 1200 + }, + { + "epoch": 0.6, + "eval_loss": 1.1242510080337524, + "eval_runtime": 847.0203, + "eval_samples_per_second": 1.295, + "eval_steps_per_second": 0.021, + "step": 1200 + }, + { + "epoch": 0.6125, + "grad_norm": 0.16016128659248352, + "learning_rate": 0.00016821017608365264, + "loss": 0.6161, + "step": 1225 + }, + { + "epoch": 0.625, + "grad_norm": 0.17186138033866882, + "learning_rate": 0.00015885917197714112, + "loss": 0.623, + "step": 1250 + }, + { + "epoch": 0.6375, + "grad_norm": 0.1764240562915802, + "learning_rate": 0.00014965301261957238, + "loss": 0.6988, + "step": 1275 + }, + { + "epoch": 0.65, + "grad_norm": 0.16019247472286224, + "learning_rate": 0.00014060632881768558, + "loss": 0.599, + "step": 1300 + }, + { + "epoch": 0.65, + "eval_loss": 1.1349693536758423, + "eval_runtime": 842.5056, + "eval_samples_per_second": 1.302, + "eval_steps_per_second": 0.021, + "step": 1300 + }, + { + "epoch": 0.6625, + "grad_norm": 0.17425072193145752, + "learning_rate": 0.00013173349793311424, + "loss": 0.6607, + "step": 1325 + }, + { + "epoch": 0.675, + "grad_norm": 0.17765691876411438, + "learning_rate": 0.0001230486210332916, + "loss": 0.6811, + "step": 1350 + }, + { + "epoch": 0.6875, + "grad_norm": 0.17980526387691498, + "learning_rate": 0.00011456550048145536, + "loss": 0.5755, + "step": 1375 + }, + { + "epoch": 0.7, + "grad_norm": 0.1814012974500656, + "learning_rate": 0.00010629761800136473, + "loss": 0.6642, + "step": 1400 + }, + { + "epoch": 0.7, + "eval_loss": 1.1495640277862549, + "eval_runtime": 840.9164, + "eval_samples_per_second": 1.305, + "eval_steps_per_second": 0.021, + "step": 1400 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.4111041607067238e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1500/optimizer_0/.metadata b/checkpoint-1500/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..155da2a739ef9e6ad07dab70407fb8fcb48d0b92 --- /dev/null +++ b/checkpoint-1500/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6ac6f6d03caa4b60589688484e2f1963fb3faaea38531abca62e117a7e39656 +size 869362 diff --git a/checkpoint-1500/optimizer_0/__0_0.distcp b/checkpoint-1500/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..bc743e8ee93ea9f18018f3382408f81282059053 --- /dev/null +++ b/checkpoint-1500/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6105dcbc124012ad780415c283015e6736f1f98522bbdf9aa5ffd19db7cb3ee7 +size 6008476 diff --git a/checkpoint-1500/optimizer_0/__1_0.distcp b/checkpoint-1500/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..440250696ba33d9e9dc14adcf638c981b22758ff --- /dev/null +++ b/checkpoint-1500/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ffb3f37caa15cb68db43f39203e8d42349f14b2524cefe26773b73915d96a0c +size 6041200 diff --git a/checkpoint-1500/optimizer_0/__2_0.distcp b/checkpoint-1500/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..3a2d3395b24b6a06517d808640a642d669803821 --- /dev/null +++ b/checkpoint-1500/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fea3be553365829a4d6f4bf514c604b9c3fea4e599bbb8214b550d1deafc58b +size 6041200 diff --git a/checkpoint-1500/optimizer_0/__3_0.distcp b/checkpoint-1500/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..a4f091759ec01fc3dc15c09d0249ca38964c5f70 --- /dev/null +++ b/checkpoint-1500/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38231669358d16bbd14b8dde9decb35ce533ef3c73aecfec0227cbc44eb141f5 +size 6043476 diff --git a/checkpoint-1500/optimizer_0/__4_0.distcp b/checkpoint-1500/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..0d289bd6eba69870346c592984af6172b3717fa8 --- /dev/null +++ b/checkpoint-1500/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94af25e6f67d5c9f677c127d0bf709ccd4522fb62bd39501cafd9f0ffb5ab55c +size 6057364 diff --git a/checkpoint-1500/optimizer_0/__5_0.distcp b/checkpoint-1500/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..b2ddbb19c865bb2433f8dd49e417e818a1028a4f --- /dev/null +++ b/checkpoint-1500/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96d3016d2bab08536af91ef332165feec3579ca0ac043b5677b51d73ed491f23 +size 6042612 diff --git a/checkpoint-1500/optimizer_0/__6_0.distcp b/checkpoint-1500/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..f6c2a95f3faf2ea3127ba1ee1edfbe81e1940a63 --- /dev/null +++ b/checkpoint-1500/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2382a4118e1a06bf461e7b90ff107a92987436e1debbce2f39ec7b0b962419c +size 6042612 diff --git a/checkpoint-1500/optimizer_0/__7_0.distcp b/checkpoint-1500/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..d129a04965dad03ae78d949280fdcb943b3ed13a --- /dev/null +++ b/checkpoint-1500/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17e6859dfcf27a2ed57edbcad0fb8faa0beda48f0e7dafad4f20753de0077ff4 +size 6042612 diff --git a/checkpoint-1500/pytorch_model_fsdp_0/.metadata b/checkpoint-1500/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..dde8ca3600ddeed560cc0d9c22e4ae19ddad9ad2 --- /dev/null +++ b/checkpoint-1500/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d6806ca7d0fd860204b67c021947bc14902284d5f0fa65ef6851d90441261f8 +size 339852 diff --git a/checkpoint-1500/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-1500/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..7104638dae967ac305b707aee486d7f732ec706f --- /dev/null +++ b/checkpoint-1500/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91463eb27276e741b53383a5f51ed8e7142d198f277920a9adf76fb32156a616 +size 3003648 diff --git a/checkpoint-1500/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-1500/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..268794393dca3d36f0db33dc05c29b4bca44522f --- /dev/null +++ b/checkpoint-1500/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79ff3b0fe98440aa20021ce15d6f6f06433ead5d01134690fe016a3217597318 +size 3003648 diff --git a/checkpoint-1500/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-1500/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..1336f486aa96d4b74268575c6f40132c2d96151e --- /dev/null +++ b/checkpoint-1500/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fba4704173927a2484fdd1b87499b5b1192f50dbb67f82d3097dc14059c82226 +size 3003648 diff --git a/checkpoint-1500/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-1500/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..a973d06fe53f2550e4dd220353567aa5e9ba9afd --- /dev/null +++ b/checkpoint-1500/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f0ce48aaed32874602c678e9fac2fc1ad618b5d8ee152eb73c3e178c39c6f22 +size 3003648 diff --git a/checkpoint-1500/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-1500/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..8196ca675a0230c1eae713894d100d4d43d38c16 --- /dev/null +++ b/checkpoint-1500/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d51fcca092f0b172a17f7031cd3c83f833b7430b35e5b7cecde636fd21b4a00 +size 3003648 diff --git a/checkpoint-1500/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-1500/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..0b6aac4e2b31864b80e07c50b2682a7cf1efd7f2 --- /dev/null +++ b/checkpoint-1500/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad2f7b51b790898caa2fe075db86d73588bec0a3132040bfeb9815336f18d33d +size 3003648 diff --git a/checkpoint-1500/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-1500/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..aa37a21428deaab593ea3a44a83ebf198901d40f --- /dev/null +++ b/checkpoint-1500/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c25485de614999a95b0cd3d1a97188c97a7af4fae4ecb4d938bba6bb7c2a8ec +size 3003648 diff --git a/checkpoint-1500/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-1500/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..c4185836d0e6bf78faa62d9d2361cd10ff2d9654 --- /dev/null +++ b/checkpoint-1500/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:433fb62beb52ba8511c46896fe9a896a0c134ae49207c568bd79abece834bddb +size 3003648 diff --git a/checkpoint-1500/rng_state_0.pth b/checkpoint-1500/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..e54be6e459745bf20a83cb622e1c86cc10ce8522 --- /dev/null +++ b/checkpoint-1500/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4433e68a7ecfbf84d5b59193fed5be299b3c6bd9661c1b1b3d68a8ab696604cb +size 15920 diff --git a/checkpoint-1500/rng_state_1.pth b/checkpoint-1500/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..6777150f01befaef1d07adc26bb66fa963a27b17 --- /dev/null +++ b/checkpoint-1500/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d39875d8cadf9928a3b808d6ef72fe52aae4e255cd7cd4ad5e8e242f7fd2c7fc +size 15984 diff --git a/checkpoint-1500/rng_state_2.pth b/checkpoint-1500/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..b153c00669542a7713a55886f7e0b38fbfa8b6c3 --- /dev/null +++ b/checkpoint-1500/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cbd45a2db6acfac5caa9d80d63ece12503b0f63fbb7f6b3b0b69084bbef4738 +size 15984 diff --git a/checkpoint-1500/rng_state_3.pth b/checkpoint-1500/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..7d30286c8d6304d83a82b86df629f6de262036a3 --- /dev/null +++ b/checkpoint-1500/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c38724293e33d297e0505ec90e3d2ef0c7a688bdc8ebac4eda63333054d3cf9 +size 15984 diff --git a/checkpoint-1500/rng_state_4.pth b/checkpoint-1500/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..e63dc44d1acdf8dbd85132331ef3498347f00867 --- /dev/null +++ b/checkpoint-1500/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa36cca0956be05ecb2b267f55f966b640b7231db368e3dd74d8f88fbc57f27a +size 15984 diff --git a/checkpoint-1500/rng_state_5.pth b/checkpoint-1500/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..81c44d72b83ef081e691da62e0cedc7da1972ce0 --- /dev/null +++ b/checkpoint-1500/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fac3b399bca51abee30dd72b7fbd3555191fac9a8c694b9b8f6c0c1bd78ab9db +size 15984 diff --git a/checkpoint-1500/rng_state_6.pth b/checkpoint-1500/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d4883dd34f64393221674ff342513408662cdc --- /dev/null +++ b/checkpoint-1500/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edfbdf6345e32141c8b4bce93a6ff87c74295d078ebc83b6b99aa4e9a9d59619 +size 15984 diff --git a/checkpoint-1500/rng_state_7.pth b/checkpoint-1500/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d982129db3bcab3f2d3ffcae82308af24371c19a --- /dev/null +++ b/checkpoint-1500/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e282b4b95e56ee90ebb39bc1215807db12589d83ab676f7523b66d0ddf085dee +size 15984 diff --git a/checkpoint-1500/scheduler.pt b/checkpoint-1500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e0b92a890ced6e2b96196734a43486800291efc --- /dev/null +++ b/checkpoint-1500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06e65cc13c66c2828cdb8f114dee592c488900c0a56fd072ff729fc38f989e26 +size 1064 diff --git a/checkpoint-1500/trainer_state.json b/checkpoint-1500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..98c64b57c2bdc9d04dba9f3d14a8b577bad21d1b --- /dev/null +++ b/checkpoint-1500/trainer_state.json @@ -0,0 +1,574 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.75, + "eval_steps": 100, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.14176353812217712, + "learning_rate": 0.0004, + "loss": 1.1525, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.1460508555173874, + "learning_rate": 0.0004998852503731983, + "loss": 1.047, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.2368021011352539, + "learning_rate": 0.0004993848168027977, + "loss": 0.8529, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.14488168060779572, + "learning_rate": 0.0004984880506341147, + "loss": 0.9756, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9470569491386414, + "eval_runtime": 845.9115, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.1415175348520279, + "learning_rate": 0.0004971963770447935, + "loss": 0.9564, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.1751064509153366, + "learning_rate": 0.0004955118488155782, + "loss": 0.711, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.1439943015575409, + "learning_rate": 0.0004934371430679492, + "loss": 0.9409, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.15947112441062927, + "learning_rate": 0.0004909755570095319, + "loss": 0.8979, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9711233973503113, + "eval_runtime": 845.6433, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.14685837924480438, + "learning_rate": 0.0004881310026940389, + "loss": 0.6376, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.15040776133537292, + "learning_rate": 0.0004849080008040734, + "loss": 0.927, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.16087745130062103, + "learning_rate": 0.00048131167346667446, + "loss": 0.8456, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.15025638043880463, + "learning_rate": 0.00047734773611302284, + "loss": 0.6029, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 1.0056413412094116, + "eval_runtime": 848.447, + "eval_samples_per_second": 1.293, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.15893957018852234, + "learning_rate": 0.0004730224883952422, + "loss": 0.9036, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.1535714715719223, + "learning_rate": 0.0004683428041747334, + "loss": 0.828, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.1718970686197281, + "learning_rate": 0.0004633161205979517, + "loss": 0.5944, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.17664535343647003, + "learning_rate": 0.0004579504262769877, + "loss": 0.8654, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 1.0151112079620361, + "eval_runtime": 839.7482, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.17122088372707367, + "learning_rate": 0.0004522542485937369, + "loss": 0.8078, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.19420970976352692, + "learning_rate": 0.00044623664014783386, + "loss": 0.5735, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.18166953325271606, + "learning_rate": 0.00043990716436988924, + "loss": 0.8604, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.1502382755279541, + "learning_rate": 0.0004332758803228925, + "loss": 0.7673, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 1.0454745292663574, + "eval_runtime": 841.3951, + "eval_samples_per_second": 1.304, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.16875839233398438, + "learning_rate": 0.00042635332671593575, + "loss": 0.5882, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.16070039570331573, + "learning_rate": 0.00041915050515566445, + "loss": 0.8175, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.16584184765815735, + "learning_rate": 0.00041167886266207167, + "loss": 0.7795, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.15345798432826996, + "learning_rate": 0.0004039502734764241, + "loss": 0.7331, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 1.0627943277359009, + "eval_runtime": 849.4944, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 600 + }, + { + "epoch": 0.3125, + "grad_norm": 0.16241031885147095, + "learning_rate": 0.0003959770201902294, + "loss": 0.7436, + "step": 625 + }, + { + "epoch": 0.325, + "grad_norm": 0.1531112939119339, + "learning_rate": 0.0003877717742252371, + "loss": 0.6345, + "step": 650 + }, + { + "epoch": 0.3375, + "grad_norm": 0.15790140628814697, + "learning_rate": 0.00037934757569549495, + "loss": 0.7351, + "step": 675 + }, + { + "epoch": 0.35, + "grad_norm": 0.1822807490825653, + "learning_rate": 0.00037071781268346345, + "loss": 0.745, + "step": 700 + }, + { + "epoch": 0.35, + "eval_loss": 1.07315993309021, + "eval_runtime": 837.5276, + "eval_samples_per_second": 1.31, + "eval_steps_per_second": 0.021, + "step": 700 + }, + { + "epoch": 0.3625, + "grad_norm": 0.15983282029628754, + "learning_rate": 0.00036189619996312495, + "loss": 0.5972, + "step": 725 + }, + { + "epoch": 0.375, + "grad_norm": 0.18202389776706696, + "learning_rate": 0.00035289675720390174, + "loss": 0.759, + "step": 750 + }, + { + "epoch": 0.3875, + "grad_norm": 0.16057106852531433, + "learning_rate": 0.00034373378669002105, + "loss": 0.7358, + "step": 775 + }, + { + "epoch": 0.4, + "grad_norm": 0.1680625081062317, + "learning_rate": 0.00033442185059073706, + "loss": 0.5636, + "step": 800 + }, + { + "epoch": 0.4, + "eval_loss": 1.0932456254959106, + "eval_runtime": 840.1331, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 800 + }, + { + "epoch": 0.4125, + "grad_norm": 0.15613198280334473, + "learning_rate": 0.00032497574781753367, + "loss": 0.7596, + "step": 825 + }, + { + "epoch": 0.425, + "grad_norm": 0.1628854125738144, + "learning_rate": 0.000315410490505086, + "loss": 0.7282, + "step": 850 + }, + { + "epoch": 0.4375, + "grad_norm": 0.16740979254245758, + "learning_rate": 0.0003057412801533589, + "loss": 0.5325, + "step": 875 + }, + { + "epoch": 0.45, + "grad_norm": 0.1634828895330429, + "learning_rate": 0.0002959834834687587, + "loss": 0.778, + "step": 900 + }, + { + "epoch": 0.45, + "eval_loss": 1.0952215194702148, + "eval_runtime": 836.473, + "eval_samples_per_second": 1.311, + "eval_steps_per_second": 0.022, + "step": 900 + }, + { + "epoch": 0.4625, + "grad_norm": 0.161700040102005, + "learning_rate": 0.00028615260794273236, + "loss": 0.7255, + "step": 925 + }, + { + "epoch": 0.475, + "grad_norm": 0.16280074417591095, + "learning_rate": 0.00027626427720662416, + "loss": 0.4993, + "step": 950 + }, + { + "epoch": 0.4875, + "grad_norm": 0.16114428639411926, + "learning_rate": 0.00026633420620195917, + "loss": 0.7765, + "step": 975 + }, + { + "epoch": 0.5, + "grad_norm": 0.16209788620471954, + "learning_rate": 0.00025637817620561263, + "loss": 0.7221, + "step": 1000 + }, + { + "epoch": 0.5, + "eval_loss": 1.1190329790115356, + "eval_runtime": 835.4164, + "eval_samples_per_second": 1.313, + "eval_steps_per_second": 0.022, + "step": 1000 + }, + { + "epoch": 0.5125, + "grad_norm": 0.15057620406150818, + "learning_rate": 0.0002464120097495559, + "loss": 0.4929, + "step": 1025 + }, + { + "epoch": 0.525, + "grad_norm": 0.16715486347675323, + "learning_rate": 0.00023645154547503855, + "loss": 0.7896, + "step": 1050 + }, + { + "epoch": 0.5375, + "grad_norm": 0.1591680645942688, + "learning_rate": 0.00022651261296116894, + "loss": 0.6999, + "step": 1075 + }, + { + "epoch": 0.55, + "grad_norm": 0.17062760889530182, + "learning_rate": 0.00021661100756789666, + "loss": 0.4708, + "step": 1100 + }, + { + "epoch": 0.55, + "eval_loss": 1.1381303071975708, + "eval_runtime": 839.2576, + "eval_samples_per_second": 1.307, + "eval_steps_per_second": 0.021, + "step": 1100 + }, + { + "epoch": 0.5625, + "grad_norm": 0.16746024787425995, + "learning_rate": 0.00020676246533337764, + "loss": 0.8073, + "step": 1125 + }, + { + "epoch": 0.575, + "grad_norm": 0.18011848628520966, + "learning_rate": 0.00019698263796561526, + "loss": 0.7153, + "step": 1150 + }, + { + "epoch": 0.5875, + "grad_norm": 0.16664239764213562, + "learning_rate": 0.00018728706796812333, + "loss": 0.6313, + "step": 1175 + }, + { + "epoch": 0.6, + "grad_norm": 0.1630394607782364, + "learning_rate": 0.00017769116393914037, + "loss": 0.6952, + "step": 1200 + }, + { + "epoch": 0.6, + "eval_loss": 1.1242510080337524, + "eval_runtime": 847.0203, + "eval_samples_per_second": 1.295, + "eval_steps_per_second": 0.021, + "step": 1200 + }, + { + "epoch": 0.6125, + "grad_norm": 0.16016128659248352, + "learning_rate": 0.00016821017608365264, + "loss": 0.6161, + "step": 1225 + }, + { + "epoch": 0.625, + "grad_norm": 0.17186138033866882, + "learning_rate": 0.00015885917197714112, + "loss": 0.623, + "step": 1250 + }, + { + "epoch": 0.6375, + "grad_norm": 0.1764240562915802, + "learning_rate": 0.00014965301261957238, + "loss": 0.6988, + "step": 1275 + }, + { + "epoch": 0.65, + "grad_norm": 0.16019247472286224, + "learning_rate": 0.00014060632881768558, + "loss": 0.599, + "step": 1300 + }, + { + "epoch": 0.65, + "eval_loss": 1.1349693536758423, + "eval_runtime": 842.5056, + "eval_samples_per_second": 1.302, + "eval_steps_per_second": 0.021, + "step": 1300 + }, + { + "epoch": 0.6625, + "grad_norm": 0.17425072193145752, + "learning_rate": 0.00013173349793311424, + "loss": 0.6607, + "step": 1325 + }, + { + "epoch": 0.675, + "grad_norm": 0.17765691876411438, + "learning_rate": 0.0001230486210332916, + "loss": 0.6811, + "step": 1350 + }, + { + "epoch": 0.6875, + "grad_norm": 0.17980526387691498, + "learning_rate": 0.00011456550048145536, + "loss": 0.5755, + "step": 1375 + }, + { + "epoch": 0.7, + "grad_norm": 0.1814012974500656, + "learning_rate": 0.00010629761800136473, + "loss": 0.6642, + "step": 1400 + }, + { + "epoch": 0.7, + "eval_loss": 1.1495640277862549, + "eval_runtime": 840.9164, + "eval_samples_per_second": 1.305, + "eval_steps_per_second": 0.021, + "step": 1400 + }, + { + "epoch": 0.7125, + "grad_norm": 0.1855439394712448, + "learning_rate": 9.82581132515907e-05, + "loss": 0.6796, + "step": 1425 + }, + { + "epoch": 0.725, + "grad_norm": 0.15141044557094574, + "learning_rate": 9.045976294343145e-05, + "loss": 0.5593, + "step": 1450 + }, + { + "epoch": 0.7375, + "grad_norm": 0.17237244546413422, + "learning_rate": 8.291496053563699e-05, + "loss": 0.69, + "step": 1475 + }, + { + "epoch": 0.75, + "grad_norm": 0.17433880269527435, + "learning_rate": 7.563569653821565e-05, + "loss": 0.6768, + "step": 1500 + }, + { + "epoch": 0.75, + "eval_loss": 1.1441528797149658, + "eval_runtime": 844.4916, + "eval_samples_per_second": 1.299, + "eval_steps_per_second": 0.021, + "step": 1500 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.5118973150429184e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1600/optimizer_0/.metadata b/checkpoint-1600/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..cc69c9c8a1c84eaa7d735750af35b3050360c14f --- /dev/null +++ b/checkpoint-1600/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67fd3cbe481323589b05dae2ae04133b6ac41ebb047072912483788b8ac8018f +size 869362 diff --git a/checkpoint-1600/optimizer_0/__0_0.distcp b/checkpoint-1600/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..7c94fbf2354557db5c18ac19109d4bc52f285c13 --- /dev/null +++ b/checkpoint-1600/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4a512df1354d612b6d6f4f2330bfea40e0817c378842591285ff48d75d5ec52 +size 6008476 diff --git a/checkpoint-1600/optimizer_0/__1_0.distcp b/checkpoint-1600/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..f73af05c2370180bb2124e31baed1fcf51318d92 --- /dev/null +++ b/checkpoint-1600/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e99e71fbac61c0a05bf4b24834fc3f5aeffa65e78e385292fa41913a01b9082 +size 6041200 diff --git a/checkpoint-1600/optimizer_0/__2_0.distcp b/checkpoint-1600/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..9f42fe1cc48f724d92857516812f8e7d5698f29f --- /dev/null +++ b/checkpoint-1600/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:282cf067a752594a773ce104d895894f20bb7077f41263bca1d3d9b9909da08d +size 6041200 diff --git a/checkpoint-1600/optimizer_0/__3_0.distcp b/checkpoint-1600/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..8e9886b9e93969289e0bc3db9a095d56f1490603 --- /dev/null +++ b/checkpoint-1600/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40cfbea5de3eb65b350255be41bac09518409ea5be0844ae29cf2c8295e7e08c +size 6043476 diff --git a/checkpoint-1600/optimizer_0/__4_0.distcp b/checkpoint-1600/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..6fee091972f7cb9a17d321a22a618a61df6d6d51 --- /dev/null +++ b/checkpoint-1600/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dc46ee790421df1c53b2c2c48f95a1ec5ae5a2a67b6ec33f12d5ca8dbc6ac21 +size 6057364 diff --git a/checkpoint-1600/optimizer_0/__5_0.distcp b/checkpoint-1600/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..99a7bb13b432cecf80940fa9772469d6395a2a3d --- /dev/null +++ b/checkpoint-1600/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b16f3cd1c19dc8d1d556d533485b8590ce6febc91ba64202cf6c9c378cdf0ec +size 6042612 diff --git a/checkpoint-1600/optimizer_0/__6_0.distcp b/checkpoint-1600/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..dc29b9dfb6738fe52a535a7da9c5def55925fe14 --- /dev/null +++ b/checkpoint-1600/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb81bd241b2bca4767c671d98f98421a7ea079eb0b3be0c5c8cf1d539463c144 +size 6042612 diff --git a/checkpoint-1600/optimizer_0/__7_0.distcp b/checkpoint-1600/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..6b582aa986cf11984594fb6e8b1fa275303ce681 --- /dev/null +++ b/checkpoint-1600/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3888af5617a1139be02672933122d2fd24bad0b22357f0c33f55ddbd1f5c87bb +size 6042612 diff --git a/checkpoint-1600/pytorch_model_fsdp_0/.metadata b/checkpoint-1600/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..3706ed84425629f9bdbaef62a9d8d409ac129dce --- /dev/null +++ b/checkpoint-1600/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df64ec4802d62cff7aa32cb44c76038ce41fe0750e4b2d7a66afd8b6aebcecd5 +size 339852 diff --git a/checkpoint-1600/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-1600/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..ec5ce824a577c84ac3716481090847279c989079 --- /dev/null +++ b/checkpoint-1600/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49bbc620c3f5da4c46186e6f713751c43abc7e113fb6c7f70a1d6be98a7b54fc +size 3003648 diff --git a/checkpoint-1600/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-1600/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..5d03347a38e256cb094362c091b580dab6150b0f --- /dev/null +++ b/checkpoint-1600/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a87075c4a636014231d88dd7131dd01e35b5ea9430ac81806702f49ff4706778 +size 3003648 diff --git a/checkpoint-1600/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-1600/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..e18b06a7f7963fc15f00d1f4f53a4570fe2c3f6d --- /dev/null +++ b/checkpoint-1600/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcd6780f417edb095a763ef80516d83a39b67faed3cb92adebe2defa8252a779 +size 3003648 diff --git a/checkpoint-1600/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-1600/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..66ef502907bf17793fe04b0fd70298f69c7b8f93 --- /dev/null +++ b/checkpoint-1600/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17189a151c612f422e965ec545b746f026ef258f8bb1a6c7e033623f843299e8 +size 3003648 diff --git a/checkpoint-1600/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-1600/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..05b6522c378b004ecb196021e43b9f464cdeb1a6 --- /dev/null +++ b/checkpoint-1600/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a595536cca429fc9cebcd6044d2b14383d084a2135505baa88aec249e8a64aae +size 3003648 diff --git a/checkpoint-1600/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-1600/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..804af9939dc77b082c78aaa678ce092324d0f4d9 --- /dev/null +++ b/checkpoint-1600/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdd6f00451806ee6e63881861f0747986858ad45a36eb0c16dbb3429a4d98c9b +size 3003648 diff --git a/checkpoint-1600/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-1600/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..18cb6c82ada7d5f0d58cb716ecedf0ae7bce13f9 --- /dev/null +++ b/checkpoint-1600/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46de639a391e39a0fe40b628725bd17ca99e87b5c43c5a5341ebdaefdc834530 +size 3003648 diff --git a/checkpoint-1600/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-1600/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..40575d55ae5b75a512d2920653c66988d4f75615 --- /dev/null +++ b/checkpoint-1600/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d4f5d470b37e7bf0a901c3b92fb8aed47d0995705b7907957b51803b3372d56 +size 3003648 diff --git a/checkpoint-1600/rng_state_0.pth b/checkpoint-1600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..17d1578450e189113f727a3a0632b3bc530af8ab --- /dev/null +++ b/checkpoint-1600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5183506a2396df9160b89ce86aee871710b836d566fa74da4d65a9ae5ca85552 +size 15920 diff --git a/checkpoint-1600/rng_state_1.pth b/checkpoint-1600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..bf041f9429507850844a465334e15bbc5f3d3e75 --- /dev/null +++ b/checkpoint-1600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:faced7ec28436a2990a43a87872462ff8bcaa07f8aca5783ce1ef461c24a5279 +size 15984 diff --git a/checkpoint-1600/rng_state_2.pth b/checkpoint-1600/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..22398f7f5735ca48f9ce9163659699f7d974599e --- /dev/null +++ b/checkpoint-1600/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da04d74ce1ce614a1bfa4966f86cceb96723501a6333ccc0bb669f27e8c29bcf +size 15984 diff --git a/checkpoint-1600/rng_state_3.pth b/checkpoint-1600/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..ec5815d1a61ba4dacc884cd287e0b14f53a85485 --- /dev/null +++ b/checkpoint-1600/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c2da239fb7c10f993c363a19581ac96c62790624254fd05b4f9411a2c0f8280 +size 15984 diff --git a/checkpoint-1600/rng_state_4.pth b/checkpoint-1600/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..c28c849918f4a4c68f2ba6112492313ad5813fe1 --- /dev/null +++ b/checkpoint-1600/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78bc4edab805c57c2ead6d40ffcfc64de8e97b59e297c2a921893b92f5e9296d +size 15984 diff --git a/checkpoint-1600/rng_state_5.pth b/checkpoint-1600/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..8a6f7f47b3f861e06a4b8b62a48916656b67a3bb --- /dev/null +++ b/checkpoint-1600/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28a3d9fee605e6a474ee8a412dc14b0ab906a07cb21430029959b91a634fc6d7 +size 15984 diff --git a/checkpoint-1600/rng_state_6.pth b/checkpoint-1600/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..c5c29843a667219beeb22f4e97dd3ae688fa8fd2 --- /dev/null +++ b/checkpoint-1600/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe4530cf8642eadbeef6e8e3bb17cc51305d0d35a97ce2a8a6457153b532ffd3 +size 15984 diff --git a/checkpoint-1600/rng_state_7.pth b/checkpoint-1600/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..11a550e52cc0ee1bf4b1b0c8a4ec25843b59c9d9 --- /dev/null +++ b/checkpoint-1600/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3f587f7177f6428865c1ea5f5ca96d5cd2d4fe9f7b6b5b251babc5a37b75a0a +size 15984 diff --git a/checkpoint-1600/scheduler.pt b/checkpoint-1600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1b1256b470e8c7334822cf94f43e0000f8a16dd9 --- /dev/null +++ b/checkpoint-1600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28d5f76e918fa0cbbbd298377811314b7fc9c9c89e747f720e3533cb0c69b09c +size 1064 diff --git a/checkpoint-1600/trainer_state.json b/checkpoint-1600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..df6ac680d8d27b22c12be4065377432717e9a71a --- /dev/null +++ b/checkpoint-1600/trainer_state.json @@ -0,0 +1,610 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8, + "eval_steps": 100, + "global_step": 1600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.14176353812217712, + "learning_rate": 0.0004, + "loss": 1.1525, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.1460508555173874, + "learning_rate": 0.0004998852503731983, + "loss": 1.047, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.2368021011352539, + "learning_rate": 0.0004993848168027977, + "loss": 0.8529, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.14488168060779572, + "learning_rate": 0.0004984880506341147, + "loss": 0.9756, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9470569491386414, + "eval_runtime": 845.9115, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.1415175348520279, + "learning_rate": 0.0004971963770447935, + "loss": 0.9564, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.1751064509153366, + "learning_rate": 0.0004955118488155782, + "loss": 0.711, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.1439943015575409, + "learning_rate": 0.0004934371430679492, + "loss": 0.9409, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.15947112441062927, + "learning_rate": 0.0004909755570095319, + "loss": 0.8979, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9711233973503113, + "eval_runtime": 845.6433, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.14685837924480438, + "learning_rate": 0.0004881310026940389, + "loss": 0.6376, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.15040776133537292, + "learning_rate": 0.0004849080008040734, + "loss": 0.927, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.16087745130062103, + "learning_rate": 0.00048131167346667446, + "loss": 0.8456, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.15025638043880463, + "learning_rate": 0.00047734773611302284, + "loss": 0.6029, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 1.0056413412094116, + "eval_runtime": 848.447, + "eval_samples_per_second": 1.293, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.15893957018852234, + "learning_rate": 0.0004730224883952422, + "loss": 0.9036, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.1535714715719223, + "learning_rate": 0.0004683428041747334, + "loss": 0.828, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.1718970686197281, + "learning_rate": 0.0004633161205979517, + "loss": 0.5944, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.17664535343647003, + "learning_rate": 0.0004579504262769877, + "loss": 0.8654, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 1.0151112079620361, + "eval_runtime": 839.7482, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.17122088372707367, + "learning_rate": 0.0004522542485937369, + "loss": 0.8078, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.19420970976352692, + "learning_rate": 0.00044623664014783386, + "loss": 0.5735, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.18166953325271606, + "learning_rate": 0.00043990716436988924, + "loss": 0.8604, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.1502382755279541, + "learning_rate": 0.0004332758803228925, + "loss": 0.7673, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 1.0454745292663574, + "eval_runtime": 841.3951, + "eval_samples_per_second": 1.304, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.16875839233398438, + "learning_rate": 0.00042635332671593575, + "loss": 0.5882, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.16070039570331573, + "learning_rate": 0.00041915050515566445, + "loss": 0.8175, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.16584184765815735, + "learning_rate": 0.00041167886266207167, + "loss": 0.7795, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.15345798432826996, + "learning_rate": 0.0004039502734764241, + "loss": 0.7331, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 1.0627943277359009, + "eval_runtime": 849.4944, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 600 + }, + { + "epoch": 0.3125, + "grad_norm": 0.16241031885147095, + "learning_rate": 0.0003959770201902294, + "loss": 0.7436, + "step": 625 + }, + { + "epoch": 0.325, + "grad_norm": 0.1531112939119339, + "learning_rate": 0.0003877717742252371, + "loss": 0.6345, + "step": 650 + }, + { + "epoch": 0.3375, + "grad_norm": 0.15790140628814697, + "learning_rate": 0.00037934757569549495, + "loss": 0.7351, + "step": 675 + }, + { + "epoch": 0.35, + "grad_norm": 0.1822807490825653, + "learning_rate": 0.00037071781268346345, + "loss": 0.745, + "step": 700 + }, + { + "epoch": 0.35, + "eval_loss": 1.07315993309021, + "eval_runtime": 837.5276, + "eval_samples_per_second": 1.31, + "eval_steps_per_second": 0.021, + "step": 700 + }, + { + "epoch": 0.3625, + "grad_norm": 0.15983282029628754, + "learning_rate": 0.00036189619996312495, + "loss": 0.5972, + "step": 725 + }, + { + "epoch": 0.375, + "grad_norm": 0.18202389776706696, + "learning_rate": 0.00035289675720390174, + "loss": 0.759, + "step": 750 + }, + { + "epoch": 0.3875, + "grad_norm": 0.16057106852531433, + "learning_rate": 0.00034373378669002105, + "loss": 0.7358, + "step": 775 + }, + { + "epoch": 0.4, + "grad_norm": 0.1680625081062317, + "learning_rate": 0.00033442185059073706, + "loss": 0.5636, + "step": 800 + }, + { + "epoch": 0.4, + "eval_loss": 1.0932456254959106, + "eval_runtime": 840.1331, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 800 + }, + { + "epoch": 0.4125, + "grad_norm": 0.15613198280334473, + "learning_rate": 0.00032497574781753367, + "loss": 0.7596, + "step": 825 + }, + { + "epoch": 0.425, + "grad_norm": 0.1628854125738144, + "learning_rate": 0.000315410490505086, + "loss": 0.7282, + "step": 850 + }, + { + "epoch": 0.4375, + "grad_norm": 0.16740979254245758, + "learning_rate": 0.0003057412801533589, + "loss": 0.5325, + "step": 875 + }, + { + "epoch": 0.45, + "grad_norm": 0.1634828895330429, + "learning_rate": 0.0002959834834687587, + "loss": 0.778, + "step": 900 + }, + { + "epoch": 0.45, + "eval_loss": 1.0952215194702148, + "eval_runtime": 836.473, + "eval_samples_per_second": 1.311, + "eval_steps_per_second": 0.022, + "step": 900 + }, + { + "epoch": 0.4625, + "grad_norm": 0.161700040102005, + "learning_rate": 0.00028615260794273236, + "loss": 0.7255, + "step": 925 + }, + { + "epoch": 0.475, + "grad_norm": 0.16280074417591095, + "learning_rate": 0.00027626427720662416, + "loss": 0.4993, + "step": 950 + }, + { + "epoch": 0.4875, + "grad_norm": 0.16114428639411926, + "learning_rate": 0.00026633420620195917, + "loss": 0.7765, + "step": 975 + }, + { + "epoch": 0.5, + "grad_norm": 0.16209788620471954, + "learning_rate": 0.00025637817620561263, + "loss": 0.7221, + "step": 1000 + }, + { + "epoch": 0.5, + "eval_loss": 1.1190329790115356, + "eval_runtime": 835.4164, + "eval_samples_per_second": 1.313, + "eval_steps_per_second": 0.022, + "step": 1000 + }, + { + "epoch": 0.5125, + "grad_norm": 0.15057620406150818, + "learning_rate": 0.0002464120097495559, + "loss": 0.4929, + "step": 1025 + }, + { + "epoch": 0.525, + "grad_norm": 0.16715486347675323, + "learning_rate": 0.00023645154547503855, + "loss": 0.7896, + "step": 1050 + }, + { + "epoch": 0.5375, + "grad_norm": 0.1591680645942688, + "learning_rate": 0.00022651261296116894, + "loss": 0.6999, + "step": 1075 + }, + { + "epoch": 0.55, + "grad_norm": 0.17062760889530182, + "learning_rate": 0.00021661100756789666, + "loss": 0.4708, + "step": 1100 + }, + { + "epoch": 0.55, + "eval_loss": 1.1381303071975708, + "eval_runtime": 839.2576, + "eval_samples_per_second": 1.307, + "eval_steps_per_second": 0.021, + "step": 1100 + }, + { + "epoch": 0.5625, + "grad_norm": 0.16746024787425995, + "learning_rate": 0.00020676246533337764, + "loss": 0.8073, + "step": 1125 + }, + { + "epoch": 0.575, + "grad_norm": 0.18011848628520966, + "learning_rate": 0.00019698263796561526, + "loss": 0.7153, + "step": 1150 + }, + { + "epoch": 0.5875, + "grad_norm": 0.16664239764213562, + "learning_rate": 0.00018728706796812333, + "loss": 0.6313, + "step": 1175 + }, + { + "epoch": 0.6, + "grad_norm": 0.1630394607782364, + "learning_rate": 0.00017769116393914037, + "loss": 0.6952, + "step": 1200 + }, + { + "epoch": 0.6, + "eval_loss": 1.1242510080337524, + "eval_runtime": 847.0203, + "eval_samples_per_second": 1.295, + "eval_steps_per_second": 0.021, + "step": 1200 + }, + { + "epoch": 0.6125, + "grad_norm": 0.16016128659248352, + "learning_rate": 0.00016821017608365264, + "loss": 0.6161, + "step": 1225 + }, + { + "epoch": 0.625, + "grad_norm": 0.17186138033866882, + "learning_rate": 0.00015885917197714112, + "loss": 0.623, + "step": 1250 + }, + { + "epoch": 0.6375, + "grad_norm": 0.1764240562915802, + "learning_rate": 0.00014965301261957238, + "loss": 0.6988, + "step": 1275 + }, + { + "epoch": 0.65, + "grad_norm": 0.16019247472286224, + "learning_rate": 0.00014060632881768558, + "loss": 0.599, + "step": 1300 + }, + { + "epoch": 0.65, + "eval_loss": 1.1349693536758423, + "eval_runtime": 842.5056, + "eval_samples_per_second": 1.302, + "eval_steps_per_second": 0.021, + "step": 1300 + }, + { + "epoch": 0.6625, + "grad_norm": 0.17425072193145752, + "learning_rate": 0.00013173349793311424, + "loss": 0.6607, + "step": 1325 + }, + { + "epoch": 0.675, + "grad_norm": 0.17765691876411438, + "learning_rate": 0.0001230486210332916, + "loss": 0.6811, + "step": 1350 + }, + { + "epoch": 0.6875, + "grad_norm": 0.17980526387691498, + "learning_rate": 0.00011456550048145536, + "loss": 0.5755, + "step": 1375 + }, + { + "epoch": 0.7, + "grad_norm": 0.1814012974500656, + "learning_rate": 0.00010629761800136473, + "loss": 0.6642, + "step": 1400 + }, + { + "epoch": 0.7, + "eval_loss": 1.1495640277862549, + "eval_runtime": 840.9164, + "eval_samples_per_second": 1.305, + "eval_steps_per_second": 0.021, + "step": 1400 + }, + { + "epoch": 0.7125, + "grad_norm": 0.1855439394712448, + "learning_rate": 9.82581132515907e-05, + "loss": 0.6796, + "step": 1425 + }, + { + "epoch": 0.725, + "grad_norm": 0.15141044557094574, + "learning_rate": 9.045976294343145e-05, + "loss": 0.5593, + "step": 1450 + }, + { + "epoch": 0.7375, + "grad_norm": 0.17237244546413422, + "learning_rate": 8.291496053563699e-05, + "loss": 0.69, + "step": 1475 + }, + { + "epoch": 0.75, + "grad_norm": 0.17433880269527435, + "learning_rate": 7.563569653821565e-05, + "loss": 0.6768, + "step": 1500 + }, + { + "epoch": 0.75, + "eval_loss": 1.1441528797149658, + "eval_runtime": 844.4916, + "eval_samples_per_second": 1.299, + "eval_steps_per_second": 0.021, + "step": 1500 + }, + { + "epoch": 0.7625, + "grad_norm": 0.15868628025054932, + "learning_rate": 6.863353945662288e-05, + "loss": 0.517, + "step": 1525 + }, + { + "epoch": 0.775, + "grad_norm": 0.18802335858345032, + "learning_rate": 6.191961740661687e-05, + "loss": 0.7035, + "step": 1550 + }, + { + "epoch": 0.7875, + "grad_norm": 0.17180030047893524, + "learning_rate": 5.550460042899982e-05, + "loss": 0.6911, + "step": 1575 + }, + { + "epoch": 0.8, + "grad_norm": 0.16377338767051697, + "learning_rate": 4.9398683532350855e-05, + "loss": 0.4876, + "step": 1600 + }, + { + "epoch": 0.8, + "eval_loss": 1.1581627130508423, + "eval_runtime": 844.0055, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 1600 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.612690469379113e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1700/optimizer_0/.metadata b/checkpoint-1700/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..6501725a6a10cbf4928f4ac94c98c67f2d75f98c --- /dev/null +++ b/checkpoint-1700/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a9ef12f577af598db7f044f2ed1d5d0c5397e169f4ebaaa4caae34ce30d9dda +size 869362 diff --git a/checkpoint-1700/optimizer_0/__0_0.distcp b/checkpoint-1700/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..f1bbfd962f9daa964a34c163c4c5453d7d62ef1b --- /dev/null +++ b/checkpoint-1700/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:800514aa19243c669661aeaa81e727b704220f6f7656f6805fd90ae90ed1d23e +size 6008476 diff --git a/checkpoint-1700/optimizer_0/__1_0.distcp b/checkpoint-1700/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..30fd41b306943373c5b9489243885eb6d3b927ab --- /dev/null +++ b/checkpoint-1700/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:585b6a970a56dac8b47527560cb1318f5b8fb79930d685dbb674242fcda559f9 +size 6041200 diff --git a/checkpoint-1700/optimizer_0/__2_0.distcp b/checkpoint-1700/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..e96bc16745673aa6bba77e40ba164baf79e74ac0 --- /dev/null +++ b/checkpoint-1700/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd66685c65135eb42f9656534bbdf4f49a1db76abf2a65620e516a81486f9fd +size 6041200 diff --git a/checkpoint-1700/optimizer_0/__3_0.distcp b/checkpoint-1700/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..d833ec94d1bae01dcd1c0f716a212cac05009317 --- /dev/null +++ b/checkpoint-1700/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee3280450eabba52fd0ef98f35587e613d1d01538aa03f5d7b60e1c0ee2abcd2 +size 6043476 diff --git a/checkpoint-1700/optimizer_0/__4_0.distcp b/checkpoint-1700/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..ae4d6df2849a9e55079f653d4a49070d240efa5c --- /dev/null +++ b/checkpoint-1700/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33ef3a0ffaf65b888ee5240499a55717ba984740eb797884807a4a658625313a +size 6057364 diff --git a/checkpoint-1700/optimizer_0/__5_0.distcp b/checkpoint-1700/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..814d0a173e5a1f9939f0d223e17cdaf707c58ee9 --- /dev/null +++ b/checkpoint-1700/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99b8cee462f59b25367f7282463c8da7d043fd1aefb8740d4cdb260ca508c77d +size 6042612 diff --git a/checkpoint-1700/optimizer_0/__6_0.distcp b/checkpoint-1700/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..710e2469311dbe96d3c68919e8bcd6b8101aaa6f --- /dev/null +++ b/checkpoint-1700/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f9aa258cc9b9726d9f399779125061d9f66b0cd06879bcfbb35599c15a375e7 +size 6042612 diff --git a/checkpoint-1700/optimizer_0/__7_0.distcp b/checkpoint-1700/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..e9c30c7a6d42130f272656bba4275441df1db8c8 --- /dev/null +++ b/checkpoint-1700/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1febda8ac74f88f630735b7fdc0d4806823f808c32bae309779c269f6b7fb36 +size 6042612 diff --git a/checkpoint-1700/pytorch_model_fsdp_0/.metadata b/checkpoint-1700/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..4e81a9123e374cb23a2e1406c988232a3e966029 --- /dev/null +++ b/checkpoint-1700/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5381ff55f2982f3d167ace25774c39e4480e79d22049c9109b0f772499fee02d +size 339852 diff --git a/checkpoint-1700/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-1700/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..2099d9418d5bfcd1c88e6d7f67ec82621737bf08 --- /dev/null +++ b/checkpoint-1700/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac8a9b2eae1631b1e1132608a85fe7baf580cdc579cb689826398516e2cf07e8 +size 3003648 diff --git a/checkpoint-1700/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-1700/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..2b086e19c9a8d913fcd70ee71e02fb91bda9e923 --- /dev/null +++ b/checkpoint-1700/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22b25f722c12df53eea209d25c70ce3634dfd5e274918a40899e073474855135 +size 3003648 diff --git a/checkpoint-1700/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-1700/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..7f78e3572841ec79327afabe036e1baa51ae9ad1 --- /dev/null +++ b/checkpoint-1700/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:166508c68c2838deb021b959882fa3b136e176d1d0b43ab3ac695f02f8f881fb +size 3003648 diff --git a/checkpoint-1700/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-1700/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..4343ce4229ceb565347e54465dcc7e6394e6315c --- /dev/null +++ b/checkpoint-1700/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34e1988cebd36c672c99e5a511fcae9b8a10c012371da34276dae0fbae0cefce +size 3003648 diff --git a/checkpoint-1700/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-1700/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..6ac92d98e0d7b491d6b94ed69acd5d16a9ef0905 --- /dev/null +++ b/checkpoint-1700/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c15c106f24e4aa970573af126e4b525e6908d1d862d87f48447acaca3bb06f9e +size 3003648 diff --git a/checkpoint-1700/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-1700/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..8707334767959b0142134b04f98950a7b0211d46 --- /dev/null +++ b/checkpoint-1700/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1184744a5e5401b8c3121f9986bfcdda8045b2ee0dd5e40513570e36a54aa45 +size 3003648 diff --git a/checkpoint-1700/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-1700/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..22a8db4dca31755fd79c203c0c09a2da9b7d68a0 --- /dev/null +++ b/checkpoint-1700/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4140ec56445042a73f48c265ce8e83d997d66d5e47ef0f94c7c11bc704a0a4e3 +size 3003648 diff --git a/checkpoint-1700/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-1700/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..b4f7696538cdbaa774948958b164f488b2726cdb --- /dev/null +++ b/checkpoint-1700/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7baf0aa3e46c1c757e04ab4335d1d7bdf1b7624b33c4181a9e6843db38bc4fa +size 3003648 diff --git a/checkpoint-1700/rng_state_0.pth b/checkpoint-1700/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..bbbb93443d8ac74f5921b1db6a190e4dbf1cb152 --- /dev/null +++ b/checkpoint-1700/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c368f2d5e142f8ec4d8e832f12c3a242f58f04943b01abc4f24753716227bc3 +size 15984 diff --git a/checkpoint-1700/rng_state_1.pth b/checkpoint-1700/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..a1c8a9a67875949ca088e24b6ddb6d7abb8cbe1b --- /dev/null +++ b/checkpoint-1700/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7495eab1626b234203500cb092a76fe0e8ee29326c4a214d406965f11ae00f1d +size 15984 diff --git a/checkpoint-1700/rng_state_2.pth b/checkpoint-1700/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..927034d062e94e6f7480eb76f009cc6f088f4eae --- /dev/null +++ b/checkpoint-1700/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e719f7df6c26476159f65277915dfa0fb3de15d19a7f51fb58f93a6e451f669a +size 15984 diff --git a/checkpoint-1700/rng_state_3.pth b/checkpoint-1700/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..d75d05aa7ee1eafd7e6c78c7d65a48fade012846 --- /dev/null +++ b/checkpoint-1700/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8772b6e0f5613097d80d8a1b900a879f18269d411c7702f8553115591658d5d6 +size 15984 diff --git a/checkpoint-1700/rng_state_4.pth b/checkpoint-1700/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..8af08ad4aea65a30a236b7fc47b84e7cd188dffb --- /dev/null +++ b/checkpoint-1700/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:007aca66dfa0ad6f05dbd187566dd645c4fe201a60ffb68ffbbf0988506518ad +size 15984 diff --git a/checkpoint-1700/rng_state_5.pth b/checkpoint-1700/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..be0cec73b636df956aac221669db063e8e848c9b --- /dev/null +++ b/checkpoint-1700/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b82f8686622b3cb34cb432fe0331d1c0ead3a513a521e46bfc46b764a797f003 +size 15984 diff --git a/checkpoint-1700/rng_state_6.pth b/checkpoint-1700/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..793ed4585e58a46c9096ceb7444786d7abbbb105 --- /dev/null +++ b/checkpoint-1700/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:238bdc28b08c7e2c5690436b2d75c6c89bce8c123cb1195a3dcec4e4923064de +size 15984 diff --git a/checkpoint-1700/rng_state_7.pth b/checkpoint-1700/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..bafba19b5a76475813eaaac5f8c7d956bdb62319 --- /dev/null +++ b/checkpoint-1700/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:823f8a6617c96025013f66bf901be2a45b4251e14fe49e4147b91854b5a49d0e +size 15984 diff --git a/checkpoint-1700/scheduler.pt b/checkpoint-1700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f32a2b2cd826add882328cd593354c5b68429a27 --- /dev/null +++ b/checkpoint-1700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bddc22e089a0095581797df13adaa44992df016d73b873c38a5d6d0910baea06 +size 1064 diff --git a/checkpoint-1700/trainer_state.json b/checkpoint-1700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e38c0d048154d60045f96100fd518272b986c450 --- /dev/null +++ b/checkpoint-1700/trainer_state.json @@ -0,0 +1,646 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.85, + "eval_steps": 100, + "global_step": 1700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.14176353812217712, + "learning_rate": 0.0004, + "loss": 1.1525, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.1460508555173874, + "learning_rate": 0.0004998852503731983, + "loss": 1.047, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.2368021011352539, + "learning_rate": 0.0004993848168027977, + "loss": 0.8529, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.14488168060779572, + "learning_rate": 0.0004984880506341147, + "loss": 0.9756, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9470569491386414, + "eval_runtime": 845.9115, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.1415175348520279, + "learning_rate": 0.0004971963770447935, + "loss": 0.9564, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.1751064509153366, + "learning_rate": 0.0004955118488155782, + "loss": 0.711, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.1439943015575409, + "learning_rate": 0.0004934371430679492, + "loss": 0.9409, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.15947112441062927, + "learning_rate": 0.0004909755570095319, + "loss": 0.8979, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9711233973503113, + "eval_runtime": 845.6433, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.14685837924480438, + "learning_rate": 0.0004881310026940389, + "loss": 0.6376, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.15040776133537292, + "learning_rate": 0.0004849080008040734, + "loss": 0.927, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.16087745130062103, + "learning_rate": 0.00048131167346667446, + "loss": 0.8456, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.15025638043880463, + "learning_rate": 0.00047734773611302284, + "loss": 0.6029, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 1.0056413412094116, + "eval_runtime": 848.447, + "eval_samples_per_second": 1.293, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.15893957018852234, + "learning_rate": 0.0004730224883952422, + "loss": 0.9036, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.1535714715719223, + "learning_rate": 0.0004683428041747334, + "loss": 0.828, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.1718970686197281, + "learning_rate": 0.0004633161205979517, + "loss": 0.5944, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.17664535343647003, + "learning_rate": 0.0004579504262769877, + "loss": 0.8654, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 1.0151112079620361, + "eval_runtime": 839.7482, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.17122088372707367, + "learning_rate": 0.0004522542485937369, + "loss": 0.8078, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.19420970976352692, + "learning_rate": 0.00044623664014783386, + "loss": 0.5735, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.18166953325271606, + "learning_rate": 0.00043990716436988924, + "loss": 0.8604, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.1502382755279541, + "learning_rate": 0.0004332758803228925, + "loss": 0.7673, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 1.0454745292663574, + "eval_runtime": 841.3951, + "eval_samples_per_second": 1.304, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.16875839233398438, + "learning_rate": 0.00042635332671593575, + "loss": 0.5882, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.16070039570331573, + "learning_rate": 0.00041915050515566445, + "loss": 0.8175, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.16584184765815735, + "learning_rate": 0.00041167886266207167, + "loss": 0.7795, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.15345798432826996, + "learning_rate": 0.0004039502734764241, + "loss": 0.7331, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 1.0627943277359009, + "eval_runtime": 849.4944, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 600 + }, + { + "epoch": 0.3125, + "grad_norm": 0.16241031885147095, + "learning_rate": 0.0003959770201902294, + "loss": 0.7436, + "step": 625 + }, + { + "epoch": 0.325, + "grad_norm": 0.1531112939119339, + "learning_rate": 0.0003877717742252371, + "loss": 0.6345, + "step": 650 + }, + { + "epoch": 0.3375, + "grad_norm": 0.15790140628814697, + "learning_rate": 0.00037934757569549495, + "loss": 0.7351, + "step": 675 + }, + { + "epoch": 0.35, + "grad_norm": 0.1822807490825653, + "learning_rate": 0.00037071781268346345, + "loss": 0.745, + "step": 700 + }, + { + "epoch": 0.35, + "eval_loss": 1.07315993309021, + "eval_runtime": 837.5276, + "eval_samples_per_second": 1.31, + "eval_steps_per_second": 0.021, + "step": 700 + }, + { + "epoch": 0.3625, + "grad_norm": 0.15983282029628754, + "learning_rate": 0.00036189619996312495, + "loss": 0.5972, + "step": 725 + }, + { + "epoch": 0.375, + "grad_norm": 0.18202389776706696, + "learning_rate": 0.00035289675720390174, + "loss": 0.759, + "step": 750 + }, + { + "epoch": 0.3875, + "grad_norm": 0.16057106852531433, + "learning_rate": 0.00034373378669002105, + "loss": 0.7358, + "step": 775 + }, + { + "epoch": 0.4, + "grad_norm": 0.1680625081062317, + "learning_rate": 0.00033442185059073706, + "loss": 0.5636, + "step": 800 + }, + { + "epoch": 0.4, + "eval_loss": 1.0932456254959106, + "eval_runtime": 840.1331, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 800 + }, + { + "epoch": 0.4125, + "grad_norm": 0.15613198280334473, + "learning_rate": 0.00032497574781753367, + "loss": 0.7596, + "step": 825 + }, + { + "epoch": 0.425, + "grad_norm": 0.1628854125738144, + "learning_rate": 0.000315410490505086, + "loss": 0.7282, + "step": 850 + }, + { + "epoch": 0.4375, + "grad_norm": 0.16740979254245758, + "learning_rate": 0.0003057412801533589, + "loss": 0.5325, + "step": 875 + }, + { + "epoch": 0.45, + "grad_norm": 0.1634828895330429, + "learning_rate": 0.0002959834834687587, + "loss": 0.778, + "step": 900 + }, + { + "epoch": 0.45, + "eval_loss": 1.0952215194702148, + "eval_runtime": 836.473, + "eval_samples_per_second": 1.311, + "eval_steps_per_second": 0.022, + "step": 900 + }, + { + "epoch": 0.4625, + "grad_norm": 0.161700040102005, + "learning_rate": 0.00028615260794273236, + "loss": 0.7255, + "step": 925 + }, + { + "epoch": 0.475, + "grad_norm": 0.16280074417591095, + "learning_rate": 0.00027626427720662416, + "loss": 0.4993, + "step": 950 + }, + { + "epoch": 0.4875, + "grad_norm": 0.16114428639411926, + "learning_rate": 0.00026633420620195917, + "loss": 0.7765, + "step": 975 + }, + { + "epoch": 0.5, + "grad_norm": 0.16209788620471954, + "learning_rate": 0.00025637817620561263, + "loss": 0.7221, + "step": 1000 + }, + { + "epoch": 0.5, + "eval_loss": 1.1190329790115356, + "eval_runtime": 835.4164, + "eval_samples_per_second": 1.313, + "eval_steps_per_second": 0.022, + "step": 1000 + }, + { + "epoch": 0.5125, + "grad_norm": 0.15057620406150818, + "learning_rate": 0.0002464120097495559, + "loss": 0.4929, + "step": 1025 + }, + { + "epoch": 0.525, + "grad_norm": 0.16715486347675323, + "learning_rate": 0.00023645154547503855, + "loss": 0.7896, + "step": 1050 + }, + { + "epoch": 0.5375, + "grad_norm": 0.1591680645942688, + "learning_rate": 0.00022651261296116894, + "loss": 0.6999, + "step": 1075 + }, + { + "epoch": 0.55, + "grad_norm": 0.17062760889530182, + "learning_rate": 0.00021661100756789666, + "loss": 0.4708, + "step": 1100 + }, + { + "epoch": 0.55, + "eval_loss": 1.1381303071975708, + "eval_runtime": 839.2576, + "eval_samples_per_second": 1.307, + "eval_steps_per_second": 0.021, + "step": 1100 + }, + { + "epoch": 0.5625, + "grad_norm": 0.16746024787425995, + "learning_rate": 0.00020676246533337764, + "loss": 0.8073, + "step": 1125 + }, + { + "epoch": 0.575, + "grad_norm": 0.18011848628520966, + "learning_rate": 0.00019698263796561526, + "loss": 0.7153, + "step": 1150 + }, + { + "epoch": 0.5875, + "grad_norm": 0.16664239764213562, + "learning_rate": 0.00018728706796812333, + "loss": 0.6313, + "step": 1175 + }, + { + "epoch": 0.6, + "grad_norm": 0.1630394607782364, + "learning_rate": 0.00017769116393914037, + "loss": 0.6952, + "step": 1200 + }, + { + "epoch": 0.6, + "eval_loss": 1.1242510080337524, + "eval_runtime": 847.0203, + "eval_samples_per_second": 1.295, + "eval_steps_per_second": 0.021, + "step": 1200 + }, + { + "epoch": 0.6125, + "grad_norm": 0.16016128659248352, + "learning_rate": 0.00016821017608365264, + "loss": 0.6161, + "step": 1225 + }, + { + "epoch": 0.625, + "grad_norm": 0.17186138033866882, + "learning_rate": 0.00015885917197714112, + "loss": 0.623, + "step": 1250 + }, + { + "epoch": 0.6375, + "grad_norm": 0.1764240562915802, + "learning_rate": 0.00014965301261957238, + "loss": 0.6988, + "step": 1275 + }, + { + "epoch": 0.65, + "grad_norm": 0.16019247472286224, + "learning_rate": 0.00014060632881768558, + "loss": 0.599, + "step": 1300 + }, + { + "epoch": 0.65, + "eval_loss": 1.1349693536758423, + "eval_runtime": 842.5056, + "eval_samples_per_second": 1.302, + "eval_steps_per_second": 0.021, + "step": 1300 + }, + { + "epoch": 0.6625, + "grad_norm": 0.17425072193145752, + "learning_rate": 0.00013173349793311424, + "loss": 0.6607, + "step": 1325 + }, + { + "epoch": 0.675, + "grad_norm": 0.17765691876411438, + "learning_rate": 0.0001230486210332916, + "loss": 0.6811, + "step": 1350 + }, + { + "epoch": 0.6875, + "grad_norm": 0.17980526387691498, + "learning_rate": 0.00011456550048145536, + "loss": 0.5755, + "step": 1375 + }, + { + "epoch": 0.7, + "grad_norm": 0.1814012974500656, + "learning_rate": 0.00010629761800136473, + "loss": 0.6642, + "step": 1400 + }, + { + "epoch": 0.7, + "eval_loss": 1.1495640277862549, + "eval_runtime": 840.9164, + "eval_samples_per_second": 1.305, + "eval_steps_per_second": 0.021, + "step": 1400 + }, + { + "epoch": 0.7125, + "grad_norm": 0.1855439394712448, + "learning_rate": 9.82581132515907e-05, + "loss": 0.6796, + "step": 1425 + }, + { + "epoch": 0.725, + "grad_norm": 0.15141044557094574, + "learning_rate": 9.045976294343145e-05, + "loss": 0.5593, + "step": 1450 + }, + { + "epoch": 0.7375, + "grad_norm": 0.17237244546413422, + "learning_rate": 8.291496053563699e-05, + "loss": 0.69, + "step": 1475 + }, + { + "epoch": 0.75, + "grad_norm": 0.17433880269527435, + "learning_rate": 7.563569653821565e-05, + "loss": 0.6768, + "step": 1500 + }, + { + "epoch": 0.75, + "eval_loss": 1.1441528797149658, + "eval_runtime": 844.4916, + "eval_samples_per_second": 1.299, + "eval_steps_per_second": 0.021, + "step": 1500 + }, + { + "epoch": 0.7625, + "grad_norm": 0.15868628025054932, + "learning_rate": 6.863353945662288e-05, + "loss": 0.517, + "step": 1525 + }, + { + "epoch": 0.775, + "grad_norm": 0.18802335858345032, + "learning_rate": 6.191961740661687e-05, + "loss": 0.7035, + "step": 1550 + }, + { + "epoch": 0.7875, + "grad_norm": 0.17180030047893524, + "learning_rate": 5.550460042899982e-05, + "loss": 0.6911, + "step": 1575 + }, + { + "epoch": 0.8, + "grad_norm": 0.16377338767051697, + "learning_rate": 4.9398683532350855e-05, + "loss": 0.4876, + "step": 1600 + }, + { + "epoch": 0.8, + "eval_loss": 1.1581627130508423, + "eval_runtime": 844.0055, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 1600 + }, + { + "epoch": 0.8125, + "grad_norm": 0.16953328251838684, + "learning_rate": 4.3611570490698945e-05, + "loss": 0.745, + "step": 1625 + }, + { + "epoch": 0.825, + "grad_norm": 0.16867636144161224, + "learning_rate": 3.815245842188697e-05, + "loss": 0.6623, + "step": 1650 + }, + { + "epoch": 0.8375, + "grad_norm": 0.16276288032531738, + "learning_rate": 3.30300231711339e-05, + "loss": 0.4716, + "step": 1675 + }, + { + "epoch": 0.85, + "grad_norm": 0.17455314099788666, + "learning_rate": 2.8252405523025106e-05, + "loss": 0.7464, + "step": 1700 + }, + { + "epoch": 0.85, + "eval_loss": 1.1617317199707031, + "eval_runtime": 841.9311, + "eval_samples_per_second": 1.303, + "eval_steps_per_second": 0.021, + "step": 1700 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.7134836237153075e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1800/optimizer_0/.metadata b/checkpoint-1800/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..0a7ae246c30cc0a88d58201da68714d2057135da --- /dev/null +++ b/checkpoint-1800/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7774805d3d40902d4ac85e58b40cb7e070e669af06f0ddef6ca99e1b91c1e46 +size 869362 diff --git a/checkpoint-1800/optimizer_0/__0_0.distcp b/checkpoint-1800/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..c5a194d673e4a8fc63a489c805161a8e8d5e8669 --- /dev/null +++ b/checkpoint-1800/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afc3f90f3fb2094c155e4ea9b685ef4a49ba820657ca3cfe3da49aece6b76c08 +size 6008476 diff --git a/checkpoint-1800/optimizer_0/__1_0.distcp b/checkpoint-1800/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..814023047a31a57ead6b035754d9395e7b3a2051 --- /dev/null +++ b/checkpoint-1800/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74021d5f37f0ca4fc5b422d8d8690caef7a8b1ca07728c15698393f7f8c5a77d +size 6041200 diff --git a/checkpoint-1800/optimizer_0/__2_0.distcp b/checkpoint-1800/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..6e0ec6ca1d7ce7dc25967b0b9da621d68010d406 --- /dev/null +++ b/checkpoint-1800/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4050ee033268c4ca873a9f91fd5c18377056b5126c723f57808ff4bad3973abd +size 6041200 diff --git a/checkpoint-1800/optimizer_0/__3_0.distcp b/checkpoint-1800/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..429e8b9696c5d2fa6aa3f684c3033dbae92adc94 --- /dev/null +++ b/checkpoint-1800/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a77754214eb48c047d3dbc132277bb369a4cb315d6410b2118b12af95eeef12 +size 6043476 diff --git a/checkpoint-1800/optimizer_0/__4_0.distcp b/checkpoint-1800/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..4f9786d9d91d4c7be7fffd20be9dd5601989347a --- /dev/null +++ b/checkpoint-1800/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b71443f25d0567ed50d9799b2b1894836813b84cd21b4029a3e89f3c1464552 +size 6057364 diff --git a/checkpoint-1800/optimizer_0/__5_0.distcp b/checkpoint-1800/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..817859533fefe64aeed39090dd54a0b7dd5d7548 --- /dev/null +++ b/checkpoint-1800/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2570e40e1360014d7352b86ce563838c4fecafb3e858ff305d45388057519821 +size 6042612 diff --git a/checkpoint-1800/optimizer_0/__6_0.distcp b/checkpoint-1800/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..c4ebeb8b36870ef6e97cc56ff586a3cbf983ce20 --- /dev/null +++ b/checkpoint-1800/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e8852776768074917f2ca0233830b0a868eb2e6ccb709d9f650a9797f09d8dc +size 6042612 diff --git a/checkpoint-1800/optimizer_0/__7_0.distcp b/checkpoint-1800/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..236581dfb27fe53bf22a3cb7d588def8f16ab737 --- /dev/null +++ b/checkpoint-1800/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fde126408ee1659d1e5946268eb05132b4916e8db42641aa78c1920c7f4d101e +size 6042612 diff --git a/checkpoint-1800/pytorch_model_fsdp_0/.metadata b/checkpoint-1800/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..2dabbd4dbdba671ec6d3ba048c16da580e3c0adc --- /dev/null +++ b/checkpoint-1800/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e01fb88f10ab7d37a18a6298c62006ee26306fc1b935f4aa46f53ccbd5976c0b +size 339852 diff --git a/checkpoint-1800/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-1800/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..cc27a52fe664c6efcb352b86b91c191391039f22 --- /dev/null +++ b/checkpoint-1800/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbc575b7a5c00381b4a8c10a29e738ef0fe430e623aa9c91623d2ddca111a79a +size 3003648 diff --git a/checkpoint-1800/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-1800/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..fdc88bbbc3b37b881958b33f2acce72d56cc511e --- /dev/null +++ b/checkpoint-1800/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2af6231ff4a638e24084cbe364e888ea3006bc5e7483113de0316bc0feb6c0c7 +size 3003648 diff --git a/checkpoint-1800/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-1800/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..ae84c4596759c5add41654dfa95d01e62fa873c3 --- /dev/null +++ b/checkpoint-1800/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:603a670415e8f795abcb3248bc263b29d9f268456dc957893efbae8997a2dad5 +size 3003648 diff --git a/checkpoint-1800/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-1800/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..7bfd1e81c58e4db3e66e2f55de860384b0b39e1e --- /dev/null +++ b/checkpoint-1800/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2697a4a67d1db5ef4eb83d319b5324c2ab6c27783ad9938c00ddf10c4fdc947 +size 3003648 diff --git a/checkpoint-1800/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-1800/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..67d6264ef32384b2f910da7da880d7e1f8e321c2 --- /dev/null +++ b/checkpoint-1800/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cec38856d855f8504c57523953275f72f382d44c9eb7bd66be5425a4b91230b7 +size 3003648 diff --git a/checkpoint-1800/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-1800/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..de1bbf16e745d02820cc1c4187ebe0384d9e547f --- /dev/null +++ b/checkpoint-1800/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e54b423288476d53d99ab780cc5557fc6325bf738b7a2a82d5d67857a0acec0 +size 3003648 diff --git a/checkpoint-1800/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-1800/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..b42c2f2159ecc2ed486e51dad01b8e19cd713ad8 --- /dev/null +++ b/checkpoint-1800/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:619aeea448f351f1e6adf0e5ead65b43676d3988b2f46225e06e55b56656d127 +size 3003648 diff --git a/checkpoint-1800/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-1800/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..0db6824d7c16b181dcb0765439f1ce59de031053 --- /dev/null +++ b/checkpoint-1800/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e4edb49a2f1fb4d15a05d1850486cd5f9a96fe7dcb00fc8bbfee649fa69809d +size 3003648 diff --git a/checkpoint-1800/rng_state_0.pth b/checkpoint-1800/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..505e48e1c04b347b57874cd7f13152a3dc45a0fe --- /dev/null +++ b/checkpoint-1800/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e5cb1ddfe0c1c113a20f40807a9ad8a341a0f5f2da8d9dcb3174ddc5cf42a45 +size 15920 diff --git a/checkpoint-1800/rng_state_1.pth b/checkpoint-1800/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..fe5748fb3992b8bcbd1be2615a41e0f76c9cfda0 --- /dev/null +++ b/checkpoint-1800/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:257ec6a5e84a6f656652535f26794b006e614fc4cb0992ade63cffdc8b66bc3c +size 15984 diff --git a/checkpoint-1800/rng_state_2.pth b/checkpoint-1800/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6e4391b1af62e2a91418f873b5f4ae861d1aad2a --- /dev/null +++ b/checkpoint-1800/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b869fbc5f677cfa936b763dce05a4b33480318d270959b91a7b2f16349b67fb3 +size 15984 diff --git a/checkpoint-1800/rng_state_3.pth b/checkpoint-1800/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..5af57fe4b8fa5a039585c0691bd55735f58d70a9 --- /dev/null +++ b/checkpoint-1800/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0bae2eff0323fac0bd9c07883bce61210db1d9f3805d41ab334ed75c8981f0e +size 15984 diff --git a/checkpoint-1800/rng_state_4.pth b/checkpoint-1800/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..11da143134b0c23f06ce1046f65b5cf00acfac26 --- /dev/null +++ b/checkpoint-1800/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b70c37cdd018075af133d8de2fa13cffdff177f5300346385daa81e4e8106320 +size 15984 diff --git a/checkpoint-1800/rng_state_5.pth b/checkpoint-1800/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..4a6eacc3ecbe6bb10f5324b2ac6e1aa7bcaf0c75 --- /dev/null +++ b/checkpoint-1800/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c26e1c981b255bbc49483ee7b1834d16eb6d6ba193d8f8318f9403b16962c959 +size 15984 diff --git a/checkpoint-1800/rng_state_6.pth b/checkpoint-1800/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..7f797500c9870775d0a99fe1ecb3bbb2e52e0c70 --- /dev/null +++ b/checkpoint-1800/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f928fca3214ad4844bda0e7b59b964b307eacf836c78d92010134bcf2ae7b76 +size 15984 diff --git a/checkpoint-1800/rng_state_7.pth b/checkpoint-1800/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..82692af4a6c1b7f536bf6f26495b604c264ea19a --- /dev/null +++ b/checkpoint-1800/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:997cceb19a4774c17f6fd70d3635ecb66a8a48b908144a94910112e19f13bba1 +size 15984 diff --git a/checkpoint-1800/scheduler.pt b/checkpoint-1800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..221f9ffd1eb177554f1e976499d655666e61abff --- /dev/null +++ b/checkpoint-1800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d973c55cadccf6ff6ef9e217145fc1e52fea2fbcc1c6a26f6ebb6c7cd1ee47c +size 1064 diff --git a/checkpoint-1800/trainer_state.json b/checkpoint-1800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6bb4ce2a7da73b8ec4c3bdde868e2f95d11362dc --- /dev/null +++ b/checkpoint-1800/trainer_state.json @@ -0,0 +1,682 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9, + "eval_steps": 100, + "global_step": 1800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.14176353812217712, + "learning_rate": 0.0004, + "loss": 1.1525, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.1460508555173874, + "learning_rate": 0.0004998852503731983, + "loss": 1.047, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.2368021011352539, + "learning_rate": 0.0004993848168027977, + "loss": 0.8529, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.14488168060779572, + "learning_rate": 0.0004984880506341147, + "loss": 0.9756, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9470569491386414, + "eval_runtime": 845.9115, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.1415175348520279, + "learning_rate": 0.0004971963770447935, + "loss": 0.9564, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.1751064509153366, + "learning_rate": 0.0004955118488155782, + "loss": 0.711, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.1439943015575409, + "learning_rate": 0.0004934371430679492, + "loss": 0.9409, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.15947112441062927, + "learning_rate": 0.0004909755570095319, + "loss": 0.8979, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9711233973503113, + "eval_runtime": 845.6433, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.14685837924480438, + "learning_rate": 0.0004881310026940389, + "loss": 0.6376, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.15040776133537292, + "learning_rate": 0.0004849080008040734, + "loss": 0.927, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.16087745130062103, + "learning_rate": 0.00048131167346667446, + "loss": 0.8456, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.15025638043880463, + "learning_rate": 0.00047734773611302284, + "loss": 0.6029, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 1.0056413412094116, + "eval_runtime": 848.447, + "eval_samples_per_second": 1.293, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.15893957018852234, + "learning_rate": 0.0004730224883952422, + "loss": 0.9036, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.1535714715719223, + "learning_rate": 0.0004683428041747334, + "loss": 0.828, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.1718970686197281, + "learning_rate": 0.0004633161205979517, + "loss": 0.5944, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.17664535343647003, + "learning_rate": 0.0004579504262769877, + "loss": 0.8654, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 1.0151112079620361, + "eval_runtime": 839.7482, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.17122088372707367, + "learning_rate": 0.0004522542485937369, + "loss": 0.8078, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.19420970976352692, + "learning_rate": 0.00044623664014783386, + "loss": 0.5735, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.18166953325271606, + "learning_rate": 0.00043990716436988924, + "loss": 0.8604, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.1502382755279541, + "learning_rate": 0.0004332758803228925, + "loss": 0.7673, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 1.0454745292663574, + "eval_runtime": 841.3951, + "eval_samples_per_second": 1.304, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.16875839233398438, + "learning_rate": 0.00042635332671593575, + "loss": 0.5882, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.16070039570331573, + "learning_rate": 0.00041915050515566445, + "loss": 0.8175, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.16584184765815735, + "learning_rate": 0.00041167886266207167, + "loss": 0.7795, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.15345798432826996, + "learning_rate": 0.0004039502734764241, + "loss": 0.7331, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 1.0627943277359009, + "eval_runtime": 849.4944, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 600 + }, + { + "epoch": 0.3125, + "grad_norm": 0.16241031885147095, + "learning_rate": 0.0003959770201902294, + "loss": 0.7436, + "step": 625 + }, + { + "epoch": 0.325, + "grad_norm": 0.1531112939119339, + "learning_rate": 0.0003877717742252371, + "loss": 0.6345, + "step": 650 + }, + { + "epoch": 0.3375, + "grad_norm": 0.15790140628814697, + "learning_rate": 0.00037934757569549495, + "loss": 0.7351, + "step": 675 + }, + { + "epoch": 0.35, + "grad_norm": 0.1822807490825653, + "learning_rate": 0.00037071781268346345, + "loss": 0.745, + "step": 700 + }, + { + "epoch": 0.35, + "eval_loss": 1.07315993309021, + "eval_runtime": 837.5276, + "eval_samples_per_second": 1.31, + "eval_steps_per_second": 0.021, + "step": 700 + }, + { + "epoch": 0.3625, + "grad_norm": 0.15983282029628754, + "learning_rate": 0.00036189619996312495, + "loss": 0.5972, + "step": 725 + }, + { + "epoch": 0.375, + "grad_norm": 0.18202389776706696, + "learning_rate": 0.00035289675720390174, + "loss": 0.759, + "step": 750 + }, + { + "epoch": 0.3875, + "grad_norm": 0.16057106852531433, + "learning_rate": 0.00034373378669002105, + "loss": 0.7358, + "step": 775 + }, + { + "epoch": 0.4, + "grad_norm": 0.1680625081062317, + "learning_rate": 0.00033442185059073706, + "loss": 0.5636, + "step": 800 + }, + { + "epoch": 0.4, + "eval_loss": 1.0932456254959106, + "eval_runtime": 840.1331, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 800 + }, + { + "epoch": 0.4125, + "grad_norm": 0.15613198280334473, + "learning_rate": 0.00032497574781753367, + "loss": 0.7596, + "step": 825 + }, + { + "epoch": 0.425, + "grad_norm": 0.1628854125738144, + "learning_rate": 0.000315410490505086, + "loss": 0.7282, + "step": 850 + }, + { + "epoch": 0.4375, + "grad_norm": 0.16740979254245758, + "learning_rate": 0.0003057412801533589, + "loss": 0.5325, + "step": 875 + }, + { + "epoch": 0.45, + "grad_norm": 0.1634828895330429, + "learning_rate": 0.0002959834834687587, + "loss": 0.778, + "step": 900 + }, + { + "epoch": 0.45, + "eval_loss": 1.0952215194702148, + "eval_runtime": 836.473, + "eval_samples_per_second": 1.311, + "eval_steps_per_second": 0.022, + "step": 900 + }, + { + "epoch": 0.4625, + "grad_norm": 0.161700040102005, + "learning_rate": 0.00028615260794273236, + "loss": 0.7255, + "step": 925 + }, + { + "epoch": 0.475, + "grad_norm": 0.16280074417591095, + "learning_rate": 0.00027626427720662416, + "loss": 0.4993, + "step": 950 + }, + { + "epoch": 0.4875, + "grad_norm": 0.16114428639411926, + "learning_rate": 0.00026633420620195917, + "loss": 0.7765, + "step": 975 + }, + { + "epoch": 0.5, + "grad_norm": 0.16209788620471954, + "learning_rate": 0.00025637817620561263, + "loss": 0.7221, + "step": 1000 + }, + { + "epoch": 0.5, + "eval_loss": 1.1190329790115356, + "eval_runtime": 835.4164, + "eval_samples_per_second": 1.313, + "eval_steps_per_second": 0.022, + "step": 1000 + }, + { + "epoch": 0.5125, + "grad_norm": 0.15057620406150818, + "learning_rate": 0.0002464120097495559, + "loss": 0.4929, + "step": 1025 + }, + { + "epoch": 0.525, + "grad_norm": 0.16715486347675323, + "learning_rate": 0.00023645154547503855, + "loss": 0.7896, + "step": 1050 + }, + { + "epoch": 0.5375, + "grad_norm": 0.1591680645942688, + "learning_rate": 0.00022651261296116894, + "loss": 0.6999, + "step": 1075 + }, + { + "epoch": 0.55, + "grad_norm": 0.17062760889530182, + "learning_rate": 0.00021661100756789666, + "loss": 0.4708, + "step": 1100 + }, + { + "epoch": 0.55, + "eval_loss": 1.1381303071975708, + "eval_runtime": 839.2576, + "eval_samples_per_second": 1.307, + "eval_steps_per_second": 0.021, + "step": 1100 + }, + { + "epoch": 0.5625, + "grad_norm": 0.16746024787425995, + "learning_rate": 0.00020676246533337764, + "loss": 0.8073, + "step": 1125 + }, + { + "epoch": 0.575, + "grad_norm": 0.18011848628520966, + "learning_rate": 0.00019698263796561526, + "loss": 0.7153, + "step": 1150 + }, + { + "epoch": 0.5875, + "grad_norm": 0.16664239764213562, + "learning_rate": 0.00018728706796812333, + "loss": 0.6313, + "step": 1175 + }, + { + "epoch": 0.6, + "grad_norm": 0.1630394607782364, + "learning_rate": 0.00017769116393914037, + "loss": 0.6952, + "step": 1200 + }, + { + "epoch": 0.6, + "eval_loss": 1.1242510080337524, + "eval_runtime": 847.0203, + "eval_samples_per_second": 1.295, + "eval_steps_per_second": 0.021, + "step": 1200 + }, + { + "epoch": 0.6125, + "grad_norm": 0.16016128659248352, + "learning_rate": 0.00016821017608365264, + "loss": 0.6161, + "step": 1225 + }, + { + "epoch": 0.625, + "grad_norm": 0.17186138033866882, + "learning_rate": 0.00015885917197714112, + "loss": 0.623, + "step": 1250 + }, + { + "epoch": 0.6375, + "grad_norm": 0.1764240562915802, + "learning_rate": 0.00014965301261957238, + "loss": 0.6988, + "step": 1275 + }, + { + "epoch": 0.65, + "grad_norm": 0.16019247472286224, + "learning_rate": 0.00014060632881768558, + "loss": 0.599, + "step": 1300 + }, + { + "epoch": 0.65, + "eval_loss": 1.1349693536758423, + "eval_runtime": 842.5056, + "eval_samples_per_second": 1.302, + "eval_steps_per_second": 0.021, + "step": 1300 + }, + { + "epoch": 0.6625, + "grad_norm": 0.17425072193145752, + "learning_rate": 0.00013173349793311424, + "loss": 0.6607, + "step": 1325 + }, + { + "epoch": 0.675, + "grad_norm": 0.17765691876411438, + "learning_rate": 0.0001230486210332916, + "loss": 0.6811, + "step": 1350 + }, + { + "epoch": 0.6875, + "grad_norm": 0.17980526387691498, + "learning_rate": 0.00011456550048145536, + "loss": 0.5755, + "step": 1375 + }, + { + "epoch": 0.7, + "grad_norm": 0.1814012974500656, + "learning_rate": 0.00010629761800136473, + "loss": 0.6642, + "step": 1400 + }, + { + "epoch": 0.7, + "eval_loss": 1.1495640277862549, + "eval_runtime": 840.9164, + "eval_samples_per_second": 1.305, + "eval_steps_per_second": 0.021, + "step": 1400 + }, + { + "epoch": 0.7125, + "grad_norm": 0.1855439394712448, + "learning_rate": 9.82581132515907e-05, + "loss": 0.6796, + "step": 1425 + }, + { + "epoch": 0.725, + "grad_norm": 0.15141044557094574, + "learning_rate": 9.045976294343145e-05, + "loss": 0.5593, + "step": 1450 + }, + { + "epoch": 0.7375, + "grad_norm": 0.17237244546413422, + "learning_rate": 8.291496053563699e-05, + "loss": 0.69, + "step": 1475 + }, + { + "epoch": 0.75, + "grad_norm": 0.17433880269527435, + "learning_rate": 7.563569653821565e-05, + "loss": 0.6768, + "step": 1500 + }, + { + "epoch": 0.75, + "eval_loss": 1.1441528797149658, + "eval_runtime": 844.4916, + "eval_samples_per_second": 1.299, + "eval_steps_per_second": 0.021, + "step": 1500 + }, + { + "epoch": 0.7625, + "grad_norm": 0.15868628025054932, + "learning_rate": 6.863353945662288e-05, + "loss": 0.517, + "step": 1525 + }, + { + "epoch": 0.775, + "grad_norm": 0.18802335858345032, + "learning_rate": 6.191961740661687e-05, + "loss": 0.7035, + "step": 1550 + }, + { + "epoch": 0.7875, + "grad_norm": 0.17180030047893524, + "learning_rate": 5.550460042899982e-05, + "loss": 0.6911, + "step": 1575 + }, + { + "epoch": 0.8, + "grad_norm": 0.16377338767051697, + "learning_rate": 4.9398683532350855e-05, + "loss": 0.4876, + "step": 1600 + }, + { + "epoch": 0.8, + "eval_loss": 1.1581627130508423, + "eval_runtime": 844.0055, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 1600 + }, + { + "epoch": 0.8125, + "grad_norm": 0.16953328251838684, + "learning_rate": 4.3611570490698945e-05, + "loss": 0.745, + "step": 1625 + }, + { + "epoch": 0.825, + "grad_norm": 0.16867636144161224, + "learning_rate": 3.815245842188697e-05, + "loss": 0.6623, + "step": 1650 + }, + { + "epoch": 0.8375, + "grad_norm": 0.16276288032531738, + "learning_rate": 3.30300231711339e-05, + "loss": 0.4716, + "step": 1675 + }, + { + "epoch": 0.85, + "grad_norm": 0.17455314099788666, + "learning_rate": 2.8252405523025106e-05, + "loss": 0.7464, + "step": 1700 + }, + { + "epoch": 0.85, + "eval_loss": 1.1617317199707031, + "eval_runtime": 841.9311, + "eval_samples_per_second": 1.303, + "eval_steps_per_second": 0.021, + "step": 1700 + }, + { + "epoch": 0.8625, + "grad_norm": 0.16539457440376282, + "learning_rate": 2.3827198263843162e-05, + "loss": 0.7088, + "step": 1725 + }, + { + "epoch": 0.875, + "grad_norm": 0.17668606340885162, + "learning_rate": 1.9761434114799497e-05, + "loss": 0.5753, + "step": 1750 + }, + { + "epoch": 0.8875, + "grad_norm": 0.17462626099586487, + "learning_rate": 1.606157455534535e-05, + "loss": 0.6541, + "step": 1775 + }, + { + "epoch": 0.9, + "grad_norm": 0.1645047813653946, + "learning_rate": 1.2733499554322708e-05, + "loss": 0.6349, + "step": 1800 + }, + { + "epoch": 0.9, + "eval_loss": 1.1562622785568237, + "eval_runtime": 838.8692, + "eval_samples_per_second": 1.308, + "eval_steps_per_second": 0.021, + "step": 1800 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.814276778051502e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1900/optimizer_0/.metadata b/checkpoint-1900/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..3b8784289a577a7200154d192e716a9daf18f223 --- /dev/null +++ b/checkpoint-1900/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5eca47e38881d3b1342a437c39a33f0493fa524137710bd1c1b7d39af2d58cfc +size 869362 diff --git a/checkpoint-1900/optimizer_0/__0_0.distcp b/checkpoint-1900/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..3195cf36641132db7baaf16f1adf81476242ae1d --- /dev/null +++ b/checkpoint-1900/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e89eb3fee9a9cac80f381744b3dd1b5f2ac0a3b41e89ae8f83cbdb6fbd94ab8d +size 6008476 diff --git a/checkpoint-1900/optimizer_0/__1_0.distcp b/checkpoint-1900/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..37f1b6df7d3ca1fcb73e99ac8204b1d9577888b1 --- /dev/null +++ b/checkpoint-1900/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c7becd285801ec21dcd1fd4c6b14eca1b9e148a7e35a01f9bbc0795f7bacfc8 +size 6041200 diff --git a/checkpoint-1900/optimizer_0/__2_0.distcp b/checkpoint-1900/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..535968d1ed056e1d4f33830fb964237daded0aee --- /dev/null +++ b/checkpoint-1900/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71698719d9682bb2030056ef6cd82cfa7348fdc9b3701e6802af8975529b21a4 +size 6041200 diff --git a/checkpoint-1900/optimizer_0/__3_0.distcp b/checkpoint-1900/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..97c46b0776b3f8252950c7c8fc04ea737b7a34c0 --- /dev/null +++ b/checkpoint-1900/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fded99eed4ccdc05509326ae2e1b13f26ed955232d35872f850cc65c2a6991a4 +size 6043476 diff --git a/checkpoint-1900/optimizer_0/__4_0.distcp b/checkpoint-1900/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..214d6b02cada5211263d052eed844e8fe3f2e5df --- /dev/null +++ b/checkpoint-1900/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acc6c578a4e674e2d6df8ef2dd30403646c2c8d8616a15d2f8a3726f57c65be6 +size 6057364 diff --git a/checkpoint-1900/optimizer_0/__5_0.distcp b/checkpoint-1900/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..9994419c6eeecbac23f6a057988d4a3813f202c9 --- /dev/null +++ b/checkpoint-1900/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38b70d090aac10ddb3aecb083fb6bf0ca00ad2e8c923b2711b41684d14ae52f6 +size 6042612 diff --git a/checkpoint-1900/optimizer_0/__6_0.distcp b/checkpoint-1900/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..5412e5457e3fe196e0d3e4d3832153292f925ca6 --- /dev/null +++ b/checkpoint-1900/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9465cdfb2ccc20321c4669f47d891d2c5423a5f2acd5adbc1e88aa3f4a134050 +size 6042612 diff --git a/checkpoint-1900/optimizer_0/__7_0.distcp b/checkpoint-1900/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..aa55031e6230d7887f3a4384075189a595e84a51 --- /dev/null +++ b/checkpoint-1900/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c4be1087f7e4a40bc99d83086bff0e53ec5e7e48fa60ce61e19226ced66a6b7 +size 6042612 diff --git a/checkpoint-1900/pytorch_model_fsdp_0/.metadata b/checkpoint-1900/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..2a08716752e36a628352fd8eb2ecfd6dad82c244 --- /dev/null +++ b/checkpoint-1900/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:626f1eff81b107b5afb4700b2c9c20e2dd032adfdd5d5ad60791877870d29316 +size 339852 diff --git a/checkpoint-1900/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-1900/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..154b2e432e9868d68768396ea8e490073dabe239 --- /dev/null +++ b/checkpoint-1900/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3779ed45f1df18a8703a6401dad3c36f6176f75195b209d11c134d342457d17 +size 3003648 diff --git a/checkpoint-1900/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-1900/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..dc66c30ee97c0b01a60ce93850f729661b1a349e --- /dev/null +++ b/checkpoint-1900/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:378baf946e9c813f0b170b2ba5705d6ecc6297bfe6bfbb6d6a8e8df5ae56e558 +size 3003648 diff --git a/checkpoint-1900/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-1900/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..3e08198514a19ba10e495af7051d311585ecb0a0 --- /dev/null +++ b/checkpoint-1900/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d58f83bd638acc0cb9b3057a6ca8841280c1b141ba5dac2b7d989d79c757c945 +size 3003648 diff --git a/checkpoint-1900/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-1900/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..6854e7ad4f0eea74d8f570007b7c377f4536ce49 --- /dev/null +++ b/checkpoint-1900/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e184bf122d3083f5ce8887e46543b42dcd52dcb540816f67453ac7c0bec9cec3 +size 3003648 diff --git a/checkpoint-1900/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-1900/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..ad2675418f9079e98709e544da771224aef6a35e --- /dev/null +++ b/checkpoint-1900/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc67d663bed34c2b93cd34a4e15d0536f30bcfa52d87aaed6169b586e2f4840d +size 3003648 diff --git a/checkpoint-1900/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-1900/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..b2a5b62368b0034b92ed6d2ef1c363b18b7c7f76 --- /dev/null +++ b/checkpoint-1900/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:867add648b05212edcaf449313503d961539e342c475a3d24b4c85485d44238f +size 3003648 diff --git a/checkpoint-1900/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-1900/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..35c4c329df80a18410ae56db13daa41a0f053849 --- /dev/null +++ b/checkpoint-1900/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62d1d92e57815d824b199ccfb3420ad61e338100bcb0e62f3bace5f96e03e77f +size 3003648 diff --git a/checkpoint-1900/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-1900/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..1a4b3b720374ba3577372bf7e53d55104e11c5c7 --- /dev/null +++ b/checkpoint-1900/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2701e99e7a0387eb0748dcbaafbfdff164487cdfd6dbff97969c320ed4411e63 +size 3003648 diff --git a/checkpoint-1900/rng_state_0.pth b/checkpoint-1900/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..93dc3bdd8b544c34e17bf9996dbcdab801ea0f97 --- /dev/null +++ b/checkpoint-1900/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33e409faa47bc79c0ad718c95292033fc1402bf7b7ab48bbec1e96ee3b980028 +size 15920 diff --git a/checkpoint-1900/rng_state_1.pth b/checkpoint-1900/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..1b8768e2aeb7dd408d7467dfd254c45c618b7390 --- /dev/null +++ b/checkpoint-1900/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a55605f0f83843e4c91757a79dac5b24454a60f0028fc9599812d32f683d434a +size 15984 diff --git a/checkpoint-1900/rng_state_2.pth b/checkpoint-1900/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..3b2f621de46c3576e38f4f41e43532d1b3f0e919 --- /dev/null +++ b/checkpoint-1900/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8c31c4f35ed9f40bf17f26f645bbb184ad6e9f7def045d9d0266215523c86b2 +size 15984 diff --git a/checkpoint-1900/rng_state_3.pth b/checkpoint-1900/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..ceb10e858887091a0caae6cb0d452bb1d51f3314 --- /dev/null +++ b/checkpoint-1900/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce24c14f7dabcc1e9c93031afb3e354d5cdf49d6eb3bef0dbaa358b0fad50527 +size 15984 diff --git a/checkpoint-1900/rng_state_4.pth b/checkpoint-1900/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..36efcc64f5934958950738bafcdfe20b132230d5 --- /dev/null +++ b/checkpoint-1900/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2122ecd749d6091c466d968c7b005c78eaa2b9179e618afc8c409b6236761ef0 +size 15984 diff --git a/checkpoint-1900/rng_state_5.pth b/checkpoint-1900/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..da8c4dcb813b72b75aabd47c95060126cb65216e --- /dev/null +++ b/checkpoint-1900/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24d192fa9d3d7305db8099dc5198a4c974a1210b9e552f80c8ae30c88456dbfa +size 15984 diff --git a/checkpoint-1900/rng_state_6.pth b/checkpoint-1900/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..74358107123f2f8aa3277a1e65a7a052f9f66c82 --- /dev/null +++ b/checkpoint-1900/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de15ef0c441e909cf4d389f94942f48bf79c0569709ff15dadb83798e850a4a1 +size 15984 diff --git a/checkpoint-1900/rng_state_7.pth b/checkpoint-1900/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..3e56d9c206ef2e43663ba68c1476535cbc6551f5 --- /dev/null +++ b/checkpoint-1900/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52c8b9b13c12a6e57a19e8cb21bc74781b7df61084f81874e75d9dfdbfd68ab0 +size 15984 diff --git a/checkpoint-1900/scheduler.pt b/checkpoint-1900/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..88d6546af3495fc26241dfd833bbbb2f711934ac --- /dev/null +++ b/checkpoint-1900/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5ea56670e55a6b485cbf0f38ec022a88b913275368ecee1cfea14b0bc438f1b +size 1064 diff --git a/checkpoint-1900/trainer_state.json b/checkpoint-1900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8bccee9bbc5eeae237b0a4639a2d868bb4a40162 --- /dev/null +++ b/checkpoint-1900/trainer_state.json @@ -0,0 +1,718 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.95, + "eval_steps": 100, + "global_step": 1900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.14176353812217712, + "learning_rate": 0.0004, + "loss": 1.1525, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.1460508555173874, + "learning_rate": 0.0004998852503731983, + "loss": 1.047, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.2368021011352539, + "learning_rate": 0.0004993848168027977, + "loss": 0.8529, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.14488168060779572, + "learning_rate": 0.0004984880506341147, + "loss": 0.9756, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9470569491386414, + "eval_runtime": 845.9115, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.1415175348520279, + "learning_rate": 0.0004971963770447935, + "loss": 0.9564, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.1751064509153366, + "learning_rate": 0.0004955118488155782, + "loss": 0.711, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.1439943015575409, + "learning_rate": 0.0004934371430679492, + "loss": 0.9409, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.15947112441062927, + "learning_rate": 0.0004909755570095319, + "loss": 0.8979, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9711233973503113, + "eval_runtime": 845.6433, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.14685837924480438, + "learning_rate": 0.0004881310026940389, + "loss": 0.6376, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.15040776133537292, + "learning_rate": 0.0004849080008040734, + "loss": 0.927, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.16087745130062103, + "learning_rate": 0.00048131167346667446, + "loss": 0.8456, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.15025638043880463, + "learning_rate": 0.00047734773611302284, + "loss": 0.6029, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 1.0056413412094116, + "eval_runtime": 848.447, + "eval_samples_per_second": 1.293, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.15893957018852234, + "learning_rate": 0.0004730224883952422, + "loss": 0.9036, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.1535714715719223, + "learning_rate": 0.0004683428041747334, + "loss": 0.828, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.1718970686197281, + "learning_rate": 0.0004633161205979517, + "loss": 0.5944, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.17664535343647003, + "learning_rate": 0.0004579504262769877, + "loss": 0.8654, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 1.0151112079620361, + "eval_runtime": 839.7482, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.17122088372707367, + "learning_rate": 0.0004522542485937369, + "loss": 0.8078, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.19420970976352692, + "learning_rate": 0.00044623664014783386, + "loss": 0.5735, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.18166953325271606, + "learning_rate": 0.00043990716436988924, + "loss": 0.8604, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.1502382755279541, + "learning_rate": 0.0004332758803228925, + "loss": 0.7673, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 1.0454745292663574, + "eval_runtime": 841.3951, + "eval_samples_per_second": 1.304, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.16875839233398438, + "learning_rate": 0.00042635332671593575, + "loss": 0.5882, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.16070039570331573, + "learning_rate": 0.00041915050515566445, + "loss": 0.8175, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.16584184765815735, + "learning_rate": 0.00041167886266207167, + "loss": 0.7795, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.15345798432826996, + "learning_rate": 0.0004039502734764241, + "loss": 0.7331, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 1.0627943277359009, + "eval_runtime": 849.4944, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 600 + }, + { + "epoch": 0.3125, + "grad_norm": 0.16241031885147095, + "learning_rate": 0.0003959770201902294, + "loss": 0.7436, + "step": 625 + }, + { + "epoch": 0.325, + "grad_norm": 0.1531112939119339, + "learning_rate": 0.0003877717742252371, + "loss": 0.6345, + "step": 650 + }, + { + "epoch": 0.3375, + "grad_norm": 0.15790140628814697, + "learning_rate": 0.00037934757569549495, + "loss": 0.7351, + "step": 675 + }, + { + "epoch": 0.35, + "grad_norm": 0.1822807490825653, + "learning_rate": 0.00037071781268346345, + "loss": 0.745, + "step": 700 + }, + { + "epoch": 0.35, + "eval_loss": 1.07315993309021, + "eval_runtime": 837.5276, + "eval_samples_per_second": 1.31, + "eval_steps_per_second": 0.021, + "step": 700 + }, + { + "epoch": 0.3625, + "grad_norm": 0.15983282029628754, + "learning_rate": 0.00036189619996312495, + "loss": 0.5972, + "step": 725 + }, + { + "epoch": 0.375, + "grad_norm": 0.18202389776706696, + "learning_rate": 0.00035289675720390174, + "loss": 0.759, + "step": 750 + }, + { + "epoch": 0.3875, + "grad_norm": 0.16057106852531433, + "learning_rate": 0.00034373378669002105, + "loss": 0.7358, + "step": 775 + }, + { + "epoch": 0.4, + "grad_norm": 0.1680625081062317, + "learning_rate": 0.00033442185059073706, + "loss": 0.5636, + "step": 800 + }, + { + "epoch": 0.4, + "eval_loss": 1.0932456254959106, + "eval_runtime": 840.1331, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 800 + }, + { + "epoch": 0.4125, + "grad_norm": 0.15613198280334473, + "learning_rate": 0.00032497574781753367, + "loss": 0.7596, + "step": 825 + }, + { + "epoch": 0.425, + "grad_norm": 0.1628854125738144, + "learning_rate": 0.000315410490505086, + "loss": 0.7282, + "step": 850 + }, + { + "epoch": 0.4375, + "grad_norm": 0.16740979254245758, + "learning_rate": 0.0003057412801533589, + "loss": 0.5325, + "step": 875 + }, + { + "epoch": 0.45, + "grad_norm": 0.1634828895330429, + "learning_rate": 0.0002959834834687587, + "loss": 0.778, + "step": 900 + }, + { + "epoch": 0.45, + "eval_loss": 1.0952215194702148, + "eval_runtime": 836.473, + "eval_samples_per_second": 1.311, + "eval_steps_per_second": 0.022, + "step": 900 + }, + { + "epoch": 0.4625, + "grad_norm": 0.161700040102005, + "learning_rate": 0.00028615260794273236, + "loss": 0.7255, + "step": 925 + }, + { + "epoch": 0.475, + "grad_norm": 0.16280074417591095, + "learning_rate": 0.00027626427720662416, + "loss": 0.4993, + "step": 950 + }, + { + "epoch": 0.4875, + "grad_norm": 0.16114428639411926, + "learning_rate": 0.00026633420620195917, + "loss": 0.7765, + "step": 975 + }, + { + "epoch": 0.5, + "grad_norm": 0.16209788620471954, + "learning_rate": 0.00025637817620561263, + "loss": 0.7221, + "step": 1000 + }, + { + "epoch": 0.5, + "eval_loss": 1.1190329790115356, + "eval_runtime": 835.4164, + "eval_samples_per_second": 1.313, + "eval_steps_per_second": 0.022, + "step": 1000 + }, + { + "epoch": 0.5125, + "grad_norm": 0.15057620406150818, + "learning_rate": 0.0002464120097495559, + "loss": 0.4929, + "step": 1025 + }, + { + "epoch": 0.525, + "grad_norm": 0.16715486347675323, + "learning_rate": 0.00023645154547503855, + "loss": 0.7896, + "step": 1050 + }, + { + "epoch": 0.5375, + "grad_norm": 0.1591680645942688, + "learning_rate": 0.00022651261296116894, + "loss": 0.6999, + "step": 1075 + }, + { + "epoch": 0.55, + "grad_norm": 0.17062760889530182, + "learning_rate": 0.00021661100756789666, + "loss": 0.4708, + "step": 1100 + }, + { + "epoch": 0.55, + "eval_loss": 1.1381303071975708, + "eval_runtime": 839.2576, + "eval_samples_per_second": 1.307, + "eval_steps_per_second": 0.021, + "step": 1100 + }, + { + "epoch": 0.5625, + "grad_norm": 0.16746024787425995, + "learning_rate": 0.00020676246533337764, + "loss": 0.8073, + "step": 1125 + }, + { + "epoch": 0.575, + "grad_norm": 0.18011848628520966, + "learning_rate": 0.00019698263796561526, + "loss": 0.7153, + "step": 1150 + }, + { + "epoch": 0.5875, + "grad_norm": 0.16664239764213562, + "learning_rate": 0.00018728706796812333, + "loss": 0.6313, + "step": 1175 + }, + { + "epoch": 0.6, + "grad_norm": 0.1630394607782364, + "learning_rate": 0.00017769116393914037, + "loss": 0.6952, + "step": 1200 + }, + { + "epoch": 0.6, + "eval_loss": 1.1242510080337524, + "eval_runtime": 847.0203, + "eval_samples_per_second": 1.295, + "eval_steps_per_second": 0.021, + "step": 1200 + }, + { + "epoch": 0.6125, + "grad_norm": 0.16016128659248352, + "learning_rate": 0.00016821017608365264, + "loss": 0.6161, + "step": 1225 + }, + { + "epoch": 0.625, + "grad_norm": 0.17186138033866882, + "learning_rate": 0.00015885917197714112, + "loss": 0.623, + "step": 1250 + }, + { + "epoch": 0.6375, + "grad_norm": 0.1764240562915802, + "learning_rate": 0.00014965301261957238, + "loss": 0.6988, + "step": 1275 + }, + { + "epoch": 0.65, + "grad_norm": 0.16019247472286224, + "learning_rate": 0.00014060632881768558, + "loss": 0.599, + "step": 1300 + }, + { + "epoch": 0.65, + "eval_loss": 1.1349693536758423, + "eval_runtime": 842.5056, + "eval_samples_per_second": 1.302, + "eval_steps_per_second": 0.021, + "step": 1300 + }, + { + "epoch": 0.6625, + "grad_norm": 0.17425072193145752, + "learning_rate": 0.00013173349793311424, + "loss": 0.6607, + "step": 1325 + }, + { + "epoch": 0.675, + "grad_norm": 0.17765691876411438, + "learning_rate": 0.0001230486210332916, + "loss": 0.6811, + "step": 1350 + }, + { + "epoch": 0.6875, + "grad_norm": 0.17980526387691498, + "learning_rate": 0.00011456550048145536, + "loss": 0.5755, + "step": 1375 + }, + { + "epoch": 0.7, + "grad_norm": 0.1814012974500656, + "learning_rate": 0.00010629761800136473, + "loss": 0.6642, + "step": 1400 + }, + { + "epoch": 0.7, + "eval_loss": 1.1495640277862549, + "eval_runtime": 840.9164, + "eval_samples_per_second": 1.305, + "eval_steps_per_second": 0.021, + "step": 1400 + }, + { + "epoch": 0.7125, + "grad_norm": 0.1855439394712448, + "learning_rate": 9.82581132515907e-05, + "loss": 0.6796, + "step": 1425 + }, + { + "epoch": 0.725, + "grad_norm": 0.15141044557094574, + "learning_rate": 9.045976294343145e-05, + "loss": 0.5593, + "step": 1450 + }, + { + "epoch": 0.7375, + "grad_norm": 0.17237244546413422, + "learning_rate": 8.291496053563699e-05, + "loss": 0.69, + "step": 1475 + }, + { + "epoch": 0.75, + "grad_norm": 0.17433880269527435, + "learning_rate": 7.563569653821565e-05, + "loss": 0.6768, + "step": 1500 + }, + { + "epoch": 0.75, + "eval_loss": 1.1441528797149658, + "eval_runtime": 844.4916, + "eval_samples_per_second": 1.299, + "eval_steps_per_second": 0.021, + "step": 1500 + }, + { + "epoch": 0.7625, + "grad_norm": 0.15868628025054932, + "learning_rate": 6.863353945662288e-05, + "loss": 0.517, + "step": 1525 + }, + { + "epoch": 0.775, + "grad_norm": 0.18802335858345032, + "learning_rate": 6.191961740661687e-05, + "loss": 0.7035, + "step": 1550 + }, + { + "epoch": 0.7875, + "grad_norm": 0.17180030047893524, + "learning_rate": 5.550460042899982e-05, + "loss": 0.6911, + "step": 1575 + }, + { + "epoch": 0.8, + "grad_norm": 0.16377338767051697, + "learning_rate": 4.9398683532350855e-05, + "loss": 0.4876, + "step": 1600 + }, + { + "epoch": 0.8, + "eval_loss": 1.1581627130508423, + "eval_runtime": 844.0055, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 1600 + }, + { + "epoch": 0.8125, + "grad_norm": 0.16953328251838684, + "learning_rate": 4.3611570490698945e-05, + "loss": 0.745, + "step": 1625 + }, + { + "epoch": 0.825, + "grad_norm": 0.16867636144161224, + "learning_rate": 3.815245842188697e-05, + "loss": 0.6623, + "step": 1650 + }, + { + "epoch": 0.8375, + "grad_norm": 0.16276288032531738, + "learning_rate": 3.30300231711339e-05, + "loss": 0.4716, + "step": 1675 + }, + { + "epoch": 0.85, + "grad_norm": 0.17455314099788666, + "learning_rate": 2.8252405523025106e-05, + "loss": 0.7464, + "step": 1700 + }, + { + "epoch": 0.85, + "eval_loss": 1.1617317199707031, + "eval_runtime": 841.9311, + "eval_samples_per_second": 1.303, + "eval_steps_per_second": 0.021, + "step": 1700 + }, + { + "epoch": 0.8625, + "grad_norm": 0.16539457440376282, + "learning_rate": 2.3827198263843162e-05, + "loss": 0.7088, + "step": 1725 + }, + { + "epoch": 0.875, + "grad_norm": 0.17668606340885162, + "learning_rate": 1.9761434114799497e-05, + "loss": 0.5753, + "step": 1750 + }, + { + "epoch": 0.8875, + "grad_norm": 0.17462626099586487, + "learning_rate": 1.606157455534535e-05, + "loss": 0.6541, + "step": 1775 + }, + { + "epoch": 0.9, + "grad_norm": 0.1645047813653946, + "learning_rate": 1.2733499554322708e-05, + "loss": 0.6349, + "step": 1800 + }, + { + "epoch": 0.9, + "eval_loss": 1.1562622785568237, + "eval_runtime": 838.8692, + "eval_samples_per_second": 1.308, + "eval_steps_per_second": 0.021, + "step": 1800 + }, + { + "epoch": 0.9125, + "grad_norm": 0.17606309056282043, + "learning_rate": 9.782498225276437e-06, + "loss": 0.5512, + "step": 1825 + }, + { + "epoch": 0.925, + "grad_norm": 0.17557215690612793, + "learning_rate": 7.213260420777607e-06, + "loss": 0.6858, + "step": 1850 + }, + { + "epoch": 0.9375, + "grad_norm": 0.15605369210243225, + "learning_rate": 5.029869279117167e-06, + "loss": 0.6293, + "step": 1875 + }, + { + "epoch": 0.95, + "grad_norm": 0.17496590316295624, + "learning_rate": 3.235794735214709e-06, + "loss": 0.5686, + "step": 1900 + }, + { + "epoch": 0.95, + "eval_loss": 1.1630581617355347, + "eval_runtime": 829.7381, + "eval_samples_per_second": 1.322, + "eval_steps_per_second": 0.022, + "step": 1900 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.9150699323876966e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-200/optimizer_0/.metadata b/checkpoint-200/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..ac00058e3775443bd2b080d5669674bf9e0c483e --- /dev/null +++ b/checkpoint-200/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e4f5c17f1bd064f50a6ec4cad8fb2ceac68c551a224da87ed1e89115eac9a6d +size 869361 diff --git a/checkpoint-200/optimizer_0/__0_0.distcp b/checkpoint-200/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..12d9907d31e95c42f46dae38538f2f8abab5d43f --- /dev/null +++ b/checkpoint-200/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48e01be86864e70d9d1ce4a4382051f6bd1a970f5761416e3e987d80517df0cb +size 6008476 diff --git a/checkpoint-200/optimizer_0/__1_0.distcp b/checkpoint-200/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..837a01c81c74ac31a5360bcb027f398f4a0fb723 --- /dev/null +++ b/checkpoint-200/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:338217a24bc27b2148496174dbe15400f5e174667ade221494586187e332b12e +size 6041200 diff --git a/checkpoint-200/optimizer_0/__2_0.distcp b/checkpoint-200/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..54532b79329b2fd6a9a3065b73e5d37920904b6b --- /dev/null +++ b/checkpoint-200/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b0d596407b0a732a6f70aca50bc19701fcc97f122020b7b6c79aa1f771238e3 +size 6041200 diff --git a/checkpoint-200/optimizer_0/__3_0.distcp b/checkpoint-200/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..ffef886c65a5379f23baf58c0cc825a641ec393a --- /dev/null +++ b/checkpoint-200/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e5c9900dee0f41477a07d14a4a4d8ee335f744533b600a069382072a736928c +size 6043476 diff --git a/checkpoint-200/optimizer_0/__4_0.distcp b/checkpoint-200/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..20cbb9a346f3da7bc4c79c8deb76b24225e078b7 --- /dev/null +++ b/checkpoint-200/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:784504097a12c4fc1e77bab4d6cbd6b4984e36c10b0ef65b384476d47b7b44da +size 6057364 diff --git a/checkpoint-200/optimizer_0/__5_0.distcp b/checkpoint-200/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..92d806fc2152eb8e16b4d538490c537c51cd0e7c --- /dev/null +++ b/checkpoint-200/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3222a70384bef1c12dfbea859ed4630e8a05ab1c7e90834f9453ddb9919f8b5c +size 6042612 diff --git a/checkpoint-200/optimizer_0/__6_0.distcp b/checkpoint-200/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..f2f5e7307da58fe7e4ec3221852e6775e78f706e --- /dev/null +++ b/checkpoint-200/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc94fb729d18e65eeb55684554d2b04f851ea0d81a04b76989a5ff3860889c96 +size 6042612 diff --git a/checkpoint-200/optimizer_0/__7_0.distcp b/checkpoint-200/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..b06636edce60c27c7c3d8291d0631282fa76aac1 --- /dev/null +++ b/checkpoint-200/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61ae25cd03a327af39256328f2031c1616d7fadfeb4d3d973792279c6df3fffb +size 6042612 diff --git a/checkpoint-200/pytorch_model_fsdp_0/.metadata b/checkpoint-200/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..7d97e724b50e13845f6b3d27a2d7cbe12ea7ee4c --- /dev/null +++ b/checkpoint-200/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b63faf05d94d50c7c976528213a0c14310fc52ba7b27b5c2a6c4a57307a284af +size 339851 diff --git a/checkpoint-200/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-200/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..0516e9e9d3af17cc14808757c216e32c19122f9e --- /dev/null +++ b/checkpoint-200/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d45a3dc1ab565785c531639230a6a293b2220b4f9ca145e524a915a6db895d17 +size 3003648 diff --git a/checkpoint-200/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-200/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..9787b7cba0c9c447e507368193fa3ec02e28e6d2 --- /dev/null +++ b/checkpoint-200/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1124ff66893839fcca4a2cd7ba4d8e8c3a8fc36ed19bceaa74cd4a22c6bdd5d4 +size 3003648 diff --git a/checkpoint-200/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-200/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..b1457a24c6adf57b107544171d2ecc5bc0af2290 --- /dev/null +++ b/checkpoint-200/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e849e02f678f6cf1e87c324e560641ea43d8d222e703894ff1c414fa754691fe +size 3003648 diff --git a/checkpoint-200/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-200/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..f41518d4d362da93d80d6a364c0f7edb14ccb296 --- /dev/null +++ b/checkpoint-200/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:591bc9b44ed4165cdd1c26c8918375dff955beb3890d1f35881d6dd06a5e6863 +size 3003648 diff --git a/checkpoint-200/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-200/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..b6a46161a9e2a660c1f2642ead601e2aadcbce5e --- /dev/null +++ b/checkpoint-200/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0693043df5304c09056641ed42c36c7b365d9a13023751087e991dc911262ea +size 3003648 diff --git a/checkpoint-200/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-200/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..7f81abb90ef44344535bf814592848044217d2db --- /dev/null +++ b/checkpoint-200/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:236d3100bb5002265f850d5066591c357ac203a99794d452a6db4ac6e21945ae +size 3003648 diff --git a/checkpoint-200/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-200/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..5e2b6a85c51b4c3ea6922382311dd6b1cc83c510 --- /dev/null +++ b/checkpoint-200/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:239e64a49d9401bfdd63c4a51b4c7e1508753866528a81e0188c017ed883f0f2 +size 3003648 diff --git a/checkpoint-200/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-200/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..378d62ad5791254522b977f75e47465c8718b144 --- /dev/null +++ b/checkpoint-200/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5922013ac89f73f26afd9ce433355ed8f0d65c899fa374fcb92bb76a1a7b5f35 +size 3003648 diff --git a/checkpoint-200/rng_state_0.pth b/checkpoint-200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..16b69078618b6c5c44fe97eb4a33f5d7f2c2b6d1 --- /dev/null +++ b/checkpoint-200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0639e2524b3606de92cf704efe87f4f42e6b531536716338096cdcb997c8f523 +size 15984 diff --git a/checkpoint-200/rng_state_1.pth b/checkpoint-200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9c62011595a852aae341a8454b41adf3693e94d --- /dev/null +++ b/checkpoint-200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf7f49d9db6183dc24e6704956551cd47c1f5a209075611fe04ca451437a895e +size 15984 diff --git a/checkpoint-200/rng_state_2.pth b/checkpoint-200/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..46f7047665cf1a3970a3b63ed2c2a0c96af8ab3a --- /dev/null +++ b/checkpoint-200/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e46e59be97e565494bebb1430b0c9995dec568fb4b79287f2dff3dbf2730430a +size 15984 diff --git a/checkpoint-200/rng_state_3.pth b/checkpoint-200/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..d987862d727896ff71382bdfdae9b0bcdd01daf2 --- /dev/null +++ b/checkpoint-200/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc0bb1325f8952acda6b977e4e52f785ac1892d58ddb3f31f0d60ae566525666 +size 15984 diff --git a/checkpoint-200/rng_state_4.pth b/checkpoint-200/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..8b36f99e0e9ebc014c97a490a15ccedf89960d6a --- /dev/null +++ b/checkpoint-200/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:338387d0d962d6ea549d773166e09059d382f352f9b68a7f8f49f176fdb24478 +size 15984 diff --git a/checkpoint-200/rng_state_5.pth b/checkpoint-200/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..2ce5464236860b95d6c73e13afc55f7c87f56249 --- /dev/null +++ b/checkpoint-200/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57875a0f71eaf92d41a7e95ed7a6061e2351c52735063fb8199f0a2528b42b27 +size 15984 diff --git a/checkpoint-200/rng_state_6.pth b/checkpoint-200/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..9a5976d4ad4a4ee1c3a7350e5dc3e92d9d9b63a3 --- /dev/null +++ b/checkpoint-200/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8634b4b2a740673155b568734eb1f609f0037798280a25133c4a979cfbf6c1c2 +size 15984 diff --git a/checkpoint-200/rng_state_7.pth b/checkpoint-200/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..db7f177e1377fe2219ab10be57b5cdcd399980c8 --- /dev/null +++ b/checkpoint-200/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab80e51cd15becf6304340fc463ec3aa562bffed9ca0ae82e20eacbd1641316e +size 15984 diff --git a/checkpoint-200/scheduler.pt b/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..85540d266d7ef6a9fd6cc6c6a50cee279aee5c43 --- /dev/null +++ b/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1407dea8b779520dd0c3e208f8c82d3dffd12c0548e2e910a4f9aca30c2908c +size 1064 diff --git a/checkpoint-200/trainer_state.json b/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6dae07fc747ea9513442fdf4f6c0ebc47258461a --- /dev/null +++ b/checkpoint-200/trainer_state.json @@ -0,0 +1,106 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1, + "eval_steps": 100, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.14176353812217712, + "learning_rate": 0.0004, + "loss": 1.1525, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.1460508555173874, + "learning_rate": 0.0004998852503731983, + "loss": 1.047, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.2368021011352539, + "learning_rate": 0.0004993848168027977, + "loss": 0.8529, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.14488168060779572, + "learning_rate": 0.0004984880506341147, + "loss": 0.9756, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9470569491386414, + "eval_runtime": 845.9115, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.1415175348520279, + "learning_rate": 0.0004971963770447935, + "loss": 0.9564, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.1751064509153366, + "learning_rate": 0.0004955118488155782, + "loss": 0.711, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.1439943015575409, + "learning_rate": 0.0004934371430679492, + "loss": 0.9409, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.15947112441062927, + "learning_rate": 0.0004909755570095319, + "loss": 0.8979, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9711233973503113, + "eval_runtime": 845.6433, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 200 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.015863086723891e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2000/optimizer_0/.metadata b/checkpoint-2000/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..6f8123eb59aaa31587481fb3c77ae86f4940685e --- /dev/null +++ b/checkpoint-2000/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74cd80d7bbfef849d371bd655c841ca2823f83f114e404c2e1a374be0c44a295 +size 869362 diff --git a/checkpoint-2000/optimizer_0/__0_0.distcp b/checkpoint-2000/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..32ed14944d5482ddf7494ff5358f0c5390679798 --- /dev/null +++ b/checkpoint-2000/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab291a53fa1eeb91f959cd131da42302da86cec0e99c080f756686e181dccde1 +size 6008476 diff --git a/checkpoint-2000/optimizer_0/__1_0.distcp b/checkpoint-2000/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..81b3dd45e072677b3884258428fc70373028242d --- /dev/null +++ b/checkpoint-2000/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2be06e70fe01d83f4d1bd737f94fd8f9d5d6bd2bc9414926c8566fd49a02e7c2 +size 6041200 diff --git a/checkpoint-2000/optimizer_0/__2_0.distcp b/checkpoint-2000/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..740bf69306e3e38b5b6164586b8cf580cdee1afd --- /dev/null +++ b/checkpoint-2000/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:008c4c1758871793e3757d7e62a4ea4fb0eaaf480757586e8a00024e17da5763 +size 6041200 diff --git a/checkpoint-2000/optimizer_0/__3_0.distcp b/checkpoint-2000/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..f54c794849d00b60f1ac513c5804fb1ff86af1e9 --- /dev/null +++ b/checkpoint-2000/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fa6e6c85c43d0b999683a8bffc8803b1badfa7f6969cb906ffdfc63d9a53f91 +size 6043476 diff --git a/checkpoint-2000/optimizer_0/__4_0.distcp b/checkpoint-2000/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..544c1e827db0893c1c1123f7b74f63457c9c2e71 --- /dev/null +++ b/checkpoint-2000/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fcc0196ab16feea642a6d68449eca960bef7442e25fac8e1b3158d5db3f1c81 +size 6057364 diff --git a/checkpoint-2000/optimizer_0/__5_0.distcp b/checkpoint-2000/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..8ec3e1285dc0350d854ac166ae775a2fde8f018d --- /dev/null +++ b/checkpoint-2000/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec1e80a4d326b3efc6b6190539f7e7c835ac0cd50221be0ab755f5201b12f0fd +size 6042612 diff --git a/checkpoint-2000/optimizer_0/__6_0.distcp b/checkpoint-2000/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..a17e02095bb7f8e5a02bcc3293691b6ef274b26c --- /dev/null +++ b/checkpoint-2000/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be496adc8b729397ba103991fdb6603084f8207d42ca21632bb7c7d027623ae7 +size 6042612 diff --git a/checkpoint-2000/optimizer_0/__7_0.distcp b/checkpoint-2000/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..78f8088e1be850396899dd05d4e545e3848f8a97 --- /dev/null +++ b/checkpoint-2000/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:155da9c9251af0ba8dc618873877ad002ad53ef49b490e1967cf3c2a683dd63b +size 6042612 diff --git a/checkpoint-2000/pytorch_model_fsdp_0/.metadata b/checkpoint-2000/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..5551630f79ea8fd27aecfdb657ffa0ff92a6db6e --- /dev/null +++ b/checkpoint-2000/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17906ad725717e50eb95464ef09677598edf06fbe299d5b08979edf8a62cfb91 +size 339852 diff --git a/checkpoint-2000/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-2000/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..bdc87131b2b4c5ec552433f45731938371edaca5 --- /dev/null +++ b/checkpoint-2000/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0235286666c0188e8224662cfa02e213e880d14d6bf6698fe1272c3cc29e5dba +size 3003648 diff --git a/checkpoint-2000/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-2000/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..3cb284ea927739414e7cb6e9e0851696cdb22e26 --- /dev/null +++ b/checkpoint-2000/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51d1dadde6b52281c19fe71b2d4970f4ff4606a3bc82fb1f52fc7a057160e0d8 +size 3003648 diff --git a/checkpoint-2000/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-2000/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..6e716be13a18ce7ecba80dff62a7b67454cc761f --- /dev/null +++ b/checkpoint-2000/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4508f31e675835587773a80878afd7e1be119d5ea74d33fa04c04b6d73070677 +size 3003648 diff --git a/checkpoint-2000/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-2000/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..8b8ea3e1fc4ecd53f8a6f794475d54b82990aa49 --- /dev/null +++ b/checkpoint-2000/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76a5b3c766cf9df1f6e8429cebf53b09af6327244bcfa42d3443d6f10ad7ea2d +size 3003648 diff --git a/checkpoint-2000/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-2000/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..00ba0a01477d7d2baa0c0972a1c8ee8a9deb494d --- /dev/null +++ b/checkpoint-2000/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69181921ef9924a11e809373dd27adf926a03dad17a7ababa735382f7e21f8b5 +size 3003648 diff --git a/checkpoint-2000/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-2000/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..3e321b92ab33a9c2de3d7dde30aed08dc4449499 --- /dev/null +++ b/checkpoint-2000/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54d1d53816123d25e8b969f74b6d3b7833bd00d56de096cc5da161d3d095fb97 +size 3003648 diff --git a/checkpoint-2000/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-2000/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..25673bddbdaff1d3de3fd053e01692eff6f1b06f --- /dev/null +++ b/checkpoint-2000/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9de9113a3b074b7ce76ded15ab0f39ee851f96f5f0abcd4a42ec613a41806c4a +size 3003648 diff --git a/checkpoint-2000/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-2000/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..ded89257e46bf9b5594493d5fb34e0f282f8cceb --- /dev/null +++ b/checkpoint-2000/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:348f5fd82b923cfe89fa788fde7a225accf82c9e278306f343daae9b6457271a +size 3003648 diff --git a/checkpoint-2000/rng_state_0.pth b/checkpoint-2000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5081f4643aed8e540561d0a4379dfa4490bb1ccd --- /dev/null +++ b/checkpoint-2000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84e13e042fb10504d07fab2f1ec3eb2fa0a34c92dc5fe441ac5ab9d5b42330a4 +size 15920 diff --git a/checkpoint-2000/rng_state_1.pth b/checkpoint-2000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d88142f8c20d884c5128f5d1117de5044e0ffb0a --- /dev/null +++ b/checkpoint-2000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d0b470305d099d6097f58f0dcefdc4f8a97dd53e726fb377a8b71b69d667b20 +size 15984 diff --git a/checkpoint-2000/rng_state_2.pth b/checkpoint-2000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..15b4b1ce3e2d63e38c45fd9e3b06ddb2c03994cb --- /dev/null +++ b/checkpoint-2000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d5c6e421cff6a3c78e8e1f97418a767aeb2642aa9dee41ebdfdc68feea96143 +size 15984 diff --git a/checkpoint-2000/rng_state_3.pth b/checkpoint-2000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..def6ee1890ef6f56f2e88d33438e382e3d66f4cc --- /dev/null +++ b/checkpoint-2000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d985edaa2c6b455c79c0e8fb434449314207f69f7896473ec6dcfc0512e694f +size 15984 diff --git a/checkpoint-2000/rng_state_4.pth b/checkpoint-2000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..6e6466f40ab982262a66ddc2945524626eb48360 --- /dev/null +++ b/checkpoint-2000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51f6638a61cecbe433054d715c778adb0e98d1216ee478eb335aa90e4b11324e +size 15984 diff --git a/checkpoint-2000/rng_state_5.pth b/checkpoint-2000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..a0f9a126eb2201b88083979892d33e4b39c747eb --- /dev/null +++ b/checkpoint-2000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10695be217f2cb83adfccd3704472700ad7d3848e568a35a138e892e94a3698e +size 15984 diff --git a/checkpoint-2000/rng_state_6.pth b/checkpoint-2000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..bf78c9946c999d5414f01fab17621ad41da200ca --- /dev/null +++ b/checkpoint-2000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a0e7dd56dc37eabbb37cfb18dc0a8b0c66fff54746ac0f68012da8409cf9f3d +size 15984 diff --git a/checkpoint-2000/rng_state_7.pth b/checkpoint-2000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..12487fd469c9018e6ca5b05dd4ccf44219da6912 --- /dev/null +++ b/checkpoint-2000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4a358eee0f2b44755165b449b7b02df94e167ea66869d2475f5cd8df3f5cd6c +size 15984 diff --git a/checkpoint-2000/scheduler.pt b/checkpoint-2000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4daf7f0b80cf589d696b1c0cece5028868dfa3d --- /dev/null +++ b/checkpoint-2000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49dc8c5c688c0f02c11f16f1d370c55a4c1e62ffdc212b9c6748e19f0536a865 +size 1064 diff --git a/checkpoint-2000/trainer_state.json b/checkpoint-2000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4cf2db083a932cc0fd9b183241d00d9b04917446 --- /dev/null +++ b/checkpoint-2000/trainer_state.json @@ -0,0 +1,754 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.14176353812217712, + "learning_rate": 0.0004, + "loss": 1.1525, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.1460508555173874, + "learning_rate": 0.0004998852503731983, + "loss": 1.047, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.2368021011352539, + "learning_rate": 0.0004993848168027977, + "loss": 0.8529, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.14488168060779572, + "learning_rate": 0.0004984880506341147, + "loss": 0.9756, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9470569491386414, + "eval_runtime": 845.9115, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.1415175348520279, + "learning_rate": 0.0004971963770447935, + "loss": 0.9564, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.1751064509153366, + "learning_rate": 0.0004955118488155782, + "loss": 0.711, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.1439943015575409, + "learning_rate": 0.0004934371430679492, + "loss": 0.9409, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.15947112441062927, + "learning_rate": 0.0004909755570095319, + "loss": 0.8979, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9711233973503113, + "eval_runtime": 845.6433, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.14685837924480438, + "learning_rate": 0.0004881310026940389, + "loss": 0.6376, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.15040776133537292, + "learning_rate": 0.0004849080008040734, + "loss": 0.927, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.16087745130062103, + "learning_rate": 0.00048131167346667446, + "loss": 0.8456, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.15025638043880463, + "learning_rate": 0.00047734773611302284, + "loss": 0.6029, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 1.0056413412094116, + "eval_runtime": 848.447, + "eval_samples_per_second": 1.293, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.15893957018852234, + "learning_rate": 0.0004730224883952422, + "loss": 0.9036, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.1535714715719223, + "learning_rate": 0.0004683428041747334, + "loss": 0.828, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.1718970686197281, + "learning_rate": 0.0004633161205979517, + "loss": 0.5944, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.17664535343647003, + "learning_rate": 0.0004579504262769877, + "loss": 0.8654, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 1.0151112079620361, + "eval_runtime": 839.7482, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.17122088372707367, + "learning_rate": 0.0004522542485937369, + "loss": 0.8078, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.19420970976352692, + "learning_rate": 0.00044623664014783386, + "loss": 0.5735, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.18166953325271606, + "learning_rate": 0.00043990716436988924, + "loss": 0.8604, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.1502382755279541, + "learning_rate": 0.0004332758803228925, + "loss": 0.7673, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 1.0454745292663574, + "eval_runtime": 841.3951, + "eval_samples_per_second": 1.304, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.16875839233398438, + "learning_rate": 0.00042635332671593575, + "loss": 0.5882, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.16070039570331573, + "learning_rate": 0.00041915050515566445, + "loss": 0.8175, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.16584184765815735, + "learning_rate": 0.00041167886266207167, + "loss": 0.7795, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.15345798432826996, + "learning_rate": 0.0004039502734764241, + "loss": 0.7331, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 1.0627943277359009, + "eval_runtime": 849.4944, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 600 + }, + { + "epoch": 0.3125, + "grad_norm": 0.16241031885147095, + "learning_rate": 0.0003959770201902294, + "loss": 0.7436, + "step": 625 + }, + { + "epoch": 0.325, + "grad_norm": 0.1531112939119339, + "learning_rate": 0.0003877717742252371, + "loss": 0.6345, + "step": 650 + }, + { + "epoch": 0.3375, + "grad_norm": 0.15790140628814697, + "learning_rate": 0.00037934757569549495, + "loss": 0.7351, + "step": 675 + }, + { + "epoch": 0.35, + "grad_norm": 0.1822807490825653, + "learning_rate": 0.00037071781268346345, + "loss": 0.745, + "step": 700 + }, + { + "epoch": 0.35, + "eval_loss": 1.07315993309021, + "eval_runtime": 837.5276, + "eval_samples_per_second": 1.31, + "eval_steps_per_second": 0.021, + "step": 700 + }, + { + "epoch": 0.3625, + "grad_norm": 0.15983282029628754, + "learning_rate": 0.00036189619996312495, + "loss": 0.5972, + "step": 725 + }, + { + "epoch": 0.375, + "grad_norm": 0.18202389776706696, + "learning_rate": 0.00035289675720390174, + "loss": 0.759, + "step": 750 + }, + { + "epoch": 0.3875, + "grad_norm": 0.16057106852531433, + "learning_rate": 0.00034373378669002105, + "loss": 0.7358, + "step": 775 + }, + { + "epoch": 0.4, + "grad_norm": 0.1680625081062317, + "learning_rate": 0.00033442185059073706, + "loss": 0.5636, + "step": 800 + }, + { + "epoch": 0.4, + "eval_loss": 1.0932456254959106, + "eval_runtime": 840.1331, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 800 + }, + { + "epoch": 0.4125, + "grad_norm": 0.15613198280334473, + "learning_rate": 0.00032497574781753367, + "loss": 0.7596, + "step": 825 + }, + { + "epoch": 0.425, + "grad_norm": 0.1628854125738144, + "learning_rate": 0.000315410490505086, + "loss": 0.7282, + "step": 850 + }, + { + "epoch": 0.4375, + "grad_norm": 0.16740979254245758, + "learning_rate": 0.0003057412801533589, + "loss": 0.5325, + "step": 875 + }, + { + "epoch": 0.45, + "grad_norm": 0.1634828895330429, + "learning_rate": 0.0002959834834687587, + "loss": 0.778, + "step": 900 + }, + { + "epoch": 0.45, + "eval_loss": 1.0952215194702148, + "eval_runtime": 836.473, + "eval_samples_per_second": 1.311, + "eval_steps_per_second": 0.022, + "step": 900 + }, + { + "epoch": 0.4625, + "grad_norm": 0.161700040102005, + "learning_rate": 0.00028615260794273236, + "loss": 0.7255, + "step": 925 + }, + { + "epoch": 0.475, + "grad_norm": 0.16280074417591095, + "learning_rate": 0.00027626427720662416, + "loss": 0.4993, + "step": 950 + }, + { + "epoch": 0.4875, + "grad_norm": 0.16114428639411926, + "learning_rate": 0.00026633420620195917, + "loss": 0.7765, + "step": 975 + }, + { + "epoch": 0.5, + "grad_norm": 0.16209788620471954, + "learning_rate": 0.00025637817620561263, + "loss": 0.7221, + "step": 1000 + }, + { + "epoch": 0.5, + "eval_loss": 1.1190329790115356, + "eval_runtime": 835.4164, + "eval_samples_per_second": 1.313, + "eval_steps_per_second": 0.022, + "step": 1000 + }, + { + "epoch": 0.5125, + "grad_norm": 0.15057620406150818, + "learning_rate": 0.0002464120097495559, + "loss": 0.4929, + "step": 1025 + }, + { + "epoch": 0.525, + "grad_norm": 0.16715486347675323, + "learning_rate": 0.00023645154547503855, + "loss": 0.7896, + "step": 1050 + }, + { + "epoch": 0.5375, + "grad_norm": 0.1591680645942688, + "learning_rate": 0.00022651261296116894, + "loss": 0.6999, + "step": 1075 + }, + { + "epoch": 0.55, + "grad_norm": 0.17062760889530182, + "learning_rate": 0.00021661100756789666, + "loss": 0.4708, + "step": 1100 + }, + { + "epoch": 0.55, + "eval_loss": 1.1381303071975708, + "eval_runtime": 839.2576, + "eval_samples_per_second": 1.307, + "eval_steps_per_second": 0.021, + "step": 1100 + }, + { + "epoch": 0.5625, + "grad_norm": 0.16746024787425995, + "learning_rate": 0.00020676246533337764, + "loss": 0.8073, + "step": 1125 + }, + { + "epoch": 0.575, + "grad_norm": 0.18011848628520966, + "learning_rate": 0.00019698263796561526, + "loss": 0.7153, + "step": 1150 + }, + { + "epoch": 0.5875, + "grad_norm": 0.16664239764213562, + "learning_rate": 0.00018728706796812333, + "loss": 0.6313, + "step": 1175 + }, + { + "epoch": 0.6, + "grad_norm": 0.1630394607782364, + "learning_rate": 0.00017769116393914037, + "loss": 0.6952, + "step": 1200 + }, + { + "epoch": 0.6, + "eval_loss": 1.1242510080337524, + "eval_runtime": 847.0203, + "eval_samples_per_second": 1.295, + "eval_steps_per_second": 0.021, + "step": 1200 + }, + { + "epoch": 0.6125, + "grad_norm": 0.16016128659248352, + "learning_rate": 0.00016821017608365264, + "loss": 0.6161, + "step": 1225 + }, + { + "epoch": 0.625, + "grad_norm": 0.17186138033866882, + "learning_rate": 0.00015885917197714112, + "loss": 0.623, + "step": 1250 + }, + { + "epoch": 0.6375, + "grad_norm": 0.1764240562915802, + "learning_rate": 0.00014965301261957238, + "loss": 0.6988, + "step": 1275 + }, + { + "epoch": 0.65, + "grad_norm": 0.16019247472286224, + "learning_rate": 0.00014060632881768558, + "loss": 0.599, + "step": 1300 + }, + { + "epoch": 0.65, + "eval_loss": 1.1349693536758423, + "eval_runtime": 842.5056, + "eval_samples_per_second": 1.302, + "eval_steps_per_second": 0.021, + "step": 1300 + }, + { + "epoch": 0.6625, + "grad_norm": 0.17425072193145752, + "learning_rate": 0.00013173349793311424, + "loss": 0.6607, + "step": 1325 + }, + { + "epoch": 0.675, + "grad_norm": 0.17765691876411438, + "learning_rate": 0.0001230486210332916, + "loss": 0.6811, + "step": 1350 + }, + { + "epoch": 0.6875, + "grad_norm": 0.17980526387691498, + "learning_rate": 0.00011456550048145536, + "loss": 0.5755, + "step": 1375 + }, + { + "epoch": 0.7, + "grad_norm": 0.1814012974500656, + "learning_rate": 0.00010629761800136473, + "loss": 0.6642, + "step": 1400 + }, + { + "epoch": 0.7, + "eval_loss": 1.1495640277862549, + "eval_runtime": 840.9164, + "eval_samples_per_second": 1.305, + "eval_steps_per_second": 0.021, + "step": 1400 + }, + { + "epoch": 0.7125, + "grad_norm": 0.1855439394712448, + "learning_rate": 9.82581132515907e-05, + "loss": 0.6796, + "step": 1425 + }, + { + "epoch": 0.725, + "grad_norm": 0.15141044557094574, + "learning_rate": 9.045976294343145e-05, + "loss": 0.5593, + "step": 1450 + }, + { + "epoch": 0.7375, + "grad_norm": 0.17237244546413422, + "learning_rate": 8.291496053563699e-05, + "loss": 0.69, + "step": 1475 + }, + { + "epoch": 0.75, + "grad_norm": 0.17433880269527435, + "learning_rate": 7.563569653821565e-05, + "loss": 0.6768, + "step": 1500 + }, + { + "epoch": 0.75, + "eval_loss": 1.1441528797149658, + "eval_runtime": 844.4916, + "eval_samples_per_second": 1.299, + "eval_steps_per_second": 0.021, + "step": 1500 + }, + { + "epoch": 0.7625, + "grad_norm": 0.15868628025054932, + "learning_rate": 6.863353945662288e-05, + "loss": 0.517, + "step": 1525 + }, + { + "epoch": 0.775, + "grad_norm": 0.18802335858345032, + "learning_rate": 6.191961740661687e-05, + "loss": 0.7035, + "step": 1550 + }, + { + "epoch": 0.7875, + "grad_norm": 0.17180030047893524, + "learning_rate": 5.550460042899982e-05, + "loss": 0.6911, + "step": 1575 + }, + { + "epoch": 0.8, + "grad_norm": 0.16377338767051697, + "learning_rate": 4.9398683532350855e-05, + "loss": 0.4876, + "step": 1600 + }, + { + "epoch": 0.8, + "eval_loss": 1.1581627130508423, + "eval_runtime": 844.0055, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 1600 + }, + { + "epoch": 0.8125, + "grad_norm": 0.16953328251838684, + "learning_rate": 4.3611570490698945e-05, + "loss": 0.745, + "step": 1625 + }, + { + "epoch": 0.825, + "grad_norm": 0.16867636144161224, + "learning_rate": 3.815245842188697e-05, + "loss": 0.6623, + "step": 1650 + }, + { + "epoch": 0.8375, + "grad_norm": 0.16276288032531738, + "learning_rate": 3.30300231711339e-05, + "loss": 0.4716, + "step": 1675 + }, + { + "epoch": 0.85, + "grad_norm": 0.17455314099788666, + "learning_rate": 2.8252405523025106e-05, + "loss": 0.7464, + "step": 1700 + }, + { + "epoch": 0.85, + "eval_loss": 1.1617317199707031, + "eval_runtime": 841.9311, + "eval_samples_per_second": 1.303, + "eval_steps_per_second": 0.021, + "step": 1700 + }, + { + "epoch": 0.8625, + "grad_norm": 0.16539457440376282, + "learning_rate": 2.3827198263843162e-05, + "loss": 0.7088, + "step": 1725 + }, + { + "epoch": 0.875, + "grad_norm": 0.17668606340885162, + "learning_rate": 1.9761434114799497e-05, + "loss": 0.5753, + "step": 1750 + }, + { + "epoch": 0.8875, + "grad_norm": 0.17462626099586487, + "learning_rate": 1.606157455534535e-05, + "loss": 0.6541, + "step": 1775 + }, + { + "epoch": 0.9, + "grad_norm": 0.1645047813653946, + "learning_rate": 1.2733499554322708e-05, + "loss": 0.6349, + "step": 1800 + }, + { + "epoch": 0.9, + "eval_loss": 1.1562622785568237, + "eval_runtime": 838.8692, + "eval_samples_per_second": 1.308, + "eval_steps_per_second": 0.021, + "step": 1800 + }, + { + "epoch": 0.9125, + "grad_norm": 0.17606309056282043, + "learning_rate": 9.782498225276437e-06, + "loss": 0.5512, + "step": 1825 + }, + { + "epoch": 0.925, + "grad_norm": 0.17557215690612793, + "learning_rate": 7.213260420777607e-06, + "loss": 0.6858, + "step": 1850 + }, + { + "epoch": 0.9375, + "grad_norm": 0.15605369210243225, + "learning_rate": 5.029869279117167e-06, + "loss": 0.6293, + "step": 1875 + }, + { + "epoch": 0.95, + "grad_norm": 0.17496590316295624, + "learning_rate": 3.235794735214709e-06, + "loss": 0.5686, + "step": 1900 + }, + { + "epoch": 0.95, + "eval_loss": 1.1630581617355347, + "eval_runtime": 829.7381, + "eval_samples_per_second": 1.322, + "eval_steps_per_second": 0.022, + "step": 1900 + }, + { + "epoch": 0.9625, + "grad_norm": 0.17726068198680878, + "learning_rate": 1.8338880060553287e-06, + "loss": 0.6856, + "step": 1925 + }, + { + "epoch": 0.975, + "grad_norm": 0.165832057595253, + "learning_rate": 8.263770594185149e-07, + "loss": 0.6185, + "step": 1950 + }, + { + "epoch": 0.9875, + "grad_norm": 0.16085895895957947, + "learning_rate": 2.1486307310000787e-07, + "loss": 0.5909, + "step": 1975 + }, + { + "epoch": 1.0, + "grad_norm": 0.166295126080513, + "learning_rate": 3.1789025450867925e-10, + "loss": 0.6814, + "step": 2000 + }, + { + "epoch": 1.0, + "eval_loss": 1.1603492498397827, + "eval_runtime": 844.3926, + "eval_samples_per_second": 1.299, + "eval_steps_per_second": 0.021, + "step": 2000 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.0158630867238912e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-300/optimizer_0/.metadata b/checkpoint-300/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..c5d72a941216df5272f54b4f3461be500bc4b350 --- /dev/null +++ b/checkpoint-300/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34f89f460ed643c282b02e9725c7ab4238a652b99aa11c46b42aadb034a3a0ca +size 869361 diff --git a/checkpoint-300/optimizer_0/__0_0.distcp b/checkpoint-300/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..6b4f770667e228cbd4eec49bce008379b528e0ee --- /dev/null +++ b/checkpoint-300/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:130ecd0fd18c24363c1633c4d3c2441bab7465a04c64e4a0d558ba25de18dc33 +size 6008476 diff --git a/checkpoint-300/optimizer_0/__1_0.distcp b/checkpoint-300/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..ffebc47fbba3dcc5bc99b8cc921f1a962aa12a74 --- /dev/null +++ b/checkpoint-300/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:170d3adc605f5ea4dfe25b3ceba8cdb0382b9d5df2d4a9db77a11c8ab619ab51 +size 6041200 diff --git a/checkpoint-300/optimizer_0/__2_0.distcp b/checkpoint-300/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..49b153679d48b4b14f61958861ad87999c6350e4 --- /dev/null +++ b/checkpoint-300/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8f2abfd37df5372abcc3443ba0821751425a17a5fcddbe59626b557ec2714f1 +size 6041200 diff --git a/checkpoint-300/optimizer_0/__3_0.distcp b/checkpoint-300/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..aab3d7e7cafe556b19022045f724720489a3fb52 --- /dev/null +++ b/checkpoint-300/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd6fafce369e5305badb6cc18ec62f507a30fafbc48266f435556d66838c0bc6 +size 6043476 diff --git a/checkpoint-300/optimizer_0/__4_0.distcp b/checkpoint-300/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..0e811dd1896a73e1582772ce15d17a4cbd7d84ff --- /dev/null +++ b/checkpoint-300/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbe15102639c200bea7b15da8e19b9723c2c722a2fd46b62d91a66e42af52d24 +size 6057364 diff --git a/checkpoint-300/optimizer_0/__5_0.distcp b/checkpoint-300/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..40c4d74493e1eec69f1c8f807235151ad383f4ff --- /dev/null +++ b/checkpoint-300/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7f6fa489ee6c903170b1f18d6732f7bf2aa54947d68ffe98f8a78a5d0445b52 +size 6042612 diff --git a/checkpoint-300/optimizer_0/__6_0.distcp b/checkpoint-300/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..060e7dc180d5888d595c4db05923fa3d77b9d9c1 --- /dev/null +++ b/checkpoint-300/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98dfbcad53058a481585dcd4b8e5a41d42978a1ec23f2500ada1d96f7815dd21 +size 6042612 diff --git a/checkpoint-300/optimizer_0/__7_0.distcp b/checkpoint-300/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..bb092f5ec407c6ea983eb54e1fa0088f939640cb --- /dev/null +++ b/checkpoint-300/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08420ea2b61f79cc7e3a5ba3c760b32f2232d0bc7e793b2eed15536aef6300b9 +size 6042612 diff --git a/checkpoint-300/pytorch_model_fsdp_0/.metadata b/checkpoint-300/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..07f5feffd9df4fa771537b21096e6c62cd13aef1 --- /dev/null +++ b/checkpoint-300/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3807a661376630ab7d298a41f61851b1be92e01e9f1e24c81d83b41bf94c5596 +size 339851 diff --git a/checkpoint-300/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-300/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..54bfb61bb230c8c6e61c90b75ca102c359ad2a98 --- /dev/null +++ b/checkpoint-300/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb36adb040c7f951bcfdec15d6f754187aa8de74675fd563f3b35a224a826d8c +size 3003648 diff --git a/checkpoint-300/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-300/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..795d6ad48469f2bd9708cbde1f1c1a85f383b2f4 --- /dev/null +++ b/checkpoint-300/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f81caca32f2fa9cf04485d6256a40872e96d1862b9eb650f74c3b6b617a87c33 +size 3003648 diff --git a/checkpoint-300/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-300/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..e85c8cac63dad954f0be72296552d8de15f920fd --- /dev/null +++ b/checkpoint-300/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bbc40b9015b9633df9edd413a691a4eecb4d0aff7887de5f450c73fcf223fb7 +size 3003648 diff --git a/checkpoint-300/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-300/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..5816503dc2c110b6f823c4403aabc65ff4710dc2 --- /dev/null +++ b/checkpoint-300/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85d407101b110026a27f80ea4dd503675e241ba396bf5e7a1640f23804fda1c2 +size 3003648 diff --git a/checkpoint-300/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-300/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..c6a1a93cc021fe374a299f37a5fe2d904e13466a --- /dev/null +++ b/checkpoint-300/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ac782a84800a32a57ec989567c6dc6258b03b74d1f8da1531efae7ed1f810fd +size 3003648 diff --git a/checkpoint-300/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-300/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..0e34d729f1f41b3503e5eb101110b8081f7caefe --- /dev/null +++ b/checkpoint-300/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a337eb118e1a4a2b3bb8bf827b293f784634676c6ed857726a56029b4e68361 +size 3003648 diff --git a/checkpoint-300/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-300/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..2504f460b8bacf5bf5e984fae52b3813840f6af6 --- /dev/null +++ b/checkpoint-300/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90dea380c8f111cdeb750a959cc8833eb80ba69ca312aa3286b53b91571dc70b +size 3003648 diff --git a/checkpoint-300/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-300/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..ae2ab5d72e9a0bb8d473be1ff8bd72ef3dbee24b --- /dev/null +++ b/checkpoint-300/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4dc80fbbb630347d9ba56888a96e738a28c9045a06ef374f6bded8fc3e91bc8 +size 3003648 diff --git a/checkpoint-300/rng_state_0.pth b/checkpoint-300/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..44607170276938b9d9b86f56ebe7a61418d7dca6 --- /dev/null +++ b/checkpoint-300/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78635405a75ff05020872a238d2d5fdc4ef85c11a5aa732aaf9e33f0fb8e6585 +size 15984 diff --git a/checkpoint-300/rng_state_1.pth b/checkpoint-300/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..473886890d701a7a85e4c02958a316903a2765c9 --- /dev/null +++ b/checkpoint-300/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7d526848276a402f7bc1beae10b45087e901b93154c2c78f477e274d809f9c3 +size 15984 diff --git a/checkpoint-300/rng_state_2.pth b/checkpoint-300/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..1dc41557c2eca229a3d3952e02a32fe1fd75b3b0 --- /dev/null +++ b/checkpoint-300/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5996a515b3450a0364280c9b9bb73f90207e512975241b5f61ef7321ae5cc30f +size 15984 diff --git a/checkpoint-300/rng_state_3.pth b/checkpoint-300/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..4e5e3074fc1fd124389bcdb4a23a0011c7746d3a --- /dev/null +++ b/checkpoint-300/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:137e0d50809da2617d6e0ed6c57dd8d60eadf6c2b4ebd9ed853378846c8f6bf5 +size 15984 diff --git a/checkpoint-300/rng_state_4.pth b/checkpoint-300/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..132e2a133b99fe66dab9093da203aa0f0e3bbdfd --- /dev/null +++ b/checkpoint-300/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b268a58ef386239793155fd0899c5233e71fa56e6fd3e57ea0dd3bda7e952e1e +size 15984 diff --git a/checkpoint-300/rng_state_5.pth b/checkpoint-300/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..af26fca67e87daa5f7dd8707e6996ff055ae6e92 --- /dev/null +++ b/checkpoint-300/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d11709cd2016c543dd6d19b0f04760f961a2ef1cbd3d0c35107ad9a94b95665 +size 15984 diff --git a/checkpoint-300/rng_state_6.pth b/checkpoint-300/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..559f3f58001a0620b426704ba864225367e421f5 --- /dev/null +++ b/checkpoint-300/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3097153c203c8aac2363aa681f5a0996960cf90714eb5b681f7757df49e9a4e +size 15984 diff --git a/checkpoint-300/rng_state_7.pth b/checkpoint-300/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..ca7376cc0fb27fbffd6603b001029e46bf790277 --- /dev/null +++ b/checkpoint-300/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbe6c7b052c669c6b10fcc44b899680eb1044abdbb7045fb7926eab1e7d098c4 +size 15984 diff --git a/checkpoint-300/scheduler.pt b/checkpoint-300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4a593887a192de425ef90f1dd945f6d4a63561b --- /dev/null +++ b/checkpoint-300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8948ab0329c2a6866caa5cd565decc3046a34770df93f963374ee2edaa3e1fbf +size 1064 diff --git a/checkpoint-300/trainer_state.json b/checkpoint-300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c351cb9e9a42b1af346b9974a3671df0c986340f --- /dev/null +++ b/checkpoint-300/trainer_state.json @@ -0,0 +1,142 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.15, + "eval_steps": 100, + "global_step": 300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.14176353812217712, + "learning_rate": 0.0004, + "loss": 1.1525, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.1460508555173874, + "learning_rate": 0.0004998852503731983, + "loss": 1.047, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.2368021011352539, + "learning_rate": 0.0004993848168027977, + "loss": 0.8529, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.14488168060779572, + "learning_rate": 0.0004984880506341147, + "loss": 0.9756, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9470569491386414, + "eval_runtime": 845.9115, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.1415175348520279, + "learning_rate": 0.0004971963770447935, + "loss": 0.9564, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.1751064509153366, + "learning_rate": 0.0004955118488155782, + "loss": 0.711, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.1439943015575409, + "learning_rate": 0.0004934371430679492, + "loss": 0.9409, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.15947112441062927, + "learning_rate": 0.0004909755570095319, + "loss": 0.8979, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9711233973503113, + "eval_runtime": 845.6433, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.14685837924480438, + "learning_rate": 0.0004881310026940389, + "loss": 0.6376, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.15040776133537292, + "learning_rate": 0.0004849080008040734, + "loss": 0.927, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.16087745130062103, + "learning_rate": 0.00048131167346667446, + "loss": 0.8456, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.15025638043880463, + "learning_rate": 0.00047734773611302284, + "loss": 0.6029, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 1.0056413412094116, + "eval_runtime": 848.447, + "eval_samples_per_second": 1.293, + "eval_steps_per_second": 0.021, + "step": 300 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.023794630085837e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-400/optimizer_0/.metadata b/checkpoint-400/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..41741a706408df51ea3d62ee8d388fba5618c486 --- /dev/null +++ b/checkpoint-400/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d13e7e6005a3ff227b5a5a9f74ab59ffd8e55a6ee2e5a1d4335c8ffb0683644c +size 869361 diff --git a/checkpoint-400/optimizer_0/__0_0.distcp b/checkpoint-400/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..6ca8358fb54cccac0ad330f0f3a213c97223dc49 --- /dev/null +++ b/checkpoint-400/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7a44de3f8d67a1557dd0f78e2cf48623d4d5d6f7711dd8666a9b13581e75304 +size 6008476 diff --git a/checkpoint-400/optimizer_0/__1_0.distcp b/checkpoint-400/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..5746c35c958a140c94cd6791eaf2bd3d2270f5b9 --- /dev/null +++ b/checkpoint-400/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca2c14ac98f0521dac04a0ed16a601a688fe7dbbc45897aa441652d6d68b7600 +size 6041200 diff --git a/checkpoint-400/optimizer_0/__2_0.distcp b/checkpoint-400/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..2f9c6e61c2ed4d590df20ffe9897c8a6ac6e60d8 --- /dev/null +++ b/checkpoint-400/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05da06adc340d004f8035804c54f26b04c212e33634c69b5963033d75a4d5ca3 +size 6041200 diff --git a/checkpoint-400/optimizer_0/__3_0.distcp b/checkpoint-400/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..c13d6d4d9fd433f5556acfd3c2dfe5a0ef09fd60 --- /dev/null +++ b/checkpoint-400/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3340b81f891d5ec97ec39e7b5631e6bf9c1331c02157a3b81e7f1de29e077fc8 +size 6043476 diff --git a/checkpoint-400/optimizer_0/__4_0.distcp b/checkpoint-400/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..ff8124e7e273b68192455c1602f0b03f7e967431 --- /dev/null +++ b/checkpoint-400/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08d3a882a20397dc0dbc8ffadc07757e081c7cd695946f90434a9826fd616a73 +size 6057364 diff --git a/checkpoint-400/optimizer_0/__5_0.distcp b/checkpoint-400/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..f00b2510f8a6da1c8185f714bc2916e95dedf506 --- /dev/null +++ b/checkpoint-400/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48b2ade8ee38d7bae79fbdbf56dce95f517970af21bc19a8419ab1598b76e6ce +size 6042612 diff --git a/checkpoint-400/optimizer_0/__6_0.distcp b/checkpoint-400/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..7793914d94658be94cca37570189e6b17095254d --- /dev/null +++ b/checkpoint-400/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07d77ff2ea0095f8bfa52a31cb904ddf9e00baeed0457bee9eb0563558ec2917 +size 6042612 diff --git a/checkpoint-400/optimizer_0/__7_0.distcp b/checkpoint-400/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..4eb98e5b8a88107470a10c9cc4cf354114c82d3c --- /dev/null +++ b/checkpoint-400/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:844b8d40da1998db6040c71cfc172d6310ce884fc2c0427a41c2f0d449e4cf15 +size 6042612 diff --git a/checkpoint-400/pytorch_model_fsdp_0/.metadata b/checkpoint-400/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..5f0114b91f6741ab0b863b542d087687921d3485 --- /dev/null +++ b/checkpoint-400/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:895ce4e2ed1e705dd1d2c4829b46670859940ed31e11b4f776fe19610d8f57e3 +size 339851 diff --git a/checkpoint-400/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-400/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..86dee807e5a007202fde4d54b964f9cd1201825c --- /dev/null +++ b/checkpoint-400/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44c65ae9f1040374ee7daf887c6bafc18588ff56ca141bbbe65725ea632cd69a +size 3003648 diff --git a/checkpoint-400/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-400/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..87cebe07c12d872be42c842dcfa06c2cb669a04f --- /dev/null +++ b/checkpoint-400/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f4a56b4fd20923a816a677ee2a52e49336f4cffa747c59d1e4f8dfec9c71964 +size 3003648 diff --git a/checkpoint-400/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-400/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..47982ba00e797a561ff0b452bb8d9d53544628dd --- /dev/null +++ b/checkpoint-400/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b19de0bea4432ae7acc6dd29912d7f0543e2123761739c62aa416a76ce513803 +size 3003648 diff --git a/checkpoint-400/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-400/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..ca04109f4317df980f1f73a6d2bf30fd012bb3c0 --- /dev/null +++ b/checkpoint-400/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0dba64b79cdc90fefeb759c38408b226a9dc64acbf0051e68e1d73c80041498 +size 3003648 diff --git a/checkpoint-400/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-400/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..6d5ed4e6e4f106c4b18003ab29cb0e889a811b6a --- /dev/null +++ b/checkpoint-400/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4930b94bc407940ecab8fc75aafd9acbcabdb680ccd9d4d1a85eb9535315bb47 +size 3003648 diff --git a/checkpoint-400/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-400/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..650f4d703c36f138d03fa3a8827a2cb6fe52bd77 --- /dev/null +++ b/checkpoint-400/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a625c6e381923e95ef0499a3c95e49c71d585ba348b6939aae28aae3819a61e2 +size 3003648 diff --git a/checkpoint-400/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-400/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..1855bb150e8c6cf26fc17e464a4aafc2a8f9f904 --- /dev/null +++ b/checkpoint-400/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50fc2f3087ffb4c14f67713ba52416a4fd979b4d592dff74b92716d344d95c40 +size 3003648 diff --git a/checkpoint-400/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-400/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..79182e0d74f8a3d4c49b9034e8d20d8b521bb0d0 --- /dev/null +++ b/checkpoint-400/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ff57e8ec66083794e419ec555073e8af56165bb0c186f06962bb24a2af3f474 +size 3003648 diff --git a/checkpoint-400/rng_state_0.pth b/checkpoint-400/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..08ab431ed3d6023acb46afeb060b2bbc4e17e7d2 --- /dev/null +++ b/checkpoint-400/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21b8976a1d18f6b03f88945806f1ce2916bbfcae48e4b272dded3a6d29242251 +size 15984 diff --git a/checkpoint-400/rng_state_1.pth b/checkpoint-400/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..30e500c3cb2e2ad2da307bf4f23bfe0292318b56 --- /dev/null +++ b/checkpoint-400/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098d6bac0efc38fc3b20a23d1fe696b2b1bd80001f27e26e9d2e80d6a9bc914d +size 15984 diff --git a/checkpoint-400/rng_state_2.pth b/checkpoint-400/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..cbc4f544882477314dc3c6e8813da4d593171e88 --- /dev/null +++ b/checkpoint-400/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:757130cd10dd14fcc61c8e47b0423982e9f6232c009e802da8d3016005900192 +size 15984 diff --git a/checkpoint-400/rng_state_3.pth b/checkpoint-400/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..7ed2ecae26799f84eababd04ff7802674b0035ea --- /dev/null +++ b/checkpoint-400/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72943d33e58e0f9960fe52b4d961c4aa6906a68c34e17129e0d5333b787e1208 +size 15984 diff --git a/checkpoint-400/rng_state_4.pth b/checkpoint-400/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..9efa819df6d036dbd8ded3420ebc7e7fa05234a2 --- /dev/null +++ b/checkpoint-400/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d714ece876a944d0cb591472fd0e3d59fddad92b2c4898c12934c1b75a918bbf +size 15984 diff --git a/checkpoint-400/rng_state_5.pth b/checkpoint-400/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..0365c590a24c3ca2baac2143a3b28f34fbdacea2 --- /dev/null +++ b/checkpoint-400/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e459ac04a35241d3f7e7c709e75d01bc47307c6667df5b234448443eecc478e +size 15984 diff --git a/checkpoint-400/rng_state_6.pth b/checkpoint-400/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..351d3794dfa0fb7bb29c699429a91608befe4d83 --- /dev/null +++ b/checkpoint-400/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41d2bff8d2d1aae1314c6be02255f8b63c4963be07c252a8777856f2fd3694c6 +size 15984 diff --git a/checkpoint-400/rng_state_7.pth b/checkpoint-400/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..1b3127672c15442038bad73eb467448946889382 --- /dev/null +++ b/checkpoint-400/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9789da649a1c984e33a7981799e36bbe7bf52a545da2459c7682edf25c418cfb +size 15984 diff --git a/checkpoint-400/scheduler.pt b/checkpoint-400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b235b58bb083cf95264c44d97a93c27205471afb --- /dev/null +++ b/checkpoint-400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b56d2c6bd1c2aa83c05c7033b3af13e0206ee25c12092046a84dc057490c52d +size 1064 diff --git a/checkpoint-400/trainer_state.json b/checkpoint-400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..abc5fcfe1a828d8fd0857faaed3a39f0a455ecfa --- /dev/null +++ b/checkpoint-400/trainer_state.json @@ -0,0 +1,178 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2, + "eval_steps": 100, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.14176353812217712, + "learning_rate": 0.0004, + "loss": 1.1525, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.1460508555173874, + "learning_rate": 0.0004998852503731983, + "loss": 1.047, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.2368021011352539, + "learning_rate": 0.0004993848168027977, + "loss": 0.8529, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.14488168060779572, + "learning_rate": 0.0004984880506341147, + "loss": 0.9756, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9470569491386414, + "eval_runtime": 845.9115, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.1415175348520279, + "learning_rate": 0.0004971963770447935, + "loss": 0.9564, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.1751064509153366, + "learning_rate": 0.0004955118488155782, + "loss": 0.711, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.1439943015575409, + "learning_rate": 0.0004934371430679492, + "loss": 0.9409, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.15947112441062927, + "learning_rate": 0.0004909755570095319, + "loss": 0.8979, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9711233973503113, + "eval_runtime": 845.6433, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.14685837924480438, + "learning_rate": 0.0004881310026940389, + "loss": 0.6376, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.15040776133537292, + "learning_rate": 0.0004849080008040734, + "loss": 0.927, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.16087745130062103, + "learning_rate": 0.00048131167346667446, + "loss": 0.8456, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.15025638043880463, + "learning_rate": 0.00047734773611302284, + "loss": 0.6029, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 1.0056413412094116, + "eval_runtime": 848.447, + "eval_samples_per_second": 1.293, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.15893957018852234, + "learning_rate": 0.0004730224883952422, + "loss": 0.9036, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.1535714715719223, + "learning_rate": 0.0004683428041747334, + "loss": 0.828, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.1718970686197281, + "learning_rate": 0.0004633161205979517, + "loss": 0.5944, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.17664535343647003, + "learning_rate": 0.0004579504262769877, + "loss": 0.8654, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 1.0151112079620361, + "eval_runtime": 839.7482, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 400 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.031726173447782e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-500/optimizer_0/.metadata b/checkpoint-500/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..1de0b6cfc6b74929d2f23cc644dab72fa2d5848e --- /dev/null +++ b/checkpoint-500/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcf6a15f62e2a784468db88020de1fd5c9b61877062fe5e5b10816ea4da1b758 +size 869361 diff --git a/checkpoint-500/optimizer_0/__0_0.distcp b/checkpoint-500/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..683b3aaa34b678252e072debbe5e816f3ab18e2c --- /dev/null +++ b/checkpoint-500/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e82db23112399510d50c0095fe29d9ffa412f33f6e7cae936d20c36f5e705dc6 +size 6008476 diff --git a/checkpoint-500/optimizer_0/__1_0.distcp b/checkpoint-500/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..994b9aa4f6c6ea44dc6983ce91cc8361f2bbca4e --- /dev/null +++ b/checkpoint-500/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5aadd7e2516fe4d37a65cfaf51fbc771c9e0ffeca81519cea2a90eabb314f828 +size 6041200 diff --git a/checkpoint-500/optimizer_0/__2_0.distcp b/checkpoint-500/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..69cd6339fcc8494e322c7d3bd57269a59c8b6ab1 --- /dev/null +++ b/checkpoint-500/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:220c1d593cd300aeda257b9af61b4bdcc8b974639616bbecdbe9fcdacdc1d44b +size 6041200 diff --git a/checkpoint-500/optimizer_0/__3_0.distcp b/checkpoint-500/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..c5befce015f9470de3009529b337cda5b20ea0d2 --- /dev/null +++ b/checkpoint-500/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37fbdb77375d3fd106d3d01e2f1152606a37d84dc1911e643bfbfc8975a769ab +size 6043476 diff --git a/checkpoint-500/optimizer_0/__4_0.distcp b/checkpoint-500/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..0f469afd0b98867e10e22daa81d6a9951a8360d8 --- /dev/null +++ b/checkpoint-500/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:775677f97b66015d6d48b6b7ba82d7a0d5437efc95a5069a47ca727633090b3a +size 6057364 diff --git a/checkpoint-500/optimizer_0/__5_0.distcp b/checkpoint-500/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..fae9377b13366d88f336f694c258dee00bc373d2 --- /dev/null +++ b/checkpoint-500/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b99d39a7d853f00e8966c69cb563a5b8dd469b8120b8d94d4101c050a61bd59b +size 6042612 diff --git a/checkpoint-500/optimizer_0/__6_0.distcp b/checkpoint-500/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..5d54fc024115091b2837a3d758de6974dba2669f --- /dev/null +++ b/checkpoint-500/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2033a6e8ab051776afe46a830df044d224316f23b42188738dca129d6823f8a4 +size 6042612 diff --git a/checkpoint-500/optimizer_0/__7_0.distcp b/checkpoint-500/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..2290b5dc37c7000c1f5642aa2e1470aa50ee2724 --- /dev/null +++ b/checkpoint-500/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:268da50232b60d028531f2a37df143c4284f235c2b67305c869dc97e3495e5ab +size 6042612 diff --git a/checkpoint-500/pytorch_model_fsdp_0/.metadata b/checkpoint-500/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..8a8f754f95c7870aec80777f9a1a54a79d5a5e29 --- /dev/null +++ b/checkpoint-500/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e2932a7666be4eae77acb30269a99fe22df5150289ccb47ee860b5b62a68b93 +size 339851 diff --git a/checkpoint-500/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-500/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..576555b974f496ccd7461c2484647dd805e22b03 --- /dev/null +++ b/checkpoint-500/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:103ca964bc4afd5e2d8f1dd420384506e7f106b609dcad38c907ae3b68c11747 +size 3003648 diff --git a/checkpoint-500/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-500/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..589ffd1594a1e3d870e85ee8109913060b91140e --- /dev/null +++ b/checkpoint-500/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59c1f6a31fbb00dbd25e0d53d5b1fc5fe678a08456c37e1a2eefc85c1acf1e8b +size 3003648 diff --git a/checkpoint-500/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-500/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..50aee6045ab11ea7faaffd9539e72dff41e401b3 --- /dev/null +++ b/checkpoint-500/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21d0f832865cb664c159320d2a35abd2dc11ba39a482eb1a12649a78534bdb75 +size 3003648 diff --git a/checkpoint-500/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-500/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..81fe381d1adb54311f4e68193daec15a3647c871 --- /dev/null +++ b/checkpoint-500/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ac148f509d967a02cec39466509c8dbc32e72dee6a29b124436a63d480dca2f +size 3003648 diff --git a/checkpoint-500/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-500/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..851619babe6dac21eb6a97c534eaded4d1037e37 --- /dev/null +++ b/checkpoint-500/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b60ea3cee892ab7b0cb77875a64ab8b857a2df9d78d6ee8ca7d3009fece1daf +size 3003648 diff --git a/checkpoint-500/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-500/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..2b9dfcab094e79186d080424124b7ecbea22e77e --- /dev/null +++ b/checkpoint-500/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9133ce35c013479b694e74784314aab2234acf8fba978fb6fd8a65086e8c5c3d +size 3003648 diff --git a/checkpoint-500/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-500/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..05a2a925e77b95752e80ce2ceca2af447f6362ca --- /dev/null +++ b/checkpoint-500/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49891757daa7a4a19740fbb37f55c21652ba300b98369823a5d2924ab9d919a9 +size 3003648 diff --git a/checkpoint-500/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-500/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..8e4bafff19a50ad30cd4bb0a473879a7d3c696f2 --- /dev/null +++ b/checkpoint-500/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a469d4aa5479c2b64953295474bd722bf7fdd4d8c5ce8e2af3fb0c18d794b4f +size 3003648 diff --git a/checkpoint-500/rng_state_0.pth b/checkpoint-500/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..699403fdac20c1753b0c0ae0b94a6414158f3257 --- /dev/null +++ b/checkpoint-500/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f620b269378694edf449ca02b2bfeac974979b2e2447e87f07029a0fed826d7 +size 15920 diff --git a/checkpoint-500/rng_state_1.pth b/checkpoint-500/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..050d63a5485b5258fd9c948c66f3b4e3794c4d90 --- /dev/null +++ b/checkpoint-500/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f9cb03eee71de2be3538288b2f1981537922a835272dad01c4069bfdca300bc +size 15984 diff --git a/checkpoint-500/rng_state_2.pth b/checkpoint-500/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..efa423c8fcfb306b8e66e50cb301a6c2ab76ffb7 --- /dev/null +++ b/checkpoint-500/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83178132bb595ebe8b479ea8f105f64cce3a1f9d8ed04b481c05928607ca1513 +size 15984 diff --git a/checkpoint-500/rng_state_3.pth b/checkpoint-500/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..8ac9ee3678df4eb7e0ca5c1ef371205c2d005cc5 --- /dev/null +++ b/checkpoint-500/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abd3e2d5b57a6bd25638c48f3b5b207d746b7faa0a195b8bbad1b546064dd8f7 +size 15984 diff --git a/checkpoint-500/rng_state_4.pth b/checkpoint-500/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..34a9402a0db7efea3756caf4840667911b86a37b --- /dev/null +++ b/checkpoint-500/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:202a4071f09999f050e9768c1a3a518b6b3bff189c23039fe733f9f24291210f +size 15984 diff --git a/checkpoint-500/rng_state_5.pth b/checkpoint-500/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..d598e109dca963639e8e4f2dba04edaf08d7b9e6 --- /dev/null +++ b/checkpoint-500/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0212e64fa50a81a26672f3500b949ccc2f145f3fcf63b35bc77484ecf00c0c08 +size 15984 diff --git a/checkpoint-500/rng_state_6.pth b/checkpoint-500/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..19e1f04b247c5fc37a394707f3f802eca220c93d --- /dev/null +++ b/checkpoint-500/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:879c494ee06aa75a54e0e810621a4cb24f07dfba633a1043ab54566bc9cd7870 +size 15984 diff --git a/checkpoint-500/rng_state_7.pth b/checkpoint-500/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..4cbc6a61e7ad7add8a5412ad34a6085eb835717a --- /dev/null +++ b/checkpoint-500/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d230e0857cdd767606b7f9cc69f92c1f0b0c46e8929ac5cc39e3a522ede1c7bc +size 15984 diff --git a/checkpoint-500/scheduler.pt b/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a7a8fc1beb3f842bafa7113ae4a09e73a121df7 --- /dev/null +++ b/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc95850e72613f7a4a1684465dd4f848b800fdca2b51b79a9ac67435988b218d +size 1064 diff --git a/checkpoint-500/trainer_state.json b/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e47912a191961c290027e50a96f937031adbe38b --- /dev/null +++ b/checkpoint-500/trainer_state.json @@ -0,0 +1,214 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.25, + "eval_steps": 100, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.14176353812217712, + "learning_rate": 0.0004, + "loss": 1.1525, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.1460508555173874, + "learning_rate": 0.0004998852503731983, + "loss": 1.047, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.2368021011352539, + "learning_rate": 0.0004993848168027977, + "loss": 0.8529, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.14488168060779572, + "learning_rate": 0.0004984880506341147, + "loss": 0.9756, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9470569491386414, + "eval_runtime": 845.9115, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.1415175348520279, + "learning_rate": 0.0004971963770447935, + "loss": 0.9564, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.1751064509153366, + "learning_rate": 0.0004955118488155782, + "loss": 0.711, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.1439943015575409, + "learning_rate": 0.0004934371430679492, + "loss": 0.9409, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.15947112441062927, + "learning_rate": 0.0004909755570095319, + "loss": 0.8979, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9711233973503113, + "eval_runtime": 845.6433, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.14685837924480438, + "learning_rate": 0.0004881310026940389, + "loss": 0.6376, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.15040776133537292, + "learning_rate": 0.0004849080008040734, + "loss": 0.927, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.16087745130062103, + "learning_rate": 0.00048131167346667446, + "loss": 0.8456, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.15025638043880463, + "learning_rate": 0.00047734773611302284, + "loss": 0.6029, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 1.0056413412094116, + "eval_runtime": 848.447, + "eval_samples_per_second": 1.293, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.15893957018852234, + "learning_rate": 0.0004730224883952422, + "loss": 0.9036, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.1535714715719223, + "learning_rate": 0.0004683428041747334, + "loss": 0.828, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.1718970686197281, + "learning_rate": 0.0004633161205979517, + "loss": 0.5944, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.17664535343647003, + "learning_rate": 0.0004579504262769877, + "loss": 0.8654, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 1.0151112079620361, + "eval_runtime": 839.7482, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.17122088372707367, + "learning_rate": 0.0004522542485937369, + "loss": 0.8078, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.19420970976352692, + "learning_rate": 0.00044623664014783386, + "loss": 0.5735, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.18166953325271606, + "learning_rate": 0.00043990716436988924, + "loss": 0.8604, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.1502382755279541, + "learning_rate": 0.0004332758803228925, + "loss": 0.7673, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 1.0454745292663574, + "eval_runtime": 841.3951, + "eval_samples_per_second": 1.304, + "eval_steps_per_second": 0.021, + "step": 500 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.039657716809728e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-600/optimizer_0/.metadata b/checkpoint-600/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..61c55b5fecc11d63a530c9289bb0352423cd3ba1 --- /dev/null +++ b/checkpoint-600/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a22cb80540dec888e83231c5e99ac5f705ab64a4e5be7cd619c417cc110dc3b +size 869361 diff --git a/checkpoint-600/optimizer_0/__0_0.distcp b/checkpoint-600/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..f0ac2c8c3dc758ae5b3bdeeae9dc69b3f63252a6 --- /dev/null +++ b/checkpoint-600/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bbb5a1d2ac362b8173671c93a0a478b4ce391e576e8a07b3c40a28506560940 +size 6008476 diff --git a/checkpoint-600/optimizer_0/__1_0.distcp b/checkpoint-600/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..58ad60ef0d84967731f2e101b4707ed828d6d958 --- /dev/null +++ b/checkpoint-600/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eabc15a1b6ad60998a5810cc86b9a8047151e1269fe71b1ae599af9252c24f13 +size 6041200 diff --git a/checkpoint-600/optimizer_0/__2_0.distcp b/checkpoint-600/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..e61aaa187e37b9fabb9972445fdb718017e7ce18 --- /dev/null +++ b/checkpoint-600/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7faa5ccb185f3eab0602691345345d3cfe62f230cab244e4640afb8d61f111bf +size 6041200 diff --git a/checkpoint-600/optimizer_0/__3_0.distcp b/checkpoint-600/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..cb9cce874e2340bfb40bb5c2cc9ad2dc1e7b1e3b --- /dev/null +++ b/checkpoint-600/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:692438a262793bbf0a09afde7917fe76c918e6ab168abb709947e42a906ff962 +size 6043476 diff --git a/checkpoint-600/optimizer_0/__4_0.distcp b/checkpoint-600/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..fff95947b7a7f9728f2bc25151eac7b7b270a2fc --- /dev/null +++ b/checkpoint-600/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:799952ec2fc25e98bc325e4e80b3da6b385a86105c801203969f9b1107041690 +size 6057364 diff --git a/checkpoint-600/optimizer_0/__5_0.distcp b/checkpoint-600/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..3c6ac656a04db7059c3821c8c76aa6881d6ff8b0 --- /dev/null +++ b/checkpoint-600/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6ba8d6bc9e32460751bade772e99387083bf91b4fa81b628e618579aee313fe +size 6042612 diff --git a/checkpoint-600/optimizer_0/__6_0.distcp b/checkpoint-600/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..9da8c621730e92effabca8fc96e66f9a92d4e0f2 --- /dev/null +++ b/checkpoint-600/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:530be93e3e89a3706c256547184cac0d1ece551960876fd788a50bec9fbc65f7 +size 6042612 diff --git a/checkpoint-600/optimizer_0/__7_0.distcp b/checkpoint-600/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..25c7810a16efb08957497a97d66a8b5ae3eaf71b --- /dev/null +++ b/checkpoint-600/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:760676a33c726908e619ddda946aad1fa63058cbe0488c9cb567b8c67c11641f +size 6042612 diff --git a/checkpoint-600/pytorch_model_fsdp_0/.metadata b/checkpoint-600/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..14c04953c5269ff64c48c9a2fcf4ff0667df484c --- /dev/null +++ b/checkpoint-600/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95ff34a00c8404c216617cee0111cbf8a7af88f38496db08656475e53f116a6a +size 339851 diff --git a/checkpoint-600/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-600/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..7bada20c83e51a7d73f3d2faee9038183dc605f8 --- /dev/null +++ b/checkpoint-600/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d603e39d2263e86a92411a35675a2ac9c52b6a75a3aede15d8cedcbaa2728d4 +size 3003648 diff --git a/checkpoint-600/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-600/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..9f388d679f434eea6f8d6b34b7beb4b596b9d000 --- /dev/null +++ b/checkpoint-600/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:650411799950fa550ad1a8c5f5a19e18bcdcf04bee3fd64f0a10da618744b3ff +size 3003648 diff --git a/checkpoint-600/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-600/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..5d6c7b81423fdbe325c00234c45f473d89638123 --- /dev/null +++ b/checkpoint-600/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac96bcd1cd9db124c7315280e09c52c11dc7168587b90226273dd417098b1089 +size 3003648 diff --git a/checkpoint-600/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-600/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..74d7080f12690707593b30f95f63950c53df6591 --- /dev/null +++ b/checkpoint-600/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c3ea4a71c5b53197bebd3d07829e835df826314fff944dc83da8f5dc676c443 +size 3003648 diff --git a/checkpoint-600/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-600/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..808470330ea6f0c10b7bb50558ea7a54f006d2cd --- /dev/null +++ b/checkpoint-600/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da4fcae0eb686824a9d16bddeb3e2eb0c6f184f707cecf577755160c0b2cdbe7 +size 3003648 diff --git a/checkpoint-600/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-600/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..682ea018b1f26d7a9bce3caaf089fc90bf42a934 --- /dev/null +++ b/checkpoint-600/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:475d9725a44f1b2e341e3c80d9d313f41f84ad325df0a7966fdcd4644e4cbf3d +size 3003648 diff --git a/checkpoint-600/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-600/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..0f0cc9560a7286657457538dd7617441c54df369 --- /dev/null +++ b/checkpoint-600/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58306a5b1bda996efe205711878dcff03cb33b4eae4a8a096ce6800564fc052d +size 3003648 diff --git a/checkpoint-600/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-600/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..14b8ad3176735570cf7fb208f26c7644f094eabe --- /dev/null +++ b/checkpoint-600/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d77de9241fba779c59fc441ad5bc10885bec851ea4304127acf79239e5b8fd6 +size 3003648 diff --git a/checkpoint-600/rng_state_0.pth b/checkpoint-600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..c6fd6e0a521df4fcbb775eee743abac77c33e99f --- /dev/null +++ b/checkpoint-600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a51c34116e74976603507ba08705aca7a521f5063e0d5280afc2eab68c4bf36 +size 15920 diff --git a/checkpoint-600/rng_state_1.pth b/checkpoint-600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..91b5f7b3616eba14f6616b60b318f0a38e42ad39 --- /dev/null +++ b/checkpoint-600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0e94a8e4fb8632c1a2bbfafc9954dea009a47a89164eaa6b4e9ad4738a61866 +size 15984 diff --git a/checkpoint-600/rng_state_2.pth b/checkpoint-600/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..f61758714f0af0a51664275e8c183b1723ee985c --- /dev/null +++ b/checkpoint-600/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:581c40d10e64ee3415fa1edb81e4df35bcb7d2cccace07218e909d4a0cd39670 +size 15984 diff --git a/checkpoint-600/rng_state_3.pth b/checkpoint-600/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..49c24790df1059ce6d97cc85aa0bbafbbe5bef44 --- /dev/null +++ b/checkpoint-600/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a5c5fbd89816b8ec2f69e0f019ea2ae70ee8c86981f8d64e3a701572708e36e +size 15984 diff --git a/checkpoint-600/rng_state_4.pth b/checkpoint-600/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..caa359520023e2247cb8bb95d569e113ecb91360 --- /dev/null +++ b/checkpoint-600/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6a729343aef347ba34536cc6a4fb56f898d13aad7a247e651a1c36796fef1e4 +size 15984 diff --git a/checkpoint-600/rng_state_5.pth b/checkpoint-600/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..dec6b5929d6f20adfb88bf325d934067adb7ee8e --- /dev/null +++ b/checkpoint-600/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:889462d35b322a22d2431ed49ed60a340bdbfc709dabefd05137283df1487d2e +size 15984 diff --git a/checkpoint-600/rng_state_6.pth b/checkpoint-600/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..b819bbb64675781cd44fea373d7fa2b4890a7bd3 --- /dev/null +++ b/checkpoint-600/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06b701aac9cf3c38cff274fe22ba3612c50d5a982e2585585497f2ca2d0a4729 +size 15984 diff --git a/checkpoint-600/rng_state_7.pth b/checkpoint-600/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..397efd8a009b165f197504e261e6eeb57f633301 --- /dev/null +++ b/checkpoint-600/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30f35a94dc3a8dc4b2ae30697f159aedf54d139373b80885849c908f85376243 +size 15984 diff --git a/checkpoint-600/scheduler.pt b/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1fa9750d60fe6a490c749218369d6d200b7b6ffc --- /dev/null +++ b/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0fdf508bbbc1f82a3989be901b295050953f0512d0841595d9dae3ad6c0a857 +size 1064 diff --git a/checkpoint-600/trainer_state.json b/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..942d44a0d21c995002c5b9e0212387ccbc90a561 --- /dev/null +++ b/checkpoint-600/trainer_state.json @@ -0,0 +1,250 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3, + "eval_steps": 100, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.14176353812217712, + "learning_rate": 0.0004, + "loss": 1.1525, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.1460508555173874, + "learning_rate": 0.0004998852503731983, + "loss": 1.047, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.2368021011352539, + "learning_rate": 0.0004993848168027977, + "loss": 0.8529, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.14488168060779572, + "learning_rate": 0.0004984880506341147, + "loss": 0.9756, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9470569491386414, + "eval_runtime": 845.9115, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.1415175348520279, + "learning_rate": 0.0004971963770447935, + "loss": 0.9564, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.1751064509153366, + "learning_rate": 0.0004955118488155782, + "loss": 0.711, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.1439943015575409, + "learning_rate": 0.0004934371430679492, + "loss": 0.9409, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.15947112441062927, + "learning_rate": 0.0004909755570095319, + "loss": 0.8979, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9711233973503113, + "eval_runtime": 845.6433, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.14685837924480438, + "learning_rate": 0.0004881310026940389, + "loss": 0.6376, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.15040776133537292, + "learning_rate": 0.0004849080008040734, + "loss": 0.927, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.16087745130062103, + "learning_rate": 0.00048131167346667446, + "loss": 0.8456, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.15025638043880463, + "learning_rate": 0.00047734773611302284, + "loss": 0.6029, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 1.0056413412094116, + "eval_runtime": 848.447, + "eval_samples_per_second": 1.293, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.15893957018852234, + "learning_rate": 0.0004730224883952422, + "loss": 0.9036, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.1535714715719223, + "learning_rate": 0.0004683428041747334, + "loss": 0.828, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.1718970686197281, + "learning_rate": 0.0004633161205979517, + "loss": 0.5944, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.17664535343647003, + "learning_rate": 0.0004579504262769877, + "loss": 0.8654, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 1.0151112079620361, + "eval_runtime": 839.7482, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.17122088372707367, + "learning_rate": 0.0004522542485937369, + "loss": 0.8078, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.19420970976352692, + "learning_rate": 0.00044623664014783386, + "loss": 0.5735, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.18166953325271606, + "learning_rate": 0.00043990716436988924, + "loss": 0.8604, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.1502382755279541, + "learning_rate": 0.0004332758803228925, + "loss": 0.7673, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 1.0454745292663574, + "eval_runtime": 841.3951, + "eval_samples_per_second": 1.304, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.16875839233398438, + "learning_rate": 0.00042635332671593575, + "loss": 0.5882, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.16070039570331573, + "learning_rate": 0.00041915050515566445, + "loss": 0.8175, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.16584184765815735, + "learning_rate": 0.00041167886266207167, + "loss": 0.7795, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.15345798432826996, + "learning_rate": 0.0004039502734764241, + "loss": 0.7331, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 1.0627943277359009, + "eval_runtime": 849.4944, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 600 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.047589260171674e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-700/optimizer_0/.metadata b/checkpoint-700/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..3edcb25f0fc5fd0014159e7b8d35716de0fed5b7 --- /dev/null +++ b/checkpoint-700/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:606f756a799bc782a25edca688ec5888f4e10ce8b80f8ebf577cd6bab18ab942 +size 869361 diff --git a/checkpoint-700/optimizer_0/__0_0.distcp b/checkpoint-700/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..b3a56b6f976949bda9df07f45d5b683861489ac6 --- /dev/null +++ b/checkpoint-700/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa33c6f1692891f242eeeecb8b226cbada28e37532a82f93e365d60b2d409e43 +size 6008476 diff --git a/checkpoint-700/optimizer_0/__1_0.distcp b/checkpoint-700/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..6951ccb45b64f1330b40ce96021f8c0b493253b3 --- /dev/null +++ b/checkpoint-700/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afa51cd98a484548558e8cca30e0f8876285dbb9aaa947bcc1737668af9fa3e1 +size 6041200 diff --git a/checkpoint-700/optimizer_0/__2_0.distcp b/checkpoint-700/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..b25a4a0fedff69bb05e806a85e4169959f23bb2a --- /dev/null +++ b/checkpoint-700/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be3f7dbf0386bbbfcb0e066824fc704992b4334888dc6175bfa022a0e8e62f14 +size 6041200 diff --git a/checkpoint-700/optimizer_0/__3_0.distcp b/checkpoint-700/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..309f1ceae298dbff7a215ba437c4e05d9f566b91 --- /dev/null +++ b/checkpoint-700/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd1ac01729d19f3c1d6f4366a61b4236fde5d51ee912808ec9ab6949eac1da3c +size 6043476 diff --git a/checkpoint-700/optimizer_0/__4_0.distcp b/checkpoint-700/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..926c19507576d4f33b03c57a6533f890a5e0506c --- /dev/null +++ b/checkpoint-700/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23493c73829d03271341dc06f13101d9a1406b21d89bf33c41849a02640f0af5 +size 6057364 diff --git a/checkpoint-700/optimizer_0/__5_0.distcp b/checkpoint-700/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..f8a61b4f6c686c1bad8b17c529bcf083aa32307a --- /dev/null +++ b/checkpoint-700/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb7ced8b5ec2f46c96fba6be3c304a0dfdc374921def92d7b9b3401d4099c17d +size 6042612 diff --git a/checkpoint-700/optimizer_0/__6_0.distcp b/checkpoint-700/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..839859d16008ad619a9fc4e7d70fb5ea9509e5d8 --- /dev/null +++ b/checkpoint-700/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54b9bc1a8bc163ba24d5244509b63d02e6ec62bf5a5a64cd300d41a6ebde55fe +size 6042612 diff --git a/checkpoint-700/optimizer_0/__7_0.distcp b/checkpoint-700/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..030fccf29442b69b3e3d24f2bc6ceee698da0be8 --- /dev/null +++ b/checkpoint-700/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60e5d5014fb5b364a7c1f0ba485c355a32df393f2cb14e2823783a22b42f6afd +size 6042612 diff --git a/checkpoint-700/pytorch_model_fsdp_0/.metadata b/checkpoint-700/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..9fa096cb42f28b3b493602024ff9ab059d177734 --- /dev/null +++ b/checkpoint-700/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebddfa90ed31a508d45f4a44c73c006f93d4541cadd510381ee73532703c11a8 +size 339851 diff --git a/checkpoint-700/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-700/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..437394a3575d3dbbd5a2ed9cb9e0ea2e5fe51bb8 --- /dev/null +++ b/checkpoint-700/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38a693471c1c5cc6d409771fafce84e234c4cb332a42b87dc5e332ddca02d37f +size 3003648 diff --git a/checkpoint-700/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-700/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..8638ce4779cf782cb2d02b9ceb3031b6a3ab6e18 --- /dev/null +++ b/checkpoint-700/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad1266e382868e46d8e2469113247a86c1d457d11fddc23d59bc20f9e618bad5 +size 3003648 diff --git a/checkpoint-700/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-700/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..3237dca6a20c26451149429dddb49e9bc33f9af3 --- /dev/null +++ b/checkpoint-700/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ada046f1e4223e14cf2871aafcff282377bd0d1a33423fbc23b8c198d1f4f5b +size 3003648 diff --git a/checkpoint-700/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-700/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..85cc61dc5d8a285c654ea243c45090c2f0df368d --- /dev/null +++ b/checkpoint-700/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd005d80d061f212699e0e630878d313e4ebdd3f1ed46b5b125b89ee9f22d76d +size 3003648 diff --git a/checkpoint-700/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-700/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..d0828ac37f64cc846c7b2eff079a46e1dc579430 --- /dev/null +++ b/checkpoint-700/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:409e0637704ee16b36c2a2e377f06deb186e9d156d7d83f93587f8f75fc20bc3 +size 3003648 diff --git a/checkpoint-700/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-700/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..16e6c98ba875a82b2d007cda2de696efab77fb89 --- /dev/null +++ b/checkpoint-700/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:518a3072d0422f6af6cccee18fa51bd058df72ecd1b934fe1b247359edca016f +size 3003648 diff --git a/checkpoint-700/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-700/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..069a7b423828583f1fcb17c235c0809b22636ba4 --- /dev/null +++ b/checkpoint-700/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c29b6f92cd6e5a0ba40c4e5420b439758a4e6dc53306135ca7cfae82f76a5425 +size 3003648 diff --git a/checkpoint-700/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-700/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..506c7a32cbdbd61fae53f1357229b6a9a15758c7 --- /dev/null +++ b/checkpoint-700/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7879b02e5d31af05198f85c1edbaa8ae350aa701eff64f1b792c6fbe7f3fd96f +size 3003648 diff --git a/checkpoint-700/rng_state_0.pth b/checkpoint-700/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5de689d9fd1a61dc6709321a6aa331d6e005be36 --- /dev/null +++ b/checkpoint-700/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85c87cd53dec4801e4dcb6d4bb9548ff38aa0045659cdd86a18577ca7fcd9a3e +size 15920 diff --git a/checkpoint-700/rng_state_1.pth b/checkpoint-700/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..48342dcb0785b7c96ce94a1e65ad7ee7a0b93dbc --- /dev/null +++ b/checkpoint-700/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14f6fcbe44fc6f1a64ef2e4229329192390d155590e99d4c4819c2d087a49fe7 +size 15984 diff --git a/checkpoint-700/rng_state_2.pth b/checkpoint-700/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..4dc040c521ab500222e26dbc5e64f11d4660c409 --- /dev/null +++ b/checkpoint-700/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8eb1ee5c68161bb5c64f0b872e58bb8fc2ec4b387d82190a48748f4bb20aceb2 +size 15984 diff --git a/checkpoint-700/rng_state_3.pth b/checkpoint-700/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..644a69426ebb1f4d81e7e519fff7817ec70f4c0c --- /dev/null +++ b/checkpoint-700/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5331cf0a58fb23632a83b360557d02d5fc89b8a8c2f4e8abeb139ab0cec5acb6 +size 15984 diff --git a/checkpoint-700/rng_state_4.pth b/checkpoint-700/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..5750a227925088c8490d2e426cc9b2ec8040e296 --- /dev/null +++ b/checkpoint-700/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3056627d13df37bbea46087f62d9c8e5eed572e8607868c71a241cfe13de1ac6 +size 15984 diff --git a/checkpoint-700/rng_state_5.pth b/checkpoint-700/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..48a0d5f5be44dab6078d368b51b38930d64f1475 --- /dev/null +++ b/checkpoint-700/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3d44298040b1507a5b96aef6d7f8aa7e27fb07ba806caeaf7d2308f4bbff608 +size 15984 diff --git a/checkpoint-700/rng_state_6.pth b/checkpoint-700/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..1aa7e59b4e0f8ab340bac71838c6bd072bfa78ca --- /dev/null +++ b/checkpoint-700/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdc7c28fd2bad2438655e4c93594ac44c548ed1530b724512ffdda3a05c275d2 +size 15984 diff --git a/checkpoint-700/rng_state_7.pth b/checkpoint-700/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..794e5a4bcac7ac5857121ab7fc543d81364d4cb2 --- /dev/null +++ b/checkpoint-700/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54038d56bc152013057e4b82ee86ab0a30d67fd1bf3f57326f8342e3b27946bb +size 15984 diff --git a/checkpoint-700/scheduler.pt b/checkpoint-700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8cb42d4426a8299f4a6337041b5f31cab6f971c5 --- /dev/null +++ b/checkpoint-700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6345684fba5839b30835cdc28a18266062e5d7b4c59bdd0ceab08c6e41c7a958 +size 1064 diff --git a/checkpoint-700/trainer_state.json b/checkpoint-700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3faf255c3976288481d57b20e54a6a3c4f9526ff --- /dev/null +++ b/checkpoint-700/trainer_state.json @@ -0,0 +1,286 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.35, + "eval_steps": 100, + "global_step": 700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.14176353812217712, + "learning_rate": 0.0004, + "loss": 1.1525, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.1460508555173874, + "learning_rate": 0.0004998852503731983, + "loss": 1.047, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.2368021011352539, + "learning_rate": 0.0004993848168027977, + "loss": 0.8529, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.14488168060779572, + "learning_rate": 0.0004984880506341147, + "loss": 0.9756, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9470569491386414, + "eval_runtime": 845.9115, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.1415175348520279, + "learning_rate": 0.0004971963770447935, + "loss": 0.9564, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.1751064509153366, + "learning_rate": 0.0004955118488155782, + "loss": 0.711, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.1439943015575409, + "learning_rate": 0.0004934371430679492, + "loss": 0.9409, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.15947112441062927, + "learning_rate": 0.0004909755570095319, + "loss": 0.8979, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9711233973503113, + "eval_runtime": 845.6433, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.14685837924480438, + "learning_rate": 0.0004881310026940389, + "loss": 0.6376, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.15040776133537292, + "learning_rate": 0.0004849080008040734, + "loss": 0.927, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.16087745130062103, + "learning_rate": 0.00048131167346667446, + "loss": 0.8456, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.15025638043880463, + "learning_rate": 0.00047734773611302284, + "loss": 0.6029, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 1.0056413412094116, + "eval_runtime": 848.447, + "eval_samples_per_second": 1.293, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.15893957018852234, + "learning_rate": 0.0004730224883952422, + "loss": 0.9036, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.1535714715719223, + "learning_rate": 0.0004683428041747334, + "loss": 0.828, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.1718970686197281, + "learning_rate": 0.0004633161205979517, + "loss": 0.5944, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.17664535343647003, + "learning_rate": 0.0004579504262769877, + "loss": 0.8654, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 1.0151112079620361, + "eval_runtime": 839.7482, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.17122088372707367, + "learning_rate": 0.0004522542485937369, + "loss": 0.8078, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.19420970976352692, + "learning_rate": 0.00044623664014783386, + "loss": 0.5735, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.18166953325271606, + "learning_rate": 0.00043990716436988924, + "loss": 0.8604, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.1502382755279541, + "learning_rate": 0.0004332758803228925, + "loss": 0.7673, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 1.0454745292663574, + "eval_runtime": 841.3951, + "eval_samples_per_second": 1.304, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.16875839233398438, + "learning_rate": 0.00042635332671593575, + "loss": 0.5882, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.16070039570331573, + "learning_rate": 0.00041915050515566445, + "loss": 0.8175, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.16584184765815735, + "learning_rate": 0.00041167886266207167, + "loss": 0.7795, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.15345798432826996, + "learning_rate": 0.0004039502734764241, + "loss": 0.7331, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 1.0627943277359009, + "eval_runtime": 849.4944, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 600 + }, + { + "epoch": 0.3125, + "grad_norm": 0.16241031885147095, + "learning_rate": 0.0003959770201902294, + "loss": 0.7436, + "step": 625 + }, + { + "epoch": 0.325, + "grad_norm": 0.1531112939119339, + "learning_rate": 0.0003877717742252371, + "loss": 0.6345, + "step": 650 + }, + { + "epoch": 0.3375, + "grad_norm": 0.15790140628814697, + "learning_rate": 0.00037934757569549495, + "loss": 0.7351, + "step": 675 + }, + { + "epoch": 0.35, + "grad_norm": 0.1822807490825653, + "learning_rate": 0.00037071781268346345, + "loss": 0.745, + "step": 700 + }, + { + "epoch": 0.35, + "eval_loss": 1.07315993309021, + "eval_runtime": 837.5276, + "eval_samples_per_second": 1.31, + "eval_steps_per_second": 0.021, + "step": 700 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.055520803533619e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-800/optimizer_0/.metadata b/checkpoint-800/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..ce5d7e337937471412c70fe8156ef634a70ba8a7 --- /dev/null +++ b/checkpoint-800/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec2476bc171fe95c2b95ad4fb077ff388ddb6a2a3348e173bd33207d258858e +size 869361 diff --git a/checkpoint-800/optimizer_0/__0_0.distcp b/checkpoint-800/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..ea74bb554765ab5522abf2988b8488413afa573b --- /dev/null +++ b/checkpoint-800/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff653a3c06465cb982f27e30171d98cf0740c0bf1104f1d1b3256a3e20c9007f +size 6008476 diff --git a/checkpoint-800/optimizer_0/__1_0.distcp b/checkpoint-800/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..4461f1115d50d222ff7a5fe3fb5485658222c86b --- /dev/null +++ b/checkpoint-800/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:677897065ba4c71fb1e71d2791b6654753f4c6b483e9d1e2fd6b31db51b1705a +size 6041200 diff --git a/checkpoint-800/optimizer_0/__2_0.distcp b/checkpoint-800/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..188085f5490ca1a76bae8c29393728118a54dd64 --- /dev/null +++ b/checkpoint-800/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:636361253cfc5e97f05fa4e6d1fe5f0610440c10b26facdb43b9ba35d6fcc7c4 +size 6041200 diff --git a/checkpoint-800/optimizer_0/__3_0.distcp b/checkpoint-800/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..c37022260ebf0e521bf4a9358d1c798de4152941 --- /dev/null +++ b/checkpoint-800/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63a7a807962c5634a5fef558432426e4951fa07fbbf04844fdbdaeac9f988fe8 +size 6043476 diff --git a/checkpoint-800/optimizer_0/__4_0.distcp b/checkpoint-800/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..6a6560cc18d128f5bd3673d34039ef1f29f8b2fb --- /dev/null +++ b/checkpoint-800/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c09a9a67c235b9e490e235f224f1336e12b8ef5dd50bf725f73c0a7e643617a +size 6057364 diff --git a/checkpoint-800/optimizer_0/__5_0.distcp b/checkpoint-800/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..db1c88200ead7c9d620744af0c51924d41442de6 --- /dev/null +++ b/checkpoint-800/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5be6a5114a79ec6ee31fd3ef152ac51d9486cca40ac254185a19b3b823e29568 +size 6042612 diff --git a/checkpoint-800/optimizer_0/__6_0.distcp b/checkpoint-800/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..630415d229bc51f47b6bc85ef8d6e8409ab3a9a0 --- /dev/null +++ b/checkpoint-800/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5c0350fecc7b24d9b6e7b45bd9268cf22a66b6eb64862b63980c9a3302ce41e +size 6042612 diff --git a/checkpoint-800/optimizer_0/__7_0.distcp b/checkpoint-800/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..b57523f06204d69b459294fc8977519b12195c65 --- /dev/null +++ b/checkpoint-800/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94a11062a54c2170fd7ed392854bbdcfcac62d70c8039190999f80e554679b40 +size 6042612 diff --git a/checkpoint-800/pytorch_model_fsdp_0/.metadata b/checkpoint-800/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..f2c86408d66d41476643d955209d5330b2a486fb --- /dev/null +++ b/checkpoint-800/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:134f8be5bc9563037ccbde32dd1351a8b5f4bced3476dd2e0019dc1654b42b02 +size 339851 diff --git a/checkpoint-800/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-800/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..60f94c99442408624c7cbf04b4259f2d840515c5 --- /dev/null +++ b/checkpoint-800/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a2ddf89aea89e6ce51433313fe01ee39c5b697747c7db2a1610d67a0c3a926b +size 3003648 diff --git a/checkpoint-800/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-800/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..bfb9294cd2e154bfd3c315b6da4fd38a1273a9f0 --- /dev/null +++ b/checkpoint-800/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5961beec90d8c4db471765bf434057141a4e5dbe12169a1874717e0939915b1 +size 3003648 diff --git a/checkpoint-800/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-800/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..f90c7ee39e982ee8749d548f148452739d2829e7 --- /dev/null +++ b/checkpoint-800/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe78197e3f8397ac42a6244694bd7fe0adbff8149c31debf047b8bd9240a47a4 +size 3003648 diff --git a/checkpoint-800/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-800/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..70d7c09f40c24c5e2397e1b2574ed69df4fe8056 --- /dev/null +++ b/checkpoint-800/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27774b160d3fcbac1720579682189ef7505dc13c58b704db4980f43a6e7fc5e9 +size 3003648 diff --git a/checkpoint-800/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-800/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..c0c4d51509b1a53d8e76fd0fb97464c693154a6b --- /dev/null +++ b/checkpoint-800/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9065730d6049bb30f429421f51834024da41eaac751677fbf01c51d4586b2afa +size 3003648 diff --git a/checkpoint-800/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-800/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..50db1a2acc2b35d7711d8d50bcb6e731a381dcf7 --- /dev/null +++ b/checkpoint-800/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2505e0fc6e61c7c3539c4ef41b43c9ae048e48c96e38c602f9dee38f91b7e1c8 +size 3003648 diff --git a/checkpoint-800/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-800/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..a6c6911eb7026b055c5b8b02d92a1c5b11fdbcfc --- /dev/null +++ b/checkpoint-800/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0c2bb3a2c46373db290e94c6e7f4cb9442ac310ef9b45cc4e2911c4e2823967 +size 3003648 diff --git a/checkpoint-800/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-800/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..a315820bfc1c0a12ae29c2b37a8b3af1eb872568 --- /dev/null +++ b/checkpoint-800/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:798bb5bddee72ac5671880f83e7005a14c8259b847f61c223714bc4fe6e0507d +size 3003648 diff --git a/checkpoint-800/rng_state_0.pth b/checkpoint-800/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..60f6acd1c374668f2a5587e26990b9bd686873b3 --- /dev/null +++ b/checkpoint-800/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da38903189d77ebca677952019ddffb7dc5e4dff9e4e4d1d2a62ff346e14cd1e +size 15920 diff --git a/checkpoint-800/rng_state_1.pth b/checkpoint-800/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..cad0c6517f208136366ea009ff0ec32fcfadb9cb --- /dev/null +++ b/checkpoint-800/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d7e4cf0e0adc364a659de9420fa4d12fb279f876e679a4b79e7a01fb98c8580 +size 15984 diff --git a/checkpoint-800/rng_state_2.pth b/checkpoint-800/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..dcbe0a0928a33931cc06580007b1dcebba9ea56f --- /dev/null +++ b/checkpoint-800/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f02eefc8db7444d78f255ccaa290083f51941b3a85ff2920cad0bdeaf64ec20 +size 15984 diff --git a/checkpoint-800/rng_state_3.pth b/checkpoint-800/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..61ef41217922d31e2e68b3deefa6b100ff7d69d3 --- /dev/null +++ b/checkpoint-800/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b85bbe008ff0367b8be7c5b22ebec4e6c57a6c257812f2923dd9db1590d8fbb +size 15984 diff --git a/checkpoint-800/rng_state_4.pth b/checkpoint-800/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..33d1fcde67e6ebc100b7d8763871261985f2621a --- /dev/null +++ b/checkpoint-800/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d693a5e5e3cf8ad4afea3c62093fd15960c5ef4039e21b8699187772ed46244b +size 15984 diff --git a/checkpoint-800/rng_state_5.pth b/checkpoint-800/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..e7324bdd0fcc8b76ecacd006e8ebacb6783d10d3 --- /dev/null +++ b/checkpoint-800/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e89d03c65eb3124c5f266341e1acfe9513d5e84d6a484888bcec59658a101d6 +size 15984 diff --git a/checkpoint-800/rng_state_6.pth b/checkpoint-800/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..aee6d6df3a75f3d97cd7a9a0974b47206c729386 --- /dev/null +++ b/checkpoint-800/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bdfa8d967dfcbcd71a8bafeac0ad93d130db3f163ed82de33d0ae3e01f671d5 +size 15984 diff --git a/checkpoint-800/rng_state_7.pth b/checkpoint-800/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..5fe000e2503db4be1cfcf332723aed922265e3e8 --- /dev/null +++ b/checkpoint-800/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2c691d6dc153751df913aa6b27aabff61d809c06b4ef74d0b96634fe71f71c1 +size 15984 diff --git a/checkpoint-800/scheduler.pt b/checkpoint-800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a08acd119a7aab21ab3547c87f67a43d79bfe92d --- /dev/null +++ b/checkpoint-800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5707b44605991a4e2707814032fbd5e2ffa2f78529edb47686673bff4f2f267b +size 1064 diff --git a/checkpoint-800/trainer_state.json b/checkpoint-800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..cafb85757f604d21e85ab64c4785611116f33066 --- /dev/null +++ b/checkpoint-800/trainer_state.json @@ -0,0 +1,322 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4, + "eval_steps": 100, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.14176353812217712, + "learning_rate": 0.0004, + "loss": 1.1525, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.1460508555173874, + "learning_rate": 0.0004998852503731983, + "loss": 1.047, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.2368021011352539, + "learning_rate": 0.0004993848168027977, + "loss": 0.8529, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.14488168060779572, + "learning_rate": 0.0004984880506341147, + "loss": 0.9756, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9470569491386414, + "eval_runtime": 845.9115, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.1415175348520279, + "learning_rate": 0.0004971963770447935, + "loss": 0.9564, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.1751064509153366, + "learning_rate": 0.0004955118488155782, + "loss": 0.711, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.1439943015575409, + "learning_rate": 0.0004934371430679492, + "loss": 0.9409, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.15947112441062927, + "learning_rate": 0.0004909755570095319, + "loss": 0.8979, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9711233973503113, + "eval_runtime": 845.6433, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.14685837924480438, + "learning_rate": 0.0004881310026940389, + "loss": 0.6376, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.15040776133537292, + "learning_rate": 0.0004849080008040734, + "loss": 0.927, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.16087745130062103, + "learning_rate": 0.00048131167346667446, + "loss": 0.8456, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.15025638043880463, + "learning_rate": 0.00047734773611302284, + "loss": 0.6029, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 1.0056413412094116, + "eval_runtime": 848.447, + "eval_samples_per_second": 1.293, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.15893957018852234, + "learning_rate": 0.0004730224883952422, + "loss": 0.9036, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.1535714715719223, + "learning_rate": 0.0004683428041747334, + "loss": 0.828, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.1718970686197281, + "learning_rate": 0.0004633161205979517, + "loss": 0.5944, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.17664535343647003, + "learning_rate": 0.0004579504262769877, + "loss": 0.8654, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 1.0151112079620361, + "eval_runtime": 839.7482, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.17122088372707367, + "learning_rate": 0.0004522542485937369, + "loss": 0.8078, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.19420970976352692, + "learning_rate": 0.00044623664014783386, + "loss": 0.5735, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.18166953325271606, + "learning_rate": 0.00043990716436988924, + "loss": 0.8604, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.1502382755279541, + "learning_rate": 0.0004332758803228925, + "loss": 0.7673, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 1.0454745292663574, + "eval_runtime": 841.3951, + "eval_samples_per_second": 1.304, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.16875839233398438, + "learning_rate": 0.00042635332671593575, + "loss": 0.5882, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.16070039570331573, + "learning_rate": 0.00041915050515566445, + "loss": 0.8175, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.16584184765815735, + "learning_rate": 0.00041167886266207167, + "loss": 0.7795, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.15345798432826996, + "learning_rate": 0.0004039502734764241, + "loss": 0.7331, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 1.0627943277359009, + "eval_runtime": 849.4944, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 600 + }, + { + "epoch": 0.3125, + "grad_norm": 0.16241031885147095, + "learning_rate": 0.0003959770201902294, + "loss": 0.7436, + "step": 625 + }, + { + "epoch": 0.325, + "grad_norm": 0.1531112939119339, + "learning_rate": 0.0003877717742252371, + "loss": 0.6345, + "step": 650 + }, + { + "epoch": 0.3375, + "grad_norm": 0.15790140628814697, + "learning_rate": 0.00037934757569549495, + "loss": 0.7351, + "step": 675 + }, + { + "epoch": 0.35, + "grad_norm": 0.1822807490825653, + "learning_rate": 0.00037071781268346345, + "loss": 0.745, + "step": 700 + }, + { + "epoch": 0.35, + "eval_loss": 1.07315993309021, + "eval_runtime": 837.5276, + "eval_samples_per_second": 1.31, + "eval_steps_per_second": 0.021, + "step": 700 + }, + { + "epoch": 0.3625, + "grad_norm": 0.15983282029628754, + "learning_rate": 0.00036189619996312495, + "loss": 0.5972, + "step": 725 + }, + { + "epoch": 0.375, + "grad_norm": 0.18202389776706696, + "learning_rate": 0.00035289675720390174, + "loss": 0.759, + "step": 750 + }, + { + "epoch": 0.3875, + "grad_norm": 0.16057106852531433, + "learning_rate": 0.00034373378669002105, + "loss": 0.7358, + "step": 775 + }, + { + "epoch": 0.4, + "grad_norm": 0.1680625081062317, + "learning_rate": 0.00033442185059073706, + "loss": 0.5636, + "step": 800 + }, + { + "epoch": 0.4, + "eval_loss": 1.0932456254959106, + "eval_runtime": 840.1331, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 800 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.063452346895565e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-900/optimizer_0/.metadata b/checkpoint-900/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..eb3658995bda24aae40571e0a2d8d803b0367e08 --- /dev/null +++ b/checkpoint-900/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbaa47ae2bf92dccf80560af82e198097e278995a4d33dd8e104fb67783768c4 +size 869361 diff --git a/checkpoint-900/optimizer_0/__0_0.distcp b/checkpoint-900/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..618b93ba37bc7e88f85ec326a690cacbca442ef7 --- /dev/null +++ b/checkpoint-900/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0093ae3ec95a2b888d2363731af0175db98122277787a475f1fd48fedf0ee6e2 +size 6008476 diff --git a/checkpoint-900/optimizer_0/__1_0.distcp b/checkpoint-900/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..6e83402cc2745b410295652d044858b69fb02344 --- /dev/null +++ b/checkpoint-900/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28ddc323f50aabfa06bd269d458c7460a74228653b048a453809130e7a1f86da +size 6041200 diff --git a/checkpoint-900/optimizer_0/__2_0.distcp b/checkpoint-900/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..f5c72bc1e740f0eb130166cc210a062b151cd88f --- /dev/null +++ b/checkpoint-900/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fd838bcee0fd614da3c6a5931b247d6972ae9a76ad231642cd91a6e04f1bbd6 +size 6041200 diff --git a/checkpoint-900/optimizer_0/__3_0.distcp b/checkpoint-900/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..6b929e06ea218a036cfce148a877d1c5dd75f24e --- /dev/null +++ b/checkpoint-900/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a1d1305899acca84fc5bcae449beac4c5e0c955dfb99b23bb368e5c633793a8 +size 6043476 diff --git a/checkpoint-900/optimizer_0/__4_0.distcp b/checkpoint-900/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..69d638ccc635274dd2225613b65c5708130983e1 --- /dev/null +++ b/checkpoint-900/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95e637b53651802ad2a4082ea66097a197e62d974d32746c26fcfb2b7a108ab6 +size 6057364 diff --git a/checkpoint-900/optimizer_0/__5_0.distcp b/checkpoint-900/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..47c2b367363cbe12b0223dee23530af8ac2af52f --- /dev/null +++ b/checkpoint-900/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:622da15d670a25912919548c848b4a098a79ff996c1fe7fa7f083ac1c0ec9f70 +size 6042612 diff --git a/checkpoint-900/optimizer_0/__6_0.distcp b/checkpoint-900/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..4b0438c4e77f55618795d99c1b12823e281c45a8 --- /dev/null +++ b/checkpoint-900/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d3ad44367c6935f5f3bc07d928fc5f58ec3b998e088e812756d49bd4d4d2b72 +size 6042612 diff --git a/checkpoint-900/optimizer_0/__7_0.distcp b/checkpoint-900/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..fd5dcdb3fbeada29ef1c53ac7b637ef713a4e2fd --- /dev/null +++ b/checkpoint-900/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f342eb553d9da275381c1c24a5669309455409f94ecbbaa926f454b4780c16ad +size 6042612 diff --git a/checkpoint-900/pytorch_model_fsdp_0/.metadata b/checkpoint-900/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..dffa7390ca220bd46debdbeb781ff274862e1d24 --- /dev/null +++ b/checkpoint-900/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9190ddfcf1e844d555cc23be7b793dac21abdbaa76c40a427926789b02d44910 +size 339851 diff --git a/checkpoint-900/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-900/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..7d68a17ccbc4359993619627f4446324b417904e --- /dev/null +++ b/checkpoint-900/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b850378ed10e2aa42825412fe68032cae0c52e9ccb0c2d577d77acc313019ac +size 3003648 diff --git a/checkpoint-900/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-900/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..36b5f4f248556b08e1812cc9ce2b47f65bd6fe7b --- /dev/null +++ b/checkpoint-900/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4e837781a1eda240513907fc3a6163d05bba1974ea055458e9ba8076f3179ee +size 3003648 diff --git a/checkpoint-900/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-900/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..2447eaec8e5ccbfe94087f2cc407736a71e71acb --- /dev/null +++ b/checkpoint-900/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e76961aff78207f32c1170656e28b45b4b4382eb0e162b80202d0a0984b15b3 +size 3003648 diff --git a/checkpoint-900/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-900/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..ca3055170491be3a23a10c12527c4c51582a70be --- /dev/null +++ b/checkpoint-900/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10dfafdeefb1eeca1ca0f10eba4f9d7afee6202030d9c4fe542176de6980288f +size 3003648 diff --git a/checkpoint-900/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-900/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..9eef89d2738f6c864e50c7585004b7de1930c93a --- /dev/null +++ b/checkpoint-900/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93d78fba4d537ed7979cf5c64fececbfe3710b99cc7ad454de5cc8618b0714d5 +size 3003648 diff --git a/checkpoint-900/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-900/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..8ac3d30bd3816585021533f590363bbf6b6dcc6c --- /dev/null +++ b/checkpoint-900/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:874c9e0a8fcfe4f3022185757d4c3584f8bbfee2d9f0081dd71e4618cf516c13 +size 3003648 diff --git a/checkpoint-900/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-900/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..4be4e71f8e3c1a8a64dfa9493f3e86ec30c1483a --- /dev/null +++ b/checkpoint-900/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:210ee36ad75763128721b6155fa1a5952c4f529ecb6f156c50b4ea40093e8c72 +size 3003648 diff --git a/checkpoint-900/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-900/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..99cf4047ffbf6efb63fa99ee039b25c459ae4745 --- /dev/null +++ b/checkpoint-900/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57dcea19646c49f36c8b5803aeebca8d708c740231eb4726928c9cf6e8c62c73 +size 3003648 diff --git a/checkpoint-900/rng_state_0.pth b/checkpoint-900/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..6f7212620384ca896ddbb8c23d9b0098e49cd3ff --- /dev/null +++ b/checkpoint-900/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d13a36e628236695a972ee9fb76f88cc36c00608604d5bd263206cb5124dbd7d +size 15920 diff --git a/checkpoint-900/rng_state_1.pth b/checkpoint-900/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..4f150aed2990427a652f468bc7413a6d9482ff81 --- /dev/null +++ b/checkpoint-900/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23f82f54b09d8f2dac91585825ecf36127e36b821dbab5ed25796ec90d0e9b04 +size 15984 diff --git a/checkpoint-900/rng_state_2.pth b/checkpoint-900/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..08cbb03242a69ade96d61402b2b294aa0f01232e --- /dev/null +++ b/checkpoint-900/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:138709c99223895b208c586f871a225b0faf3972db74e752eb5533badae97672 +size 15984 diff --git a/checkpoint-900/rng_state_3.pth b/checkpoint-900/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..cd7a3edee7d843e261b8f2d0704079bf9aa33a96 --- /dev/null +++ b/checkpoint-900/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:356d53601f7cbb7bd3c2b24fa4b30c939a248284d345d26fe1b313b58687aea7 +size 15984 diff --git a/checkpoint-900/rng_state_4.pth b/checkpoint-900/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..8936b7750cd6a493f9a495f396ad3616522a5a25 --- /dev/null +++ b/checkpoint-900/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29f832fb5ca9e51ef7d884042fbdebee0a99641db20294d78e0bbbd1eb8faca1 +size 15984 diff --git a/checkpoint-900/rng_state_5.pth b/checkpoint-900/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..9a4a53e31d212b6f2550bae6e5c38d65c6b72b5e --- /dev/null +++ b/checkpoint-900/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:762540bbafcd78de93499144446247bf3b263115df9227761d32e784f965a8d5 +size 15984 diff --git a/checkpoint-900/rng_state_6.pth b/checkpoint-900/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..996605c16fe19ccb4add64768e7a35890431b5e4 --- /dev/null +++ b/checkpoint-900/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a06a2dfb2b1b635be1df9ea42d970b61333bd2b01e51d9e7dbf3873784b30e00 +size 15984 diff --git a/checkpoint-900/rng_state_7.pth b/checkpoint-900/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..24fcd3eb7ec7ad8e89ae99d0e36b21b1fa6b28b8 --- /dev/null +++ b/checkpoint-900/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf2b8ee4118d412fb54eaa07e2a5280ba0272c643737804cc19a5ab5c30bc01b +size 15984 diff --git a/checkpoint-900/scheduler.pt b/checkpoint-900/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..72a700f2d08b7ec27a4b1bf9093382c22731b5de --- /dev/null +++ b/checkpoint-900/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b9752ff9fe4b64d860340a91f85fd2a22d00770203412a2f6bf01b73aa0f846 +size 1064 diff --git a/checkpoint-900/trainer_state.json b/checkpoint-900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..91abea50eb3f712dba0870a66487b9ec7ebd1593 --- /dev/null +++ b/checkpoint-900/trainer_state.json @@ -0,0 +1,358 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.45, + "eval_steps": 100, + "global_step": 900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.14176353812217712, + "learning_rate": 0.0004, + "loss": 1.1525, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.1460508555173874, + "learning_rate": 0.0004998852503731983, + "loss": 1.047, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.2368021011352539, + "learning_rate": 0.0004993848168027977, + "loss": 0.8529, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.14488168060779572, + "learning_rate": 0.0004984880506341147, + "loss": 0.9756, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9470569491386414, + "eval_runtime": 845.9115, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.1415175348520279, + "learning_rate": 0.0004971963770447935, + "loss": 0.9564, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.1751064509153366, + "learning_rate": 0.0004955118488155782, + "loss": 0.711, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.1439943015575409, + "learning_rate": 0.0004934371430679492, + "loss": 0.9409, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.15947112441062927, + "learning_rate": 0.0004909755570095319, + "loss": 0.8979, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9711233973503113, + "eval_runtime": 845.6433, + "eval_samples_per_second": 1.297, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.14685837924480438, + "learning_rate": 0.0004881310026940389, + "loss": 0.6376, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.15040776133537292, + "learning_rate": 0.0004849080008040734, + "loss": 0.927, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.16087745130062103, + "learning_rate": 0.00048131167346667446, + "loss": 0.8456, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.15025638043880463, + "learning_rate": 0.00047734773611302284, + "loss": 0.6029, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 1.0056413412094116, + "eval_runtime": 848.447, + "eval_samples_per_second": 1.293, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.15893957018852234, + "learning_rate": 0.0004730224883952422, + "loss": 0.9036, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.1535714715719223, + "learning_rate": 0.0004683428041747334, + "loss": 0.828, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.1718970686197281, + "learning_rate": 0.0004633161205979517, + "loss": 0.5944, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.17664535343647003, + "learning_rate": 0.0004579504262769877, + "loss": 0.8654, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 1.0151112079620361, + "eval_runtime": 839.7482, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.17122088372707367, + "learning_rate": 0.0004522542485937369, + "loss": 0.8078, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.19420970976352692, + "learning_rate": 0.00044623664014783386, + "loss": 0.5735, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.18166953325271606, + "learning_rate": 0.00043990716436988924, + "loss": 0.8604, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.1502382755279541, + "learning_rate": 0.0004332758803228925, + "loss": 0.7673, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 1.0454745292663574, + "eval_runtime": 841.3951, + "eval_samples_per_second": 1.304, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.16875839233398438, + "learning_rate": 0.00042635332671593575, + "loss": 0.5882, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.16070039570331573, + "learning_rate": 0.00041915050515566445, + "loss": 0.8175, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.16584184765815735, + "learning_rate": 0.00041167886266207167, + "loss": 0.7795, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.15345798432826996, + "learning_rate": 0.0004039502734764241, + "loss": 0.7331, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 1.0627943277359009, + "eval_runtime": 849.4944, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 600 + }, + { + "epoch": 0.3125, + "grad_norm": 0.16241031885147095, + "learning_rate": 0.0003959770201902294, + "loss": 0.7436, + "step": 625 + }, + { + "epoch": 0.325, + "grad_norm": 0.1531112939119339, + "learning_rate": 0.0003877717742252371, + "loss": 0.6345, + "step": 650 + }, + { + "epoch": 0.3375, + "grad_norm": 0.15790140628814697, + "learning_rate": 0.00037934757569549495, + "loss": 0.7351, + "step": 675 + }, + { + "epoch": 0.35, + "grad_norm": 0.1822807490825653, + "learning_rate": 0.00037071781268346345, + "loss": 0.745, + "step": 700 + }, + { + "epoch": 0.35, + "eval_loss": 1.07315993309021, + "eval_runtime": 837.5276, + "eval_samples_per_second": 1.31, + "eval_steps_per_second": 0.021, + "step": 700 + }, + { + "epoch": 0.3625, + "grad_norm": 0.15983282029628754, + "learning_rate": 0.00036189619996312495, + "loss": 0.5972, + "step": 725 + }, + { + "epoch": 0.375, + "grad_norm": 0.18202389776706696, + "learning_rate": 0.00035289675720390174, + "loss": 0.759, + "step": 750 + }, + { + "epoch": 0.3875, + "grad_norm": 0.16057106852531433, + "learning_rate": 0.00034373378669002105, + "loss": 0.7358, + "step": 775 + }, + { + "epoch": 0.4, + "grad_norm": 0.1680625081062317, + "learning_rate": 0.00033442185059073706, + "loss": 0.5636, + "step": 800 + }, + { + "epoch": 0.4, + "eval_loss": 1.0932456254959106, + "eval_runtime": 840.1331, + "eval_samples_per_second": 1.306, + "eval_steps_per_second": 0.021, + "step": 800 + }, + { + "epoch": 0.4125, + "grad_norm": 0.15613198280334473, + "learning_rate": 0.00032497574781753367, + "loss": 0.7596, + "step": 825 + }, + { + "epoch": 0.425, + "grad_norm": 0.1628854125738144, + "learning_rate": 0.000315410490505086, + "loss": 0.7282, + "step": 850 + }, + { + "epoch": 0.4375, + "grad_norm": 0.16740979254245758, + "learning_rate": 0.0003057412801533589, + "loss": 0.5325, + "step": 875 + }, + { + "epoch": 0.45, + "grad_norm": 0.1634828895330429, + "learning_rate": 0.0002959834834687587, + "loss": 0.778, + "step": 900 + }, + { + "epoch": 0.45, + "eval_loss": 1.0952215194702148, + "eval_runtime": 836.473, + "eval_samples_per_second": 1.311, + "eval_steps_per_second": 0.022, + "step": 900 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.07138389025751e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9b9572142403aa03b5e8c5935734c7d378bea8b3 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:475a61313f272f84af344f68e4b3293485fac16812ba9a9d293a6d45cefc5cbd +size 5304