diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..4f21c7103f970cbddafe69773523ef20ce0db5ea 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,291 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-100/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-100/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-100/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-100/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-200/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-200/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-200/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-300/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-300/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-300/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-400/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-400/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-400/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-500/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-500/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-500/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-600/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-600/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-600/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-700/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-700/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-700/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-800/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-800/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-800/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-900/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/pytorch_model_fsdp_0/.metadata filter=lfs diff=lfs merge=lfs -text +checkpoint-900/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text +checkpoint-900/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text diff --git a/checkpoint-100/optimizer_0/.metadata b/checkpoint-100/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..57cf2f4024e992e578763bcad9a00ea9b89377a3 --- /dev/null +++ b/checkpoint-100/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0edc8d2dfeb47a02ebf14c13ae8e8bce201146e0bc5a503c06dc5c3badd9be10 +size 438489 diff --git a/checkpoint-100/optimizer_0/__0_0.distcp b/checkpoint-100/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..f776e767d9601178c3904af1dbcde854a0f35439 --- /dev/null +++ b/checkpoint-100/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d1c28d3f689644a47e74d4f1ee5b7dc5cc044565b1315bff8e8a2aeefe9f413 +size 2980252 diff --git a/checkpoint-100/optimizer_0/__1_0.distcp b/checkpoint-100/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..ec125f13ef5755cb96220f3ca79337bded2653fd --- /dev/null +++ b/checkpoint-100/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:030628a66629a76363540bb2818b91de7f253b19406440a21717419a62ab3717 +size 2997320 diff --git a/checkpoint-100/optimizer_0/__2_0.distcp b/checkpoint-100/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..4daf82590d909c3d9ba2674379005d824ca6efb9 --- /dev/null +++ b/checkpoint-100/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a9391201a9da96a4fdbab7b6a4cf47168091bd0dd1c44867765ccc1d3e73476 +size 2997320 diff --git a/checkpoint-100/optimizer_0/__3_0.distcp b/checkpoint-100/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..6a273094b455bbf75106ba83bec7df4ac3a9dd11 --- /dev/null +++ b/checkpoint-100/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9fc15f10515d2e4ee6b71f505f05d1a7bea77c1dba9499fe4eb2072451c7d6d +size 2997320 diff --git a/checkpoint-100/optimizer_0/__4_0.distcp b/checkpoint-100/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..4b40388f00f5c26654fbe3ee26c648eabf45f17f --- /dev/null +++ b/checkpoint-100/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d4916eaf545376ba9d5f9f57f76790687595cb6801bf059b0993682353a07e1 +size 2997320 diff --git a/checkpoint-100/optimizer_0/__5_0.distcp b/checkpoint-100/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..94affcaaa39e5fd748c4128d22006c8cd7605d6e --- /dev/null +++ b/checkpoint-100/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f939ef56e2f24636e7e6a98bb1c60ab4d6b639f6c7ff2d28b3829debe688ed93 +size 2999596 diff --git a/checkpoint-100/optimizer_0/__6_0.distcp b/checkpoint-100/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..d83278a018cb17bec29e309d5b759a501277b6be --- /dev/null +++ b/checkpoint-100/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3865669be6d30c5583dd026a7ecba0ea760ae9771b16578f04bef63370b91163 +size 2998732 diff --git a/checkpoint-100/optimizer_0/__7_0.distcp b/checkpoint-100/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..559da9d129a4885cc1927ab966a32aeb8a37965c --- /dev/null +++ b/checkpoint-100/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fd2f18ecfb8649864f2037130c72ee56e478582b36b01305fe71bb7afc22e14 +size 3005708 diff --git a/checkpoint-100/pytorch_model_fsdp_0/.metadata b/checkpoint-100/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..0968a37d48b943f526f19533e6b242e5cf7ea829 --- /dev/null +++ b/checkpoint-100/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68cfc031ba4ee5e17d54df02e4118bf33d20b307af59f234ca2e2343fa219903 +size 170758 diff --git a/checkpoint-100/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-100/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..5e86ab1e8abb5964f426b54f45d572a9731b2a4f --- /dev/null +++ b/checkpoint-100/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1c031aaa4d9850d16cd4fd5b0d84e9577b3d28cc0ee02f5d3e5b600380a7ad8 +size 1489536 diff --git a/checkpoint-100/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-100/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..b7dddb0dbb31c0b71f1eea407133224e73fbfcc0 --- /dev/null +++ b/checkpoint-100/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ec61051deaf3fa23eb94f1031f932e14bdce8cd6bcdc1fca81330b5d598e77d +size 1489536 diff --git a/checkpoint-100/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-100/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..278d2f1ca40fdaf9178bd0e8e80fb5e797d36afb --- /dev/null +++ b/checkpoint-100/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30d8685570a1f594d1e21439e41cebd424ca6492cf61e263820d15e7a0745ddc +size 1489536 diff --git a/checkpoint-100/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-100/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..7cb475020155b8c2ee6c0f27ca87a84fd0f746b5 --- /dev/null +++ b/checkpoint-100/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ca77920177e57b80c4563dad6c5d896feb2765409bd5c898525431bba4553e4 +size 1489536 diff --git a/checkpoint-100/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-100/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..bdbfdfc5deea5dc7b67cfe6880b6b379c3a58e64 --- /dev/null +++ b/checkpoint-100/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:938b895764f35aa99aa6c7aff0eae60410fc3881558def1bf6ddfd6a293e6219 +size 1489536 diff --git a/checkpoint-100/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-100/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..2cfcef9303ce035e9ca04425190ac41f84ad7639 --- /dev/null +++ b/checkpoint-100/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1fb9b9484c9ce197cb68b5700a7090a302cef24375116457588188830d5aff1 +size 1489536 diff --git a/checkpoint-100/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-100/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..03bd22b09dffc589b19f1e59f0a4f35d2441f5b5 --- /dev/null +++ b/checkpoint-100/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52a2ed377073d705368c5982737897b617055ea31e7890f447142c68c57ad89b +size 1489536 diff --git a/checkpoint-100/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-100/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..4d4f42a047dd6ec873c1b22d44d912dbcfffbc3c --- /dev/null +++ b/checkpoint-100/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2f02548ea862d75fd09dcf614ff8b5a6c75347e7603902808fffc6f60f1fc8d +size 1489536 diff --git a/checkpoint-100/rng_state_0.pth b/checkpoint-100/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..f736bfe532d7e99bd98a6a32866d848a21a02d76 --- /dev/null +++ b/checkpoint-100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e67ac3c4038beb665d2cc4bd735b6f05977897a2757187e8c7c8e6b89fa4ad3d +size 15920 diff --git a/checkpoint-100/rng_state_1.pth b/checkpoint-100/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..7b46f01f4effba938049f318258b3987bffd06aa --- /dev/null +++ b/checkpoint-100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:219fc23b677e769ae4d4806c12e3df4ff2b78c28b311847bec2ecb99a35a51e6 +size 15984 diff --git a/checkpoint-100/rng_state_2.pth b/checkpoint-100/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..5e5963a1697f39b58d0a25802e72eac30bae3e81 --- /dev/null +++ b/checkpoint-100/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a352b51cb68b5d8818a7a28f74f32dffb1095d7e281ff4022f0e365fe98a8ee0 +size 15984 diff --git a/checkpoint-100/rng_state_3.pth b/checkpoint-100/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..a84335319e6e42b032d1d6da793e5377d412cf81 --- /dev/null +++ b/checkpoint-100/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df9e8f9785d0d17d62f1ba8a141384aa7a6438a53ecad5f21d877594c31b45d0 +size 15984 diff --git a/checkpoint-100/rng_state_4.pth b/checkpoint-100/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..1a0ed8248d472d9e71bbf034a061371cbb346355 --- /dev/null +++ b/checkpoint-100/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a770deea02d60eea9348a15bd08ac4f95c99d6b5b113eb31bc2fa7631dba1988 +size 15984 diff --git a/checkpoint-100/rng_state_5.pth b/checkpoint-100/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..74a809578c3cc5e6c60f1c6d9e0defa9447cc719 --- /dev/null +++ b/checkpoint-100/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72b60fb34596e1922ede5c5a2479b5b98033b4b988dc300df633c12f5e255755 +size 15984 diff --git a/checkpoint-100/rng_state_6.pth b/checkpoint-100/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..84a106e46cc9752aacfad3281bd314777c83b985 --- /dev/null +++ b/checkpoint-100/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84274411027a8eb72ed1179f8209a875b6f4101ac7c2790eaef04102df49af52 +size 15984 diff --git a/checkpoint-100/rng_state_7.pth b/checkpoint-100/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..10798d00a6ae647b8d8bdb74a97955808519e065 --- /dev/null +++ b/checkpoint-100/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bbc89ce90c5e7e9f362eb48f3be0a6f39aee82e598c876d2d126ef971bbbfdc +size 15984 diff --git a/checkpoint-100/scheduler.pt b/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..18a25b44ce07bc51cbcafce5586c7593482826a5 --- /dev/null +++ b/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b84ab1237abc7bd4d31945126355c5b6d9e26cb338d88dae9fd60030b2e1fb3 +size 1064 diff --git a/checkpoint-100/trainer_state.json b/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2c9aa1faa69fc7712fb5189ae62492d38fbd2fbe --- /dev/null +++ b/checkpoint-100/trainer_state.json @@ -0,0 +1,70 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.05, + "eval_steps": 100, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.08136817067861557, + "learning_rate": 0.0004, + "loss": 1.158, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.0653829574584961, + "learning_rate": 0.0004998852503731983, + "loss": 1.0957, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.11592712253332138, + "learning_rate": 0.0004993848168027977, + "loss": 0.9276, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.08926476538181305, + "learning_rate": 0.0004984880506341147, + "loss": 1.0337, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9111642837524414, + "eval_runtime": 843.8058, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 100 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0051773955833856e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1000/optimizer_0/.metadata b/checkpoint-1000/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..2adbe5bb889ae8d02c14716946a1f4af4583c391 --- /dev/null +++ b/checkpoint-1000/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5adc2f86584578d5b0fc9d27ea5dbfbb771072ee212717f7238e9996fd706b9 +size 438490 diff --git a/checkpoint-1000/optimizer_0/__0_0.distcp b/checkpoint-1000/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..20b85e7e4aa21d9b739595d8801fb676cd5cf683 --- /dev/null +++ b/checkpoint-1000/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6d7394a57419b8165633d1023718e8463d05d7c238b55a2e7d2678f252c6749 +size 2980252 diff --git a/checkpoint-1000/optimizer_0/__1_0.distcp b/checkpoint-1000/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..4417dd9154440828a2b54562d70f6fa67a4c710b --- /dev/null +++ b/checkpoint-1000/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98819476dcc0947c7b0953b16a3a43dbb12f43c67dcd556c063b49171d1c069f +size 2997320 diff --git a/checkpoint-1000/optimizer_0/__2_0.distcp b/checkpoint-1000/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..fc17d20b67424819aacc4c60853953a9e65f7072 --- /dev/null +++ b/checkpoint-1000/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7a1bb1ba5070ed8f6dfb63045e29b095ccdef856f78679e7b19b4ce0087b9f8 +size 2997320 diff --git a/checkpoint-1000/optimizer_0/__3_0.distcp b/checkpoint-1000/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..3bef0f3e5fbc143611a1e6769a72d3133ccceb21 --- /dev/null +++ b/checkpoint-1000/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7db8406b0d798fb108ace791356028126de7ad08f6b4eb0739c7cc8e9701cdf6 +size 2997320 diff --git a/checkpoint-1000/optimizer_0/__4_0.distcp b/checkpoint-1000/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..7a3a145746bd868f76f0286228f732e4fea4e4a1 --- /dev/null +++ b/checkpoint-1000/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63686f6bd6c50897e03d760b4f72958355d7496fb960c9f340850296f38d35b2 +size 2997320 diff --git a/checkpoint-1000/optimizer_0/__5_0.distcp b/checkpoint-1000/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..c48a4fb255f2f6570f1231d90df9916ba79a7f6a --- /dev/null +++ b/checkpoint-1000/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6799641756030e3574650b44f2547e20e7d31b645cc25858a4411e1be2910925 +size 2999596 diff --git a/checkpoint-1000/optimizer_0/__6_0.distcp b/checkpoint-1000/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..1deacb9a115b5ca505df69c4a305bb9bb793a89b --- /dev/null +++ b/checkpoint-1000/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bdb585bfac57da8fec672146a2806ef304b10a4f6825ba24dfa84caaa05d598 +size 2998732 diff --git a/checkpoint-1000/optimizer_0/__7_0.distcp b/checkpoint-1000/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..5e8152381b318ff6aeb0b8a2ceeac576c5e8355b --- /dev/null +++ b/checkpoint-1000/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eecf6e24578cc6ea92838e932b80d8dbddc7485f60cf5abf3cf041f8ea8982ba +size 3005708 diff --git a/checkpoint-1000/pytorch_model_fsdp_0/.metadata b/checkpoint-1000/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..a58d06e86f5b9065e1ea17e9794ef989b881a637 --- /dev/null +++ b/checkpoint-1000/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22dae21ec1ab3c79cfa84c69284e9d1d5c53be899d859a084832b641e53c0c58 +size 170759 diff --git a/checkpoint-1000/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-1000/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..4bc0b5566bcbc723464c2f5bc4bbef6db0611b7f --- /dev/null +++ b/checkpoint-1000/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45fd5c265162921ffc10ba501bcf7301af17e682d1e7782bd851d7123a9ff52f +size 1489536 diff --git a/checkpoint-1000/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-1000/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..67a365b2778cf56d56ec84a9866d40eed18551e6 --- /dev/null +++ b/checkpoint-1000/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc340604862635e9a74d8a8ad2ab86843cae36b1ee0346e5ab344793aaa415a3 +size 1489536 diff --git a/checkpoint-1000/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-1000/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..bfe426967978c38e67ebb26ce874139cd936c4f7 --- /dev/null +++ b/checkpoint-1000/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4e5135c37126c1dd364cda7c9a85dc035ff01e694ed0826916f50cfb8cbafe9 +size 1489536 diff --git a/checkpoint-1000/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-1000/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..7039b5942cc3183f46608a5999d0aefe74b562d2 --- /dev/null +++ b/checkpoint-1000/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d429e58b5d0c3e5b18959b3586a288702734cc26e8c70a9bbf05d6e3db8c88cb +size 1489536 diff --git a/checkpoint-1000/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-1000/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..ff6960411b6f8b727b436a258f9a6b9a222d82c8 --- /dev/null +++ b/checkpoint-1000/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d953897b18811b23f76e97d2db97a0bf63f3d733026b90325e1fa1e1b4140816 +size 1489536 diff --git a/checkpoint-1000/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-1000/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..44ba6c72e965b46bcedafa85cf21362a89663221 --- /dev/null +++ b/checkpoint-1000/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b52f1b67d2571358eb10fed0ea265b7ba227f5d1ae555bbdd77309941eb6a82f +size 1489536 diff --git a/checkpoint-1000/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-1000/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..a2c153b815af8c851d0a32e2be9b87dd17f0fe8d --- /dev/null +++ b/checkpoint-1000/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad5f2b67364597e2fe0e020f7a9251138cf95238cd02209fa386fedce5c7d894 +size 1489536 diff --git a/checkpoint-1000/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-1000/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..3d15bc415ad20f926200703000daf42f35d0358a --- /dev/null +++ b/checkpoint-1000/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:605e2ea118ae0cece4191888ba5264eb3241fcb31217526ef80d208ed4aa85be +size 1489536 diff --git a/checkpoint-1000/rng_state_0.pth b/checkpoint-1000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d5af0d362a3113d75dda7637fde01a29169fb8ef --- /dev/null +++ b/checkpoint-1000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cce6fd095e8164b6174af806d5b65f1592b912a16965a6ac33d77e523c8ae2a +size 15920 diff --git a/checkpoint-1000/rng_state_1.pth b/checkpoint-1000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3452b4c6342c5827692e58dda66dc3088e599489 --- /dev/null +++ b/checkpoint-1000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c89ca7da8c752e198c07a80618c28fafea39abe5f5e38d625a1d96b586893f6e +size 15984 diff --git a/checkpoint-1000/rng_state_2.pth b/checkpoint-1000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..404a7e06dec824b6e49e724be0a89e3a76291d21 --- /dev/null +++ b/checkpoint-1000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26a64aa0a7bd0e9443e2c11a9e1b32b905f251349e940dd3776471dd51dc9441 +size 15984 diff --git a/checkpoint-1000/rng_state_3.pth b/checkpoint-1000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..501b011b332818b8f0fa85551fc7e8679c367117 --- /dev/null +++ b/checkpoint-1000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a594654588fc00a315de06ecb649724c8831626a965fe1794770a5720439d77 +size 15984 diff --git a/checkpoint-1000/rng_state_4.pth b/checkpoint-1000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..cd7be1f793b16cca6404cb79e494ff102b8c2ffe --- /dev/null +++ b/checkpoint-1000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f05c534760702d2b502b038f225706d0fc2398437c12ed59dc6afeacd0f91fdb +size 15984 diff --git a/checkpoint-1000/rng_state_5.pth b/checkpoint-1000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..659abf7b203527bae8d883d7408581f8a26efaa1 --- /dev/null +++ b/checkpoint-1000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5642030f3e712a115127de41444ed858a7a4cd47e591eb5d813c9053141d0ee8 +size 15984 diff --git a/checkpoint-1000/rng_state_6.pth b/checkpoint-1000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..502636b8c422c55e279d34385c87e89e89b2c774 --- /dev/null +++ b/checkpoint-1000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e95998bcb1c519354ca7d81b1fc52e904512e12dac25fcb5d083773e08e027ec +size 15984 diff --git a/checkpoint-1000/rng_state_7.pth b/checkpoint-1000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..c1ec569e924cf8b598e112a5174e2f903d038527 --- /dev/null +++ b/checkpoint-1000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ac0d9420ffc0691423064caa05e83bae45e9091902d0a644809c8e2535119b7 +size 15984 diff --git a/checkpoint-1000/scheduler.pt b/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e37d0618b489fc19085acd1bbe069567c7c8447d --- /dev/null +++ b/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b23d844ad7ae2eb6c7cbba3f70be2436823b11da6591df71ddcc7059f5593c4 +size 1064 diff --git a/checkpoint-1000/trainer_state.json b/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2c2fa5eb96e122b1eddbfcb9754d656451052e26 --- /dev/null +++ b/checkpoint-1000/trainer_state.json @@ -0,0 +1,394 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5, + "eval_steps": 100, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.08136817067861557, + "learning_rate": 0.0004, + "loss": 1.158, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.0653829574584961, + "learning_rate": 0.0004998852503731983, + "loss": 1.0957, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.11592712253332138, + "learning_rate": 0.0004993848168027977, + "loss": 0.9276, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.08926476538181305, + "learning_rate": 0.0004984880506341147, + "loss": 1.0337, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9111642837524414, + "eval_runtime": 843.8058, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.08632034063339233, + "learning_rate": 0.0004971963770447935, + "loss": 1.0219, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.12949417531490326, + "learning_rate": 0.0004955118488155782, + "loss": 0.784, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.09222520887851715, + "learning_rate": 0.0004934371430679492, + "loss": 1.0, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.10075929015874863, + "learning_rate": 0.0004909755570095319, + "loss": 0.9617, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9380430579185486, + "eval_runtime": 856.156, + "eval_samples_per_second": 1.281, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.09567111730575562, + "learning_rate": 0.0004881310026940389, + "loss": 0.7051, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.09723825007677078, + "learning_rate": 0.0004849080008040734, + "loss": 0.9906, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.09430444985628128, + "learning_rate": 0.00048131167346667446, + "loss": 0.9113, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.09439756721258163, + "learning_rate": 0.00047734773611302284, + "loss": 0.674, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 0.9544369578361511, + "eval_runtime": 850.9461, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.10961435735225677, + "learning_rate": 0.0004730224883952422, + "loss": 0.9701, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.09628895670175552, + "learning_rate": 0.0004683428041747334, + "loss": 0.8976, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.11085063964128494, + "learning_rate": 0.0004633161205979517, + "loss": 0.6683, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.10140710324048996, + "learning_rate": 0.0004579504262769877, + "loss": 0.9373, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 0.968166172504425, + "eval_runtime": 853.0258, + "eval_samples_per_second": 1.286, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.13868793845176697, + "learning_rate": 0.0004522542485937369, + "loss": 0.8822, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.11853040754795074, + "learning_rate": 0.00044623664014783386, + "loss": 0.6483, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.11668186634778976, + "learning_rate": 0.00043990716436988924, + "loss": 0.9374, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.11911585181951523, + "learning_rate": 0.0004332758803228925, + "loss": 0.8434, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 0.9753186702728271, + "eval_runtime": 851.2812, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.09807440638542175, + "learning_rate": 0.00042635332671593575, + "loss": 0.6661, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.09675773978233337, + "learning_rate": 0.00041915050515566445, + "loss": 0.8999, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.0979737937450409, + "learning_rate": 0.00041167886266207167, + "loss": 0.8616, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.08990875631570816, + "learning_rate": 0.0004039502734764241, + "loss": 0.8167, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 0.9926204681396484, + "eval_runtime": 851.6484, + "eval_samples_per_second": 1.288, + "eval_steps_per_second": 0.021, + "step": 600 + }, + { + "epoch": 0.3125, + "grad_norm": 0.09505137801170349, + "learning_rate": 0.0003959770201902294, + "loss": 0.8191, + "step": 625 + }, + { + "epoch": 0.325, + "grad_norm": 0.09105059504508972, + "learning_rate": 0.0003877717742252371, + "loss": 0.7203, + "step": 650 + }, + { + "epoch": 0.3375, + "grad_norm": 0.09274734556674957, + "learning_rate": 0.00037934757569549495, + "loss": 0.8154, + "step": 675 + }, + { + "epoch": 0.35, + "grad_norm": 0.10026416182518005, + "learning_rate": 0.00037071781268346345, + "loss": 0.8336, + "step": 700 + }, + { + "epoch": 0.35, + "eval_loss": 0.9959912896156311, + "eval_runtime": 849.7662, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 700 + }, + { + "epoch": 0.3625, + "grad_norm": 0.09517039358615875, + "learning_rate": 0.00036189619996312495, + "loss": 0.6845, + "step": 725 + }, + { + "epoch": 0.375, + "grad_norm": 0.11264201998710632, + "learning_rate": 0.00035289675720390174, + "loss": 0.8445, + "step": 750 + }, + { + "epoch": 0.3875, + "grad_norm": 0.0979958102107048, + "learning_rate": 0.00034373378669002105, + "loss": 0.8269, + "step": 775 + }, + { + "epoch": 0.4, + "grad_norm": 0.10223093628883362, + "learning_rate": 0.00033442185059073706, + "loss": 0.6517, + "step": 800 + }, + { + "epoch": 0.4, + "eval_loss": 1.006990671157837, + "eval_runtime": 845.1184, + "eval_samples_per_second": 1.298, + "eval_steps_per_second": 0.021, + "step": 800 + }, + { + "epoch": 0.4125, + "grad_norm": 0.09464213252067566, + "learning_rate": 0.00032497574781753367, + "loss": 0.8455, + "step": 825 + }, + { + "epoch": 0.425, + "grad_norm": 0.10187377035617828, + "learning_rate": 0.000315410490505086, + "loss": 0.8217, + "step": 850 + }, + { + "epoch": 0.4375, + "grad_norm": 0.09793733805418015, + "learning_rate": 0.0003057412801533589, + "loss": 0.6218, + "step": 875 + }, + { + "epoch": 0.45, + "grad_norm": 0.09902466833591461, + "learning_rate": 0.0002959834834687587, + "loss": 0.869, + "step": 900 + }, + { + "epoch": 0.45, + "eval_loss": 1.0097707509994507, + "eval_runtime": 847.3981, + "eval_samples_per_second": 1.295, + "eval_steps_per_second": 0.021, + "step": 900 + }, + { + "epoch": 0.4625, + "grad_norm": 0.10546339303255081, + "learning_rate": 0.00028615260794273236, + "loss": 0.8204, + "step": 925 + }, + { + "epoch": 0.475, + "grad_norm": 0.10600671917200089, + "learning_rate": 0.00027626427720662416, + "loss": 0.5917, + "step": 950 + }, + { + "epoch": 0.4875, + "grad_norm": 0.09627766162157059, + "learning_rate": 0.00026633420620195917, + "loss": 0.8667, + "step": 975 + }, + { + "epoch": 0.5, + "grad_norm": 0.09961821138858795, + "learning_rate": 0.00025637817620561263, + "loss": 0.8215, + "step": 1000 + }, + { + "epoch": 0.5, + "eval_loss": 1.023415446281433, + "eval_runtime": 844.006, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 1000 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0051773955833856e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1100/optimizer_0/.metadata b/checkpoint-1100/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..e8074e71b61880d205f41b23d5d73c25f96c2d7e --- /dev/null +++ b/checkpoint-1100/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8260767f9e396da564bdaf23b851bc21f57ae9c4004c7696ca0a338169603204 +size 438490 diff --git a/checkpoint-1100/optimizer_0/__0_0.distcp b/checkpoint-1100/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..6c76c156b24c2486f5be7c3f4698e6debe1c2eaf --- /dev/null +++ b/checkpoint-1100/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff14fce62eaf8b74323e4d56ddb08d83960164ae54d2db0c760f02390fe3bcec +size 2980252 diff --git a/checkpoint-1100/optimizer_0/__1_0.distcp b/checkpoint-1100/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..74e6fd1d82132c4efd7b682f3f471ab4c0d679ca --- /dev/null +++ b/checkpoint-1100/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21e6429fe715c37a20246da36160888f7efecc123158100e36bd13d5d6b38b47 +size 2997320 diff --git a/checkpoint-1100/optimizer_0/__2_0.distcp b/checkpoint-1100/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..34afc274930cc424265b21ab4b25d21f04008681 --- /dev/null +++ b/checkpoint-1100/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8627cf183c4218c1bb664018ca78cd56824a510b3cae1612e2424afd3debd05 +size 2997320 diff --git a/checkpoint-1100/optimizer_0/__3_0.distcp b/checkpoint-1100/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..658474fc8851abacb890d6081830601d6924b06f --- /dev/null +++ b/checkpoint-1100/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77fb37e592a4adc657285ab3db2af313a7a9c5674d73953c66a0d47e9605d37a +size 2997320 diff --git a/checkpoint-1100/optimizer_0/__4_0.distcp b/checkpoint-1100/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..3837184970ee0bd3f600e266747298e2cf07841b --- /dev/null +++ b/checkpoint-1100/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:447f3769fdca87290b45d737da5f667769a1264552ea156475885cd2e3b2ff5c +size 2997320 diff --git a/checkpoint-1100/optimizer_0/__5_0.distcp b/checkpoint-1100/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..9f666c9815ca6b72c55a0ba17d6ead8a452b4f94 --- /dev/null +++ b/checkpoint-1100/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee4f5a6ff1a641293044eadca3e09386ad9b6aa02c3025e7a51109466a26bef8 +size 2999596 diff --git a/checkpoint-1100/optimizer_0/__6_0.distcp b/checkpoint-1100/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..ce854351ebeca3acfe6ca07d46a91156220786fe --- /dev/null +++ b/checkpoint-1100/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00ef3a97927b1aedf0929d94bb6cee3f348ec4c714cc57a5fe96fef8afddc32e +size 2998732 diff --git a/checkpoint-1100/optimizer_0/__7_0.distcp b/checkpoint-1100/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..880d275d7fee9f786b450ffe6dd0f906e2a00ad3 --- /dev/null +++ b/checkpoint-1100/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20b2c05b95117d769e0fb61404e8285e77f18cf3767f77c93758b81eb03a5f0b +size 3005708 diff --git a/checkpoint-1100/pytorch_model_fsdp_0/.metadata b/checkpoint-1100/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..4315a3da4a88752d80940235e9b8f7ae66296554 --- /dev/null +++ b/checkpoint-1100/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c016866653273cf57a16d15f53d35f0182115803750cca68469e1648b2ffd513 +size 170759 diff --git a/checkpoint-1100/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-1100/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..0fd0f035dafc413c034b5cd855e59fc2e98a6598 --- /dev/null +++ b/checkpoint-1100/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c1331270ba425ccc54b88010838d91f32b48a74aa289fae151b36ecf7e0a726 +size 1489536 diff --git a/checkpoint-1100/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-1100/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..e64651e6fd18a1c9f8c21b33cb93d2e8ec0949a5 --- /dev/null +++ b/checkpoint-1100/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56ad1392881a938a8e6c172e8856f8dde27bd4657045ccc6a9c64f1424bc4dbb +size 1489536 diff --git a/checkpoint-1100/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-1100/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..a9d7550dadbfaf7e82c7b74af63ad1bb7e19cc93 --- /dev/null +++ b/checkpoint-1100/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff4f5be1ea88ae5ca8c84457cf2c37f42c9640cb641dc148071a891d3ff79ff8 +size 1489536 diff --git a/checkpoint-1100/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-1100/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..dd9ad63dd7222fa52b54ff048ef4542e15dbb81c --- /dev/null +++ b/checkpoint-1100/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:730d0b90c6e0b76cf4c09b4f33b69736298ae127fe4e5298e50307c5aec9bfe6 +size 1489536 diff --git a/checkpoint-1100/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-1100/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..d0b294289ad263062eeaaca3b91175289f7b8c00 --- /dev/null +++ b/checkpoint-1100/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b828c0b6768efe7462d6d8e062ee567f23ee7c0c41a5e8bc67a44ebd339024a0 +size 1489536 diff --git a/checkpoint-1100/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-1100/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..bffbcf5135491468c9bff1f459d3d7d9e46890a3 --- /dev/null +++ b/checkpoint-1100/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3ccc02990702cdd7346162169f36ad14e9240f7d651f816e8574a3620b4aa25 +size 1489536 diff --git a/checkpoint-1100/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-1100/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..4f2f3c0cc460e54b5a129e19fda8b38f5ecc8942 --- /dev/null +++ b/checkpoint-1100/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b35047afd2554ca95c9482c71e76e46698c14e740377f8e45d1122c59b39bdf +size 1489536 diff --git a/checkpoint-1100/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-1100/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..667897733daf8030d791339473cdc3f2e61ff2b7 --- /dev/null +++ b/checkpoint-1100/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8be766a86142956fa7294986ac7e5e18003ade41f423c477cf93a8991dd6be1a +size 1489536 diff --git a/checkpoint-1100/rng_state_0.pth b/checkpoint-1100/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..228f1ad5a1b03d68979173cf35d1c2903949b4b0 --- /dev/null +++ b/checkpoint-1100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10951c05f9fb192e43c36f7be898ee80966c186349da1034b098ec9159a5ec9b +size 15920 diff --git a/checkpoint-1100/rng_state_1.pth b/checkpoint-1100/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..079ab70e3eb64f8877a09c067ba6f6841daf2e84 --- /dev/null +++ b/checkpoint-1100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41b525dff0f684213798d62adb1acac1209a73873811052adb7c1ab57cebef53 +size 15984 diff --git a/checkpoint-1100/rng_state_2.pth b/checkpoint-1100/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6987c37178156f21a931cf21098a254a4a2d339a --- /dev/null +++ b/checkpoint-1100/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af6ec805c024be58842b00ba146913cc39f31735ead84ce44b5bc8288671b8c3 +size 15984 diff --git a/checkpoint-1100/rng_state_3.pth b/checkpoint-1100/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..3a5fb7aa094abdadccec49f4e3a0cb7cf7671110 --- /dev/null +++ b/checkpoint-1100/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bd93f2f6c415f784da2333f3cb1d21155a8fd7d1eec27a52408206334a2aee8 +size 15984 diff --git a/checkpoint-1100/rng_state_4.pth b/checkpoint-1100/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..d995c17a722803f0fe0facd7d913119c6d0440a1 --- /dev/null +++ b/checkpoint-1100/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5a6c06f83feb658a3a9dae7756b28bbc7f946e746dbbf609ae7b29aade1ab39 +size 15984 diff --git a/checkpoint-1100/rng_state_5.pth b/checkpoint-1100/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..fe2ed8c7907681ecbd60f9ad528dfc8fdcc0dfe8 --- /dev/null +++ b/checkpoint-1100/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9848dabf5d790c0b63a0c06050b93ff06a12f036d5a22192fa1c0ea1eea577cd +size 15984 diff --git a/checkpoint-1100/rng_state_6.pth b/checkpoint-1100/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..0e2b6906b013eae0f780865ac0113031b64f0d3d --- /dev/null +++ b/checkpoint-1100/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cd49cf108d0c9d7f33194257dd81143851b474af0a01f8ac96dd71a1d515195 +size 15984 diff --git a/checkpoint-1100/rng_state_7.pth b/checkpoint-1100/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..bf68cc860287179a359ac7d204b102b9ea9be48b --- /dev/null +++ b/checkpoint-1100/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dfed987246f4749dbec2c7d8e35618cdf9dc4d6bb56c4a6fc8b18b57228705b +size 15984 diff --git a/checkpoint-1100/scheduler.pt b/checkpoint-1100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..784ad2399b5919ecf3eccc367c9fa04fbfab4619 --- /dev/null +++ b/checkpoint-1100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2594e962e3980706571eb12f2ed27e8aed3b5e373484af50799e77cad68ebb48 +size 1064 diff --git a/checkpoint-1100/trainer_state.json b/checkpoint-1100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bd99475109cde21a08769439011f481fc162f624 --- /dev/null +++ b/checkpoint-1100/trainer_state.json @@ -0,0 +1,430 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.55, + "eval_steps": 100, + "global_step": 1100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.08136817067861557, + "learning_rate": 0.0004, + "loss": 1.158, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.0653829574584961, + "learning_rate": 0.0004998852503731983, + "loss": 1.0957, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.11592712253332138, + "learning_rate": 0.0004993848168027977, + "loss": 0.9276, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.08926476538181305, + "learning_rate": 0.0004984880506341147, + "loss": 1.0337, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9111642837524414, + "eval_runtime": 843.8058, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.08632034063339233, + "learning_rate": 0.0004971963770447935, + "loss": 1.0219, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.12949417531490326, + "learning_rate": 0.0004955118488155782, + "loss": 0.784, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.09222520887851715, + "learning_rate": 0.0004934371430679492, + "loss": 1.0, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.10075929015874863, + "learning_rate": 0.0004909755570095319, + "loss": 0.9617, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9380430579185486, + "eval_runtime": 856.156, + "eval_samples_per_second": 1.281, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.09567111730575562, + "learning_rate": 0.0004881310026940389, + "loss": 0.7051, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.09723825007677078, + "learning_rate": 0.0004849080008040734, + "loss": 0.9906, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.09430444985628128, + "learning_rate": 0.00048131167346667446, + "loss": 0.9113, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.09439756721258163, + "learning_rate": 0.00047734773611302284, + "loss": 0.674, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 0.9544369578361511, + "eval_runtime": 850.9461, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.10961435735225677, + "learning_rate": 0.0004730224883952422, + "loss": 0.9701, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.09628895670175552, + "learning_rate": 0.0004683428041747334, + "loss": 0.8976, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.11085063964128494, + "learning_rate": 0.0004633161205979517, + "loss": 0.6683, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.10140710324048996, + "learning_rate": 0.0004579504262769877, + "loss": 0.9373, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 0.968166172504425, + "eval_runtime": 853.0258, + "eval_samples_per_second": 1.286, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.13868793845176697, + "learning_rate": 0.0004522542485937369, + "loss": 0.8822, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.11853040754795074, + "learning_rate": 0.00044623664014783386, + "loss": 0.6483, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.11668186634778976, + "learning_rate": 0.00043990716436988924, + "loss": 0.9374, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.11911585181951523, + "learning_rate": 0.0004332758803228925, + "loss": 0.8434, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 0.9753186702728271, + "eval_runtime": 851.2812, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.09807440638542175, + "learning_rate": 0.00042635332671593575, + "loss": 0.6661, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.09675773978233337, + "learning_rate": 0.00041915050515566445, + "loss": 0.8999, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.0979737937450409, + "learning_rate": 0.00041167886266207167, + "loss": 0.8616, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.08990875631570816, + "learning_rate": 0.0004039502734764241, + "loss": 0.8167, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 0.9926204681396484, + "eval_runtime": 851.6484, + "eval_samples_per_second": 1.288, + "eval_steps_per_second": 0.021, + "step": 600 + }, + { + "epoch": 0.3125, + "grad_norm": 0.09505137801170349, + "learning_rate": 0.0003959770201902294, + "loss": 0.8191, + "step": 625 + }, + { + "epoch": 0.325, + "grad_norm": 0.09105059504508972, + "learning_rate": 0.0003877717742252371, + "loss": 0.7203, + "step": 650 + }, + { + "epoch": 0.3375, + "grad_norm": 0.09274734556674957, + "learning_rate": 0.00037934757569549495, + "loss": 0.8154, + "step": 675 + }, + { + "epoch": 0.35, + "grad_norm": 0.10026416182518005, + "learning_rate": 0.00037071781268346345, + "loss": 0.8336, + "step": 700 + }, + { + "epoch": 0.35, + "eval_loss": 0.9959912896156311, + "eval_runtime": 849.7662, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 700 + }, + { + "epoch": 0.3625, + "grad_norm": 0.09517039358615875, + "learning_rate": 0.00036189619996312495, + "loss": 0.6845, + "step": 725 + }, + { + "epoch": 0.375, + "grad_norm": 0.11264201998710632, + "learning_rate": 0.00035289675720390174, + "loss": 0.8445, + "step": 750 + }, + { + "epoch": 0.3875, + "grad_norm": 0.0979958102107048, + "learning_rate": 0.00034373378669002105, + "loss": 0.8269, + "step": 775 + }, + { + "epoch": 0.4, + "grad_norm": 0.10223093628883362, + "learning_rate": 0.00033442185059073706, + "loss": 0.6517, + "step": 800 + }, + { + "epoch": 0.4, + "eval_loss": 1.006990671157837, + "eval_runtime": 845.1184, + "eval_samples_per_second": 1.298, + "eval_steps_per_second": 0.021, + "step": 800 + }, + { + "epoch": 0.4125, + "grad_norm": 0.09464213252067566, + "learning_rate": 0.00032497574781753367, + "loss": 0.8455, + "step": 825 + }, + { + "epoch": 0.425, + "grad_norm": 0.10187377035617828, + "learning_rate": 0.000315410490505086, + "loss": 0.8217, + "step": 850 + }, + { + "epoch": 0.4375, + "grad_norm": 0.09793733805418015, + "learning_rate": 0.0003057412801533589, + "loss": 0.6218, + "step": 875 + }, + { + "epoch": 0.45, + "grad_norm": 0.09902466833591461, + "learning_rate": 0.0002959834834687587, + "loss": 0.869, + "step": 900 + }, + { + "epoch": 0.45, + "eval_loss": 1.0097707509994507, + "eval_runtime": 847.3981, + "eval_samples_per_second": 1.295, + "eval_steps_per_second": 0.021, + "step": 900 + }, + { + "epoch": 0.4625, + "grad_norm": 0.10546339303255081, + "learning_rate": 0.00028615260794273236, + "loss": 0.8204, + "step": 925 + }, + { + "epoch": 0.475, + "grad_norm": 0.10600671917200089, + "learning_rate": 0.00027626427720662416, + "loss": 0.5917, + "step": 950 + }, + { + "epoch": 0.4875, + "grad_norm": 0.09627766162157059, + "learning_rate": 0.00026633420620195917, + "loss": 0.8667, + "step": 975 + }, + { + "epoch": 0.5, + "grad_norm": 0.09961821138858795, + "learning_rate": 0.00025637817620561263, + "loss": 0.8215, + "step": 1000 + }, + { + "epoch": 0.5, + "eval_loss": 1.023415446281433, + "eval_runtime": 844.006, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 1000 + }, + { + "epoch": 0.5125, + "grad_norm": 0.09977111220359802, + "learning_rate": 0.0002464120097495559, + "loss": 0.5829, + "step": 1025 + }, + { + "epoch": 0.525, + "grad_norm": 0.10276953876018524, + "learning_rate": 0.00023645154547503855, + "loss": 0.8857, + "step": 1050 + }, + { + "epoch": 0.5375, + "grad_norm": 0.10077586770057678, + "learning_rate": 0.00022651261296116894, + "loss": 0.8018, + "step": 1075 + }, + { + "epoch": 0.55, + "grad_norm": 0.09652134776115417, + "learning_rate": 0.00021661100756789666, + "loss": 0.561, + "step": 1100 + }, + { + "epoch": 0.55, + "eval_loss": 1.0231441259384155, + "eval_runtime": 845.0138, + "eval_samples_per_second": 1.298, + "eval_steps_per_second": 0.021, + "step": 1100 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1056951351417242e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1200/.ipynb_checkpoints/trainer_state-checkpoint.json b/checkpoint-1200/.ipynb_checkpoints/trainer_state-checkpoint.json new file mode 100644 index 0000000000000000000000000000000000000000..dbce72ba82c466a395d21d8e43944fb697a73dea --- /dev/null +++ b/checkpoint-1200/.ipynb_checkpoints/trainer_state-checkpoint.json @@ -0,0 +1,466 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6, + "eval_steps": 100, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.08136817067861557, + "learning_rate": 0.0004, + "loss": 1.158, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.0653829574584961, + "learning_rate": 0.0004998852503731983, + "loss": 1.0957, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.11592712253332138, + "learning_rate": 0.0004993848168027977, + "loss": 0.9276, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.08926476538181305, + "learning_rate": 0.0004984880506341147, + "loss": 1.0337, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9111642837524414, + "eval_runtime": 843.8058, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.08632034063339233, + "learning_rate": 0.0004971963770447935, + "loss": 1.0219, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.12949417531490326, + "learning_rate": 0.0004955118488155782, + "loss": 0.784, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.09222520887851715, + "learning_rate": 0.0004934371430679492, + "loss": 1.0, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.10075929015874863, + "learning_rate": 0.0004909755570095319, + "loss": 0.9617, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9380430579185486, + "eval_runtime": 856.156, + "eval_samples_per_second": 1.281, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.09567111730575562, + "learning_rate": 0.0004881310026940389, + "loss": 0.7051, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.09723825007677078, + "learning_rate": 0.0004849080008040734, + "loss": 0.9906, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.09430444985628128, + "learning_rate": 0.00048131167346667446, + "loss": 0.9113, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.09439756721258163, + "learning_rate": 0.00047734773611302284, + "loss": 0.674, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 0.9544369578361511, + "eval_runtime": 850.9461, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.10961435735225677, + "learning_rate": 0.0004730224883952422, + "loss": 0.9701, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.09628895670175552, + "learning_rate": 0.0004683428041747334, + "loss": 0.8976, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.11085063964128494, + "learning_rate": 0.0004633161205979517, + "loss": 0.6683, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.10140710324048996, + "learning_rate": 0.0004579504262769877, + "loss": 0.9373, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 0.968166172504425, + "eval_runtime": 853.0258, + "eval_samples_per_second": 1.286, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.13868793845176697, + "learning_rate": 0.0004522542485937369, + "loss": 0.8822, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.11853040754795074, + "learning_rate": 0.00044623664014783386, + "loss": 0.6483, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.11668186634778976, + "learning_rate": 0.00043990716436988924, + "loss": 0.9374, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.11911585181951523, + "learning_rate": 0.0004332758803228925, + "loss": 0.8434, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 0.9753186702728271, + "eval_runtime": 851.2812, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.09807440638542175, + "learning_rate": 0.00042635332671593575, + "loss": 0.6661, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.09675773978233337, + "learning_rate": 0.00041915050515566445, + "loss": 0.8999, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.0979737937450409, + "learning_rate": 0.00041167886266207167, + "loss": 0.8616, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.08990875631570816, + "learning_rate": 0.0004039502734764241, + "loss": 0.8167, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 0.9926204681396484, + "eval_runtime": 851.6484, + "eval_samples_per_second": 1.288, + "eval_steps_per_second": 0.021, + "step": 600 + }, + { + "epoch": 0.3125, + "grad_norm": 0.09505137801170349, + "learning_rate": 0.0003959770201902294, + "loss": 0.8191, + "step": 625 + }, + { + "epoch": 0.325, + "grad_norm": 0.09105059504508972, + "learning_rate": 0.0003877717742252371, + "loss": 0.7203, + "step": 650 + }, + { + "epoch": 0.3375, + "grad_norm": 0.09274734556674957, + "learning_rate": 0.00037934757569549495, + "loss": 0.8154, + "step": 675 + }, + { + "epoch": 0.35, + "grad_norm": 0.10026416182518005, + "learning_rate": 0.00037071781268346345, + "loss": 0.8336, + "step": 700 + }, + { + "epoch": 0.35, + "eval_loss": 0.9959912896156311, + "eval_runtime": 849.7662, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 700 + }, + { + "epoch": 0.3625, + "grad_norm": 0.09517039358615875, + "learning_rate": 0.00036189619996312495, + "loss": 0.6845, + "step": 725 + }, + { + "epoch": 0.375, + "grad_norm": 0.11264201998710632, + "learning_rate": 0.00035289675720390174, + "loss": 0.8445, + "step": 750 + }, + { + "epoch": 0.3875, + "grad_norm": 0.0979958102107048, + "learning_rate": 0.00034373378669002105, + "loss": 0.8269, + "step": 775 + }, + { + "epoch": 0.4, + "grad_norm": 0.10223093628883362, + "learning_rate": 0.00033442185059073706, + "loss": 0.6517, + "step": 800 + }, + { + "epoch": 0.4, + "eval_loss": 1.006990671157837, + "eval_runtime": 845.1184, + "eval_samples_per_second": 1.298, + "eval_steps_per_second": 0.021, + "step": 800 + }, + { + "epoch": 0.4125, + "grad_norm": 0.09464213252067566, + "learning_rate": 0.00032497574781753367, + "loss": 0.8455, + "step": 825 + }, + { + "epoch": 0.425, + "grad_norm": 0.10187377035617828, + "learning_rate": 0.000315410490505086, + "loss": 0.8217, + "step": 850 + }, + { + "epoch": 0.4375, + "grad_norm": 0.09793733805418015, + "learning_rate": 0.0003057412801533589, + "loss": 0.6218, + "step": 875 + }, + { + "epoch": 0.45, + "grad_norm": 0.09902466833591461, + "learning_rate": 0.0002959834834687587, + "loss": 0.869, + "step": 900 + }, + { + "epoch": 0.45, + "eval_loss": 1.0097707509994507, + "eval_runtime": 847.3981, + "eval_samples_per_second": 1.295, + "eval_steps_per_second": 0.021, + "step": 900 + }, + { + "epoch": 0.4625, + "grad_norm": 0.10546339303255081, + "learning_rate": 0.00028615260794273236, + "loss": 0.8204, + "step": 925 + }, + { + "epoch": 0.475, + "grad_norm": 0.10600671917200089, + "learning_rate": 0.00027626427720662416, + "loss": 0.5917, + "step": 950 + }, + { + "epoch": 0.4875, + "grad_norm": 0.09627766162157059, + "learning_rate": 0.00026633420620195917, + "loss": 0.8667, + "step": 975 + }, + { + "epoch": 0.5, + "grad_norm": 0.09961821138858795, + "learning_rate": 0.00025637817620561263, + "loss": 0.8215, + "step": 1000 + }, + { + "epoch": 0.5, + "eval_loss": 1.023415446281433, + "eval_runtime": 844.006, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 1000 + }, + { + "epoch": 0.5125, + "grad_norm": 0.09977111220359802, + "learning_rate": 0.0002464120097495559, + "loss": 0.5829, + "step": 1025 + }, + { + "epoch": 0.525, + "grad_norm": 0.10276953876018524, + "learning_rate": 0.00023645154547503855, + "loss": 0.8857, + "step": 1050 + }, + { + "epoch": 0.5375, + "grad_norm": 0.10077586770057678, + "learning_rate": 0.00022651261296116894, + "loss": 0.8018, + "step": 1075 + }, + { + "epoch": 0.55, + "grad_norm": 0.09652134776115417, + "learning_rate": 0.00021661100756789666, + "loss": 0.561, + "step": 1100 + }, + { + "epoch": 0.55, + "eval_loss": 1.0231441259384155, + "eval_runtime": 845.0138, + "eval_samples_per_second": 1.298, + "eval_steps_per_second": 0.021, + "step": 1100 + }, + { + "epoch": 0.5625, + "grad_norm": 0.0990302637219429, + "learning_rate": 0.00020676246533337764, + "loss": 0.9092, + "step": 1125 + }, + { + "epoch": 0.575, + "grad_norm": 0.11152709275484085, + "learning_rate": 0.00019698263796561526, + "loss": 0.8159, + "step": 1150 + }, + { + "epoch": 0.5875, + "grad_norm": 0.10092920064926147, + "learning_rate": 0.00018728706796812333, + "loss": 0.7329, + "step": 1175 + }, + { + "epoch": 0.6, + "grad_norm": 0.10138432681560516, + "learning_rate": 0.00017769116393914037, + "loss": 0.7873, + "step": 1200 + }, + { + "epoch": 0.6, + "eval_loss": 1.0180704593658447, + "eval_runtime": 843.0775, + "eval_samples_per_second": 1.301, + "eval_steps_per_second": 0.021, + "step": 1200 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2062128747000627e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1200/optimizer_0/.metadata b/checkpoint-1200/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..edbfd3f0e57fc91b2199a31f56cca0b6f084f654 --- /dev/null +++ b/checkpoint-1200/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e9a2ee32b246cf5a180111f7efde0c2de7380311cd81d186b2485dcc722b0fd +size 438490 diff --git a/checkpoint-1200/optimizer_0/__0_0.distcp b/checkpoint-1200/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..82d92c54f7aabdc247b08260ef68d5e8460e0ce7 --- /dev/null +++ b/checkpoint-1200/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd721c40828469ff84ee799afc6f326fa0d094882db4cdc55e078a62f5d614e1 +size 2980252 diff --git a/checkpoint-1200/optimizer_0/__1_0.distcp b/checkpoint-1200/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..30d00857a554e79344be0281fa2d814a62b050fb --- /dev/null +++ b/checkpoint-1200/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05c9286b38d8b90696740d67b481a641bee64c73670ee2affa95b3235b4e518b +size 2997320 diff --git a/checkpoint-1200/optimizer_0/__2_0.distcp b/checkpoint-1200/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..8c50379c23fb4f2107d0ee7d774844ea140d5fbb --- /dev/null +++ b/checkpoint-1200/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b548ba21900df79df73e08e4cd6839055fce21f9b905ad0a7203433772066325 +size 2997320 diff --git a/checkpoint-1200/optimizer_0/__3_0.distcp b/checkpoint-1200/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..daffd96e2274da13050e84b97f32e12c2705c46b --- /dev/null +++ b/checkpoint-1200/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5ede1f2cdf00ba1369ef659369672e5cefd1692f3d04eac4057f3a31520bb0c +size 2997320 diff --git a/checkpoint-1200/optimizer_0/__4_0.distcp b/checkpoint-1200/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..97f9b6eb7af44e76bacd7bdd3720aa1951520e2a --- /dev/null +++ b/checkpoint-1200/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c399010b611856ad53302d7f388e10296c782788af9a4c053e2c131aef24eb8 +size 2997320 diff --git a/checkpoint-1200/optimizer_0/__5_0.distcp b/checkpoint-1200/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..64a4a51f23d4d6b696ff184a05aeb6acdff84581 --- /dev/null +++ b/checkpoint-1200/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c00ddc0ad3e0c2ecd4e32f430c7968922a3df88a66c79c3090d5745c43b3b3e +size 2999596 diff --git a/checkpoint-1200/optimizer_0/__6_0.distcp b/checkpoint-1200/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..6aa62078cd29ec4cdea8dfe49ae8a1a661e3e082 --- /dev/null +++ b/checkpoint-1200/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ae6c22e5e6c359644c646457bd441cf6a2243c9c56d82ab6d30722b2c35ad2f +size 2998732 diff --git a/checkpoint-1200/optimizer_0/__7_0.distcp b/checkpoint-1200/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..bf02a9263f04ab7964e73937111b24c965305b05 --- /dev/null +++ b/checkpoint-1200/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:138b800773ad59bd13b28046bc9e1b508fb626d35cf5f6357038fe1ece3f3bbf +size 3005708 diff --git a/checkpoint-1200/pytorch_model_fsdp_0/.metadata b/checkpoint-1200/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..22edb1721802e671d10af76365064bbb0b9e43e1 --- /dev/null +++ b/checkpoint-1200/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ac5065aeca34062b595abbf8ecbce74c73fa1aeb849972e80678176992aa8f4 +size 170759 diff --git a/checkpoint-1200/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-1200/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..4ca847bbfe67aabc95734c9906a10dc69ea3d8f7 --- /dev/null +++ b/checkpoint-1200/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81a6246f18f4abb3ab59c3ad204602a9e3ddea3d8259b20e4f58f7244d3f6a0d +size 1489536 diff --git a/checkpoint-1200/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-1200/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..8eac4ea93ac3fd1eae3baf4fef8bf93bcfde1434 --- /dev/null +++ b/checkpoint-1200/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4145088d42c674b857875d089f975387810eeb7e76266880d2786b6f01ab9136 +size 1489536 diff --git a/checkpoint-1200/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-1200/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..f17cc0e1886b16477cd37a4acfdad61666b4cd0e --- /dev/null +++ b/checkpoint-1200/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5aeec38bcd522229818d06eb38d2489469a0b7945bea3df629a183b14c02964a +size 1489536 diff --git a/checkpoint-1200/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-1200/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..c3a90df7fd4a4efe58ac7f76e80654634990e492 --- /dev/null +++ b/checkpoint-1200/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62b479da846c97d4e7007df2ee6f87b9f2652856d7bf79ca2f5674d36b8b9cd5 +size 1489536 diff --git a/checkpoint-1200/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-1200/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..0f3d42e228d3a469ccc5221ec9d5ef9845e364d4 --- /dev/null +++ b/checkpoint-1200/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35819c549fb1ff190ad8b98f80b1e6ef9cb88ea999009ea567b23a3b992450fa +size 1489536 diff --git a/checkpoint-1200/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-1200/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..445f9f345e22912d13bf17401f51d2e75748ada8 --- /dev/null +++ b/checkpoint-1200/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bae1837c00f205ce27c4282716c92b6aab936ffe9b2bb21a799c8ade45619a82 +size 1489536 diff --git a/checkpoint-1200/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-1200/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..189f15fc061b16bcf1e5a365f5df59f2e9a62136 --- /dev/null +++ b/checkpoint-1200/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47ea5583a01f084697dcc6169db5551303f035711e70b3ff254df458bbec60ee +size 1489536 diff --git a/checkpoint-1200/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-1200/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..9061c8a83b4ce7f6fd8c2fbaf1397e9e2389b743 --- /dev/null +++ b/checkpoint-1200/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f944b510044f732e782064d04694a9e196ac7e6cd8d0db966dc34cf33273625 +size 1489536 diff --git a/checkpoint-1200/rng_state_0.pth b/checkpoint-1200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5f189e74ec3c2b32bdf53731d15eeabdb45c473c --- /dev/null +++ b/checkpoint-1200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84c26b2a8acc7af904f88833cb6aa2007f56b758cc2bb09f4af6a136dcf2254e +size 15920 diff --git a/checkpoint-1200/rng_state_1.pth b/checkpoint-1200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..e7569c9da2848265f26dc13884b4d74d5c78a6d1 --- /dev/null +++ b/checkpoint-1200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:158efb2c453739224cd1a02c979b41052987fa4da6c1ca00610e5d806809e0b8 +size 15984 diff --git a/checkpoint-1200/rng_state_2.pth b/checkpoint-1200/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..96ddbce766334aeca24bedb3ad7523b5858cf5be --- /dev/null +++ b/checkpoint-1200/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e69bd054b5237b13d22eff3a3128acb49fbeda87aa873c3641f4221fa18abfb +size 15984 diff --git a/checkpoint-1200/rng_state_3.pth b/checkpoint-1200/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..bd11ab4ad1207e0e4d739bbf2e662fc3ae7ad75b --- /dev/null +++ b/checkpoint-1200/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01ca716fbae1882edf4fce6f1a9b9ee51f9bcdd08bceaec254a0906850aa5f3a +size 15984 diff --git a/checkpoint-1200/rng_state_4.pth b/checkpoint-1200/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..133fdff1da712ef7cbf2f6acd126fa45a65dfec5 --- /dev/null +++ b/checkpoint-1200/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:baca997feb34a76ea1c234c3cba01504c1ec11987d3b0b60d72a72245855c8b5 +size 15984 diff --git a/checkpoint-1200/rng_state_5.pth b/checkpoint-1200/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..c20e2ebff6f82adba59a56ee75a0f79d8f92695c --- /dev/null +++ b/checkpoint-1200/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1105b7213ffc67ccbe98e30fca965f1bea43cb45c136d5c8d7bdddc87ef6cde +size 15984 diff --git a/checkpoint-1200/rng_state_6.pth b/checkpoint-1200/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..30c01500e59bca4e78787572747633ce30a0ff44 --- /dev/null +++ b/checkpoint-1200/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4502eeb033ad80488d0ce1a43b8dcabb5e0552838a1afaa44f198f1fa8519580 +size 15984 diff --git a/checkpoint-1200/rng_state_7.pth b/checkpoint-1200/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..296cab46339e243450eba8853d7e69a5d544069d --- /dev/null +++ b/checkpoint-1200/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:364b0c128330c69d4648979aa306cf1cdc8e9a164f74b3a408e4fe68d4f6da7b +size 15984 diff --git a/checkpoint-1200/scheduler.pt b/checkpoint-1200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb1b33cc62da996510524591d7252c6e3ea166f0 --- /dev/null +++ b/checkpoint-1200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b015bc498e5b4ffb7ed88672aba64b1aba2e32e94c4926ce4107ea8baf36834c +size 1064 diff --git a/checkpoint-1200/trainer_state.json b/checkpoint-1200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dbce72ba82c466a395d21d8e43944fb697a73dea --- /dev/null +++ b/checkpoint-1200/trainer_state.json @@ -0,0 +1,466 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6, + "eval_steps": 100, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.08136817067861557, + "learning_rate": 0.0004, + "loss": 1.158, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.0653829574584961, + "learning_rate": 0.0004998852503731983, + "loss": 1.0957, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.11592712253332138, + "learning_rate": 0.0004993848168027977, + "loss": 0.9276, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.08926476538181305, + "learning_rate": 0.0004984880506341147, + "loss": 1.0337, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9111642837524414, + "eval_runtime": 843.8058, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.08632034063339233, + "learning_rate": 0.0004971963770447935, + "loss": 1.0219, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.12949417531490326, + "learning_rate": 0.0004955118488155782, + "loss": 0.784, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.09222520887851715, + "learning_rate": 0.0004934371430679492, + "loss": 1.0, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.10075929015874863, + "learning_rate": 0.0004909755570095319, + "loss": 0.9617, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9380430579185486, + "eval_runtime": 856.156, + "eval_samples_per_second": 1.281, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.09567111730575562, + "learning_rate": 0.0004881310026940389, + "loss": 0.7051, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.09723825007677078, + "learning_rate": 0.0004849080008040734, + "loss": 0.9906, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.09430444985628128, + "learning_rate": 0.00048131167346667446, + "loss": 0.9113, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.09439756721258163, + "learning_rate": 0.00047734773611302284, + "loss": 0.674, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 0.9544369578361511, + "eval_runtime": 850.9461, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.10961435735225677, + "learning_rate": 0.0004730224883952422, + "loss": 0.9701, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.09628895670175552, + "learning_rate": 0.0004683428041747334, + "loss": 0.8976, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.11085063964128494, + "learning_rate": 0.0004633161205979517, + "loss": 0.6683, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.10140710324048996, + "learning_rate": 0.0004579504262769877, + "loss": 0.9373, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 0.968166172504425, + "eval_runtime": 853.0258, + "eval_samples_per_second": 1.286, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.13868793845176697, + "learning_rate": 0.0004522542485937369, + "loss": 0.8822, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.11853040754795074, + "learning_rate": 0.00044623664014783386, + "loss": 0.6483, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.11668186634778976, + "learning_rate": 0.00043990716436988924, + "loss": 0.9374, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.11911585181951523, + "learning_rate": 0.0004332758803228925, + "loss": 0.8434, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 0.9753186702728271, + "eval_runtime": 851.2812, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.09807440638542175, + "learning_rate": 0.00042635332671593575, + "loss": 0.6661, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.09675773978233337, + "learning_rate": 0.00041915050515566445, + "loss": 0.8999, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.0979737937450409, + "learning_rate": 0.00041167886266207167, + "loss": 0.8616, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.08990875631570816, + "learning_rate": 0.0004039502734764241, + "loss": 0.8167, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 0.9926204681396484, + "eval_runtime": 851.6484, + "eval_samples_per_second": 1.288, + "eval_steps_per_second": 0.021, + "step": 600 + }, + { + "epoch": 0.3125, + "grad_norm": 0.09505137801170349, + "learning_rate": 0.0003959770201902294, + "loss": 0.8191, + "step": 625 + }, + { + "epoch": 0.325, + "grad_norm": 0.09105059504508972, + "learning_rate": 0.0003877717742252371, + "loss": 0.7203, + "step": 650 + }, + { + "epoch": 0.3375, + "grad_norm": 0.09274734556674957, + "learning_rate": 0.00037934757569549495, + "loss": 0.8154, + "step": 675 + }, + { + "epoch": 0.35, + "grad_norm": 0.10026416182518005, + "learning_rate": 0.00037071781268346345, + "loss": 0.8336, + "step": 700 + }, + { + "epoch": 0.35, + "eval_loss": 0.9959912896156311, + "eval_runtime": 849.7662, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 700 + }, + { + "epoch": 0.3625, + "grad_norm": 0.09517039358615875, + "learning_rate": 0.00036189619996312495, + "loss": 0.6845, + "step": 725 + }, + { + "epoch": 0.375, + "grad_norm": 0.11264201998710632, + "learning_rate": 0.00035289675720390174, + "loss": 0.8445, + "step": 750 + }, + { + "epoch": 0.3875, + "grad_norm": 0.0979958102107048, + "learning_rate": 0.00034373378669002105, + "loss": 0.8269, + "step": 775 + }, + { + "epoch": 0.4, + "grad_norm": 0.10223093628883362, + "learning_rate": 0.00033442185059073706, + "loss": 0.6517, + "step": 800 + }, + { + "epoch": 0.4, + "eval_loss": 1.006990671157837, + "eval_runtime": 845.1184, + "eval_samples_per_second": 1.298, + "eval_steps_per_second": 0.021, + "step": 800 + }, + { + "epoch": 0.4125, + "grad_norm": 0.09464213252067566, + "learning_rate": 0.00032497574781753367, + "loss": 0.8455, + "step": 825 + }, + { + "epoch": 0.425, + "grad_norm": 0.10187377035617828, + "learning_rate": 0.000315410490505086, + "loss": 0.8217, + "step": 850 + }, + { + "epoch": 0.4375, + "grad_norm": 0.09793733805418015, + "learning_rate": 0.0003057412801533589, + "loss": 0.6218, + "step": 875 + }, + { + "epoch": 0.45, + "grad_norm": 0.09902466833591461, + "learning_rate": 0.0002959834834687587, + "loss": 0.869, + "step": 900 + }, + { + "epoch": 0.45, + "eval_loss": 1.0097707509994507, + "eval_runtime": 847.3981, + "eval_samples_per_second": 1.295, + "eval_steps_per_second": 0.021, + "step": 900 + }, + { + "epoch": 0.4625, + "grad_norm": 0.10546339303255081, + "learning_rate": 0.00028615260794273236, + "loss": 0.8204, + "step": 925 + }, + { + "epoch": 0.475, + "grad_norm": 0.10600671917200089, + "learning_rate": 0.00027626427720662416, + "loss": 0.5917, + "step": 950 + }, + { + "epoch": 0.4875, + "grad_norm": 0.09627766162157059, + "learning_rate": 0.00026633420620195917, + "loss": 0.8667, + "step": 975 + }, + { + "epoch": 0.5, + "grad_norm": 0.09961821138858795, + "learning_rate": 0.00025637817620561263, + "loss": 0.8215, + "step": 1000 + }, + { + "epoch": 0.5, + "eval_loss": 1.023415446281433, + "eval_runtime": 844.006, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 1000 + }, + { + "epoch": 0.5125, + "grad_norm": 0.09977111220359802, + "learning_rate": 0.0002464120097495559, + "loss": 0.5829, + "step": 1025 + }, + { + "epoch": 0.525, + "grad_norm": 0.10276953876018524, + "learning_rate": 0.00023645154547503855, + "loss": 0.8857, + "step": 1050 + }, + { + "epoch": 0.5375, + "grad_norm": 0.10077586770057678, + "learning_rate": 0.00022651261296116894, + "loss": 0.8018, + "step": 1075 + }, + { + "epoch": 0.55, + "grad_norm": 0.09652134776115417, + "learning_rate": 0.00021661100756789666, + "loss": 0.561, + "step": 1100 + }, + { + "epoch": 0.55, + "eval_loss": 1.0231441259384155, + "eval_runtime": 845.0138, + "eval_samples_per_second": 1.298, + "eval_steps_per_second": 0.021, + "step": 1100 + }, + { + "epoch": 0.5625, + "grad_norm": 0.0990302637219429, + "learning_rate": 0.00020676246533337764, + "loss": 0.9092, + "step": 1125 + }, + { + "epoch": 0.575, + "grad_norm": 0.11152709275484085, + "learning_rate": 0.00019698263796561526, + "loss": 0.8159, + "step": 1150 + }, + { + "epoch": 0.5875, + "grad_norm": 0.10092920064926147, + "learning_rate": 0.00018728706796812333, + "loss": 0.7329, + "step": 1175 + }, + { + "epoch": 0.6, + "grad_norm": 0.10138432681560516, + "learning_rate": 0.00017769116393914037, + "loss": 0.7873, + "step": 1200 + }, + { + "epoch": 0.6, + "eval_loss": 1.0180704593658447, + "eval_runtime": 843.0775, + "eval_samples_per_second": 1.301, + "eval_steps_per_second": 0.021, + "step": 1200 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2062128747000627e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1300/optimizer_0/.metadata b/checkpoint-1300/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..60756d8188847a901d44b19bd1addafe3dbf9a88 --- /dev/null +++ b/checkpoint-1300/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74c4dcd81ddda294b4a2eb4cb1b28262fad97054c913e2c0668e53651d4d8d94 +size 438490 diff --git a/checkpoint-1300/optimizer_0/__0_0.distcp b/checkpoint-1300/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..8dc77f81f50590d5caad56d7449d8a6a492470b7 --- /dev/null +++ b/checkpoint-1300/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfb17021a3c5c3ed2a1a31f9d7d4c5f9627f7a8f1b1efe006bc7b03538f8f239 +size 2980252 diff --git a/checkpoint-1300/optimizer_0/__1_0.distcp b/checkpoint-1300/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..c896faec1c49e9373d9367a87534608a4acf9a03 --- /dev/null +++ b/checkpoint-1300/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58290c5c42281045a3b43e91aaf4951ac05dbf3de1931b64f5781f33908ee14f +size 2997320 diff --git a/checkpoint-1300/optimizer_0/__2_0.distcp b/checkpoint-1300/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..745d8c09275ba81d17f756188cdccc3347c66be7 --- /dev/null +++ b/checkpoint-1300/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9bfe038805cdf24ba5875039df401f16c4390965fbd64277aef0db4e087951 +size 2997320 diff --git a/checkpoint-1300/optimizer_0/__3_0.distcp b/checkpoint-1300/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..9a1c62662f8b03640dc5268a314ad7ae2a09d582 --- /dev/null +++ b/checkpoint-1300/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26c620d3e2e244cf5f8c95e2289db483a78b00639c839d880b92887de583db61 +size 2997320 diff --git a/checkpoint-1300/optimizer_0/__4_0.distcp b/checkpoint-1300/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..c56500269af1dbb17b0a946beaaa0af367cafa76 --- /dev/null +++ b/checkpoint-1300/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47106295f366702d6f8cb12b264f6aab517da5ed8b76f98ca02ad64a987a9519 +size 2997320 diff --git a/checkpoint-1300/optimizer_0/__5_0.distcp b/checkpoint-1300/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..9c21f17544b7ab2000615dee861e58896b9405f4 --- /dev/null +++ b/checkpoint-1300/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e6cf614f12af9b6385e8df54e6ff63e493bc03c11683c10097f7643915ba3e7 +size 2999596 diff --git a/checkpoint-1300/optimizer_0/__6_0.distcp b/checkpoint-1300/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..2a7b4e50d91ea4d51e3605bc1b92261832db1f81 --- /dev/null +++ b/checkpoint-1300/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d66fe82de755980e2afe320f5a96328c44d4d31ff2a1837252a00d2427e8642 +size 2998732 diff --git a/checkpoint-1300/optimizer_0/__7_0.distcp b/checkpoint-1300/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..7fc8ebf46fe1344aed9f35455b5ea220e4e49d20 --- /dev/null +++ b/checkpoint-1300/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc7ee5eb17ae35be2a0c9dd5e57b66324f4a284306b54f3d82dca1694687b18b +size 3005708 diff --git a/checkpoint-1300/pytorch_model_fsdp_0/.metadata b/checkpoint-1300/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..aa6fbf9f4d78e548b9a41b2f9b92a3831689320e --- /dev/null +++ b/checkpoint-1300/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f2d2b3d1b1221d6099b407d317005ea851f9e9f4dd20fda3a16021f2246666c +size 170759 diff --git a/checkpoint-1300/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-1300/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..614c919b452f305f352f0b0bf04661d883e4a177 --- /dev/null +++ b/checkpoint-1300/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35492df053d8b97625eb4233d8630269396ccfc88fcc6f76efcfc7d3a691201b +size 1489536 diff --git a/checkpoint-1300/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-1300/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..0d8b8090e0558faf85a166466701d341df718815 --- /dev/null +++ b/checkpoint-1300/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7caaca88545431639f92fabfa73f82ba8ab46029bf8d0931a050697e537a641f +size 1489536 diff --git a/checkpoint-1300/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-1300/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..7d91fe3311bf1867c4dd2dc385f580e8b8ba555e --- /dev/null +++ b/checkpoint-1300/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6971bce5b1c3958c01b376a7f641d8f7d90455ddcd1888411cb5d116cd97ffa +size 1489536 diff --git a/checkpoint-1300/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-1300/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..e6b35da0c075afc65536a23e5fb5e302a0f7b053 --- /dev/null +++ b/checkpoint-1300/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff513dd4ba58bb0a2adb282cb368833b11d03fe25475aad38689b976e22240e0 +size 1489536 diff --git a/checkpoint-1300/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-1300/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..23e76ced0dfdf6c6def530f668c06548d2519588 --- /dev/null +++ b/checkpoint-1300/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09e067636fb39fef96cf6de1a7869507329e22e24a188bd741b17ef9605d182a +size 1489536 diff --git a/checkpoint-1300/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-1300/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..492828ddf53993f5cb0dcd9f2086ace7499b2b92 --- /dev/null +++ b/checkpoint-1300/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:851300b2e96189d1648d2fa0f8aa14f6ddcc066ee4520869f376329390945bce +size 1489536 diff --git a/checkpoint-1300/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-1300/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..0d575271e68750ff285c122aa9e9ef6bd06ea259 --- /dev/null +++ b/checkpoint-1300/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80b9fcaf2b6760b5adfde2edd5fd918a87b14e02250f3af0aa0caa20979651b2 +size 1489536 diff --git a/checkpoint-1300/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-1300/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..1b1b0d0eb0fd104eeee7e176daa9e299aaaab4a4 --- /dev/null +++ b/checkpoint-1300/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:070e4df414cb383d5fd56fb0d60b16bafe8b9db6ef83ad27b924b7f5b4f36971 +size 1489536 diff --git a/checkpoint-1300/rng_state_0.pth b/checkpoint-1300/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..02683b94819a040ad40ce62df07cbf9f1df7ac07 --- /dev/null +++ b/checkpoint-1300/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7fd89795021dd0160bb820ad9e658cd1b0d80e3405b507e1c81edf6001bc8ca +size 15920 diff --git a/checkpoint-1300/rng_state_1.pth b/checkpoint-1300/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2f2d417794539025b8ef9394635570b28a840a69 --- /dev/null +++ b/checkpoint-1300/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59c14448ee646c5307c2110c2dbd86e370f0560e1b3ea0772e54e8789b3405d6 +size 15984 diff --git a/checkpoint-1300/rng_state_2.pth b/checkpoint-1300/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..83a36ac16625c9b64cfdbc6cad05b7f4c7fa3422 --- /dev/null +++ b/checkpoint-1300/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5be62bd202108de11264c1be2eb9abd7dc33d1b2edb627ba15cd11e3cb6250d +size 15984 diff --git a/checkpoint-1300/rng_state_3.pth b/checkpoint-1300/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..efed9169b167ba01c0f8245575a19958f3143771 --- /dev/null +++ b/checkpoint-1300/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1f4f8d1e91666e894d50d7e2886591715ba36ed4a759c2ea2acd4a2145bc0a1 +size 15984 diff --git a/checkpoint-1300/rng_state_4.pth b/checkpoint-1300/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..f6924c1ba4ca473f751b2d2c970bb6a03fba8a6e --- /dev/null +++ b/checkpoint-1300/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41e84b003216499d66cd69ab0951adaa3bcbda1e67fd3962cc82600206da2c25 +size 15984 diff --git a/checkpoint-1300/rng_state_5.pth b/checkpoint-1300/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..50fedbed6bae3628716cf7e83cf39732524b200b --- /dev/null +++ b/checkpoint-1300/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d07b16fc7185bf60589c6dc98ebb7edb5b3e9a7ecf3a0cb1a83bfbc60ed674c2 +size 15984 diff --git a/checkpoint-1300/rng_state_6.pth b/checkpoint-1300/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..f76d98ab45c53e6a766641c614b055d3fd180f40 --- /dev/null +++ b/checkpoint-1300/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45b6bda4561656d04ad5a1edb957a0fd798a9af4aac1623510b9407bc589f070 +size 15984 diff --git a/checkpoint-1300/rng_state_7.pth b/checkpoint-1300/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..6a446cf921419ddb5cd7b23d29c622635cedce4a --- /dev/null +++ b/checkpoint-1300/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:088bf86c6b6f9482925a6c46f7a5976920adeb27963828cdc042e3e4328e7bff +size 15984 diff --git a/checkpoint-1300/scheduler.pt b/checkpoint-1300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..72652d1ca1d20c565375320a634597caa64d6264 --- /dev/null +++ b/checkpoint-1300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da7bd64925acbd9bc3dbd1a44a27b1aa523daf766a871a6ffb2ba33b7fc1ea02 +size 1064 diff --git a/checkpoint-1300/trainer_state.json b/checkpoint-1300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e34b5a46fc369e7ab30088cba3faa84b92d949b4 --- /dev/null +++ b/checkpoint-1300/trainer_state.json @@ -0,0 +1,502 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.65, + "eval_steps": 100, + "global_step": 1300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.08136817067861557, + "learning_rate": 0.0004, + "loss": 1.158, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.0653829574584961, + "learning_rate": 0.0004998852503731983, + "loss": 1.0957, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.11592712253332138, + "learning_rate": 0.0004993848168027977, + "loss": 0.9276, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.08926476538181305, + "learning_rate": 0.0004984880506341147, + "loss": 1.0337, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9111642837524414, + "eval_runtime": 843.8058, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.08632034063339233, + "learning_rate": 0.0004971963770447935, + "loss": 1.0219, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.12949417531490326, + "learning_rate": 0.0004955118488155782, + "loss": 0.784, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.09222520887851715, + "learning_rate": 0.0004934371430679492, + "loss": 1.0, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.10075929015874863, + "learning_rate": 0.0004909755570095319, + "loss": 0.9617, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9380430579185486, + "eval_runtime": 856.156, + "eval_samples_per_second": 1.281, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.09567111730575562, + "learning_rate": 0.0004881310026940389, + "loss": 0.7051, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.09723825007677078, + "learning_rate": 0.0004849080008040734, + "loss": 0.9906, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.09430444985628128, + "learning_rate": 0.00048131167346667446, + "loss": 0.9113, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.09439756721258163, + "learning_rate": 0.00047734773611302284, + "loss": 0.674, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 0.9544369578361511, + "eval_runtime": 850.9461, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.10961435735225677, + "learning_rate": 0.0004730224883952422, + "loss": 0.9701, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.09628895670175552, + "learning_rate": 0.0004683428041747334, + "loss": 0.8976, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.11085063964128494, + "learning_rate": 0.0004633161205979517, + "loss": 0.6683, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.10140710324048996, + "learning_rate": 0.0004579504262769877, + "loss": 0.9373, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 0.968166172504425, + "eval_runtime": 853.0258, + "eval_samples_per_second": 1.286, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.13868793845176697, + "learning_rate": 0.0004522542485937369, + "loss": 0.8822, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.11853040754795074, + "learning_rate": 0.00044623664014783386, + "loss": 0.6483, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.11668186634778976, + "learning_rate": 0.00043990716436988924, + "loss": 0.9374, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.11911585181951523, + "learning_rate": 0.0004332758803228925, + "loss": 0.8434, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 0.9753186702728271, + "eval_runtime": 851.2812, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.09807440638542175, + "learning_rate": 0.00042635332671593575, + "loss": 0.6661, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.09675773978233337, + "learning_rate": 0.00041915050515566445, + "loss": 0.8999, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.0979737937450409, + "learning_rate": 0.00041167886266207167, + "loss": 0.8616, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.08990875631570816, + "learning_rate": 0.0004039502734764241, + "loss": 0.8167, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 0.9926204681396484, + "eval_runtime": 851.6484, + "eval_samples_per_second": 1.288, + "eval_steps_per_second": 0.021, + "step": 600 + }, + { + "epoch": 0.3125, + "grad_norm": 0.09505137801170349, + "learning_rate": 0.0003959770201902294, + "loss": 0.8191, + "step": 625 + }, + { + "epoch": 0.325, + "grad_norm": 0.09105059504508972, + "learning_rate": 0.0003877717742252371, + "loss": 0.7203, + "step": 650 + }, + { + "epoch": 0.3375, + "grad_norm": 0.09274734556674957, + "learning_rate": 0.00037934757569549495, + "loss": 0.8154, + "step": 675 + }, + { + "epoch": 0.35, + "grad_norm": 0.10026416182518005, + "learning_rate": 0.00037071781268346345, + "loss": 0.8336, + "step": 700 + }, + { + "epoch": 0.35, + "eval_loss": 0.9959912896156311, + "eval_runtime": 849.7662, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 700 + }, + { + "epoch": 0.3625, + "grad_norm": 0.09517039358615875, + "learning_rate": 0.00036189619996312495, + "loss": 0.6845, + "step": 725 + }, + { + "epoch": 0.375, + "grad_norm": 0.11264201998710632, + "learning_rate": 0.00035289675720390174, + "loss": 0.8445, + "step": 750 + }, + { + "epoch": 0.3875, + "grad_norm": 0.0979958102107048, + "learning_rate": 0.00034373378669002105, + "loss": 0.8269, + "step": 775 + }, + { + "epoch": 0.4, + "grad_norm": 0.10223093628883362, + "learning_rate": 0.00033442185059073706, + "loss": 0.6517, + "step": 800 + }, + { + "epoch": 0.4, + "eval_loss": 1.006990671157837, + "eval_runtime": 845.1184, + "eval_samples_per_second": 1.298, + "eval_steps_per_second": 0.021, + "step": 800 + }, + { + "epoch": 0.4125, + "grad_norm": 0.09464213252067566, + "learning_rate": 0.00032497574781753367, + "loss": 0.8455, + "step": 825 + }, + { + "epoch": 0.425, + "grad_norm": 0.10187377035617828, + "learning_rate": 0.000315410490505086, + "loss": 0.8217, + "step": 850 + }, + { + "epoch": 0.4375, + "grad_norm": 0.09793733805418015, + "learning_rate": 0.0003057412801533589, + "loss": 0.6218, + "step": 875 + }, + { + "epoch": 0.45, + "grad_norm": 0.09902466833591461, + "learning_rate": 0.0002959834834687587, + "loss": 0.869, + "step": 900 + }, + { + "epoch": 0.45, + "eval_loss": 1.0097707509994507, + "eval_runtime": 847.3981, + "eval_samples_per_second": 1.295, + "eval_steps_per_second": 0.021, + "step": 900 + }, + { + "epoch": 0.4625, + "grad_norm": 0.10546339303255081, + "learning_rate": 0.00028615260794273236, + "loss": 0.8204, + "step": 925 + }, + { + "epoch": 0.475, + "grad_norm": 0.10600671917200089, + "learning_rate": 0.00027626427720662416, + "loss": 0.5917, + "step": 950 + }, + { + "epoch": 0.4875, + "grad_norm": 0.09627766162157059, + "learning_rate": 0.00026633420620195917, + "loss": 0.8667, + "step": 975 + }, + { + "epoch": 0.5, + "grad_norm": 0.09961821138858795, + "learning_rate": 0.00025637817620561263, + "loss": 0.8215, + "step": 1000 + }, + { + "epoch": 0.5, + "eval_loss": 1.023415446281433, + "eval_runtime": 844.006, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 1000 + }, + { + "epoch": 0.5125, + "grad_norm": 0.09977111220359802, + "learning_rate": 0.0002464120097495559, + "loss": 0.5829, + "step": 1025 + }, + { + "epoch": 0.525, + "grad_norm": 0.10276953876018524, + "learning_rate": 0.00023645154547503855, + "loss": 0.8857, + "step": 1050 + }, + { + "epoch": 0.5375, + "grad_norm": 0.10077586770057678, + "learning_rate": 0.00022651261296116894, + "loss": 0.8018, + "step": 1075 + }, + { + "epoch": 0.55, + "grad_norm": 0.09652134776115417, + "learning_rate": 0.00021661100756789666, + "loss": 0.561, + "step": 1100 + }, + { + "epoch": 0.55, + "eval_loss": 1.0231441259384155, + "eval_runtime": 845.0138, + "eval_samples_per_second": 1.298, + "eval_steps_per_second": 0.021, + "step": 1100 + }, + { + "epoch": 0.5625, + "grad_norm": 0.0990302637219429, + "learning_rate": 0.00020676246533337764, + "loss": 0.9092, + "step": 1125 + }, + { + "epoch": 0.575, + "grad_norm": 0.11152709275484085, + "learning_rate": 0.00019698263796561526, + "loss": 0.8159, + "step": 1150 + }, + { + "epoch": 0.5875, + "grad_norm": 0.10092920064926147, + "learning_rate": 0.00018728706796812333, + "loss": 0.7329, + "step": 1175 + }, + { + "epoch": 0.6, + "grad_norm": 0.10138432681560516, + "learning_rate": 0.00017769116393914037, + "loss": 0.7873, + "step": 1200 + }, + { + "epoch": 0.6, + "eval_loss": 1.0180704593658447, + "eval_runtime": 843.0775, + "eval_samples_per_second": 1.301, + "eval_steps_per_second": 0.021, + "step": 1200 + }, + { + "epoch": 0.6125, + "grad_norm": 0.10279645770788193, + "learning_rate": 0.00016821017608365264, + "loss": 0.7186, + "step": 1225 + }, + { + "epoch": 0.625, + "grad_norm": 0.10766831040382385, + "learning_rate": 0.00015885917197714112, + "loss": 0.7201, + "step": 1250 + }, + { + "epoch": 0.6375, + "grad_norm": 0.10177863389253616, + "learning_rate": 0.00014965301261957238, + "loss": 0.8009, + "step": 1275 + }, + { + "epoch": 0.65, + "grad_norm": 0.0983508974313736, + "learning_rate": 0.00014060632881768558, + "loss": 0.7023, + "step": 1300 + }, + { + "epoch": 0.65, + "eval_loss": 1.0266674757003784, + "eval_runtime": 854.7884, + "eval_samples_per_second": 1.283, + "eval_steps_per_second": 0.021, + "step": 1300 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3067306142584013e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1400/optimizer_0/.metadata b/checkpoint-1400/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..3fb1a5d3b73e11086ea0488652864d1d1f5d820e --- /dev/null +++ b/checkpoint-1400/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a551edc0dfcb509174fb5c0bf99d017292fa35d04badff62180fe76499773e3a +size 438490 diff --git a/checkpoint-1400/optimizer_0/__0_0.distcp b/checkpoint-1400/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..9beb92294ccd1b9808debdab3311a21885ba3047 --- /dev/null +++ b/checkpoint-1400/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46508f90dab6ea58188488395f896883f5deb91cec2c2c12cca990662e1795f2 +size 2980252 diff --git a/checkpoint-1400/optimizer_0/__1_0.distcp b/checkpoint-1400/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..93539dff9e6eb2508540c2ec73d0810b83b3bb38 --- /dev/null +++ b/checkpoint-1400/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6d403f9fa684294b79013723c3dd2a57f40940d83a83123676c3cf38aa9b0c8 +size 2997320 diff --git a/checkpoint-1400/optimizer_0/__2_0.distcp b/checkpoint-1400/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..8138e2dac0230c864884616203756c41bfda87bb --- /dev/null +++ b/checkpoint-1400/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f855865525213e253b9543f8392f5064f9e3701eaf2acef2f6b50459f080a081 +size 2997320 diff --git a/checkpoint-1400/optimizer_0/__3_0.distcp b/checkpoint-1400/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..84bd06ec7e9a8a413ffa4907bf98bdef823e7914 --- /dev/null +++ b/checkpoint-1400/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f005cae79ac2bb53fbb10c5c4d70dc4a394f0b774ed10424e8451dea17ab40e8 +size 2997320 diff --git a/checkpoint-1400/optimizer_0/__4_0.distcp b/checkpoint-1400/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..4f344467f876e4485243381fd4d34a238a11e906 --- /dev/null +++ b/checkpoint-1400/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53ddd36e7d7768c94baec33cf0d3c35f8726777b792a37f88d53b9b391eeb433 +size 2997320 diff --git a/checkpoint-1400/optimizer_0/__5_0.distcp b/checkpoint-1400/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..53373882ccdaa7693364b830955f69dd132a852d --- /dev/null +++ b/checkpoint-1400/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2332a2b0d18c65e5a896ca211078763fde81c40683a1fefa390b6f3597daf6a +size 2999596 diff --git a/checkpoint-1400/optimizer_0/__6_0.distcp b/checkpoint-1400/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..48990b2050e5c8f2ffe138435b63388923185ca9 --- /dev/null +++ b/checkpoint-1400/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c9b95ed239d87e6a75e344da816baf23d69bcc9ef5bf9331b0280a5e0042ceb +size 2998732 diff --git a/checkpoint-1400/optimizer_0/__7_0.distcp b/checkpoint-1400/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..a7fe628b5168612055c3a209b2c31e7f23d5fee3 --- /dev/null +++ b/checkpoint-1400/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:896982ce07a5742c7135ee372a5a79f2aa2558ad671a12f0327768cb9cbc73fe +size 3005708 diff --git a/checkpoint-1400/pytorch_model_fsdp_0/.metadata b/checkpoint-1400/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..c541bd867896fb4aebbc4419020a877eab644c0e --- /dev/null +++ b/checkpoint-1400/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60af4e180f0e665084619e2c54aaad9ccc1836f9da0a2f829ec2d8cb6ec92c2a +size 170759 diff --git a/checkpoint-1400/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-1400/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..2e317d9c5d8059f6c39149eae5332fb9ace3d9fb --- /dev/null +++ b/checkpoint-1400/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36270c9946c8419d3cafe037dd6ff959f31d4f83d41383a50893869730e66ef6 +size 1489536 diff --git a/checkpoint-1400/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-1400/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..71feaceff05d30031d9e311810b7df8301bcdb73 --- /dev/null +++ b/checkpoint-1400/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f08d2fe94f06bf864d5034ee0604288dab23776f1979091d41512fd3c3d1ab9 +size 1489536 diff --git a/checkpoint-1400/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-1400/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..ebd4c52e9f08ca9d2be66fa257bcf1e395b9ff61 --- /dev/null +++ b/checkpoint-1400/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8bc151e1e7e2514c7d3bceeebfa100293fb3daf5a1ec96a4f6905ab8abf906a +size 1489536 diff --git a/checkpoint-1400/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-1400/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..4d958b0307f867386835af4755c487c436e1bc05 --- /dev/null +++ b/checkpoint-1400/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac95a24ae8fac08021d02a4d3352d8eb293aa82f1bd84cc86ca0265683a6d27d +size 1489536 diff --git a/checkpoint-1400/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-1400/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..3385e6da132518e288969c3adb19ab1c0d69edf1 --- /dev/null +++ b/checkpoint-1400/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17a70c6570fcce6a27bfc2c96e06836e2ba2fc8137fbb816d944ee1e6b518deb +size 1489536 diff --git a/checkpoint-1400/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-1400/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..b679d38a515f80cacec0cf4ae5397ac668e3ef8c --- /dev/null +++ b/checkpoint-1400/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5687792214c1d89542ca6af543c3c42acce9d4e91ac657184f68e6a11ee1027f +size 1489536 diff --git a/checkpoint-1400/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-1400/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..61d3c2e86ffe00258efa27bbf4c53048ab7ffb89 --- /dev/null +++ b/checkpoint-1400/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e29577839cacf305d6132412593b27f09963f6a8004fe14f29c95700e61089e +size 1489536 diff --git a/checkpoint-1400/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-1400/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..95575c8b596d97492a7c4ae411d464f5d3887280 --- /dev/null +++ b/checkpoint-1400/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34e5e4495022bda54948f92b7c2fc4beeb5cbb3f36400c6861dffacb5f7ead46 +size 1489536 diff --git a/checkpoint-1400/rng_state_0.pth b/checkpoint-1400/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..227ec93c3e8ef56f4b4c2cca828cef131af39a71 --- /dev/null +++ b/checkpoint-1400/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d42a2eedba7a4091055a96ccd3dd4da2eaca6fbbe25f1c2e80ab817e8886a5de +size 15920 diff --git a/checkpoint-1400/rng_state_1.pth b/checkpoint-1400/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..de560c5e245753e21ecc7f31871114a4704ce933 --- /dev/null +++ b/checkpoint-1400/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9179322cd15281e69c1b0d5a3f50657fc311319f087cacb8b2e5938e7eb07e9a +size 15984 diff --git a/checkpoint-1400/rng_state_2.pth b/checkpoint-1400/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..2b105f23b205018fb489a337ae2d5b3f7d705293 --- /dev/null +++ b/checkpoint-1400/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:575be61fed0627afbdff204b0816dae4e338f50bd0d2049ecaaf1655573b6da8 +size 15984 diff --git a/checkpoint-1400/rng_state_3.pth b/checkpoint-1400/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..e6015106f13742c4787da603dabde3c77d68a3d8 --- /dev/null +++ b/checkpoint-1400/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78eba4696cda3ab2ac997a1652955234d9cd351ca320a61eebda0d811d802485 +size 15984 diff --git a/checkpoint-1400/rng_state_4.pth b/checkpoint-1400/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..34a4632a882aa5d234a0ee74fc465997f9dc55d6 --- /dev/null +++ b/checkpoint-1400/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:079ee4a128bab7511bd7c5d4c741b16adf9d7557be80143b70136278e26989b6 +size 15984 diff --git a/checkpoint-1400/rng_state_5.pth b/checkpoint-1400/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..9ebabe22764141052f308a11d43efdb26f55c746 --- /dev/null +++ b/checkpoint-1400/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af7a288fd74a41f90a133314e24ec753386013d31f41ff53c5be06f970265382 +size 15984 diff --git a/checkpoint-1400/rng_state_6.pth b/checkpoint-1400/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..291457d8a66ca684b3e5b40da23d5d44f68da956 --- /dev/null +++ b/checkpoint-1400/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d86ea638f561758fd19b01eae63675ca617641f68ebc2017b5613bf3f2cf71ff +size 15984 diff --git a/checkpoint-1400/rng_state_7.pth b/checkpoint-1400/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..289a19856c5fa6f64baecb063315516f8ad2ca77 --- /dev/null +++ b/checkpoint-1400/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19b9e9a5b7eb9ee987ef2641478c13a247070b27f40b93bd75415ecff0952a25 +size 15984 diff --git a/checkpoint-1400/scheduler.pt b/checkpoint-1400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0a26edbd6707caad9939b100160a29bf32569fa --- /dev/null +++ b/checkpoint-1400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f00dc98dfb48cefd72fc26922748e64c046f326e2d0dc623af08b2bb7f66af78 +size 1064 diff --git a/checkpoint-1400/trainer_state.json b/checkpoint-1400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2710ebdac91a4f3b484498405b61ea49a90a6cd9 --- /dev/null +++ b/checkpoint-1400/trainer_state.json @@ -0,0 +1,538 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7, + "eval_steps": 100, + "global_step": 1400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.08136817067861557, + "learning_rate": 0.0004, + "loss": 1.158, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.0653829574584961, + "learning_rate": 0.0004998852503731983, + "loss": 1.0957, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.11592712253332138, + "learning_rate": 0.0004993848168027977, + "loss": 0.9276, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.08926476538181305, + "learning_rate": 0.0004984880506341147, + "loss": 1.0337, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9111642837524414, + "eval_runtime": 843.8058, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.08632034063339233, + "learning_rate": 0.0004971963770447935, + "loss": 1.0219, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.12949417531490326, + "learning_rate": 0.0004955118488155782, + "loss": 0.784, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.09222520887851715, + "learning_rate": 0.0004934371430679492, + "loss": 1.0, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.10075929015874863, + "learning_rate": 0.0004909755570095319, + "loss": 0.9617, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9380430579185486, + "eval_runtime": 856.156, + "eval_samples_per_second": 1.281, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.09567111730575562, + "learning_rate": 0.0004881310026940389, + "loss": 0.7051, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.09723825007677078, + "learning_rate": 0.0004849080008040734, + "loss": 0.9906, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.09430444985628128, + "learning_rate": 0.00048131167346667446, + "loss": 0.9113, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.09439756721258163, + "learning_rate": 0.00047734773611302284, + "loss": 0.674, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 0.9544369578361511, + "eval_runtime": 850.9461, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.10961435735225677, + "learning_rate": 0.0004730224883952422, + "loss": 0.9701, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.09628895670175552, + "learning_rate": 0.0004683428041747334, + "loss": 0.8976, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.11085063964128494, + "learning_rate": 0.0004633161205979517, + "loss": 0.6683, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.10140710324048996, + "learning_rate": 0.0004579504262769877, + "loss": 0.9373, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 0.968166172504425, + "eval_runtime": 853.0258, + "eval_samples_per_second": 1.286, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.13868793845176697, + "learning_rate": 0.0004522542485937369, + "loss": 0.8822, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.11853040754795074, + "learning_rate": 0.00044623664014783386, + "loss": 0.6483, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.11668186634778976, + "learning_rate": 0.00043990716436988924, + "loss": 0.9374, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.11911585181951523, + "learning_rate": 0.0004332758803228925, + "loss": 0.8434, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 0.9753186702728271, + "eval_runtime": 851.2812, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.09807440638542175, + "learning_rate": 0.00042635332671593575, + "loss": 0.6661, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.09675773978233337, + "learning_rate": 0.00041915050515566445, + "loss": 0.8999, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.0979737937450409, + "learning_rate": 0.00041167886266207167, + "loss": 0.8616, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.08990875631570816, + "learning_rate": 0.0004039502734764241, + "loss": 0.8167, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 0.9926204681396484, + "eval_runtime": 851.6484, + "eval_samples_per_second": 1.288, + "eval_steps_per_second": 0.021, + "step": 600 + }, + { + "epoch": 0.3125, + "grad_norm": 0.09505137801170349, + "learning_rate": 0.0003959770201902294, + "loss": 0.8191, + "step": 625 + }, + { + "epoch": 0.325, + "grad_norm": 0.09105059504508972, + "learning_rate": 0.0003877717742252371, + "loss": 0.7203, + "step": 650 + }, + { + "epoch": 0.3375, + "grad_norm": 0.09274734556674957, + "learning_rate": 0.00037934757569549495, + "loss": 0.8154, + "step": 675 + }, + { + "epoch": 0.35, + "grad_norm": 0.10026416182518005, + "learning_rate": 0.00037071781268346345, + "loss": 0.8336, + "step": 700 + }, + { + "epoch": 0.35, + "eval_loss": 0.9959912896156311, + "eval_runtime": 849.7662, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 700 + }, + { + "epoch": 0.3625, + "grad_norm": 0.09517039358615875, + "learning_rate": 0.00036189619996312495, + "loss": 0.6845, + "step": 725 + }, + { + "epoch": 0.375, + "grad_norm": 0.11264201998710632, + "learning_rate": 0.00035289675720390174, + "loss": 0.8445, + "step": 750 + }, + { + "epoch": 0.3875, + "grad_norm": 0.0979958102107048, + "learning_rate": 0.00034373378669002105, + "loss": 0.8269, + "step": 775 + }, + { + "epoch": 0.4, + "grad_norm": 0.10223093628883362, + "learning_rate": 0.00033442185059073706, + "loss": 0.6517, + "step": 800 + }, + { + "epoch": 0.4, + "eval_loss": 1.006990671157837, + "eval_runtime": 845.1184, + "eval_samples_per_second": 1.298, + "eval_steps_per_second": 0.021, + "step": 800 + }, + { + "epoch": 0.4125, + "grad_norm": 0.09464213252067566, + "learning_rate": 0.00032497574781753367, + "loss": 0.8455, + "step": 825 + }, + { + "epoch": 0.425, + "grad_norm": 0.10187377035617828, + "learning_rate": 0.000315410490505086, + "loss": 0.8217, + "step": 850 + }, + { + "epoch": 0.4375, + "grad_norm": 0.09793733805418015, + "learning_rate": 0.0003057412801533589, + "loss": 0.6218, + "step": 875 + }, + { + "epoch": 0.45, + "grad_norm": 0.09902466833591461, + "learning_rate": 0.0002959834834687587, + "loss": 0.869, + "step": 900 + }, + { + "epoch": 0.45, + "eval_loss": 1.0097707509994507, + "eval_runtime": 847.3981, + "eval_samples_per_second": 1.295, + "eval_steps_per_second": 0.021, + "step": 900 + }, + { + "epoch": 0.4625, + "grad_norm": 0.10546339303255081, + "learning_rate": 0.00028615260794273236, + "loss": 0.8204, + "step": 925 + }, + { + "epoch": 0.475, + "grad_norm": 0.10600671917200089, + "learning_rate": 0.00027626427720662416, + "loss": 0.5917, + "step": 950 + }, + { + "epoch": 0.4875, + "grad_norm": 0.09627766162157059, + "learning_rate": 0.00026633420620195917, + "loss": 0.8667, + "step": 975 + }, + { + "epoch": 0.5, + "grad_norm": 0.09961821138858795, + "learning_rate": 0.00025637817620561263, + "loss": 0.8215, + "step": 1000 + }, + { + "epoch": 0.5, + "eval_loss": 1.023415446281433, + "eval_runtime": 844.006, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 1000 + }, + { + "epoch": 0.5125, + "grad_norm": 0.09977111220359802, + "learning_rate": 0.0002464120097495559, + "loss": 0.5829, + "step": 1025 + }, + { + "epoch": 0.525, + "grad_norm": 0.10276953876018524, + "learning_rate": 0.00023645154547503855, + "loss": 0.8857, + "step": 1050 + }, + { + "epoch": 0.5375, + "grad_norm": 0.10077586770057678, + "learning_rate": 0.00022651261296116894, + "loss": 0.8018, + "step": 1075 + }, + { + "epoch": 0.55, + "grad_norm": 0.09652134776115417, + "learning_rate": 0.00021661100756789666, + "loss": 0.561, + "step": 1100 + }, + { + "epoch": 0.55, + "eval_loss": 1.0231441259384155, + "eval_runtime": 845.0138, + "eval_samples_per_second": 1.298, + "eval_steps_per_second": 0.021, + "step": 1100 + }, + { + "epoch": 0.5625, + "grad_norm": 0.0990302637219429, + "learning_rate": 0.00020676246533337764, + "loss": 0.9092, + "step": 1125 + }, + { + "epoch": 0.575, + "grad_norm": 0.11152709275484085, + "learning_rate": 0.00019698263796561526, + "loss": 0.8159, + "step": 1150 + }, + { + "epoch": 0.5875, + "grad_norm": 0.10092920064926147, + "learning_rate": 0.00018728706796812333, + "loss": 0.7329, + "step": 1175 + }, + { + "epoch": 0.6, + "grad_norm": 0.10138432681560516, + "learning_rate": 0.00017769116393914037, + "loss": 0.7873, + "step": 1200 + }, + { + "epoch": 0.6, + "eval_loss": 1.0180704593658447, + "eval_runtime": 843.0775, + "eval_samples_per_second": 1.301, + "eval_steps_per_second": 0.021, + "step": 1200 + }, + { + "epoch": 0.6125, + "grad_norm": 0.10279645770788193, + "learning_rate": 0.00016821017608365264, + "loss": 0.7186, + "step": 1225 + }, + { + "epoch": 0.625, + "grad_norm": 0.10766831040382385, + "learning_rate": 0.00015885917197714112, + "loss": 0.7201, + "step": 1250 + }, + { + "epoch": 0.6375, + "grad_norm": 0.10177863389253616, + "learning_rate": 0.00014965301261957238, + "loss": 0.8009, + "step": 1275 + }, + { + "epoch": 0.65, + "grad_norm": 0.0983508974313736, + "learning_rate": 0.00014060632881768558, + "loss": 0.7023, + "step": 1300 + }, + { + "epoch": 0.65, + "eval_loss": 1.0266674757003784, + "eval_runtime": 854.7884, + "eval_samples_per_second": 1.283, + "eval_steps_per_second": 0.021, + "step": 1300 + }, + { + "epoch": 0.6625, + "grad_norm": 0.10579918324947357, + "learning_rate": 0.00013173349793311424, + "loss": 0.7624, + "step": 1325 + }, + { + "epoch": 0.675, + "grad_norm": 0.10350169986486435, + "learning_rate": 0.0001230486210332916, + "loss": 0.7857, + "step": 1350 + }, + { + "epoch": 0.6875, + "grad_norm": 0.10701841115951538, + "learning_rate": 0.00011456550048145536, + "loss": 0.6771, + "step": 1375 + }, + { + "epoch": 0.7, + "grad_norm": 0.10333641618490219, + "learning_rate": 0.00010629761800136473, + "loss": 0.7669, + "step": 1400 + }, + { + "epoch": 0.7, + "eval_loss": 1.0311139822006226, + "eval_runtime": 850.5438, + "eval_samples_per_second": 1.29, + "eval_steps_per_second": 0.021, + "step": 1400 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.4072483538167398e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1500/optimizer_0/.metadata b/checkpoint-1500/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..099952cf2a685c5cde1ef0948a8a50ee2df16323 --- /dev/null +++ b/checkpoint-1500/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1be551ea45e7adff15d56ec1b228f69111c643574fde72564b1d6df3a45030b4 +size 438490 diff --git a/checkpoint-1500/optimizer_0/__0_0.distcp b/checkpoint-1500/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..f56702621322b2bd5a748d6afe199300b4bddfd4 --- /dev/null +++ b/checkpoint-1500/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43f45e20c359b21c9307b2303131ca522a30a86907cd7373fda715ba7139a74f +size 2980252 diff --git a/checkpoint-1500/optimizer_0/__1_0.distcp b/checkpoint-1500/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..3a71ecceaef6eca2ff7a95e5b5f7e3c70d92aec7 --- /dev/null +++ b/checkpoint-1500/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1dc7cd55c1364d5bd7b5703d8053f6b883040ef702706980103d993357ab7cc +size 2997320 diff --git a/checkpoint-1500/optimizer_0/__2_0.distcp b/checkpoint-1500/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..1c246f08e566211688b6dbdb87b2583786da0f19 --- /dev/null +++ b/checkpoint-1500/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7fc2c1b100125b809b2a68391ee5d53a8adad96e520a40bcacd7ab9753a082a +size 2997320 diff --git a/checkpoint-1500/optimizer_0/__3_0.distcp b/checkpoint-1500/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..7dd84a12cce8da30d5476bf537784a142d4dc7d4 --- /dev/null +++ b/checkpoint-1500/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b3ac97225b8de92580ffbe057ef6e4582cb6aafb027cb42c3b003c703e3d497 +size 2997320 diff --git a/checkpoint-1500/optimizer_0/__4_0.distcp b/checkpoint-1500/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..42275577e00437186bc2a3f797d3446d5f853b34 --- /dev/null +++ b/checkpoint-1500/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:570cbb77993a5e16284f5f273371789518584955b19308c193ace92f35b375e2 +size 2997320 diff --git a/checkpoint-1500/optimizer_0/__5_0.distcp b/checkpoint-1500/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..68f8907295e9f5307ff4a5f51b9e34d7b3d1208c --- /dev/null +++ b/checkpoint-1500/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a15692f42d5991e9941618763a83b47c8488ace6d7c32ba0f84b445cd3a99fdd +size 2999596 diff --git a/checkpoint-1500/optimizer_0/__6_0.distcp b/checkpoint-1500/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..4e84f885f96f82a3d2bafcad774cd881ca0f5aa1 --- /dev/null +++ b/checkpoint-1500/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e47d2946386974298458ef5b129bf4e5925a3e16ba9f0dbfdc715b23e0cc5e33 +size 2998732 diff --git a/checkpoint-1500/optimizer_0/__7_0.distcp b/checkpoint-1500/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..d6811fc6a7faf0f5ba5f8b8ab638173b9eb1e932 --- /dev/null +++ b/checkpoint-1500/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aae99cc88cf8808b0ec7cf7f1241cf3a2c17be60c52c2b4dc02d7290474f174b +size 3005708 diff --git a/checkpoint-1500/pytorch_model_fsdp_0/.metadata b/checkpoint-1500/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..4c82995604e30948483a9487ab1c919134e80ee5 --- /dev/null +++ b/checkpoint-1500/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41d5945e8c0aaf16391ac07f8a361f3843afc819f93fb121ecb014bf9fd5e632 +size 170759 diff --git a/checkpoint-1500/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-1500/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..d7a24681126c62cdb7785d6e5386b329a255092f --- /dev/null +++ b/checkpoint-1500/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f871b0fa15d376ad52a2ddd27ebb91a18edbaac1982d7b143eb89449a4fcf898 +size 1489536 diff --git a/checkpoint-1500/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-1500/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..02f1dd71044269bf10b123d99033c1c404615edd --- /dev/null +++ b/checkpoint-1500/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b7e6b9d3a48701a8f859b31a2d60e3bea74368ef6efe6518174d5e7a4c46c09 +size 1489536 diff --git a/checkpoint-1500/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-1500/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..f02805a678ea6cfe6cca729bddd494b6312adc9d --- /dev/null +++ b/checkpoint-1500/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ce5bd44f7bf5c55ce2ba53f77d7d680a28d7f510a037590b3040e572bd5481d +size 1489536 diff --git a/checkpoint-1500/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-1500/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..bf15b79139182876efd3ea1e90516d9f9833580e --- /dev/null +++ b/checkpoint-1500/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:001eea2fe01514bed3b9561c82a01ca51df104e57fe26680138bedc6eff4ace8 +size 1489536 diff --git a/checkpoint-1500/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-1500/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..9ce9752d72df64fc5b750bf4e7613ba092e1bc4c --- /dev/null +++ b/checkpoint-1500/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcd6094330b0c7f552a5c48e0142402269439d60d2a3fff87461ffb42ce13462 +size 1489536 diff --git a/checkpoint-1500/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-1500/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..327ce990688520a1fb7bba3d3f0f8884f8ec99dd --- /dev/null +++ b/checkpoint-1500/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5373579dfab8ff7d7c2504f333b4d983f128fa8335b3f0169ab00cb2c10d70f +size 1489536 diff --git a/checkpoint-1500/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-1500/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..a97f745511d6f685b395406abb9ac1067f7e847d --- /dev/null +++ b/checkpoint-1500/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6494ba3ca383bf8715f59ddeeb10dac558fc0948ad06ec52907188fc879d32f +size 1489536 diff --git a/checkpoint-1500/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-1500/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..8f9597e16d7ccf3d401cec21cc0f9799e8e9fb77 --- /dev/null +++ b/checkpoint-1500/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9845f9acdd648fabaf75af161a563a0427924b8516925b97b2bca168bcac8bf2 +size 1489536 diff --git a/checkpoint-1500/rng_state_0.pth b/checkpoint-1500/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..e54be6e459745bf20a83cb622e1c86cc10ce8522 --- /dev/null +++ b/checkpoint-1500/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4433e68a7ecfbf84d5b59193fed5be299b3c6bd9661c1b1b3d68a8ab696604cb +size 15920 diff --git a/checkpoint-1500/rng_state_1.pth b/checkpoint-1500/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..6777150f01befaef1d07adc26bb66fa963a27b17 --- /dev/null +++ b/checkpoint-1500/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d39875d8cadf9928a3b808d6ef72fe52aae4e255cd7cd4ad5e8e242f7fd2c7fc +size 15984 diff --git a/checkpoint-1500/rng_state_2.pth b/checkpoint-1500/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..b153c00669542a7713a55886f7e0b38fbfa8b6c3 --- /dev/null +++ b/checkpoint-1500/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cbd45a2db6acfac5caa9d80d63ece12503b0f63fbb7f6b3b0b69084bbef4738 +size 15984 diff --git a/checkpoint-1500/rng_state_3.pth b/checkpoint-1500/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..7d30286c8d6304d83a82b86df629f6de262036a3 --- /dev/null +++ b/checkpoint-1500/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c38724293e33d297e0505ec90e3d2ef0c7a688bdc8ebac4eda63333054d3cf9 +size 15984 diff --git a/checkpoint-1500/rng_state_4.pth b/checkpoint-1500/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..e63dc44d1acdf8dbd85132331ef3498347f00867 --- /dev/null +++ b/checkpoint-1500/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa36cca0956be05ecb2b267f55f966b640b7231db368e3dd74d8f88fbc57f27a +size 15984 diff --git a/checkpoint-1500/rng_state_5.pth b/checkpoint-1500/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..81c44d72b83ef081e691da62e0cedc7da1972ce0 --- /dev/null +++ b/checkpoint-1500/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fac3b399bca51abee30dd72b7fbd3555191fac9a8c694b9b8f6c0c1bd78ab9db +size 15984 diff --git a/checkpoint-1500/rng_state_6.pth b/checkpoint-1500/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d4883dd34f64393221674ff342513408662cdc --- /dev/null +++ b/checkpoint-1500/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edfbdf6345e32141c8b4bce93a6ff87c74295d078ebc83b6b99aa4e9a9d59619 +size 15984 diff --git a/checkpoint-1500/rng_state_7.pth b/checkpoint-1500/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d982129db3bcab3f2d3ffcae82308af24371c19a --- /dev/null +++ b/checkpoint-1500/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e282b4b95e56ee90ebb39bc1215807db12589d83ab676f7523b66d0ddf085dee +size 15984 diff --git a/checkpoint-1500/scheduler.pt b/checkpoint-1500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e0b92a890ced6e2b96196734a43486800291efc --- /dev/null +++ b/checkpoint-1500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06e65cc13c66c2828cdb8f114dee592c488900c0a56fd072ff729fc38f989e26 +size 1064 diff --git a/checkpoint-1500/trainer_state.json b/checkpoint-1500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ceaf4ffd0903c87968b9511be3565a7f5949df9a --- /dev/null +++ b/checkpoint-1500/trainer_state.json @@ -0,0 +1,574 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.75, + "eval_steps": 100, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.08136817067861557, + "learning_rate": 0.0004, + "loss": 1.158, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.0653829574584961, + "learning_rate": 0.0004998852503731983, + "loss": 1.0957, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.11592712253332138, + "learning_rate": 0.0004993848168027977, + "loss": 0.9276, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.08926476538181305, + "learning_rate": 0.0004984880506341147, + "loss": 1.0337, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9111642837524414, + "eval_runtime": 843.8058, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.08632034063339233, + "learning_rate": 0.0004971963770447935, + "loss": 1.0219, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.12949417531490326, + "learning_rate": 0.0004955118488155782, + "loss": 0.784, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.09222520887851715, + "learning_rate": 0.0004934371430679492, + "loss": 1.0, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.10075929015874863, + "learning_rate": 0.0004909755570095319, + "loss": 0.9617, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9380430579185486, + "eval_runtime": 856.156, + "eval_samples_per_second": 1.281, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.09567111730575562, + "learning_rate": 0.0004881310026940389, + "loss": 0.7051, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.09723825007677078, + "learning_rate": 0.0004849080008040734, + "loss": 0.9906, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.09430444985628128, + "learning_rate": 0.00048131167346667446, + "loss": 0.9113, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.09439756721258163, + "learning_rate": 0.00047734773611302284, + "loss": 0.674, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 0.9544369578361511, + "eval_runtime": 850.9461, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.10961435735225677, + "learning_rate": 0.0004730224883952422, + "loss": 0.9701, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.09628895670175552, + "learning_rate": 0.0004683428041747334, + "loss": 0.8976, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.11085063964128494, + "learning_rate": 0.0004633161205979517, + "loss": 0.6683, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.10140710324048996, + "learning_rate": 0.0004579504262769877, + "loss": 0.9373, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 0.968166172504425, + "eval_runtime": 853.0258, + "eval_samples_per_second": 1.286, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.13868793845176697, + "learning_rate": 0.0004522542485937369, + "loss": 0.8822, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.11853040754795074, + "learning_rate": 0.00044623664014783386, + "loss": 0.6483, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.11668186634778976, + "learning_rate": 0.00043990716436988924, + "loss": 0.9374, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.11911585181951523, + "learning_rate": 0.0004332758803228925, + "loss": 0.8434, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 0.9753186702728271, + "eval_runtime": 851.2812, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.09807440638542175, + "learning_rate": 0.00042635332671593575, + "loss": 0.6661, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.09675773978233337, + "learning_rate": 0.00041915050515566445, + "loss": 0.8999, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.0979737937450409, + "learning_rate": 0.00041167886266207167, + "loss": 0.8616, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.08990875631570816, + "learning_rate": 0.0004039502734764241, + "loss": 0.8167, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 0.9926204681396484, + "eval_runtime": 851.6484, + "eval_samples_per_second": 1.288, + "eval_steps_per_second": 0.021, + "step": 600 + }, + { + "epoch": 0.3125, + "grad_norm": 0.09505137801170349, + "learning_rate": 0.0003959770201902294, + "loss": 0.8191, + "step": 625 + }, + { + "epoch": 0.325, + "grad_norm": 0.09105059504508972, + "learning_rate": 0.0003877717742252371, + "loss": 0.7203, + "step": 650 + }, + { + "epoch": 0.3375, + "grad_norm": 0.09274734556674957, + "learning_rate": 0.00037934757569549495, + "loss": 0.8154, + "step": 675 + }, + { + "epoch": 0.35, + "grad_norm": 0.10026416182518005, + "learning_rate": 0.00037071781268346345, + "loss": 0.8336, + "step": 700 + }, + { + "epoch": 0.35, + "eval_loss": 0.9959912896156311, + "eval_runtime": 849.7662, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 700 + }, + { + "epoch": 0.3625, + "grad_norm": 0.09517039358615875, + "learning_rate": 0.00036189619996312495, + "loss": 0.6845, + "step": 725 + }, + { + "epoch": 0.375, + "grad_norm": 0.11264201998710632, + "learning_rate": 0.00035289675720390174, + "loss": 0.8445, + "step": 750 + }, + { + "epoch": 0.3875, + "grad_norm": 0.0979958102107048, + "learning_rate": 0.00034373378669002105, + "loss": 0.8269, + "step": 775 + }, + { + "epoch": 0.4, + "grad_norm": 0.10223093628883362, + "learning_rate": 0.00033442185059073706, + "loss": 0.6517, + "step": 800 + }, + { + "epoch": 0.4, + "eval_loss": 1.006990671157837, + "eval_runtime": 845.1184, + "eval_samples_per_second": 1.298, + "eval_steps_per_second": 0.021, + "step": 800 + }, + { + "epoch": 0.4125, + "grad_norm": 0.09464213252067566, + "learning_rate": 0.00032497574781753367, + "loss": 0.8455, + "step": 825 + }, + { + "epoch": 0.425, + "grad_norm": 0.10187377035617828, + "learning_rate": 0.000315410490505086, + "loss": 0.8217, + "step": 850 + }, + { + "epoch": 0.4375, + "grad_norm": 0.09793733805418015, + "learning_rate": 0.0003057412801533589, + "loss": 0.6218, + "step": 875 + }, + { + "epoch": 0.45, + "grad_norm": 0.09902466833591461, + "learning_rate": 0.0002959834834687587, + "loss": 0.869, + "step": 900 + }, + { + "epoch": 0.45, + "eval_loss": 1.0097707509994507, + "eval_runtime": 847.3981, + "eval_samples_per_second": 1.295, + "eval_steps_per_second": 0.021, + "step": 900 + }, + { + "epoch": 0.4625, + "grad_norm": 0.10546339303255081, + "learning_rate": 0.00028615260794273236, + "loss": 0.8204, + "step": 925 + }, + { + "epoch": 0.475, + "grad_norm": 0.10600671917200089, + "learning_rate": 0.00027626427720662416, + "loss": 0.5917, + "step": 950 + }, + { + "epoch": 0.4875, + "grad_norm": 0.09627766162157059, + "learning_rate": 0.00026633420620195917, + "loss": 0.8667, + "step": 975 + }, + { + "epoch": 0.5, + "grad_norm": 0.09961821138858795, + "learning_rate": 0.00025637817620561263, + "loss": 0.8215, + "step": 1000 + }, + { + "epoch": 0.5, + "eval_loss": 1.023415446281433, + "eval_runtime": 844.006, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 1000 + }, + { + "epoch": 0.5125, + "grad_norm": 0.09977111220359802, + "learning_rate": 0.0002464120097495559, + "loss": 0.5829, + "step": 1025 + }, + { + "epoch": 0.525, + "grad_norm": 0.10276953876018524, + "learning_rate": 0.00023645154547503855, + "loss": 0.8857, + "step": 1050 + }, + { + "epoch": 0.5375, + "grad_norm": 0.10077586770057678, + "learning_rate": 0.00022651261296116894, + "loss": 0.8018, + "step": 1075 + }, + { + "epoch": 0.55, + "grad_norm": 0.09652134776115417, + "learning_rate": 0.00021661100756789666, + "loss": 0.561, + "step": 1100 + }, + { + "epoch": 0.55, + "eval_loss": 1.0231441259384155, + "eval_runtime": 845.0138, + "eval_samples_per_second": 1.298, + "eval_steps_per_second": 0.021, + "step": 1100 + }, + { + "epoch": 0.5625, + "grad_norm": 0.0990302637219429, + "learning_rate": 0.00020676246533337764, + "loss": 0.9092, + "step": 1125 + }, + { + "epoch": 0.575, + "grad_norm": 0.11152709275484085, + "learning_rate": 0.00019698263796561526, + "loss": 0.8159, + "step": 1150 + }, + { + "epoch": 0.5875, + "grad_norm": 0.10092920064926147, + "learning_rate": 0.00018728706796812333, + "loss": 0.7329, + "step": 1175 + }, + { + "epoch": 0.6, + "grad_norm": 0.10138432681560516, + "learning_rate": 0.00017769116393914037, + "loss": 0.7873, + "step": 1200 + }, + { + "epoch": 0.6, + "eval_loss": 1.0180704593658447, + "eval_runtime": 843.0775, + "eval_samples_per_second": 1.301, + "eval_steps_per_second": 0.021, + "step": 1200 + }, + { + "epoch": 0.6125, + "grad_norm": 0.10279645770788193, + "learning_rate": 0.00016821017608365264, + "loss": 0.7186, + "step": 1225 + }, + { + "epoch": 0.625, + "grad_norm": 0.10766831040382385, + "learning_rate": 0.00015885917197714112, + "loss": 0.7201, + "step": 1250 + }, + { + "epoch": 0.6375, + "grad_norm": 0.10177863389253616, + "learning_rate": 0.00014965301261957238, + "loss": 0.8009, + "step": 1275 + }, + { + "epoch": 0.65, + "grad_norm": 0.0983508974313736, + "learning_rate": 0.00014060632881768558, + "loss": 0.7023, + "step": 1300 + }, + { + "epoch": 0.65, + "eval_loss": 1.0266674757003784, + "eval_runtime": 854.7884, + "eval_samples_per_second": 1.283, + "eval_steps_per_second": 0.021, + "step": 1300 + }, + { + "epoch": 0.6625, + "grad_norm": 0.10579918324947357, + "learning_rate": 0.00013173349793311424, + "loss": 0.7624, + "step": 1325 + }, + { + "epoch": 0.675, + "grad_norm": 0.10350169986486435, + "learning_rate": 0.0001230486210332916, + "loss": 0.7857, + "step": 1350 + }, + { + "epoch": 0.6875, + "grad_norm": 0.10701841115951538, + "learning_rate": 0.00011456550048145536, + "loss": 0.6771, + "step": 1375 + }, + { + "epoch": 0.7, + "grad_norm": 0.10333641618490219, + "learning_rate": 0.00010629761800136473, + "loss": 0.7669, + "step": 1400 + }, + { + "epoch": 0.7, + "eval_loss": 1.0311139822006226, + "eval_runtime": 850.5438, + "eval_samples_per_second": 1.29, + "eval_steps_per_second": 0.021, + "step": 1400 + }, + { + "epoch": 0.7125, + "grad_norm": 0.10531915724277496, + "learning_rate": 9.82581132515907e-05, + "loss": 0.7869, + "step": 1425 + }, + { + "epoch": 0.725, + "grad_norm": 0.08779594302177429, + "learning_rate": 9.045976294343145e-05, + "loss": 0.6651, + "step": 1450 + }, + { + "epoch": 0.7375, + "grad_norm": 0.09893123060464859, + "learning_rate": 8.291496053563699e-05, + "loss": 0.7938, + "step": 1475 + }, + { + "epoch": 0.75, + "grad_norm": 0.09708540141582489, + "learning_rate": 7.563569653821565e-05, + "loss": 0.7873, + "step": 1500 + }, + { + "epoch": 0.75, + "eval_loss": 1.0259246826171875, + "eval_runtime": 843.8488, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 1500 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.5077660933750784e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1600/optimizer_0/.metadata b/checkpoint-1600/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..5933dbb2aa38d293eb3b98dff545566ad3d5cf73 --- /dev/null +++ b/checkpoint-1600/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:521f4f2a6fe63a810cad99bc137ac14725c202cbd8d0fac046e914b8fef50c3d +size 438490 diff --git a/checkpoint-1600/optimizer_0/__0_0.distcp b/checkpoint-1600/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..7355fc67b8e3dd202305afa6ad71d28cd3fa5dcd --- /dev/null +++ b/checkpoint-1600/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db07453499f60d0bffc6d271468a4f9f4a1e47c5fbbeed7963ae44e83b79cb44 +size 2980252 diff --git a/checkpoint-1600/optimizer_0/__1_0.distcp b/checkpoint-1600/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..3e66f978339c00e468b983a7500a70969c75b0a2 --- /dev/null +++ b/checkpoint-1600/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06771d7ba4c8f2393618ff668e6d70b112758a402b42732a66a8e5d2f6ce3a81 +size 2997320 diff --git a/checkpoint-1600/optimizer_0/__2_0.distcp b/checkpoint-1600/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..ab95846a1c1d5464023b31d6ecf0a9d6b000d26c --- /dev/null +++ b/checkpoint-1600/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af33d933d7f5b9585027a803ceae2abd233526f403c6bf2a72f851480b41cd1a +size 2997320 diff --git a/checkpoint-1600/optimizer_0/__3_0.distcp b/checkpoint-1600/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..b654f5a9a9bafdc864f781e87d9f21fd0d86c398 --- /dev/null +++ b/checkpoint-1600/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d60b10ad8ad06ed37df4e0d1c8b0758172f9f89dadaa924a5f27618431488b4c +size 2997320 diff --git a/checkpoint-1600/optimizer_0/__4_0.distcp b/checkpoint-1600/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..2165bdb1ae20872fc9a23ede6626cbe48982a0e1 --- /dev/null +++ b/checkpoint-1600/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1666d7f6230ce09ad2893bac048ee34c50b250818fa2723884ea352bba61755 +size 2997320 diff --git a/checkpoint-1600/optimizer_0/__5_0.distcp b/checkpoint-1600/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..d2bbe760df9722709d31b8ea0b13758f3a45e107 --- /dev/null +++ b/checkpoint-1600/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7a3f8abec3cb31c1baefffd8337d9717954a6e291442e66e89ab2221c6622b4 +size 2999596 diff --git a/checkpoint-1600/optimizer_0/__6_0.distcp b/checkpoint-1600/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..5debad4c6291555dba961eb55d54b4e012017d63 --- /dev/null +++ b/checkpoint-1600/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ad190174f343faf12df4315744a8e1f5cfaec58ddab91b8c9b9fdc3cfae418a +size 2998732 diff --git a/checkpoint-1600/optimizer_0/__7_0.distcp b/checkpoint-1600/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..4d5fcdf88329309abef37adc91141c7d02895df7 --- /dev/null +++ b/checkpoint-1600/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b26c09c2e0db384f59fff3a0eb0398eb8d8c6ebf4cb91c4540d5ad6b86a4562 +size 3005708 diff --git a/checkpoint-1600/pytorch_model_fsdp_0/.metadata b/checkpoint-1600/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..4cf5e6c139b17783113d36a466ee7797cae8cc36 --- /dev/null +++ b/checkpoint-1600/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47f4fec03b6921228be743ef91733bb2d3e1e4ae394eef13eb3edb11aff8ed02 +size 170759 diff --git a/checkpoint-1600/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-1600/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..16659391c9322fd4c77e0610e43edeb91349546f --- /dev/null +++ b/checkpoint-1600/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c682d340b027ab2be88209b93bcfdd8bac8f66c76f734591277fe8485f100da4 +size 1489536 diff --git a/checkpoint-1600/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-1600/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..b7f39fa93c8d13bbd604ed0b451efb27469f61d6 --- /dev/null +++ b/checkpoint-1600/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebb43e3155dd35822962b8cf60c200aea65e1f16bd4e18dfd3aa40df2bf5c2ed +size 1489536 diff --git a/checkpoint-1600/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-1600/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..200164e838a74963866e3e44648c081625b9d0b1 --- /dev/null +++ b/checkpoint-1600/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:630168c36e90481500e34b59dd77951de91d40a04c8ea0a7d6fa7c24ef5b7932 +size 1489536 diff --git a/checkpoint-1600/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-1600/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..b804a18b984108ae4eee5880f68b2c8636f759d7 --- /dev/null +++ b/checkpoint-1600/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcfdbc640a228f859b156fd9035afe399195824873f2f2614510b35acdca5c3d +size 1489536 diff --git a/checkpoint-1600/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-1600/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..14a7fcf27180230f9845b0636718a1a2d66246bb --- /dev/null +++ b/checkpoint-1600/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22390576ae45017daf438f07bfef3a6b0046139ad7b6aeaaa80a5c3906523df1 +size 1489536 diff --git a/checkpoint-1600/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-1600/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..f879cc27ee895c116d52ec9f67077947fd8164c1 --- /dev/null +++ b/checkpoint-1600/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51223e7672d8b13027604b388a243a8f9e48e6a6fc019a54509a77c2a490038b +size 1489536 diff --git a/checkpoint-1600/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-1600/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..eb6513ca47920bbdb9f1bafe86a4ffbf4d419230 --- /dev/null +++ b/checkpoint-1600/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b6de7c42e7ff16ee3eb267228330b8c42351a200867f54e7339abbb6c63667c +size 1489536 diff --git a/checkpoint-1600/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-1600/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..faffb24275106ce5f99c944ef2f24b7bd011817d --- /dev/null +++ b/checkpoint-1600/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0bfe6f9bf5feb320cde7a2ab2e989734ba10ca0394ef523f477cebf5aa1d30b +size 1489536 diff --git a/checkpoint-1600/rng_state_0.pth b/checkpoint-1600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..17d1578450e189113f727a3a0632b3bc530af8ab --- /dev/null +++ b/checkpoint-1600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5183506a2396df9160b89ce86aee871710b836d566fa74da4d65a9ae5ca85552 +size 15920 diff --git a/checkpoint-1600/rng_state_1.pth b/checkpoint-1600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..bf041f9429507850844a465334e15bbc5f3d3e75 --- /dev/null +++ b/checkpoint-1600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:faced7ec28436a2990a43a87872462ff8bcaa07f8aca5783ce1ef461c24a5279 +size 15984 diff --git a/checkpoint-1600/rng_state_2.pth b/checkpoint-1600/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..22398f7f5735ca48f9ce9163659699f7d974599e --- /dev/null +++ b/checkpoint-1600/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da04d74ce1ce614a1bfa4966f86cceb96723501a6333ccc0bb669f27e8c29bcf +size 15984 diff --git a/checkpoint-1600/rng_state_3.pth b/checkpoint-1600/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..ec5815d1a61ba4dacc884cd287e0b14f53a85485 --- /dev/null +++ b/checkpoint-1600/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c2da239fb7c10f993c363a19581ac96c62790624254fd05b4f9411a2c0f8280 +size 15984 diff --git a/checkpoint-1600/rng_state_4.pth b/checkpoint-1600/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..c28c849918f4a4c68f2ba6112492313ad5813fe1 --- /dev/null +++ b/checkpoint-1600/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78bc4edab805c57c2ead6d40ffcfc64de8e97b59e297c2a921893b92f5e9296d +size 15984 diff --git a/checkpoint-1600/rng_state_5.pth b/checkpoint-1600/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..8a6f7f47b3f861e06a4b8b62a48916656b67a3bb --- /dev/null +++ b/checkpoint-1600/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28a3d9fee605e6a474ee8a412dc14b0ab906a07cb21430029959b91a634fc6d7 +size 15984 diff --git a/checkpoint-1600/rng_state_6.pth b/checkpoint-1600/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..c5c29843a667219beeb22f4e97dd3ae688fa8fd2 --- /dev/null +++ b/checkpoint-1600/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe4530cf8642eadbeef6e8e3bb17cc51305d0d35a97ce2a8a6457153b532ffd3 +size 15984 diff --git a/checkpoint-1600/rng_state_7.pth b/checkpoint-1600/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..11a550e52cc0ee1bf4b1b0c8a4ec25843b59c9d9 --- /dev/null +++ b/checkpoint-1600/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3f587f7177f6428865c1ea5f5ca96d5cd2d4fe9f7b6b5b251babc5a37b75a0a +size 15984 diff --git a/checkpoint-1600/scheduler.pt b/checkpoint-1600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1b1256b470e8c7334822cf94f43e0000f8a16dd9 --- /dev/null +++ b/checkpoint-1600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28d5f76e918fa0cbbbd298377811314b7fc9c9c89e747f720e3533cb0c69b09c +size 1064 diff --git a/checkpoint-1600/trainer_state.json b/checkpoint-1600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fe5afaf7c446311960ee24ef7ea25193ff0375dc --- /dev/null +++ b/checkpoint-1600/trainer_state.json @@ -0,0 +1,610 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8, + "eval_steps": 100, + "global_step": 1600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.08136817067861557, + "learning_rate": 0.0004, + "loss": 1.158, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.0653829574584961, + "learning_rate": 0.0004998852503731983, + "loss": 1.0957, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.11592712253332138, + "learning_rate": 0.0004993848168027977, + "loss": 0.9276, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.08926476538181305, + "learning_rate": 0.0004984880506341147, + "loss": 1.0337, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9111642837524414, + "eval_runtime": 843.8058, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.08632034063339233, + "learning_rate": 0.0004971963770447935, + "loss": 1.0219, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.12949417531490326, + "learning_rate": 0.0004955118488155782, + "loss": 0.784, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.09222520887851715, + "learning_rate": 0.0004934371430679492, + "loss": 1.0, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.10075929015874863, + "learning_rate": 0.0004909755570095319, + "loss": 0.9617, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9380430579185486, + "eval_runtime": 856.156, + "eval_samples_per_second": 1.281, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.09567111730575562, + "learning_rate": 0.0004881310026940389, + "loss": 0.7051, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.09723825007677078, + "learning_rate": 0.0004849080008040734, + "loss": 0.9906, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.09430444985628128, + "learning_rate": 0.00048131167346667446, + "loss": 0.9113, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.09439756721258163, + "learning_rate": 0.00047734773611302284, + "loss": 0.674, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 0.9544369578361511, + "eval_runtime": 850.9461, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.10961435735225677, + "learning_rate": 0.0004730224883952422, + "loss": 0.9701, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.09628895670175552, + "learning_rate": 0.0004683428041747334, + "loss": 0.8976, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.11085063964128494, + "learning_rate": 0.0004633161205979517, + "loss": 0.6683, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.10140710324048996, + "learning_rate": 0.0004579504262769877, + "loss": 0.9373, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 0.968166172504425, + "eval_runtime": 853.0258, + "eval_samples_per_second": 1.286, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.13868793845176697, + "learning_rate": 0.0004522542485937369, + "loss": 0.8822, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.11853040754795074, + "learning_rate": 0.00044623664014783386, + "loss": 0.6483, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.11668186634778976, + "learning_rate": 0.00043990716436988924, + "loss": 0.9374, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.11911585181951523, + "learning_rate": 0.0004332758803228925, + "loss": 0.8434, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 0.9753186702728271, + "eval_runtime": 851.2812, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.09807440638542175, + "learning_rate": 0.00042635332671593575, + "loss": 0.6661, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.09675773978233337, + "learning_rate": 0.00041915050515566445, + "loss": 0.8999, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.0979737937450409, + "learning_rate": 0.00041167886266207167, + "loss": 0.8616, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.08990875631570816, + "learning_rate": 0.0004039502734764241, + "loss": 0.8167, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 0.9926204681396484, + "eval_runtime": 851.6484, + "eval_samples_per_second": 1.288, + "eval_steps_per_second": 0.021, + "step": 600 + }, + { + "epoch": 0.3125, + "grad_norm": 0.09505137801170349, + "learning_rate": 0.0003959770201902294, + "loss": 0.8191, + "step": 625 + }, + { + "epoch": 0.325, + "grad_norm": 0.09105059504508972, + "learning_rate": 0.0003877717742252371, + "loss": 0.7203, + "step": 650 + }, + { + "epoch": 0.3375, + "grad_norm": 0.09274734556674957, + "learning_rate": 0.00037934757569549495, + "loss": 0.8154, + "step": 675 + }, + { + "epoch": 0.35, + "grad_norm": 0.10026416182518005, + "learning_rate": 0.00037071781268346345, + "loss": 0.8336, + "step": 700 + }, + { + "epoch": 0.35, + "eval_loss": 0.9959912896156311, + "eval_runtime": 849.7662, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 700 + }, + { + "epoch": 0.3625, + "grad_norm": 0.09517039358615875, + "learning_rate": 0.00036189619996312495, + "loss": 0.6845, + "step": 725 + }, + { + "epoch": 0.375, + "grad_norm": 0.11264201998710632, + "learning_rate": 0.00035289675720390174, + "loss": 0.8445, + "step": 750 + }, + { + "epoch": 0.3875, + "grad_norm": 0.0979958102107048, + "learning_rate": 0.00034373378669002105, + "loss": 0.8269, + "step": 775 + }, + { + "epoch": 0.4, + "grad_norm": 0.10223093628883362, + "learning_rate": 0.00033442185059073706, + "loss": 0.6517, + "step": 800 + }, + { + "epoch": 0.4, + "eval_loss": 1.006990671157837, + "eval_runtime": 845.1184, + "eval_samples_per_second": 1.298, + "eval_steps_per_second": 0.021, + "step": 800 + }, + { + "epoch": 0.4125, + "grad_norm": 0.09464213252067566, + "learning_rate": 0.00032497574781753367, + "loss": 0.8455, + "step": 825 + }, + { + "epoch": 0.425, + "grad_norm": 0.10187377035617828, + "learning_rate": 0.000315410490505086, + "loss": 0.8217, + "step": 850 + }, + { + "epoch": 0.4375, + "grad_norm": 0.09793733805418015, + "learning_rate": 0.0003057412801533589, + "loss": 0.6218, + "step": 875 + }, + { + "epoch": 0.45, + "grad_norm": 0.09902466833591461, + "learning_rate": 0.0002959834834687587, + "loss": 0.869, + "step": 900 + }, + { + "epoch": 0.45, + "eval_loss": 1.0097707509994507, + "eval_runtime": 847.3981, + "eval_samples_per_second": 1.295, + "eval_steps_per_second": 0.021, + "step": 900 + }, + { + "epoch": 0.4625, + "grad_norm": 0.10546339303255081, + "learning_rate": 0.00028615260794273236, + "loss": 0.8204, + "step": 925 + }, + { + "epoch": 0.475, + "grad_norm": 0.10600671917200089, + "learning_rate": 0.00027626427720662416, + "loss": 0.5917, + "step": 950 + }, + { + "epoch": 0.4875, + "grad_norm": 0.09627766162157059, + "learning_rate": 0.00026633420620195917, + "loss": 0.8667, + "step": 975 + }, + { + "epoch": 0.5, + "grad_norm": 0.09961821138858795, + "learning_rate": 0.00025637817620561263, + "loss": 0.8215, + "step": 1000 + }, + { + "epoch": 0.5, + "eval_loss": 1.023415446281433, + "eval_runtime": 844.006, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 1000 + }, + { + "epoch": 0.5125, + "grad_norm": 0.09977111220359802, + "learning_rate": 0.0002464120097495559, + "loss": 0.5829, + "step": 1025 + }, + { + "epoch": 0.525, + "grad_norm": 0.10276953876018524, + "learning_rate": 0.00023645154547503855, + "loss": 0.8857, + "step": 1050 + }, + { + "epoch": 0.5375, + "grad_norm": 0.10077586770057678, + "learning_rate": 0.00022651261296116894, + "loss": 0.8018, + "step": 1075 + }, + { + "epoch": 0.55, + "grad_norm": 0.09652134776115417, + "learning_rate": 0.00021661100756789666, + "loss": 0.561, + "step": 1100 + }, + { + "epoch": 0.55, + "eval_loss": 1.0231441259384155, + "eval_runtime": 845.0138, + "eval_samples_per_second": 1.298, + "eval_steps_per_second": 0.021, + "step": 1100 + }, + { + "epoch": 0.5625, + "grad_norm": 0.0990302637219429, + "learning_rate": 0.00020676246533337764, + "loss": 0.9092, + "step": 1125 + }, + { + "epoch": 0.575, + "grad_norm": 0.11152709275484085, + "learning_rate": 0.00019698263796561526, + "loss": 0.8159, + "step": 1150 + }, + { + "epoch": 0.5875, + "grad_norm": 0.10092920064926147, + "learning_rate": 0.00018728706796812333, + "loss": 0.7329, + "step": 1175 + }, + { + "epoch": 0.6, + "grad_norm": 0.10138432681560516, + "learning_rate": 0.00017769116393914037, + "loss": 0.7873, + "step": 1200 + }, + { + "epoch": 0.6, + "eval_loss": 1.0180704593658447, + "eval_runtime": 843.0775, + "eval_samples_per_second": 1.301, + "eval_steps_per_second": 0.021, + "step": 1200 + }, + { + "epoch": 0.6125, + "grad_norm": 0.10279645770788193, + "learning_rate": 0.00016821017608365264, + "loss": 0.7186, + "step": 1225 + }, + { + "epoch": 0.625, + "grad_norm": 0.10766831040382385, + "learning_rate": 0.00015885917197714112, + "loss": 0.7201, + "step": 1250 + }, + { + "epoch": 0.6375, + "grad_norm": 0.10177863389253616, + "learning_rate": 0.00014965301261957238, + "loss": 0.8009, + "step": 1275 + }, + { + "epoch": 0.65, + "grad_norm": 0.0983508974313736, + "learning_rate": 0.00014060632881768558, + "loss": 0.7023, + "step": 1300 + }, + { + "epoch": 0.65, + "eval_loss": 1.0266674757003784, + "eval_runtime": 854.7884, + "eval_samples_per_second": 1.283, + "eval_steps_per_second": 0.021, + "step": 1300 + }, + { + "epoch": 0.6625, + "grad_norm": 0.10579918324947357, + "learning_rate": 0.00013173349793311424, + "loss": 0.7624, + "step": 1325 + }, + { + "epoch": 0.675, + "grad_norm": 0.10350169986486435, + "learning_rate": 0.0001230486210332916, + "loss": 0.7857, + "step": 1350 + }, + { + "epoch": 0.6875, + "grad_norm": 0.10701841115951538, + "learning_rate": 0.00011456550048145536, + "loss": 0.6771, + "step": 1375 + }, + { + "epoch": 0.7, + "grad_norm": 0.10333641618490219, + "learning_rate": 0.00010629761800136473, + "loss": 0.7669, + "step": 1400 + }, + { + "epoch": 0.7, + "eval_loss": 1.0311139822006226, + "eval_runtime": 850.5438, + "eval_samples_per_second": 1.29, + "eval_steps_per_second": 0.021, + "step": 1400 + }, + { + "epoch": 0.7125, + "grad_norm": 0.10531915724277496, + "learning_rate": 9.82581132515907e-05, + "loss": 0.7869, + "step": 1425 + }, + { + "epoch": 0.725, + "grad_norm": 0.08779594302177429, + "learning_rate": 9.045976294343145e-05, + "loss": 0.6651, + "step": 1450 + }, + { + "epoch": 0.7375, + "grad_norm": 0.09893123060464859, + "learning_rate": 8.291496053563699e-05, + "loss": 0.7938, + "step": 1475 + }, + { + "epoch": 0.75, + "grad_norm": 0.09708540141582489, + "learning_rate": 7.563569653821565e-05, + "loss": 0.7873, + "step": 1500 + }, + { + "epoch": 0.75, + "eval_loss": 1.0259246826171875, + "eval_runtime": 843.8488, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 1500 + }, + { + "epoch": 0.7625, + "grad_norm": 0.1001739501953125, + "learning_rate": 6.863353945662288e-05, + "loss": 0.6202, + "step": 1525 + }, + { + "epoch": 0.775, + "grad_norm": 0.10724864155054092, + "learning_rate": 6.191961740661687e-05, + "loss": 0.8107, + "step": 1550 + }, + { + "epoch": 0.7875, + "grad_norm": 0.10225515067577362, + "learning_rate": 5.550460042899982e-05, + "loss": 0.8041, + "step": 1575 + }, + { + "epoch": 0.8, + "grad_norm": 0.09861259162425995, + "learning_rate": 4.9398683532350855e-05, + "loss": 0.5894, + "step": 1600 + }, + { + "epoch": 0.8, + "eval_loss": 1.0317810773849487, + "eval_runtime": 852.0414, + "eval_samples_per_second": 1.287, + "eval_steps_per_second": 0.021, + "step": 1600 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.608283832933417e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-200/optimizer_0/.metadata b/checkpoint-200/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..0d9dcfd135c93333bf81e5c33e3bfd29a743087a --- /dev/null +++ b/checkpoint-200/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e9591ef68a9bb253d3b85d0c0026395e9a97eb31df144f62bc51ee67aa4fa73 +size 438489 diff --git a/checkpoint-200/optimizer_0/__0_0.distcp b/checkpoint-200/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..2f557fbcb85e760b089a91e8c22a766b76c09cb1 --- /dev/null +++ b/checkpoint-200/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7619661f165a3cc9dd37cfef1de28a05e8748fa45294bb9d9748ad7abc1b7583 +size 2980252 diff --git a/checkpoint-200/optimizer_0/__1_0.distcp b/checkpoint-200/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..7c25acf96c216b84d7cd71776b384016421138c5 --- /dev/null +++ b/checkpoint-200/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8f131147d3a428612363df05687fd4fe9f9a9465bc3ea9e50b72b455ea8739c +size 2997320 diff --git a/checkpoint-200/optimizer_0/__2_0.distcp b/checkpoint-200/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..554118a816c5dabe473545c0ab32d26389c5f281 --- /dev/null +++ b/checkpoint-200/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9219130eeeb1a6544bd55a7f2330c2e7c6dc7b0c985c5cf032f35c99b99b2089 +size 2997320 diff --git a/checkpoint-200/optimizer_0/__3_0.distcp b/checkpoint-200/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..88ccd37d7f05f7a47be3c8349fdd14e1aad5dd4b --- /dev/null +++ b/checkpoint-200/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a71b3955a7e44a2bcceb6e52d5c20504d13f484ccf5a2a906d66fec6d468f96 +size 2997320 diff --git a/checkpoint-200/optimizer_0/__4_0.distcp b/checkpoint-200/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..95993762430b401e4db260d6ed0140738e872c44 --- /dev/null +++ b/checkpoint-200/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da98f91fe54a1a9d463244744c8992c2d353831876e0097ca6e6fc29b0f6ee8f +size 2997320 diff --git a/checkpoint-200/optimizer_0/__5_0.distcp b/checkpoint-200/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..e369fa9c3d5ece04cd1d41cc865f0508894504f2 --- /dev/null +++ b/checkpoint-200/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1a8f5fdf2203a09df55bb42b2d23a21cab61a53cf319c030c053bfdfa87a135 +size 2999596 diff --git a/checkpoint-200/optimizer_0/__6_0.distcp b/checkpoint-200/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..1e39a2849bb287df6d73cb25ef4bcefcdec4174b --- /dev/null +++ b/checkpoint-200/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2747d35b4936d138a08f7348c4f9689b17520581cd0c8308f04b8df43490ff6d +size 2998732 diff --git a/checkpoint-200/optimizer_0/__7_0.distcp b/checkpoint-200/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..646d5de479ebbdcec885c66fe6d098885c06e278 --- /dev/null +++ b/checkpoint-200/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb71ad30ec8fac5364eda5d728810d3d93116432802e6e143a58a26fc6ce557e +size 3005708 diff --git a/checkpoint-200/pytorch_model_fsdp_0/.metadata b/checkpoint-200/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..67a2c972a9b6d6e0774a718b7229ec90124028b2 --- /dev/null +++ b/checkpoint-200/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c2463eca5e348561b22de1fb3177b9a6a78ee231a1c415fca3ab271b54d0d41 +size 170758 diff --git a/checkpoint-200/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-200/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..ebe0c8a816c0e5b0eb7f170341dda60597976d37 --- /dev/null +++ b/checkpoint-200/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a29fa3f289dfa6b506eef31c807fc2cf104775ed21052a9bff043895ba0c1d1f +size 1489536 diff --git a/checkpoint-200/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-200/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..c296248658b1a6ab6114774782080e451b6ab0e7 --- /dev/null +++ b/checkpoint-200/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f5839ff1418b12f117737ecefd887355d5d6855c344c97765412ce8c916604f +size 1489536 diff --git a/checkpoint-200/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-200/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..154e650d7dbee554a47d0cfa8850ab4bc33fdc94 --- /dev/null +++ b/checkpoint-200/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfe810bbfe7c1ab41025eff75fdf7efc4fa5fc4d79a74d84ea24cfc77a0271c7 +size 1489536 diff --git a/checkpoint-200/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-200/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..5f91ec7a708e2775655dff1b81acabf99cdb41d5 --- /dev/null +++ b/checkpoint-200/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2138a54c90f177286b12cca7f170546e80067516bf48c573cc2e300239db33bb +size 1489536 diff --git a/checkpoint-200/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-200/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..251bc7fd46d5c9d7285b59c2baecd8012e8816b0 --- /dev/null +++ b/checkpoint-200/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5386cc64a52077a87dfd9e5382566e43fc566e6612af84552585f843eebf304b +size 1489536 diff --git a/checkpoint-200/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-200/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..d0f57a52047257755309fec65d435e77fe72be18 --- /dev/null +++ b/checkpoint-200/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77acab20bfa57a3ac051975539f1d021d6c7d9404546cd96b12500e81e4a43e1 +size 1489536 diff --git a/checkpoint-200/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-200/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..d197773fd9599f1ff4703bcd0978ad09596aab1b --- /dev/null +++ b/checkpoint-200/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82ba164ba2e30920270b1ec9e8a9573576ce06482386c6bb415016b163efb49e +size 1489536 diff --git a/checkpoint-200/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-200/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..3db91c0f6d1fbc0c84b68b41360222878deafe8c --- /dev/null +++ b/checkpoint-200/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ac10ceffde2c21fee467739af2d215707cffafe3012f4efdcf1092061eabdc6 +size 1489536 diff --git a/checkpoint-200/rng_state_0.pth b/checkpoint-200/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..16b69078618b6c5c44fe97eb4a33f5d7f2c2b6d1 --- /dev/null +++ b/checkpoint-200/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0639e2524b3606de92cf704efe87f4f42e6b531536716338096cdcb997c8f523 +size 15984 diff --git a/checkpoint-200/rng_state_1.pth b/checkpoint-200/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9c62011595a852aae341a8454b41adf3693e94d --- /dev/null +++ b/checkpoint-200/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf7f49d9db6183dc24e6704956551cd47c1f5a209075611fe04ca451437a895e +size 15984 diff --git a/checkpoint-200/rng_state_2.pth b/checkpoint-200/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..46f7047665cf1a3970a3b63ed2c2a0c96af8ab3a --- /dev/null +++ b/checkpoint-200/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e46e59be97e565494bebb1430b0c9995dec568fb4b79287f2dff3dbf2730430a +size 15984 diff --git a/checkpoint-200/rng_state_3.pth b/checkpoint-200/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..d987862d727896ff71382bdfdae9b0bcdd01daf2 --- /dev/null +++ b/checkpoint-200/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc0bb1325f8952acda6b977e4e52f785ac1892d58ddb3f31f0d60ae566525666 +size 15984 diff --git a/checkpoint-200/rng_state_4.pth b/checkpoint-200/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..8b36f99e0e9ebc014c97a490a15ccedf89960d6a --- /dev/null +++ b/checkpoint-200/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:338387d0d962d6ea549d773166e09059d382f352f9b68a7f8f49f176fdb24478 +size 15984 diff --git a/checkpoint-200/rng_state_5.pth b/checkpoint-200/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..2ce5464236860b95d6c73e13afc55f7c87f56249 --- /dev/null +++ b/checkpoint-200/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57875a0f71eaf92d41a7e95ed7a6061e2351c52735063fb8199f0a2528b42b27 +size 15984 diff --git a/checkpoint-200/rng_state_6.pth b/checkpoint-200/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..9a5976d4ad4a4ee1c3a7350e5dc3e92d9d9b63a3 --- /dev/null +++ b/checkpoint-200/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8634b4b2a740673155b568734eb1f609f0037798280a25133c4a979cfbf6c1c2 +size 15984 diff --git a/checkpoint-200/rng_state_7.pth b/checkpoint-200/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..db7f177e1377fe2219ab10be57b5cdcd399980c8 --- /dev/null +++ b/checkpoint-200/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab80e51cd15becf6304340fc463ec3aa562bffed9ca0ae82e20eacbd1641316e +size 15984 diff --git a/checkpoint-200/scheduler.pt b/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..85540d266d7ef6a9fd6cc6c6a50cee279aee5c43 --- /dev/null +++ b/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1407dea8b779520dd0c3e208f8c82d3dffd12c0548e2e910a4f9aca30c2908c +size 1064 diff --git a/checkpoint-200/trainer_state.json b/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..76f7be30f6478821e403c9408c35776ce5d68473 --- /dev/null +++ b/checkpoint-200/trainer_state.json @@ -0,0 +1,106 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1, + "eval_steps": 100, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.08136817067861557, + "learning_rate": 0.0004, + "loss": 1.158, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.0653829574584961, + "learning_rate": 0.0004998852503731983, + "loss": 1.0957, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.11592712253332138, + "learning_rate": 0.0004993848168027977, + "loss": 0.9276, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.08926476538181305, + "learning_rate": 0.0004984880506341147, + "loss": 1.0337, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9111642837524414, + "eval_runtime": 843.8058, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.08632034063339233, + "learning_rate": 0.0004971963770447935, + "loss": 1.0219, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.12949417531490326, + "learning_rate": 0.0004955118488155782, + "loss": 0.784, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.09222520887851715, + "learning_rate": 0.0004934371430679492, + "loss": 1.0, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.10075929015874863, + "learning_rate": 0.0004909755570095319, + "loss": 0.9617, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9380430579185486, + "eval_runtime": 856.156, + "eval_samples_per_second": 1.281, + "eval_steps_per_second": 0.021, + "step": 200 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.010354791166771e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-300/optimizer_0/.metadata b/checkpoint-300/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..8f2a8f8c7e54812f369e5294fef048a85b5acd3a --- /dev/null +++ b/checkpoint-300/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee746b0e8d31a227e0cb0a6b64032d2b182f1031413813440980dab0d78c1f3d +size 438489 diff --git a/checkpoint-300/optimizer_0/__0_0.distcp b/checkpoint-300/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..97fbe2afbf3767da38ad4d76979045517f7fe40d --- /dev/null +++ b/checkpoint-300/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7274653582f3698d91197d8fa999298534cf37d17ae950ac2d881482e0fc83f8 +size 2980252 diff --git a/checkpoint-300/optimizer_0/__1_0.distcp b/checkpoint-300/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..0570fc8b28eb2dc7f8c7f22fd9aa5ce5af4109f9 --- /dev/null +++ b/checkpoint-300/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79edfec751633eab8f8c631bf03c47d054ebd400f0bbf04e705098d5f309a7a6 +size 2997320 diff --git a/checkpoint-300/optimizer_0/__2_0.distcp b/checkpoint-300/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..82f6f1ea3ba31cb699a6b5a51584bf83034ca5ed --- /dev/null +++ b/checkpoint-300/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a81014e62f5dc0e7c78a24b05a466e2ff82c25e35822abf588092d899de90c0 +size 2997320 diff --git a/checkpoint-300/optimizer_0/__3_0.distcp b/checkpoint-300/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..f6622c2c7f1fef173163526bf66bff905994d513 --- /dev/null +++ b/checkpoint-300/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3768d7cdf222e845bcd67d6dd9735773d90f49f4bd39a0fd4b828f328dc0f529 +size 2997320 diff --git a/checkpoint-300/optimizer_0/__4_0.distcp b/checkpoint-300/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..7f62c84398a86f30b4a1e3190fda3737ec9d94a3 --- /dev/null +++ b/checkpoint-300/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9562d371bae6bddb1d97c81d6635f979014b1c40843b684ec2aa1b30cc2b3847 +size 2997320 diff --git a/checkpoint-300/optimizer_0/__5_0.distcp b/checkpoint-300/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..1375a774fcf38e4aaa351629c3d0dcd0fb61dfa4 --- /dev/null +++ b/checkpoint-300/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f26321c86c6baf59de6498e1284b5cc1344c2c0500ffb13fbd59db8e6d2ff13 +size 2999596 diff --git a/checkpoint-300/optimizer_0/__6_0.distcp b/checkpoint-300/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..4e4d0f848d6a584fb0de8999ea88f0cd55d6cd3f --- /dev/null +++ b/checkpoint-300/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e04df38cfc832e84f4f9511464c0d9f4a036911da441a4efb6175b734f79930 +size 2998732 diff --git a/checkpoint-300/optimizer_0/__7_0.distcp b/checkpoint-300/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..e545fb6c0cff22a12b7b56a779a0c1af0018df64 --- /dev/null +++ b/checkpoint-300/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cec2950e3f1a334a3cd4947813e685d63c48f0259226e2b7538576d7d7a1027b +size 3005708 diff --git a/checkpoint-300/pytorch_model_fsdp_0/.metadata b/checkpoint-300/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..0e3f91d9b1383f9ccf5332aad07ce5c021feba2d --- /dev/null +++ b/checkpoint-300/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:130c0520b89a3a0c5f857d1e3cecd2f865b71cf1ce86894774880d6cdf849cf6 +size 170758 diff --git a/checkpoint-300/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-300/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..821065bf9bb416b9ee0fd435fa7492637979d7d3 --- /dev/null +++ b/checkpoint-300/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57cfeb465a071ddd02c81e4a07cc0b0dd0a81c4113f274ae572f1e426fd1cb69 +size 1489536 diff --git a/checkpoint-300/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-300/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..31292c84e09c576e8f8b5cd6b1ddf6bde6ef0deb --- /dev/null +++ b/checkpoint-300/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4509c9581724e058b51a78c65a168525c53fb6916039d9048b75a866863e35f +size 1489536 diff --git a/checkpoint-300/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-300/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..de63583a8cad47c36fc6ec4affc43d6147d9ed3f --- /dev/null +++ b/checkpoint-300/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b38250c3bb1b201de319e0cc6479eb159954b5796094e4a0a19b6c17fdb03851 +size 1489536 diff --git a/checkpoint-300/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-300/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..4ef5b8c4201bad9d927c53af6a0efeed3850e5bd --- /dev/null +++ b/checkpoint-300/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c64e1db75af9b35f18452994748ebcd90856ac4137856dd66a5bc258d6175eda +size 1489536 diff --git a/checkpoint-300/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-300/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..c9af9bd0202b941f2737c01cffe2f4c467a84196 --- /dev/null +++ b/checkpoint-300/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e577d3e37aee2cb8c4b9d0d0553ad2ce852c8243507674b59e5327109821d5b6 +size 1489536 diff --git a/checkpoint-300/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-300/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..56cf88d1a95b310d161745719fb52b5be997e3ad --- /dev/null +++ b/checkpoint-300/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca551680c9818af1c0f68f32f09d630f247ba84081a02e61ba6e94351c2d02e2 +size 1489536 diff --git a/checkpoint-300/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-300/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..cc73ef299c7cfd6abfff2fc032fc932d2e616a77 --- /dev/null +++ b/checkpoint-300/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a1f2d557d1d85b63e94d90aeb4d09d1816eb61d93fbda603b3af1c26b7ae29e +size 1489536 diff --git a/checkpoint-300/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-300/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..f49385e696e67962aa28b45fb2760a1a8f23043b --- /dev/null +++ b/checkpoint-300/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05dc3dd0769386b10e8301771c24b897c55b83a95a84870d6a49027f8e85d7b0 +size 1489536 diff --git a/checkpoint-300/rng_state_0.pth b/checkpoint-300/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..44607170276938b9d9b86f56ebe7a61418d7dca6 --- /dev/null +++ b/checkpoint-300/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78635405a75ff05020872a238d2d5fdc4ef85c11a5aa732aaf9e33f0fb8e6585 +size 15984 diff --git a/checkpoint-300/rng_state_1.pth b/checkpoint-300/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..473886890d701a7a85e4c02958a316903a2765c9 --- /dev/null +++ b/checkpoint-300/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7d526848276a402f7bc1beae10b45087e901b93154c2c78f477e274d809f9c3 +size 15984 diff --git a/checkpoint-300/rng_state_2.pth b/checkpoint-300/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..1dc41557c2eca229a3d3952e02a32fe1fd75b3b0 --- /dev/null +++ b/checkpoint-300/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5996a515b3450a0364280c9b9bb73f90207e512975241b5f61ef7321ae5cc30f +size 15984 diff --git a/checkpoint-300/rng_state_3.pth b/checkpoint-300/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..4e5e3074fc1fd124389bcdb4a23a0011c7746d3a --- /dev/null +++ b/checkpoint-300/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:137e0d50809da2617d6e0ed6c57dd8d60eadf6c2b4ebd9ed853378846c8f6bf5 +size 15984 diff --git a/checkpoint-300/rng_state_4.pth b/checkpoint-300/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..132e2a133b99fe66dab9093da203aa0f0e3bbdfd --- /dev/null +++ b/checkpoint-300/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b268a58ef386239793155fd0899c5233e71fa56e6fd3e57ea0dd3bda7e952e1e +size 15984 diff --git a/checkpoint-300/rng_state_5.pth b/checkpoint-300/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..af26fca67e87daa5f7dd8707e6996ff055ae6e92 --- /dev/null +++ b/checkpoint-300/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d11709cd2016c543dd6d19b0f04760f961a2ef1cbd3d0c35107ad9a94b95665 +size 15984 diff --git a/checkpoint-300/rng_state_6.pth b/checkpoint-300/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..559f3f58001a0620b426704ba864225367e421f5 --- /dev/null +++ b/checkpoint-300/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3097153c203c8aac2363aa681f5a0996960cf90714eb5b681f7757df49e9a4e +size 15984 diff --git a/checkpoint-300/rng_state_7.pth b/checkpoint-300/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..ca7376cc0fb27fbffd6603b001029e46bf790277 --- /dev/null +++ b/checkpoint-300/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbe6c7b052c669c6b10fcc44b899680eb1044abdbb7045fb7926eab1e7d098c4 +size 15984 diff --git a/checkpoint-300/scheduler.pt b/checkpoint-300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4a593887a192de425ef90f1dd945f6d4a63561b --- /dev/null +++ b/checkpoint-300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8948ab0329c2a6866caa5cd565decc3046a34770df93f963374ee2edaa3e1fbf +size 1064 diff --git a/checkpoint-300/trainer_state.json b/checkpoint-300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6c44958dc834012070a33c64d369cbbfb5cb99a3 --- /dev/null +++ b/checkpoint-300/trainer_state.json @@ -0,0 +1,142 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.15, + "eval_steps": 100, + "global_step": 300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.08136817067861557, + "learning_rate": 0.0004, + "loss": 1.158, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.0653829574584961, + "learning_rate": 0.0004998852503731983, + "loss": 1.0957, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.11592712253332138, + "learning_rate": 0.0004993848168027977, + "loss": 0.9276, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.08926476538181305, + "learning_rate": 0.0004984880506341147, + "loss": 1.0337, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9111642837524414, + "eval_runtime": 843.8058, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.08632034063339233, + "learning_rate": 0.0004971963770447935, + "loss": 1.0219, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.12949417531490326, + "learning_rate": 0.0004955118488155782, + "loss": 0.784, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.09222520887851715, + "learning_rate": 0.0004934371430679492, + "loss": 1.0, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.10075929015874863, + "learning_rate": 0.0004909755570095319, + "loss": 0.9617, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9380430579185486, + "eval_runtime": 856.156, + "eval_samples_per_second": 1.281, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.09567111730575562, + "learning_rate": 0.0004881310026940389, + "loss": 0.7051, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.09723825007677078, + "learning_rate": 0.0004849080008040734, + "loss": 0.9906, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.09430444985628128, + "learning_rate": 0.00048131167346667446, + "loss": 0.9113, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.09439756721258163, + "learning_rate": 0.00047734773611302284, + "loss": 0.674, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 0.9544369578361511, + "eval_runtime": 850.9461, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 300 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.015532186750157e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-400/optimizer_0/.metadata b/checkpoint-400/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..bb43d1003f26eae2e31758c94ce321da47da9a96 --- /dev/null +++ b/checkpoint-400/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3a437650d6231d016d6817f471a45fd08a7d17c24638f4828fd39faec0c07bc +size 438489 diff --git a/checkpoint-400/optimizer_0/__0_0.distcp b/checkpoint-400/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..33d98801320a772c2403ae077af4ae63b79f7faf --- /dev/null +++ b/checkpoint-400/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d20cfe218e8897d69a534faba1ad09a03e9b0b89d820b57f58a457bd2c44e72 +size 2980252 diff --git a/checkpoint-400/optimizer_0/__1_0.distcp b/checkpoint-400/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..a8601dc1d37626893b376835e80ac9f949b8cdca --- /dev/null +++ b/checkpoint-400/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac63794fe4e90dda80589e30f2d018018a8f276f732a6e5c6cad7e0ddc8090c0 +size 2997320 diff --git a/checkpoint-400/optimizer_0/__2_0.distcp b/checkpoint-400/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..bfda243cd3701b1d62cf927d5c16d5b04534b966 --- /dev/null +++ b/checkpoint-400/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f06fe3e81b4d1300858bebd9c8900e0b66ee7590dea2c127c48796948ca5e2b +size 2997320 diff --git a/checkpoint-400/optimizer_0/__3_0.distcp b/checkpoint-400/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..41bd0631bab64a8783aa451b10d50f33a6b096ab --- /dev/null +++ b/checkpoint-400/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8071b04d29fa06a16beeee3b30c4367d68bfb028be2519444ed06f0501e2a5d6 +size 2997320 diff --git a/checkpoint-400/optimizer_0/__4_0.distcp b/checkpoint-400/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..7e860ab4080ad52b4a72e324a7747895026d9a4b --- /dev/null +++ b/checkpoint-400/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e1e8463a9de644e71fb14f050436195c9a159a5d1ee09dadd0aae79f27483f8 +size 2997320 diff --git a/checkpoint-400/optimizer_0/__5_0.distcp b/checkpoint-400/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..5df2c5c77c2e3085b42426e1c4fba222c111f58c --- /dev/null +++ b/checkpoint-400/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:042a4437b99f00acd750e086580f0fd323ddf15d7fc689ae59f74772d9921516 +size 2999596 diff --git a/checkpoint-400/optimizer_0/__6_0.distcp b/checkpoint-400/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..d38a3bba3d4ede150479d495b13c7ddca00bd8ea --- /dev/null +++ b/checkpoint-400/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7ba0e4b613a912bf22d547f0a3efbc83c31e76b5a6e3e552100c5cd69746487 +size 2998732 diff --git a/checkpoint-400/optimizer_0/__7_0.distcp b/checkpoint-400/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..11e07a6d94dda9a083387900d731b5c432b5cd68 --- /dev/null +++ b/checkpoint-400/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9afddfc5905f1c21e71cea2f222c014144e4fcc96d0c65d581b4092896c397f0 +size 3005708 diff --git a/checkpoint-400/pytorch_model_fsdp_0/.metadata b/checkpoint-400/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..f508534179ca40333e2ed69198a77b03f76e00c2 --- /dev/null +++ b/checkpoint-400/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02033daed9b272b82d875c40e5935442c779ef507a6bb0e37e5ddee686760c13 +size 170758 diff --git a/checkpoint-400/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-400/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..fb68bffab4df481383777568729950d9be5ae8c5 --- /dev/null +++ b/checkpoint-400/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cfdb889fd5e3829be89f8709e3f1c270cbddaf9ecae645d34fa5df8fe4101a5 +size 1489536 diff --git a/checkpoint-400/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-400/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..d733f5dedc8e777db2bbc14d24856b2221b9aab7 --- /dev/null +++ b/checkpoint-400/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb1abccae5b32a8d5db4fbad6371c7ec94ff4fb1e8c4abfb8f86b9b1473e3375 +size 1489536 diff --git a/checkpoint-400/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-400/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..6c57bf0e235a4f8b16eb48f646e97685cbb72141 --- /dev/null +++ b/checkpoint-400/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d6712164789573a89b709be1856b60b9ea0cc06db948b49e6f62b4fefaa107f +size 1489536 diff --git a/checkpoint-400/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-400/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..93b2afc268c154e77447d421cc74314b41adb766 --- /dev/null +++ b/checkpoint-400/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef9290eb1463c5f5c17564f4b7dfb4b50da554f858ca0d196118172f99d4a9de +size 1489536 diff --git a/checkpoint-400/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-400/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..1aacbfea027710387483c3b6657c1536d8165b95 --- /dev/null +++ b/checkpoint-400/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c47fb04a7e8c0b4fadecc0fae28c9ed190a4cfffbbd6e99d6167f078250e9b50 +size 1489536 diff --git a/checkpoint-400/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-400/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..53568ef2acc1d72606cd3fbc2a31df54599d8d05 --- /dev/null +++ b/checkpoint-400/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97df96c3775dc854a79140cb69abaf175b9be92d176f5c7c3abce080f6ef29ea +size 1489536 diff --git a/checkpoint-400/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-400/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..b496b4281042c54ecba8a8168188134de6e4ff6a --- /dev/null +++ b/checkpoint-400/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2755450e4c3a7de983bc0b4af78ddee067782a7c0a754477920cc3f8c10a1b70 +size 1489536 diff --git a/checkpoint-400/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-400/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..5cccfe1041712b13b36d70885804b06745bc850d --- /dev/null +++ b/checkpoint-400/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdb5c8c25b2a86903178423ca52e31c90d3e3d9d8f97a8f4ceeac77c3baf34ba +size 1489536 diff --git a/checkpoint-400/rng_state_0.pth b/checkpoint-400/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..08ab431ed3d6023acb46afeb060b2bbc4e17e7d2 --- /dev/null +++ b/checkpoint-400/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21b8976a1d18f6b03f88945806f1ce2916bbfcae48e4b272dded3a6d29242251 +size 15984 diff --git a/checkpoint-400/rng_state_1.pth b/checkpoint-400/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..30e500c3cb2e2ad2da307bf4f23bfe0292318b56 --- /dev/null +++ b/checkpoint-400/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098d6bac0efc38fc3b20a23d1fe696b2b1bd80001f27e26e9d2e80d6a9bc914d +size 15984 diff --git a/checkpoint-400/rng_state_2.pth b/checkpoint-400/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..cbc4f544882477314dc3c6e8813da4d593171e88 --- /dev/null +++ b/checkpoint-400/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:757130cd10dd14fcc61c8e47b0423982e9f6232c009e802da8d3016005900192 +size 15984 diff --git a/checkpoint-400/rng_state_3.pth b/checkpoint-400/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..7ed2ecae26799f84eababd04ff7802674b0035ea --- /dev/null +++ b/checkpoint-400/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72943d33e58e0f9960fe52b4d961c4aa6906a68c34e17129e0d5333b787e1208 +size 15984 diff --git a/checkpoint-400/rng_state_4.pth b/checkpoint-400/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..9efa819df6d036dbd8ded3420ebc7e7fa05234a2 --- /dev/null +++ b/checkpoint-400/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d714ece876a944d0cb591472fd0e3d59fddad92b2c4898c12934c1b75a918bbf +size 15984 diff --git a/checkpoint-400/rng_state_5.pth b/checkpoint-400/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..0365c590a24c3ca2baac2143a3b28f34fbdacea2 --- /dev/null +++ b/checkpoint-400/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e459ac04a35241d3f7e7c709e75d01bc47307c6667df5b234448443eecc478e +size 15984 diff --git a/checkpoint-400/rng_state_6.pth b/checkpoint-400/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..351d3794dfa0fb7bb29c699429a91608befe4d83 --- /dev/null +++ b/checkpoint-400/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41d2bff8d2d1aae1314c6be02255f8b63c4963be07c252a8777856f2fd3694c6 +size 15984 diff --git a/checkpoint-400/rng_state_7.pth b/checkpoint-400/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..1b3127672c15442038bad73eb467448946889382 --- /dev/null +++ b/checkpoint-400/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9789da649a1c984e33a7981799e36bbe7bf52a545da2459c7682edf25c418cfb +size 15984 diff --git a/checkpoint-400/scheduler.pt b/checkpoint-400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b235b58bb083cf95264c44d97a93c27205471afb --- /dev/null +++ b/checkpoint-400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b56d2c6bd1c2aa83c05c7033b3af13e0206ee25c12092046a84dc057490c52d +size 1064 diff --git a/checkpoint-400/trainer_state.json b/checkpoint-400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0f30081abcce6eb24d2ecf0f9d21a77e04089eb9 --- /dev/null +++ b/checkpoint-400/trainer_state.json @@ -0,0 +1,178 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2, + "eval_steps": 100, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.08136817067861557, + "learning_rate": 0.0004, + "loss": 1.158, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.0653829574584961, + "learning_rate": 0.0004998852503731983, + "loss": 1.0957, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.11592712253332138, + "learning_rate": 0.0004993848168027977, + "loss": 0.9276, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.08926476538181305, + "learning_rate": 0.0004984880506341147, + "loss": 1.0337, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9111642837524414, + "eval_runtime": 843.8058, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.08632034063339233, + "learning_rate": 0.0004971963770447935, + "loss": 1.0219, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.12949417531490326, + "learning_rate": 0.0004955118488155782, + "loss": 0.784, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.09222520887851715, + "learning_rate": 0.0004934371430679492, + "loss": 1.0, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.10075929015874863, + "learning_rate": 0.0004909755570095319, + "loss": 0.9617, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9380430579185486, + "eval_runtime": 856.156, + "eval_samples_per_second": 1.281, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.09567111730575562, + "learning_rate": 0.0004881310026940389, + "loss": 0.7051, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.09723825007677078, + "learning_rate": 0.0004849080008040734, + "loss": 0.9906, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.09430444985628128, + "learning_rate": 0.00048131167346667446, + "loss": 0.9113, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.09439756721258163, + "learning_rate": 0.00047734773611302284, + "loss": 0.674, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 0.9544369578361511, + "eval_runtime": 850.9461, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.10961435735225677, + "learning_rate": 0.0004730224883952422, + "loss": 0.9701, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.09628895670175552, + "learning_rate": 0.0004683428041747334, + "loss": 0.8976, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.11085063964128494, + "learning_rate": 0.0004633161205979517, + "loss": 0.6683, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.10140710324048996, + "learning_rate": 0.0004579504262769877, + "loss": 0.9373, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 0.968166172504425, + "eval_runtime": 853.0258, + "eval_samples_per_second": 1.286, + "eval_steps_per_second": 0.021, + "step": 400 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.020709582333542e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-500/.ipynb_checkpoints/trainer_state-checkpoint.json b/checkpoint-500/.ipynb_checkpoints/trainer_state-checkpoint.json new file mode 100644 index 0000000000000000000000000000000000000000..55950ec9758889c3cb932ad0d350e874a7ea771e --- /dev/null +++ b/checkpoint-500/.ipynb_checkpoints/trainer_state-checkpoint.json @@ -0,0 +1,214 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.25, + "eval_steps": 100, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.08136817067861557, + "learning_rate": 0.0004, + "loss": 1.158, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.0653829574584961, + "learning_rate": 0.0004998852503731983, + "loss": 1.0957, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.11592712253332138, + "learning_rate": 0.0004993848168027977, + "loss": 0.9276, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.08926476538181305, + "learning_rate": 0.0004984880506341147, + "loss": 1.0337, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9111642837524414, + "eval_runtime": 843.8058, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.08632034063339233, + "learning_rate": 0.0004971963770447935, + "loss": 1.0219, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.12949417531490326, + "learning_rate": 0.0004955118488155782, + "loss": 0.784, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.09222520887851715, + "learning_rate": 0.0004934371430679492, + "loss": 1.0, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.10075929015874863, + "learning_rate": 0.0004909755570095319, + "loss": 0.9617, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9380430579185486, + "eval_runtime": 856.156, + "eval_samples_per_second": 1.281, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.09567111730575562, + "learning_rate": 0.0004881310026940389, + "loss": 0.7051, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.09723825007677078, + "learning_rate": 0.0004849080008040734, + "loss": 0.9906, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.09430444985628128, + "learning_rate": 0.00048131167346667446, + "loss": 0.9113, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.09439756721258163, + "learning_rate": 0.00047734773611302284, + "loss": 0.674, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 0.9544369578361511, + "eval_runtime": 850.9461, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.10961435735225677, + "learning_rate": 0.0004730224883952422, + "loss": 0.9701, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.09628895670175552, + "learning_rate": 0.0004683428041747334, + "loss": 0.8976, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.11085063964128494, + "learning_rate": 0.0004633161205979517, + "loss": 0.6683, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.10140710324048996, + "learning_rate": 0.0004579504262769877, + "loss": 0.9373, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 0.968166172504425, + "eval_runtime": 853.0258, + "eval_samples_per_second": 1.286, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.13868793845176697, + "learning_rate": 0.0004522542485937369, + "loss": 0.8822, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.11853040754795074, + "learning_rate": 0.00044623664014783386, + "loss": 0.6483, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.11668186634778976, + "learning_rate": 0.00043990716436988924, + "loss": 0.9374, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.11911585181951523, + "learning_rate": 0.0004332758803228925, + "loss": 0.8434, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 0.9753186702728271, + "eval_runtime": 851.2812, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 500 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.025886977916928e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-500/optimizer_0/.metadata b/checkpoint-500/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..0ac929b33edeee182bf383ed1d00428cafcbbfcd --- /dev/null +++ b/checkpoint-500/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3f4411b63aab5388a548dbd3215d9bc9f664a46f12d7de28c0ae5a98461895d +size 438489 diff --git a/checkpoint-500/optimizer_0/__0_0.distcp b/checkpoint-500/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..22d8d77f3b277eb592b2a8f12badddd7f92b2411 --- /dev/null +++ b/checkpoint-500/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fab8ff2570f356008f2becb49fbf2c66933c70864d7885cbd0f40c60bf248393 +size 2980252 diff --git a/checkpoint-500/optimizer_0/__1_0.distcp b/checkpoint-500/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..310fcae5f9f0d4ba6316bf3005af1ef3368c8500 --- /dev/null +++ b/checkpoint-500/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a393b1e3d010dcd949cb37e2ae3edb4f9bdd629b7f0bbecf72793b6650a21b9 +size 2997320 diff --git a/checkpoint-500/optimizer_0/__2_0.distcp b/checkpoint-500/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..be7c2246e7ed49035c09571eb1c99d1a7d386c00 --- /dev/null +++ b/checkpoint-500/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:971773d3a25be5075bb9f4cb2b118317c54be03bdaa13b6b2d4c895b14caabce +size 2997320 diff --git a/checkpoint-500/optimizer_0/__3_0.distcp b/checkpoint-500/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..221fedcde5c8f134cd5789926cf0c604adfa44a7 --- /dev/null +++ b/checkpoint-500/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66aea25912e47844592ecbe6287a7a781b0b499766d1b537a73fa15780a7c9e7 +size 2997320 diff --git a/checkpoint-500/optimizer_0/__4_0.distcp b/checkpoint-500/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..5f76701c3dcb0fb2cd1e7151e222690aea33f010 --- /dev/null +++ b/checkpoint-500/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4a7f8d857a7d4349db6a049cb6fc2c474d52a3c5fd7e42370dded10898514ce +size 2997320 diff --git a/checkpoint-500/optimizer_0/__5_0.distcp b/checkpoint-500/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..3b9c70bd7921492c514803c41f80333281f1a12c --- /dev/null +++ b/checkpoint-500/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29d3515e058f9eb2410399ebc29ca585645053e9f2ff4a66312ebfae95fa011a +size 2999596 diff --git a/checkpoint-500/optimizer_0/__6_0.distcp b/checkpoint-500/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..cb2ad9167dabf257d324415dc23d08bb575aea85 --- /dev/null +++ b/checkpoint-500/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5696f1b5f9a1d594d08288690ae2b8b45306694cf6daccc5a94725bbafb7b7b +size 2998732 diff --git a/checkpoint-500/optimizer_0/__7_0.distcp b/checkpoint-500/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..872d9e6a4a7729868423a66545477bae030dd240 --- /dev/null +++ b/checkpoint-500/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca311a0cea3b9a286f312be7037c1ae0239717e8560fbf674b449bee28d16792 +size 3005708 diff --git a/checkpoint-500/pytorch_model_fsdp_0/.metadata b/checkpoint-500/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..5fe068b7cd9f2366fc12aac8055f3d72134fdc1f --- /dev/null +++ b/checkpoint-500/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba004726ea703694a6e3f5571e453a0f1f0408145d2654fbde4e64a01f58618c +size 170758 diff --git a/checkpoint-500/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-500/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..fd2d306cc84873d09d97a666cb50c03f1393ea0f --- /dev/null +++ b/checkpoint-500/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4fdb20988724f6b3ff347a7269738199ac379f1dcd5f5bae2219285e866c1de +size 1489536 diff --git a/checkpoint-500/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-500/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..1264b907a6069660f1dfd1fa5703cf2379d58c1f --- /dev/null +++ b/checkpoint-500/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afdf6dbeebb1d0aaf82e020e340e90902a5d42e4978d63995e16f6b1d244f480 +size 1489536 diff --git a/checkpoint-500/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-500/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..9e999a30565f83d825d7b4af02500962b853652b --- /dev/null +++ b/checkpoint-500/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e716b1a785d38406f72ffa2ceef333719d2be2bd7f9184629a4cca7caeb28c4 +size 1489536 diff --git a/checkpoint-500/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-500/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..69f20fbbebe0995b798f53bde5303e56c55f5d1d --- /dev/null +++ b/checkpoint-500/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:691c109c8de43fd73f44691d130edf086369aab773b9429c406334d15fb22c93 +size 1489536 diff --git a/checkpoint-500/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-500/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..9c352c8e5b032b8a0fee43ab3a6c60f0623c2975 --- /dev/null +++ b/checkpoint-500/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85fb58a65486de80d0cf792dc36e20987202b5636b741b106134c62748724d39 +size 1489536 diff --git a/checkpoint-500/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-500/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..29a0c66335da236507e4bde8d6b8b4353cecc40c --- /dev/null +++ b/checkpoint-500/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90552a05ee88bce5ad86595a2071afe44eabf163f5e59c59efe5e49465ff751b +size 1489536 diff --git a/checkpoint-500/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-500/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..d1e33114a59da4671f17b68f7dec00c8fe94b332 --- /dev/null +++ b/checkpoint-500/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee60941729246313425685f8c9bbe699baf20ccd3d47d563ea88aa53f76255c4 +size 1489536 diff --git a/checkpoint-500/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-500/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..8f00277599a991d36faf829db2f278efb303accd --- /dev/null +++ b/checkpoint-500/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:225feb88fae13bd6389ea1213ef1386b6b80dfb7807f249645828fa500df7106 +size 1489536 diff --git a/checkpoint-500/rng_state_0.pth b/checkpoint-500/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..699403fdac20c1753b0c0ae0b94a6414158f3257 --- /dev/null +++ b/checkpoint-500/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f620b269378694edf449ca02b2bfeac974979b2e2447e87f07029a0fed826d7 +size 15920 diff --git a/checkpoint-500/rng_state_1.pth b/checkpoint-500/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..050d63a5485b5258fd9c948c66f3b4e3794c4d90 --- /dev/null +++ b/checkpoint-500/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f9cb03eee71de2be3538288b2f1981537922a835272dad01c4069bfdca300bc +size 15984 diff --git a/checkpoint-500/rng_state_2.pth b/checkpoint-500/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..efa423c8fcfb306b8e66e50cb301a6c2ab76ffb7 --- /dev/null +++ b/checkpoint-500/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83178132bb595ebe8b479ea8f105f64cce3a1f9d8ed04b481c05928607ca1513 +size 15984 diff --git a/checkpoint-500/rng_state_3.pth b/checkpoint-500/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..8ac9ee3678df4eb7e0ca5c1ef371205c2d005cc5 --- /dev/null +++ b/checkpoint-500/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abd3e2d5b57a6bd25638c48f3b5b207d746b7faa0a195b8bbad1b546064dd8f7 +size 15984 diff --git a/checkpoint-500/rng_state_4.pth b/checkpoint-500/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..34a9402a0db7efea3756caf4840667911b86a37b --- /dev/null +++ b/checkpoint-500/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:202a4071f09999f050e9768c1a3a518b6b3bff189c23039fe733f9f24291210f +size 15984 diff --git a/checkpoint-500/rng_state_5.pth b/checkpoint-500/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..d598e109dca963639e8e4f2dba04edaf08d7b9e6 --- /dev/null +++ b/checkpoint-500/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0212e64fa50a81a26672f3500b949ccc2f145f3fcf63b35bc77484ecf00c0c08 +size 15984 diff --git a/checkpoint-500/rng_state_6.pth b/checkpoint-500/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..19e1f04b247c5fc37a394707f3f802eca220c93d --- /dev/null +++ b/checkpoint-500/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:879c494ee06aa75a54e0e810621a4cb24f07dfba633a1043ab54566bc9cd7870 +size 15984 diff --git a/checkpoint-500/rng_state_7.pth b/checkpoint-500/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..4cbc6a61e7ad7add8a5412ad34a6085eb835717a --- /dev/null +++ b/checkpoint-500/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d230e0857cdd767606b7f9cc69f92c1f0b0c46e8929ac5cc39e3a522ede1c7bc +size 15984 diff --git a/checkpoint-500/scheduler.pt b/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a7a8fc1beb3f842bafa7113ae4a09e73a121df7 --- /dev/null +++ b/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc95850e72613f7a4a1684465dd4f848b800fdca2b51b79a9ac67435988b218d +size 1064 diff --git a/checkpoint-500/trainer_state.json b/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..55950ec9758889c3cb932ad0d350e874a7ea771e --- /dev/null +++ b/checkpoint-500/trainer_state.json @@ -0,0 +1,214 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.25, + "eval_steps": 100, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.08136817067861557, + "learning_rate": 0.0004, + "loss": 1.158, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.0653829574584961, + "learning_rate": 0.0004998852503731983, + "loss": 1.0957, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.11592712253332138, + "learning_rate": 0.0004993848168027977, + "loss": 0.9276, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.08926476538181305, + "learning_rate": 0.0004984880506341147, + "loss": 1.0337, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9111642837524414, + "eval_runtime": 843.8058, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.08632034063339233, + "learning_rate": 0.0004971963770447935, + "loss": 1.0219, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.12949417531490326, + "learning_rate": 0.0004955118488155782, + "loss": 0.784, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.09222520887851715, + "learning_rate": 0.0004934371430679492, + "loss": 1.0, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.10075929015874863, + "learning_rate": 0.0004909755570095319, + "loss": 0.9617, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9380430579185486, + "eval_runtime": 856.156, + "eval_samples_per_second": 1.281, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.09567111730575562, + "learning_rate": 0.0004881310026940389, + "loss": 0.7051, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.09723825007677078, + "learning_rate": 0.0004849080008040734, + "loss": 0.9906, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.09430444985628128, + "learning_rate": 0.00048131167346667446, + "loss": 0.9113, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.09439756721258163, + "learning_rate": 0.00047734773611302284, + "loss": 0.674, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 0.9544369578361511, + "eval_runtime": 850.9461, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.10961435735225677, + "learning_rate": 0.0004730224883952422, + "loss": 0.9701, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.09628895670175552, + "learning_rate": 0.0004683428041747334, + "loss": 0.8976, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.11085063964128494, + "learning_rate": 0.0004633161205979517, + "loss": 0.6683, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.10140710324048996, + "learning_rate": 0.0004579504262769877, + "loss": 0.9373, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 0.968166172504425, + "eval_runtime": 853.0258, + "eval_samples_per_second": 1.286, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.13868793845176697, + "learning_rate": 0.0004522542485937369, + "loss": 0.8822, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.11853040754795074, + "learning_rate": 0.00044623664014783386, + "loss": 0.6483, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.11668186634778976, + "learning_rate": 0.00043990716436988924, + "loss": 0.9374, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.11911585181951523, + "learning_rate": 0.0004332758803228925, + "loss": 0.8434, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 0.9753186702728271, + "eval_runtime": 851.2812, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 500 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.025886977916928e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-600/optimizer_0/.metadata b/checkpoint-600/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..b3fd2a9dbaf093f504f97434bc6f97aea629bc74 --- /dev/null +++ b/checkpoint-600/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece73aa7beab61a52f2b665bb84048d3e3460f655058fd51097a3a400a6d4f9e +size 438489 diff --git a/checkpoint-600/optimizer_0/__0_0.distcp b/checkpoint-600/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..64971c4f3e5aac5aa7142fe65f000174e8b2a52b --- /dev/null +++ b/checkpoint-600/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3838e6e41981e55f62e5a4a35fc298099c633343080a5ae1e34468771b58bb11 +size 2980252 diff --git a/checkpoint-600/optimizer_0/__1_0.distcp b/checkpoint-600/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..fbc121c96ed33f81ea253db150d728802f9065ad --- /dev/null +++ b/checkpoint-600/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6546d519728e07c02c1452d82092cdff8f97aae75df49702185f3af769536a0a +size 2997320 diff --git a/checkpoint-600/optimizer_0/__2_0.distcp b/checkpoint-600/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..bf2410fa6423516fc43505d67216a40e1ea83c85 --- /dev/null +++ b/checkpoint-600/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4ee34fa06851d9cdc3034b0c7c361e6d9918991dc900fe221e392a8e30c93dd +size 2997320 diff --git a/checkpoint-600/optimizer_0/__3_0.distcp b/checkpoint-600/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..9f5d2beb308689013a6274a5e6e030f565c8d946 --- /dev/null +++ b/checkpoint-600/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b3ea4c24dccf3ba24720e209129067f9fa8244bfdea7880839cbcde8f946930 +size 2997320 diff --git a/checkpoint-600/optimizer_0/__4_0.distcp b/checkpoint-600/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..55c379bdced701e5826b2147f60fd2f94f03282c --- /dev/null +++ b/checkpoint-600/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a033f11651abec2ef4bae8b6bde8af1d361c7c183190fc10f41082b2b711de2 +size 2997320 diff --git a/checkpoint-600/optimizer_0/__5_0.distcp b/checkpoint-600/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..0c425da45a9a7c62c72bf6186e3ba6859ff43e69 --- /dev/null +++ b/checkpoint-600/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4bc92ce967b97c1c061f4a4b32a14448748e7223396458de78c33f0fb149df7 +size 2999596 diff --git a/checkpoint-600/optimizer_0/__6_0.distcp b/checkpoint-600/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..52e5ebfccba892740ec4d639774f6558de34347f --- /dev/null +++ b/checkpoint-600/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edbd895bfccec8e777f9405c3418c49925d14c76e631b3007a260e8520e88ea9 +size 2998732 diff --git a/checkpoint-600/optimizer_0/__7_0.distcp b/checkpoint-600/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..318791a420a71f973a11cc4f85d199159382a85b --- /dev/null +++ b/checkpoint-600/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b07b4814086666db74920a7f7e7679c9604312d05e6c538bb848788026e2f0d0 +size 3005708 diff --git a/checkpoint-600/pytorch_model_fsdp_0/.metadata b/checkpoint-600/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..934b7902a4f1926324bc0a5d911e7fa525499f3f --- /dev/null +++ b/checkpoint-600/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d550a97f13c41b077111e3a93ed2ce388b9c7927db8aec380df6f72918c67f88 +size 170758 diff --git a/checkpoint-600/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-600/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..329097e89e10a9abefd5442e623e7bc28fa2f507 --- /dev/null +++ b/checkpoint-600/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f018dae1fd13b519e829bef877906d1f083c2572b59f86c3ca7375d099c26439 +size 1489536 diff --git a/checkpoint-600/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-600/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..114315f1f7103c6ce839b9e53c232e123e30b81d --- /dev/null +++ b/checkpoint-600/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b084acb4fc30004dd5df1b10618062728a6a66077bdcde35f5f815c57f73d9c7 +size 1489536 diff --git a/checkpoint-600/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-600/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..d7025c9853c65687cc2152b2146765298faab055 --- /dev/null +++ b/checkpoint-600/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:798481e98c57c2cdd6802e878a98ea2438420b8778bdeb5e63314d7fb15cf0d3 +size 1489536 diff --git a/checkpoint-600/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-600/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..27a2029dfc8606ebce7abc4ed52b155e9b078b24 --- /dev/null +++ b/checkpoint-600/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6164c6f795ee3ae300e0f91a4100c471102b0c35f2812909e32d0a88cc575e75 +size 1489536 diff --git a/checkpoint-600/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-600/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..24d89adbe4bcbcb9a0216c9856b6d2314416b70b --- /dev/null +++ b/checkpoint-600/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90b51e89be0ddecbe5bd7211644ea5f219f20956b32d22ebbc0d7445d5c05fed +size 1489536 diff --git a/checkpoint-600/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-600/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..59a3b7c660d8047f17bb172a69e8b9a6780db55c --- /dev/null +++ b/checkpoint-600/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23cd893c722721a6cd5361b9334378b77c0be7e5f895eb73d25834abd99ed391 +size 1489536 diff --git a/checkpoint-600/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-600/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..5d6ac9ce4a5e0bec97fb045745b49155dfae12b3 --- /dev/null +++ b/checkpoint-600/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab9059e9d592054906e58baa967852de1247b2d8b7d6090248b9ca90b8538ceb +size 1489536 diff --git a/checkpoint-600/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-600/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..f874e22f6cf9bea171a04669be55db4c47aab5fd --- /dev/null +++ b/checkpoint-600/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65be731f546bd55891906a0ea01594ac0cd6aeced7adfae6d9abd123c0eb0b26 +size 1489536 diff --git a/checkpoint-600/rng_state_0.pth b/checkpoint-600/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..c6fd6e0a521df4fcbb775eee743abac77c33e99f --- /dev/null +++ b/checkpoint-600/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a51c34116e74976603507ba08705aca7a521f5063e0d5280afc2eab68c4bf36 +size 15920 diff --git a/checkpoint-600/rng_state_1.pth b/checkpoint-600/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..91b5f7b3616eba14f6616b60b318f0a38e42ad39 --- /dev/null +++ b/checkpoint-600/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0e94a8e4fb8632c1a2bbfafc9954dea009a47a89164eaa6b4e9ad4738a61866 +size 15984 diff --git a/checkpoint-600/rng_state_2.pth b/checkpoint-600/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..f61758714f0af0a51664275e8c183b1723ee985c --- /dev/null +++ b/checkpoint-600/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:581c40d10e64ee3415fa1edb81e4df35bcb7d2cccace07218e909d4a0cd39670 +size 15984 diff --git a/checkpoint-600/rng_state_3.pth b/checkpoint-600/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..49c24790df1059ce6d97cc85aa0bbafbbe5bef44 --- /dev/null +++ b/checkpoint-600/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a5c5fbd89816b8ec2f69e0f019ea2ae70ee8c86981f8d64e3a701572708e36e +size 15984 diff --git a/checkpoint-600/rng_state_4.pth b/checkpoint-600/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..caa359520023e2247cb8bb95d569e113ecb91360 --- /dev/null +++ b/checkpoint-600/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6a729343aef347ba34536cc6a4fb56f898d13aad7a247e651a1c36796fef1e4 +size 15984 diff --git a/checkpoint-600/rng_state_5.pth b/checkpoint-600/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..dec6b5929d6f20adfb88bf325d934067adb7ee8e --- /dev/null +++ b/checkpoint-600/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:889462d35b322a22d2431ed49ed60a340bdbfc709dabefd05137283df1487d2e +size 15984 diff --git a/checkpoint-600/rng_state_6.pth b/checkpoint-600/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..b819bbb64675781cd44fea373d7fa2b4890a7bd3 --- /dev/null +++ b/checkpoint-600/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06b701aac9cf3c38cff274fe22ba3612c50d5a982e2585585497f2ca2d0a4729 +size 15984 diff --git a/checkpoint-600/rng_state_7.pth b/checkpoint-600/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..397efd8a009b165f197504e261e6eeb57f633301 --- /dev/null +++ b/checkpoint-600/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30f35a94dc3a8dc4b2ae30697f159aedf54d139373b80885849c908f85376243 +size 15984 diff --git a/checkpoint-600/scheduler.pt b/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1fa9750d60fe6a490c749218369d6d200b7b6ffc --- /dev/null +++ b/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0fdf508bbbc1f82a3989be901b295050953f0512d0841595d9dae3ad6c0a857 +size 1064 diff --git a/checkpoint-600/trainer_state.json b/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e91dbe9466715304a3d3bb5236b0c9bd3f4084d5 --- /dev/null +++ b/checkpoint-600/trainer_state.json @@ -0,0 +1,250 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3, + "eval_steps": 100, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.08136817067861557, + "learning_rate": 0.0004, + "loss": 1.158, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.0653829574584961, + "learning_rate": 0.0004998852503731983, + "loss": 1.0957, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.11592712253332138, + "learning_rate": 0.0004993848168027977, + "loss": 0.9276, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.08926476538181305, + "learning_rate": 0.0004984880506341147, + "loss": 1.0337, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9111642837524414, + "eval_runtime": 843.8058, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.08632034063339233, + "learning_rate": 0.0004971963770447935, + "loss": 1.0219, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.12949417531490326, + "learning_rate": 0.0004955118488155782, + "loss": 0.784, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.09222520887851715, + "learning_rate": 0.0004934371430679492, + "loss": 1.0, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.10075929015874863, + "learning_rate": 0.0004909755570095319, + "loss": 0.9617, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9380430579185486, + "eval_runtime": 856.156, + "eval_samples_per_second": 1.281, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.09567111730575562, + "learning_rate": 0.0004881310026940389, + "loss": 0.7051, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.09723825007677078, + "learning_rate": 0.0004849080008040734, + "loss": 0.9906, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.09430444985628128, + "learning_rate": 0.00048131167346667446, + "loss": 0.9113, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.09439756721258163, + "learning_rate": 0.00047734773611302284, + "loss": 0.674, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 0.9544369578361511, + "eval_runtime": 850.9461, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.10961435735225677, + "learning_rate": 0.0004730224883952422, + "loss": 0.9701, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.09628895670175552, + "learning_rate": 0.0004683428041747334, + "loss": 0.8976, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.11085063964128494, + "learning_rate": 0.0004633161205979517, + "loss": 0.6683, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.10140710324048996, + "learning_rate": 0.0004579504262769877, + "loss": 0.9373, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 0.968166172504425, + "eval_runtime": 853.0258, + "eval_samples_per_second": 1.286, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.13868793845176697, + "learning_rate": 0.0004522542485937369, + "loss": 0.8822, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.11853040754795074, + "learning_rate": 0.00044623664014783386, + "loss": 0.6483, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.11668186634778976, + "learning_rate": 0.00043990716436988924, + "loss": 0.9374, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.11911585181951523, + "learning_rate": 0.0004332758803228925, + "loss": 0.8434, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 0.9753186702728271, + "eval_runtime": 851.2812, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.09807440638542175, + "learning_rate": 0.00042635332671593575, + "loss": 0.6661, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.09675773978233337, + "learning_rate": 0.00041915050515566445, + "loss": 0.8999, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.0979737937450409, + "learning_rate": 0.00041167886266207167, + "loss": 0.8616, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.08990875631570816, + "learning_rate": 0.0004039502734764241, + "loss": 0.8167, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 0.9926204681396484, + "eval_runtime": 851.6484, + "eval_samples_per_second": 1.288, + "eval_steps_per_second": 0.021, + "step": 600 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.031064373500314e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-700/optimizer_0/.metadata b/checkpoint-700/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..5ba3a53661645494b1224bc091b20037dfb4adb3 --- /dev/null +++ b/checkpoint-700/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3942c396089b49bf98719b426daf606d3bdc3d8465c0553c2bd08fc11ccdd0c0 +size 438489 diff --git a/checkpoint-700/optimizer_0/__0_0.distcp b/checkpoint-700/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..82ad7a901b4f1c378f81251a3391d980e4e18bfb --- /dev/null +++ b/checkpoint-700/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf6d9b25a2f896007afbf0857ae5e8c01a738013cefc34371ffe716990044481 +size 2980252 diff --git a/checkpoint-700/optimizer_0/__1_0.distcp b/checkpoint-700/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..743066aad1af30b46c6257effcb7091e6290cad6 --- /dev/null +++ b/checkpoint-700/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e163306cfabacdb9bdc988d8d5542bec8bc671b96d7c5b129369fdf6e1b23206 +size 2997320 diff --git a/checkpoint-700/optimizer_0/__2_0.distcp b/checkpoint-700/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..205a69d7bdfaa154d744670a889ea5a32a22b672 --- /dev/null +++ b/checkpoint-700/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34256dd693e6ca9c99d61795a6114f0868703ba8567ed5d296172f0e14c31737 +size 2997320 diff --git a/checkpoint-700/optimizer_0/__3_0.distcp b/checkpoint-700/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..a4740f0e6d39c5bbb1ed9103bfb122ae97bbc403 --- /dev/null +++ b/checkpoint-700/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2db808fd357c77433e2e2c6bd5f6f413747fa7e077bff649ec124af3fe37e20 +size 2997320 diff --git a/checkpoint-700/optimizer_0/__4_0.distcp b/checkpoint-700/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..c11ed0540ce7b857dc522ae2950c3a450577edc6 --- /dev/null +++ b/checkpoint-700/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0af99ffc185113b060895ae247a6a4d0f435039d180c07c8f6ef518787d65fc +size 2997320 diff --git a/checkpoint-700/optimizer_0/__5_0.distcp b/checkpoint-700/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..2afbc116a53af713fc54bc72fa7bf12195cf8f90 --- /dev/null +++ b/checkpoint-700/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4d0cb586e32f5a1e35a0cd4939a14c1c89f08cf3782ed35eb57e4c8395b6457 +size 2999596 diff --git a/checkpoint-700/optimizer_0/__6_0.distcp b/checkpoint-700/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..b88e899b6a2481f2548314539148221bbf2726e0 --- /dev/null +++ b/checkpoint-700/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb1d6de861a624ed462320f3e134616ff713d86627170e796f62dcd6311b0b18 +size 2998732 diff --git a/checkpoint-700/optimizer_0/__7_0.distcp b/checkpoint-700/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..0bdeae678fed48874d437e50d8ae96284f21178b --- /dev/null +++ b/checkpoint-700/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:493af8b3bf5615936969beede61df79fad53955cea76963061063fdcb59a4cd5 +size 3005708 diff --git a/checkpoint-700/pytorch_model_fsdp_0/.metadata b/checkpoint-700/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..38042f66d344605f26d1d51c43fc27852c89d4a4 --- /dev/null +++ b/checkpoint-700/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58eb44eb6a067f06ccacf8e288aaeec50e816027a9e758d69c5eea082f8e593b +size 170758 diff --git a/checkpoint-700/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-700/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..0529375cfbaf97cb18f1540ff67e39f7b4ac03e2 --- /dev/null +++ b/checkpoint-700/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf7978d1740c0a12a7d1aae62b45d5e86162d31700123c54602175b640782638 +size 1489536 diff --git a/checkpoint-700/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-700/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..533fdd75b43a189719ce2aca0ac8ce80b2354c39 --- /dev/null +++ b/checkpoint-700/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8515a39ebc74eb37f716d78990c284bc892edb5eebb07c075b56f38bf89ef4fa +size 1489536 diff --git a/checkpoint-700/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-700/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..59245885a96958fe157b83952b30510186d97ef8 --- /dev/null +++ b/checkpoint-700/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cbd7067f65a1a046f218b4ea108bc13a0f551bc5947326b72afe1f7c69d6660 +size 1489536 diff --git a/checkpoint-700/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-700/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..530fed77166a4633fb312fca9da4e076df0bb23f --- /dev/null +++ b/checkpoint-700/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:423d7640b367e2c401b2b6458acb14dbd80121af35c473e451eb361d85be1c99 +size 1489536 diff --git a/checkpoint-700/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-700/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..d0dfd364bb0113dea86864822f94dc5e826f108e --- /dev/null +++ b/checkpoint-700/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbbe86ddc20d0fee9f57f34ec7160f17f61027c4fc065720abba8d2f296cc06c +size 1489536 diff --git a/checkpoint-700/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-700/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..16195997b1ff954fa174ca243f62379f6a8dabd7 --- /dev/null +++ b/checkpoint-700/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f674166aed32e8209b2df4de32c5be859c082d18db410505765481eb52ed972f +size 1489536 diff --git a/checkpoint-700/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-700/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..dc414899945195ba1aa5779ae0eed02efc812b55 --- /dev/null +++ b/checkpoint-700/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eedb330ea6714db28761855bbea1c3f7f671a615ea5d1a1e1aa879918d8726b3 +size 1489536 diff --git a/checkpoint-700/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-700/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..b4fa34be5743570b30083d4f2c48704d9a9b015c --- /dev/null +++ b/checkpoint-700/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48007f2164c78a8bed10176560f5828c479dfc2747d9265064ec9c7d53dbf0f9 +size 1489536 diff --git a/checkpoint-700/rng_state_0.pth b/checkpoint-700/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5de689d9fd1a61dc6709321a6aa331d6e005be36 --- /dev/null +++ b/checkpoint-700/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85c87cd53dec4801e4dcb6d4bb9548ff38aa0045659cdd86a18577ca7fcd9a3e +size 15920 diff --git a/checkpoint-700/rng_state_1.pth b/checkpoint-700/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..48342dcb0785b7c96ce94a1e65ad7ee7a0b93dbc --- /dev/null +++ b/checkpoint-700/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14f6fcbe44fc6f1a64ef2e4229329192390d155590e99d4c4819c2d087a49fe7 +size 15984 diff --git a/checkpoint-700/rng_state_2.pth b/checkpoint-700/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..4dc040c521ab500222e26dbc5e64f11d4660c409 --- /dev/null +++ b/checkpoint-700/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8eb1ee5c68161bb5c64f0b872e58bb8fc2ec4b387d82190a48748f4bb20aceb2 +size 15984 diff --git a/checkpoint-700/rng_state_3.pth b/checkpoint-700/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..644a69426ebb1f4d81e7e519fff7817ec70f4c0c --- /dev/null +++ b/checkpoint-700/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5331cf0a58fb23632a83b360557d02d5fc89b8a8c2f4e8abeb139ab0cec5acb6 +size 15984 diff --git a/checkpoint-700/rng_state_4.pth b/checkpoint-700/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..5750a227925088c8490d2e426cc9b2ec8040e296 --- /dev/null +++ b/checkpoint-700/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3056627d13df37bbea46087f62d9c8e5eed572e8607868c71a241cfe13de1ac6 +size 15984 diff --git a/checkpoint-700/rng_state_5.pth b/checkpoint-700/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..48a0d5f5be44dab6078d368b51b38930d64f1475 --- /dev/null +++ b/checkpoint-700/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3d44298040b1507a5b96aef6d7f8aa7e27fb07ba806caeaf7d2308f4bbff608 +size 15984 diff --git a/checkpoint-700/rng_state_6.pth b/checkpoint-700/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..1aa7e59b4e0f8ab340bac71838c6bd072bfa78ca --- /dev/null +++ b/checkpoint-700/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdc7c28fd2bad2438655e4c93594ac44c548ed1530b724512ffdda3a05c275d2 +size 15984 diff --git a/checkpoint-700/rng_state_7.pth b/checkpoint-700/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..794e5a4bcac7ac5857121ab7fc543d81364d4cb2 --- /dev/null +++ b/checkpoint-700/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54038d56bc152013057e4b82ee86ab0a30d67fd1bf3f57326f8342e3b27946bb +size 15984 diff --git a/checkpoint-700/scheduler.pt b/checkpoint-700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8cb42d4426a8299f4a6337041b5f31cab6f971c5 --- /dev/null +++ b/checkpoint-700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6345684fba5839b30835cdc28a18266062e5d7b4c59bdd0ceab08c6e41c7a958 +size 1064 diff --git a/checkpoint-700/trainer_state.json b/checkpoint-700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ea038f25cb8985b1b29128f610396b1bcad98bc1 --- /dev/null +++ b/checkpoint-700/trainer_state.json @@ -0,0 +1,286 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.35, + "eval_steps": 100, + "global_step": 700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.08136817067861557, + "learning_rate": 0.0004, + "loss": 1.158, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.0653829574584961, + "learning_rate": 0.0004998852503731983, + "loss": 1.0957, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.11592712253332138, + "learning_rate": 0.0004993848168027977, + "loss": 0.9276, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.08926476538181305, + "learning_rate": 0.0004984880506341147, + "loss": 1.0337, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9111642837524414, + "eval_runtime": 843.8058, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.08632034063339233, + "learning_rate": 0.0004971963770447935, + "loss": 1.0219, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.12949417531490326, + "learning_rate": 0.0004955118488155782, + "loss": 0.784, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.09222520887851715, + "learning_rate": 0.0004934371430679492, + "loss": 1.0, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.10075929015874863, + "learning_rate": 0.0004909755570095319, + "loss": 0.9617, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9380430579185486, + "eval_runtime": 856.156, + "eval_samples_per_second": 1.281, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.09567111730575562, + "learning_rate": 0.0004881310026940389, + "loss": 0.7051, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.09723825007677078, + "learning_rate": 0.0004849080008040734, + "loss": 0.9906, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.09430444985628128, + "learning_rate": 0.00048131167346667446, + "loss": 0.9113, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.09439756721258163, + "learning_rate": 0.00047734773611302284, + "loss": 0.674, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 0.9544369578361511, + "eval_runtime": 850.9461, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.10961435735225677, + "learning_rate": 0.0004730224883952422, + "loss": 0.9701, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.09628895670175552, + "learning_rate": 0.0004683428041747334, + "loss": 0.8976, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.11085063964128494, + "learning_rate": 0.0004633161205979517, + "loss": 0.6683, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.10140710324048996, + "learning_rate": 0.0004579504262769877, + "loss": 0.9373, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 0.968166172504425, + "eval_runtime": 853.0258, + "eval_samples_per_second": 1.286, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.13868793845176697, + "learning_rate": 0.0004522542485937369, + "loss": 0.8822, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.11853040754795074, + "learning_rate": 0.00044623664014783386, + "loss": 0.6483, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.11668186634778976, + "learning_rate": 0.00043990716436988924, + "loss": 0.9374, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.11911585181951523, + "learning_rate": 0.0004332758803228925, + "loss": 0.8434, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 0.9753186702728271, + "eval_runtime": 851.2812, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.09807440638542175, + "learning_rate": 0.00042635332671593575, + "loss": 0.6661, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.09675773978233337, + "learning_rate": 0.00041915050515566445, + "loss": 0.8999, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.0979737937450409, + "learning_rate": 0.00041167886266207167, + "loss": 0.8616, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.08990875631570816, + "learning_rate": 0.0004039502734764241, + "loss": 0.8167, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 0.9926204681396484, + "eval_runtime": 851.6484, + "eval_samples_per_second": 1.288, + "eval_steps_per_second": 0.021, + "step": 600 + }, + { + "epoch": 0.3125, + "grad_norm": 0.09505137801170349, + "learning_rate": 0.0003959770201902294, + "loss": 0.8191, + "step": 625 + }, + { + "epoch": 0.325, + "grad_norm": 0.09105059504508972, + "learning_rate": 0.0003877717742252371, + "loss": 0.7203, + "step": 650 + }, + { + "epoch": 0.3375, + "grad_norm": 0.09274734556674957, + "learning_rate": 0.00037934757569549495, + "loss": 0.8154, + "step": 675 + }, + { + "epoch": 0.35, + "grad_norm": 0.10026416182518005, + "learning_rate": 0.00037071781268346345, + "loss": 0.8336, + "step": 700 + }, + { + "epoch": 0.35, + "eval_loss": 0.9959912896156311, + "eval_runtime": 849.7662, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 700 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.036241769083699e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-800/optimizer_0/.metadata b/checkpoint-800/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..2f516e460ff2121d057c640633ae50e514437e4e --- /dev/null +++ b/checkpoint-800/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63b9e0aa9158d193e3bbd35e97bd2729e4f2d9b6636a68f1dcae181919e57a5d +size 438489 diff --git a/checkpoint-800/optimizer_0/__0_0.distcp b/checkpoint-800/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..3fd1c3e8d278f25e5505343eded39ba9a8fa4463 --- /dev/null +++ b/checkpoint-800/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:635ca7e3a1b08279884ddbc7ee287cbe2b3975d73d1369a61b61fe75eed59c05 +size 2980252 diff --git a/checkpoint-800/optimizer_0/__1_0.distcp b/checkpoint-800/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..93528b7dbdc50f1bc745c83d406c43fa627af587 --- /dev/null +++ b/checkpoint-800/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdd28a2d29a6bbbab586a4e1a5a493d5979325a1aa01bc6ca84a598eafec9316 +size 2997320 diff --git a/checkpoint-800/optimizer_0/__2_0.distcp b/checkpoint-800/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..c11db9cdb62d84fc1d04b4f2ba31a6b02c222e2f --- /dev/null +++ b/checkpoint-800/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db9eb00c3286cf6894801fadaf4670c25d3c1f71d0b4212451c0765f46ca3555 +size 2997320 diff --git a/checkpoint-800/optimizer_0/__3_0.distcp b/checkpoint-800/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..abfa9c00f2dab82f23df219928e91184348b30c8 --- /dev/null +++ b/checkpoint-800/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb0929df879f485b3e2920b4149c74263d34345396a200c9f233cadab0d4a659 +size 2997320 diff --git a/checkpoint-800/optimizer_0/__4_0.distcp b/checkpoint-800/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..ff1611bc68e5fa11a4fb1eff724175284fcb80ec --- /dev/null +++ b/checkpoint-800/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3533b8e8f924b2a942f1fb5a6129522ab596fb1d2bc0bc19448d64bbcd9328ea +size 2997320 diff --git a/checkpoint-800/optimizer_0/__5_0.distcp b/checkpoint-800/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..5faa11628e679b965f50be4d27426042922561cd --- /dev/null +++ b/checkpoint-800/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:262d3055f5ab952d91206b09b7cf551c996b2da4936700886e397bedfcfa1eff +size 2999596 diff --git a/checkpoint-800/optimizer_0/__6_0.distcp b/checkpoint-800/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..aa8f89829ec28fb2cbb84655612c8eebd308ccee --- /dev/null +++ b/checkpoint-800/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:587e16064a1b21e18122c7324066f149bc48ddfbff2c4179c4c02d288d344c62 +size 2998732 diff --git a/checkpoint-800/optimizer_0/__7_0.distcp b/checkpoint-800/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..816bb661161da7cb488adcaec8a62e109bb93b35 --- /dev/null +++ b/checkpoint-800/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:948e3503509a24e373fff03e8f8ef8bd80e1a620e847735f20b7f6d3ae903ecb +size 3005708 diff --git a/checkpoint-800/pytorch_model_fsdp_0/.metadata b/checkpoint-800/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..977d5888f797cb72f061ad8959bcabcdbda4ba45 --- /dev/null +++ b/checkpoint-800/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d26df5686b83343a9f3234f81f2d1295968d8cc55756529030f7a831c5ede4f7 +size 170758 diff --git a/checkpoint-800/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-800/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..e67dd626dbc39e58656369e7ff303a88d6efb816 --- /dev/null +++ b/checkpoint-800/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59df24836321b2a0f04ac28ed5bc4a1e12868b3c7edabb96c24ada84074a7d33 +size 1489536 diff --git a/checkpoint-800/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-800/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..eea451bba613178142662ec70d43664857ec4234 --- /dev/null +++ b/checkpoint-800/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e018d936518c47d25a34b3d5f40bce9b5184cdcff4978ceda5ecbd59eec0dfad +size 1489536 diff --git a/checkpoint-800/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-800/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..8157de4bb88eb32e097eb37fa41ebc81fda105e9 --- /dev/null +++ b/checkpoint-800/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e220357a2c67adc653d0a29d8711af5cb5bdd0a5fff1fac2e4c37af35549f24f +size 1489536 diff --git a/checkpoint-800/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-800/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..a7025b0f663e5cdfdfb2eacf38ce2d70b4328589 --- /dev/null +++ b/checkpoint-800/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6aa54ab46f4f4a3c6cd92ab47573849b6f727e385874fb337ecc45b0a5360b09 +size 1489536 diff --git a/checkpoint-800/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-800/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..a654a5d3b3becd805d94a441706bdc3660d9da4e --- /dev/null +++ b/checkpoint-800/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a82e37c4d8f314ef04cd40a8772dd1c18ce241b44cc5ad6380b82cd74acaff4e +size 1489536 diff --git a/checkpoint-800/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-800/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..8e9292f6f3f980a63d662dfb0c734b5a8a77e829 --- /dev/null +++ b/checkpoint-800/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5db2685a13b16248508b31cb9bc1805daae0cf3a0fd020d19abc87f2c5602c0a +size 1489536 diff --git a/checkpoint-800/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-800/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..06ffdf8449d0ae94663755873941470694ff55e1 --- /dev/null +++ b/checkpoint-800/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:389c46bef92177a54b8218e9a01b2bc4f0e4cbfc5905b73747d1ca7d6524ee29 +size 1489536 diff --git a/checkpoint-800/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-800/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..d96a7f5f5e5e96ccb8eee0308a2c4e49f5924c3b --- /dev/null +++ b/checkpoint-800/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59b09330803b1e641542a58cf0f1bfdd18fb7db2a821490ab2172ba90c2e2080 +size 1489536 diff --git a/checkpoint-800/rng_state_0.pth b/checkpoint-800/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..60f6acd1c374668f2a5587e26990b9bd686873b3 --- /dev/null +++ b/checkpoint-800/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da38903189d77ebca677952019ddffb7dc5e4dff9e4e4d1d2a62ff346e14cd1e +size 15920 diff --git a/checkpoint-800/rng_state_1.pth b/checkpoint-800/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..cad0c6517f208136366ea009ff0ec32fcfadb9cb --- /dev/null +++ b/checkpoint-800/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d7e4cf0e0adc364a659de9420fa4d12fb279f876e679a4b79e7a01fb98c8580 +size 15984 diff --git a/checkpoint-800/rng_state_2.pth b/checkpoint-800/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..dcbe0a0928a33931cc06580007b1dcebba9ea56f --- /dev/null +++ b/checkpoint-800/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f02eefc8db7444d78f255ccaa290083f51941b3a85ff2920cad0bdeaf64ec20 +size 15984 diff --git a/checkpoint-800/rng_state_3.pth b/checkpoint-800/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..61ef41217922d31e2e68b3deefa6b100ff7d69d3 --- /dev/null +++ b/checkpoint-800/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b85bbe008ff0367b8be7c5b22ebec4e6c57a6c257812f2923dd9db1590d8fbb +size 15984 diff --git a/checkpoint-800/rng_state_4.pth b/checkpoint-800/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..33d1fcde67e6ebc100b7d8763871261985f2621a --- /dev/null +++ b/checkpoint-800/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d693a5e5e3cf8ad4afea3c62093fd15960c5ef4039e21b8699187772ed46244b +size 15984 diff --git a/checkpoint-800/rng_state_5.pth b/checkpoint-800/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..e7324bdd0fcc8b76ecacd006e8ebacb6783d10d3 --- /dev/null +++ b/checkpoint-800/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e89d03c65eb3124c5f266341e1acfe9513d5e84d6a484888bcec59658a101d6 +size 15984 diff --git a/checkpoint-800/rng_state_6.pth b/checkpoint-800/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..aee6d6df3a75f3d97cd7a9a0974b47206c729386 --- /dev/null +++ b/checkpoint-800/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bdfa8d967dfcbcd71a8bafeac0ad93d130db3f163ed82de33d0ae3e01f671d5 +size 15984 diff --git a/checkpoint-800/rng_state_7.pth b/checkpoint-800/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..5fe000e2503db4be1cfcf332723aed922265e3e8 --- /dev/null +++ b/checkpoint-800/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2c691d6dc153751df913aa6b27aabff61d809c06b4ef74d0b96634fe71f71c1 +size 15984 diff --git a/checkpoint-800/scheduler.pt b/checkpoint-800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a08acd119a7aab21ab3547c87f67a43d79bfe92d --- /dev/null +++ b/checkpoint-800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5707b44605991a4e2707814032fbd5e2ffa2f78529edb47686673bff4f2f267b +size 1064 diff --git a/checkpoint-800/trainer_state.json b/checkpoint-800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..66541e567567003fbc521a0432b15a4ae91e9cfe --- /dev/null +++ b/checkpoint-800/trainer_state.json @@ -0,0 +1,322 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4, + "eval_steps": 100, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.08136817067861557, + "learning_rate": 0.0004, + "loss": 1.158, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.0653829574584961, + "learning_rate": 0.0004998852503731983, + "loss": 1.0957, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.11592712253332138, + "learning_rate": 0.0004993848168027977, + "loss": 0.9276, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.08926476538181305, + "learning_rate": 0.0004984880506341147, + "loss": 1.0337, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9111642837524414, + "eval_runtime": 843.8058, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.08632034063339233, + "learning_rate": 0.0004971963770447935, + "loss": 1.0219, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.12949417531490326, + "learning_rate": 0.0004955118488155782, + "loss": 0.784, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.09222520887851715, + "learning_rate": 0.0004934371430679492, + "loss": 1.0, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.10075929015874863, + "learning_rate": 0.0004909755570095319, + "loss": 0.9617, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9380430579185486, + "eval_runtime": 856.156, + "eval_samples_per_second": 1.281, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.09567111730575562, + "learning_rate": 0.0004881310026940389, + "loss": 0.7051, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.09723825007677078, + "learning_rate": 0.0004849080008040734, + "loss": 0.9906, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.09430444985628128, + "learning_rate": 0.00048131167346667446, + "loss": 0.9113, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.09439756721258163, + "learning_rate": 0.00047734773611302284, + "loss": 0.674, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 0.9544369578361511, + "eval_runtime": 850.9461, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.10961435735225677, + "learning_rate": 0.0004730224883952422, + "loss": 0.9701, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.09628895670175552, + "learning_rate": 0.0004683428041747334, + "loss": 0.8976, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.11085063964128494, + "learning_rate": 0.0004633161205979517, + "loss": 0.6683, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.10140710324048996, + "learning_rate": 0.0004579504262769877, + "loss": 0.9373, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 0.968166172504425, + "eval_runtime": 853.0258, + "eval_samples_per_second": 1.286, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.13868793845176697, + "learning_rate": 0.0004522542485937369, + "loss": 0.8822, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.11853040754795074, + "learning_rate": 0.00044623664014783386, + "loss": 0.6483, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.11668186634778976, + "learning_rate": 0.00043990716436988924, + "loss": 0.9374, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.11911585181951523, + "learning_rate": 0.0004332758803228925, + "loss": 0.8434, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 0.9753186702728271, + "eval_runtime": 851.2812, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.09807440638542175, + "learning_rate": 0.00042635332671593575, + "loss": 0.6661, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.09675773978233337, + "learning_rate": 0.00041915050515566445, + "loss": 0.8999, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.0979737937450409, + "learning_rate": 0.00041167886266207167, + "loss": 0.8616, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.08990875631570816, + "learning_rate": 0.0004039502734764241, + "loss": 0.8167, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 0.9926204681396484, + "eval_runtime": 851.6484, + "eval_samples_per_second": 1.288, + "eval_steps_per_second": 0.021, + "step": 600 + }, + { + "epoch": 0.3125, + "grad_norm": 0.09505137801170349, + "learning_rate": 0.0003959770201902294, + "loss": 0.8191, + "step": 625 + }, + { + "epoch": 0.325, + "grad_norm": 0.09105059504508972, + "learning_rate": 0.0003877717742252371, + "loss": 0.7203, + "step": 650 + }, + { + "epoch": 0.3375, + "grad_norm": 0.09274734556674957, + "learning_rate": 0.00037934757569549495, + "loss": 0.8154, + "step": 675 + }, + { + "epoch": 0.35, + "grad_norm": 0.10026416182518005, + "learning_rate": 0.00037071781268346345, + "loss": 0.8336, + "step": 700 + }, + { + "epoch": 0.35, + "eval_loss": 0.9959912896156311, + "eval_runtime": 849.7662, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 700 + }, + { + "epoch": 0.3625, + "grad_norm": 0.09517039358615875, + "learning_rate": 0.00036189619996312495, + "loss": 0.6845, + "step": 725 + }, + { + "epoch": 0.375, + "grad_norm": 0.11264201998710632, + "learning_rate": 0.00035289675720390174, + "loss": 0.8445, + "step": 750 + }, + { + "epoch": 0.3875, + "grad_norm": 0.0979958102107048, + "learning_rate": 0.00034373378669002105, + "loss": 0.8269, + "step": 775 + }, + { + "epoch": 0.4, + "grad_norm": 0.10223093628883362, + "learning_rate": 0.00033442185059073706, + "loss": 0.6517, + "step": 800 + }, + { + "epoch": 0.4, + "eval_loss": 1.006990671157837, + "eval_runtime": 845.1184, + "eval_samples_per_second": 1.298, + "eval_steps_per_second": 0.021, + "step": 800 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.041419164667085e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-900/optimizer_0/.metadata b/checkpoint-900/optimizer_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..6c84142f9ddb12cd7e02c26f3251a7334d002d00 --- /dev/null +++ b/checkpoint-900/optimizer_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dead39405d341f6af5760f32b569c0e5d14abbbc72124424249ed907d4150f8 +size 438489 diff --git a/checkpoint-900/optimizer_0/__0_0.distcp b/checkpoint-900/optimizer_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..abee5282d3c13dcacdb8898fc6493be885441dfb --- /dev/null +++ b/checkpoint-900/optimizer_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6913e9280681c30226bde20a0171fa65cf13a798b33fdeff0dc4c105f1aa115 +size 2980252 diff --git a/checkpoint-900/optimizer_0/__1_0.distcp b/checkpoint-900/optimizer_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..b34c8ca3c2f448bc6ff6a87ce35a4a007e7e5dd6 --- /dev/null +++ b/checkpoint-900/optimizer_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dd6ffbb37559c84018da8d34d0e0a9d528bf1e566022863148867842e59510c +size 2997320 diff --git a/checkpoint-900/optimizer_0/__2_0.distcp b/checkpoint-900/optimizer_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..7c366b17b8d74d9379d50a0854248b451765d271 --- /dev/null +++ b/checkpoint-900/optimizer_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cec1d175e1ed3c4a3ae6483d031b01c5c8949c84fda559401a369a36f539c57d +size 2997320 diff --git a/checkpoint-900/optimizer_0/__3_0.distcp b/checkpoint-900/optimizer_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..3535ecf93cc0384542139a584ee8c9660f3e1598 --- /dev/null +++ b/checkpoint-900/optimizer_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4bbfb171384834d30e9dcd9c5461b8c70a54541c739de618e80bc945b1c39fc +size 2997320 diff --git a/checkpoint-900/optimizer_0/__4_0.distcp b/checkpoint-900/optimizer_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..11a1017c60af5effb1ae9721fc61901d4bd58858 --- /dev/null +++ b/checkpoint-900/optimizer_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d7b48a3044abf1028d5df2f3b25cf96b893ad6f40631cca9bc8484e9817ea71 +size 2997320 diff --git a/checkpoint-900/optimizer_0/__5_0.distcp b/checkpoint-900/optimizer_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..ef84ae36511902d8a65011af841b5dbbd4b2671f --- /dev/null +++ b/checkpoint-900/optimizer_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66537d0192f5ecb4dbd4b45cf573ffe751e63d6c5383a78d1746d44aa2a27fc8 +size 2999596 diff --git a/checkpoint-900/optimizer_0/__6_0.distcp b/checkpoint-900/optimizer_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..5bf14896fcb08bb472d7c7436c251167bcd92757 --- /dev/null +++ b/checkpoint-900/optimizer_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8959be1c4b11a822c6ee930ea0983536d0ac0e9b98cea3d1580d0b53dc49013e +size 2998732 diff --git a/checkpoint-900/optimizer_0/__7_0.distcp b/checkpoint-900/optimizer_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..76e719cfbca08932ddc4930c604a5cc94aecf18f --- /dev/null +++ b/checkpoint-900/optimizer_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b06590a0b5b900aeb995cbf28f90bba721796c769cf0806d416bc11ffb743c79 +size 3005708 diff --git a/checkpoint-900/pytorch_model_fsdp_0/.metadata b/checkpoint-900/pytorch_model_fsdp_0/.metadata new file mode 100644 index 0000000000000000000000000000000000000000..684bb23f1a48a0bf7698c0d58ce95285250a147b --- /dev/null +++ b/checkpoint-900/pytorch_model_fsdp_0/.metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fe9aa32dddd98e11a0a9ac9bf7727c1cb9476a849befe9439c7cd569b4b39c3 +size 170758 diff --git a/checkpoint-900/pytorch_model_fsdp_0/__0_0.distcp b/checkpoint-900/pytorch_model_fsdp_0/__0_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..44df1077618a89413ee19ee4b1b64d1f4b88a1bb --- /dev/null +++ b/checkpoint-900/pytorch_model_fsdp_0/__0_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c449df687e31da4000e47dd714fd72c69c4dc8f6706d198383ea120c73de47b +size 1489536 diff --git a/checkpoint-900/pytorch_model_fsdp_0/__1_0.distcp b/checkpoint-900/pytorch_model_fsdp_0/__1_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..e31f3ca953a09903f9f9cc5f799128d6d5a1e95b --- /dev/null +++ b/checkpoint-900/pytorch_model_fsdp_0/__1_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc6eeec4ee15e351bb219a66605141b9517335029cb588cc29adcf2304ca12e8 +size 1489536 diff --git a/checkpoint-900/pytorch_model_fsdp_0/__2_0.distcp b/checkpoint-900/pytorch_model_fsdp_0/__2_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..1cfb29f00391252d793350d225bb41cc62b42c4a --- /dev/null +++ b/checkpoint-900/pytorch_model_fsdp_0/__2_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b5db200cf653cefd8f470157ddb11f9017bcca4bb157b486389b132b84fda99 +size 1489536 diff --git a/checkpoint-900/pytorch_model_fsdp_0/__3_0.distcp b/checkpoint-900/pytorch_model_fsdp_0/__3_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..70ff07e06f42f9b9b45b9c9481ee30ce2e2d572b --- /dev/null +++ b/checkpoint-900/pytorch_model_fsdp_0/__3_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f40a1cfdf9a5114a1c5a219a976a227a68145d93746667a468437d9424a7cd11 +size 1489536 diff --git a/checkpoint-900/pytorch_model_fsdp_0/__4_0.distcp b/checkpoint-900/pytorch_model_fsdp_0/__4_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..bcedd3774550480622741d8dfa3e65c292e69d6f --- /dev/null +++ b/checkpoint-900/pytorch_model_fsdp_0/__4_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c224dc598af0ddf613488316338fdf48f57f89cf44be271bf0c278277f886ff +size 1489536 diff --git a/checkpoint-900/pytorch_model_fsdp_0/__5_0.distcp b/checkpoint-900/pytorch_model_fsdp_0/__5_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..495a48b0cad96f48abf9a3270005be52d9d3ca59 --- /dev/null +++ b/checkpoint-900/pytorch_model_fsdp_0/__5_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4012708f2cd5709e9bdce9e41eea450da72c5b6b7fb3bd3b8747ea4bf3ad093d +size 1489536 diff --git a/checkpoint-900/pytorch_model_fsdp_0/__6_0.distcp b/checkpoint-900/pytorch_model_fsdp_0/__6_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..10bdf1503bffb5789de3aa00f63996cf0f37226f --- /dev/null +++ b/checkpoint-900/pytorch_model_fsdp_0/__6_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd80c6f3861caa819d09b93d65a5dcafc269ce8eb2f9d214af25c41cb8af281f +size 1489536 diff --git a/checkpoint-900/pytorch_model_fsdp_0/__7_0.distcp b/checkpoint-900/pytorch_model_fsdp_0/__7_0.distcp new file mode 100644 index 0000000000000000000000000000000000000000..b4c22dfa7633b6499aee54c77f798758993d6030 --- /dev/null +++ b/checkpoint-900/pytorch_model_fsdp_0/__7_0.distcp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5598fd987fe4535ef4eb14d56c46678c51b3a68b14e556092bf1ae15df64717c +size 1489536 diff --git a/checkpoint-900/rng_state_0.pth b/checkpoint-900/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..6f7212620384ca896ddbb8c23d9b0098e49cd3ff --- /dev/null +++ b/checkpoint-900/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d13a36e628236695a972ee9fb76f88cc36c00608604d5bd263206cb5124dbd7d +size 15920 diff --git a/checkpoint-900/rng_state_1.pth b/checkpoint-900/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..4f150aed2990427a652f468bc7413a6d9482ff81 --- /dev/null +++ b/checkpoint-900/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23f82f54b09d8f2dac91585825ecf36127e36b821dbab5ed25796ec90d0e9b04 +size 15984 diff --git a/checkpoint-900/rng_state_2.pth b/checkpoint-900/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..08cbb03242a69ade96d61402b2b294aa0f01232e --- /dev/null +++ b/checkpoint-900/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:138709c99223895b208c586f871a225b0faf3972db74e752eb5533badae97672 +size 15984 diff --git a/checkpoint-900/rng_state_3.pth b/checkpoint-900/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..cd7a3edee7d843e261b8f2d0704079bf9aa33a96 --- /dev/null +++ b/checkpoint-900/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:356d53601f7cbb7bd3c2b24fa4b30c939a248284d345d26fe1b313b58687aea7 +size 15984 diff --git a/checkpoint-900/rng_state_4.pth b/checkpoint-900/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..8936b7750cd6a493f9a495f396ad3616522a5a25 --- /dev/null +++ b/checkpoint-900/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29f832fb5ca9e51ef7d884042fbdebee0a99641db20294d78e0bbbd1eb8faca1 +size 15984 diff --git a/checkpoint-900/rng_state_5.pth b/checkpoint-900/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..9a4a53e31d212b6f2550bae6e5c38d65c6b72b5e --- /dev/null +++ b/checkpoint-900/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:762540bbafcd78de93499144446247bf3b263115df9227761d32e784f965a8d5 +size 15984 diff --git a/checkpoint-900/rng_state_6.pth b/checkpoint-900/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..996605c16fe19ccb4add64768e7a35890431b5e4 --- /dev/null +++ b/checkpoint-900/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a06a2dfb2b1b635be1df9ea42d970b61333bd2b01e51d9e7dbf3873784b30e00 +size 15984 diff --git a/checkpoint-900/rng_state_7.pth b/checkpoint-900/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..24fcd3eb7ec7ad8e89ae99d0e36b21b1fa6b28b8 --- /dev/null +++ b/checkpoint-900/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf2b8ee4118d412fb54eaa07e2a5280ba0272c643737804cc19a5ab5c30bc01b +size 15984 diff --git a/checkpoint-900/scheduler.pt b/checkpoint-900/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..72a700f2d08b7ec27a4b1bf9093382c22731b5de --- /dev/null +++ b/checkpoint-900/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b9752ff9fe4b64d860340a91f85fd2a22d00770203412a2f6bf01b73aa0f846 +size 1064 diff --git a/checkpoint-900/trainer_state.json b/checkpoint-900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4bbecd276da0ff2fa4c05210b2369590beec3fa8 --- /dev/null +++ b/checkpoint-900/trainer_state.json @@ -0,0 +1,358 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.45, + "eval_steps": 100, + "global_step": 900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0125, + "grad_norm": 0.08136817067861557, + "learning_rate": 0.0004, + "loss": 1.158, + "step": 25 + }, + { + "epoch": 0.025, + "grad_norm": 0.0653829574584961, + "learning_rate": 0.0004998852503731983, + "loss": 1.0957, + "step": 50 + }, + { + "epoch": 0.0375, + "grad_norm": 0.11592712253332138, + "learning_rate": 0.0004993848168027977, + "loss": 0.9276, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.08926476538181305, + "learning_rate": 0.0004984880506341147, + "loss": 1.0337, + "step": 100 + }, + { + "epoch": 0.05, + "eval_loss": 0.9111642837524414, + "eval_runtime": 843.8058, + "eval_samples_per_second": 1.3, + "eval_steps_per_second": 0.021, + "step": 100 + }, + { + "epoch": 0.0625, + "grad_norm": 0.08632034063339233, + "learning_rate": 0.0004971963770447935, + "loss": 1.0219, + "step": 125 + }, + { + "epoch": 0.075, + "grad_norm": 0.12949417531490326, + "learning_rate": 0.0004955118488155782, + "loss": 0.784, + "step": 150 + }, + { + "epoch": 0.0875, + "grad_norm": 0.09222520887851715, + "learning_rate": 0.0004934371430679492, + "loss": 1.0, + "step": 175 + }, + { + "epoch": 0.1, + "grad_norm": 0.10075929015874863, + "learning_rate": 0.0004909755570095319, + "loss": 0.9617, + "step": 200 + }, + { + "epoch": 0.1, + "eval_loss": 0.9380430579185486, + "eval_runtime": 856.156, + "eval_samples_per_second": 1.281, + "eval_steps_per_second": 0.021, + "step": 200 + }, + { + "epoch": 0.1125, + "grad_norm": 0.09567111730575562, + "learning_rate": 0.0004881310026940389, + "loss": 0.7051, + "step": 225 + }, + { + "epoch": 0.125, + "grad_norm": 0.09723825007677078, + "learning_rate": 0.0004849080008040734, + "loss": 0.9906, + "step": 250 + }, + { + "epoch": 0.1375, + "grad_norm": 0.09430444985628128, + "learning_rate": 0.00048131167346667446, + "loss": 0.9113, + "step": 275 + }, + { + "epoch": 0.15, + "grad_norm": 0.09439756721258163, + "learning_rate": 0.00047734773611302284, + "loss": 0.674, + "step": 300 + }, + { + "epoch": 0.15, + "eval_loss": 0.9544369578361511, + "eval_runtime": 850.9461, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 300 + }, + { + "epoch": 0.1625, + "grad_norm": 0.10961435735225677, + "learning_rate": 0.0004730224883952422, + "loss": 0.9701, + "step": 325 + }, + { + "epoch": 0.175, + "grad_norm": 0.09628895670175552, + "learning_rate": 0.0004683428041747334, + "loss": 0.8976, + "step": 350 + }, + { + "epoch": 0.1875, + "grad_norm": 0.11085063964128494, + "learning_rate": 0.0004633161205979517, + "loss": 0.6683, + "step": 375 + }, + { + "epoch": 0.2, + "grad_norm": 0.10140710324048996, + "learning_rate": 0.0004579504262769877, + "loss": 0.9373, + "step": 400 + }, + { + "epoch": 0.2, + "eval_loss": 0.968166172504425, + "eval_runtime": 853.0258, + "eval_samples_per_second": 1.286, + "eval_steps_per_second": 0.021, + "step": 400 + }, + { + "epoch": 0.2125, + "grad_norm": 0.13868793845176697, + "learning_rate": 0.0004522542485937369, + "loss": 0.8822, + "step": 425 + }, + { + "epoch": 0.225, + "grad_norm": 0.11853040754795074, + "learning_rate": 0.00044623664014783386, + "loss": 0.6483, + "step": 450 + }, + { + "epoch": 0.2375, + "grad_norm": 0.11668186634778976, + "learning_rate": 0.00043990716436988924, + "loss": 0.9374, + "step": 475 + }, + { + "epoch": 0.25, + "grad_norm": 0.11911585181951523, + "learning_rate": 0.0004332758803228925, + "loss": 0.8434, + "step": 500 + }, + { + "epoch": 0.25, + "eval_loss": 0.9753186702728271, + "eval_runtime": 851.2812, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.021, + "step": 500 + }, + { + "epoch": 0.2625, + "grad_norm": 0.09807440638542175, + "learning_rate": 0.00042635332671593575, + "loss": 0.6661, + "step": 525 + }, + { + "epoch": 0.275, + "grad_norm": 0.09675773978233337, + "learning_rate": 0.00041915050515566445, + "loss": 0.8999, + "step": 550 + }, + { + "epoch": 0.2875, + "grad_norm": 0.0979737937450409, + "learning_rate": 0.00041167886266207167, + "loss": 0.8616, + "step": 575 + }, + { + "epoch": 0.3, + "grad_norm": 0.08990875631570816, + "learning_rate": 0.0004039502734764241, + "loss": 0.8167, + "step": 600 + }, + { + "epoch": 0.3, + "eval_loss": 0.9926204681396484, + "eval_runtime": 851.6484, + "eval_samples_per_second": 1.288, + "eval_steps_per_second": 0.021, + "step": 600 + }, + { + "epoch": 0.3125, + "grad_norm": 0.09505137801170349, + "learning_rate": 0.0003959770201902294, + "loss": 0.8191, + "step": 625 + }, + { + "epoch": 0.325, + "grad_norm": 0.09105059504508972, + "learning_rate": 0.0003877717742252371, + "loss": 0.7203, + "step": 650 + }, + { + "epoch": 0.3375, + "grad_norm": 0.09274734556674957, + "learning_rate": 0.00037934757569549495, + "loss": 0.8154, + "step": 675 + }, + { + "epoch": 0.35, + "grad_norm": 0.10026416182518005, + "learning_rate": 0.00037071781268346345, + "loss": 0.8336, + "step": 700 + }, + { + "epoch": 0.35, + "eval_loss": 0.9959912896156311, + "eval_runtime": 849.7662, + "eval_samples_per_second": 1.291, + "eval_steps_per_second": 0.021, + "step": 700 + }, + { + "epoch": 0.3625, + "grad_norm": 0.09517039358615875, + "learning_rate": 0.00036189619996312495, + "loss": 0.6845, + "step": 725 + }, + { + "epoch": 0.375, + "grad_norm": 0.11264201998710632, + "learning_rate": 0.00035289675720390174, + "loss": 0.8445, + "step": 750 + }, + { + "epoch": 0.3875, + "grad_norm": 0.0979958102107048, + "learning_rate": 0.00034373378669002105, + "loss": 0.8269, + "step": 775 + }, + { + "epoch": 0.4, + "grad_norm": 0.10223093628883362, + "learning_rate": 0.00033442185059073706, + "loss": 0.6517, + "step": 800 + }, + { + "epoch": 0.4, + "eval_loss": 1.006990671157837, + "eval_runtime": 845.1184, + "eval_samples_per_second": 1.298, + "eval_steps_per_second": 0.021, + "step": 800 + }, + { + "epoch": 0.4125, + "grad_norm": 0.09464213252067566, + "learning_rate": 0.00032497574781753367, + "loss": 0.8455, + "step": 825 + }, + { + "epoch": 0.425, + "grad_norm": 0.10187377035617828, + "learning_rate": 0.000315410490505086, + "loss": 0.8217, + "step": 850 + }, + { + "epoch": 0.4375, + "grad_norm": 0.09793733805418015, + "learning_rate": 0.0003057412801533589, + "loss": 0.6218, + "step": 875 + }, + { + "epoch": 0.45, + "grad_norm": 0.09902466833591461, + "learning_rate": 0.0002959834834687587, + "loss": 0.869, + "step": 900 + }, + { + "epoch": 0.45, + "eval_loss": 1.0097707509994507, + "eval_runtime": 847.3981, + "eval_samples_per_second": 1.295, + "eval_steps_per_second": 0.021, + "step": 900 + } + ], + "logging_steps": 25, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.04659656025047e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}