CocoRoF commited on
Commit
739899b
·
verified ·
1 Parent(s): 356225d

Training in progress, step 12000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f780082d28637030284ffc574043a34ddaa98ada59f8a3be1bfdb021c71c2bad
3
  size 737582948
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b198e43aac6859985d7c9cb18c2860594033b256136cc0b0e915d584614c895c
3
  size 737582948
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e671142c88647d06eb6e0a94f1a24408c862ffed004bb1ca30f0573a2d1c7012
3
  size 1475256250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64ec40cbe8543eb2855a915aee21dd1f77e088ec666a079a32133adde9da7af4
3
  size 1475256250
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc0e9f2e59db7655969dde1769cc75a30dad30f3fb3535bb1fb168c359c23919
3
- size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae238f666763a7993ec652c03f60677cb3de9003ea7ee1bc1dac41c2065a9c25
3
+ size 15920
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd453d78ea5ca1a82b6f584465a57ad489bfab9616cbdb9a319d2dab9ab08613
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5ab92b6e335feba5c54de89db3c87b707994c34e8ae94b68ceaf1c0e44c4698
3
  size 15920
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4f12a9c4c1465a704b7b3eaf4aa2d8035411567425c5ecce8b3c4ae7135320de
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2632a9a94c203af7029ed1ba1b5fb0c1a8126e97bbd443fb5dba117f62e54913
3
  size 15920
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:239b5759365c936b114c9dcc21e7ab09914f3a9a4c108de0bdb1302b0f35c2e7
3
- size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:add85b850cbfe5b8cf5c4f2e6f71a61a7d77d12000e589671d2903fa92c8b4c3
3
+ size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ca26abb54fe1044979e1329080f72484b4242cf1804164290d4589e9fdbb36c
3
- size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7645a2766e30c501c310ca2b1baf3bd1106ec431388b54ca1a7f2f6cc5531dbd
3
+ size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c8981e55cd625904ad1dee349576a81728c47048cacc714333032c180a5a297e
3
- size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cad911d83e601a147b8872de9ba34bade0b9837051abcd270f992115bb282348
3
+ size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:65a2dd7e2640e0c36b079249689491f8727a8030baf600b1fd43563b4bdc8180
3
- size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36c25b50d403f4b10a416c3c4294b21a8f3a8f0d8b348d5a613cd951ffd7b66c
3
+ size 15920
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:735fc97734689aa61f8105f95f09470f72d267ae593ce19591b5e87bb4d95bbc
3
- size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4aa3be7aba10932fe3b181dbc7c647b64be83ff98de84fe2b9cd6b26e86aafe
3
+ size 15920
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:79f4d900a561875f28344d42b001a83b1900f32917ac0099a98da66cf1c47ef8
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4297fd04c7ed2579ce63e17f4b5a76a418be4b988ee50b810797fa07318b7ac1
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 5.598259319957358,
5
  "eval_steps": 500,
6
- "global_step": 11000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -7707,6 +7707,706 @@
7707
  "learning_rate": 4.98437306883095e-05,
7708
  "loss": 0.682,
7709
  "step": 11000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7710
  }
7711
  ],
7712
  "logging_steps": 10,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 6.106922942290251,
5
  "eval_steps": 500,
6
+ "global_step": 12000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
7707
  "learning_rate": 4.98437306883095e-05,
7708
  "loss": 0.682,
7709
  "step": 11000
7710
+ },
7711
+ {
7712
+ "epoch": 5.603350888637847,
7713
+ "grad_norm": 0.10835061222314835,
7714
+ "learning_rate": 4.9843588625298863e-05,
7715
+ "loss": 0.6829,
7716
+ "step": 11010
7717
+ },
7718
+ {
7719
+ "epoch": 5.6084424573183345,
7720
+ "grad_norm": 0.1209336370229721,
7721
+ "learning_rate": 4.9843446562288236e-05,
7722
+ "loss": 0.6808,
7723
+ "step": 11020
7724
+ },
7725
+ {
7726
+ "epoch": 5.613534025998822,
7727
+ "grad_norm": 0.12438962608575821,
7728
+ "learning_rate": 4.984330449927761e-05,
7729
+ "loss": 0.6768,
7730
+ "step": 11030
7731
+ },
7732
+ {
7733
+ "epoch": 5.618625594679311,
7734
+ "grad_norm": 0.1364268809556961,
7735
+ "learning_rate": 4.984316243626698e-05,
7736
+ "loss": 0.6781,
7737
+ "step": 11040
7738
+ },
7739
+ {
7740
+ "epoch": 5.623717163359799,
7741
+ "grad_norm": 0.11569849401712418,
7742
+ "learning_rate": 4.9843020373256356e-05,
7743
+ "loss": 0.6825,
7744
+ "step": 11050
7745
+ },
7746
+ {
7747
+ "epoch": 5.628808732040287,
7748
+ "grad_norm": 0.10072596371173859,
7749
+ "learning_rate": 4.984287831024573e-05,
7750
+ "loss": 0.6764,
7751
+ "step": 11060
7752
+ },
7753
+ {
7754
+ "epoch": 5.633900300720775,
7755
+ "grad_norm": 0.15180449187755585,
7756
+ "learning_rate": 4.98427362472351e-05,
7757
+ "loss": 0.6782,
7758
+ "step": 11070
7759
+ },
7760
+ {
7761
+ "epoch": 5.638991869401263,
7762
+ "grad_norm": 0.14204277098178864,
7763
+ "learning_rate": 4.9842594184224475e-05,
7764
+ "loss": 0.6806,
7765
+ "step": 11080
7766
+ },
7767
+ {
7768
+ "epoch": 5.6440834380817515,
7769
+ "grad_norm": 0.12409929186105728,
7770
+ "learning_rate": 4.984245212121385e-05,
7771
+ "loss": 0.6806,
7772
+ "step": 11090
7773
+ },
7774
+ {
7775
+ "epoch": 5.649175006762239,
7776
+ "grad_norm": 0.1692194640636444,
7777
+ "learning_rate": 4.9842310058203215e-05,
7778
+ "loss": 0.6723,
7779
+ "step": 11100
7780
+ },
7781
+ {
7782
+ "epoch": 5.654266575442728,
7783
+ "grad_norm": 0.2566402852535248,
7784
+ "learning_rate": 4.984216799519259e-05,
7785
+ "loss": 0.6845,
7786
+ "step": 11110
7787
+ },
7788
+ {
7789
+ "epoch": 5.659358144123216,
7790
+ "grad_norm": 0.13745322823524475,
7791
+ "learning_rate": 4.984202593218196e-05,
7792
+ "loss": 0.6748,
7793
+ "step": 11120
7794
+ },
7795
+ {
7796
+ "epoch": 5.664449712803704,
7797
+ "grad_norm": 0.16598811745643616,
7798
+ "learning_rate": 4.9841883869171334e-05,
7799
+ "loss": 0.6798,
7800
+ "step": 11130
7801
+ },
7802
+ {
7803
+ "epoch": 5.669541281484192,
7804
+ "grad_norm": 0.13570183515548706,
7805
+ "learning_rate": 4.984174180616071e-05,
7806
+ "loss": 0.6797,
7807
+ "step": 11140
7808
+ },
7809
+ {
7810
+ "epoch": 5.674632850164681,
7811
+ "grad_norm": 0.17549622058868408,
7812
+ "learning_rate": 4.984159974315008e-05,
7813
+ "loss": 0.6773,
7814
+ "step": 11150
7815
+ },
7816
+ {
7817
+ "epoch": 5.6797244188451685,
7818
+ "grad_norm": 0.15479332208633423,
7819
+ "learning_rate": 4.984145768013945e-05,
7820
+ "loss": 0.6795,
7821
+ "step": 11160
7822
+ },
7823
+ {
7824
+ "epoch": 5.684815987525656,
7825
+ "grad_norm": 0.1562296450138092,
7826
+ "learning_rate": 4.9841315617128826e-05,
7827
+ "loss": 0.6803,
7828
+ "step": 11170
7829
+ },
7830
+ {
7831
+ "epoch": 5.689907556206145,
7832
+ "grad_norm": 0.13014480471611023,
7833
+ "learning_rate": 4.98411735541182e-05,
7834
+ "loss": 0.6793,
7835
+ "step": 11180
7836
+ },
7837
+ {
7838
+ "epoch": 5.694999124886633,
7839
+ "grad_norm": 0.1577223241329193,
7840
+ "learning_rate": 4.984103149110757e-05,
7841
+ "loss": 0.6845,
7842
+ "step": 11190
7843
+ },
7844
+ {
7845
+ "epoch": 5.700090693567121,
7846
+ "grad_norm": 0.14906632900238037,
7847
+ "learning_rate": 4.9840889428096946e-05,
7848
+ "loss": 0.6771,
7849
+ "step": 11200
7850
+ },
7851
+ {
7852
+ "epoch": 5.705182262247609,
7853
+ "grad_norm": 0.15042632818222046,
7854
+ "learning_rate": 4.984074736508632e-05,
7855
+ "loss": 0.6737,
7856
+ "step": 11210
7857
+ },
7858
+ {
7859
+ "epoch": 5.710273830928098,
7860
+ "grad_norm": 0.1530093252658844,
7861
+ "learning_rate": 4.9840605302075685e-05,
7862
+ "loss": 0.6804,
7863
+ "step": 11220
7864
+ },
7865
+ {
7866
+ "epoch": 5.715365399608586,
7867
+ "grad_norm": 0.18300846219062805,
7868
+ "learning_rate": 4.984046323906506e-05,
7869
+ "loss": 0.6752,
7870
+ "step": 11230
7871
+ },
7872
+ {
7873
+ "epoch": 5.720456968289074,
7874
+ "grad_norm": 0.14398545026779175,
7875
+ "learning_rate": 4.9840321176054424e-05,
7876
+ "loss": 0.6793,
7877
+ "step": 11240
7878
+ },
7879
+ {
7880
+ "epoch": 5.725548536969562,
7881
+ "grad_norm": 0.12745435535907745,
7882
+ "learning_rate": 4.98401791130438e-05,
7883
+ "loss": 0.6765,
7884
+ "step": 11250
7885
+ },
7886
+ {
7887
+ "epoch": 5.73064010565005,
7888
+ "grad_norm": 0.15162277221679688,
7889
+ "learning_rate": 4.984003705003317e-05,
7890
+ "loss": 0.6744,
7891
+ "step": 11260
7892
+ },
7893
+ {
7894
+ "epoch": 5.735731674330538,
7895
+ "grad_norm": 0.12970998883247375,
7896
+ "learning_rate": 4.9839894987022544e-05,
7897
+ "loss": 0.6818,
7898
+ "step": 11270
7899
+ },
7900
+ {
7901
+ "epoch": 5.740823243011026,
7902
+ "grad_norm": 0.1195228323340416,
7903
+ "learning_rate": 4.983975292401192e-05,
7904
+ "loss": 0.6749,
7905
+ "step": 11280
7906
+ },
7907
+ {
7908
+ "epoch": 5.745914811691515,
7909
+ "grad_norm": 0.14821238815784454,
7910
+ "learning_rate": 4.983961086100129e-05,
7911
+ "loss": 0.6759,
7912
+ "step": 11290
7913
+ },
7914
+ {
7915
+ "epoch": 5.751006380372003,
7916
+ "grad_norm": 0.18345175683498383,
7917
+ "learning_rate": 4.983946879799066e-05,
7918
+ "loss": 0.6736,
7919
+ "step": 11300
7920
+ },
7921
+ {
7922
+ "epoch": 5.75609794905249,
7923
+ "grad_norm": 0.14165613055229187,
7924
+ "learning_rate": 4.9839326734980036e-05,
7925
+ "loss": 0.6777,
7926
+ "step": 11310
7927
+ },
7928
+ {
7929
+ "epoch": 5.761189517732979,
7930
+ "grad_norm": 0.16045770049095154,
7931
+ "learning_rate": 4.983918467196941e-05,
7932
+ "loss": 0.678,
7933
+ "step": 11320
7934
+ },
7935
+ {
7936
+ "epoch": 5.766281086413467,
7937
+ "grad_norm": 0.1490974873304367,
7938
+ "learning_rate": 4.983904260895878e-05,
7939
+ "loss": 0.68,
7940
+ "step": 11330
7941
+ },
7942
+ {
7943
+ "epoch": 5.7713726550939555,
7944
+ "grad_norm": 0.11064887046813965,
7945
+ "learning_rate": 4.9838900545948156e-05,
7946
+ "loss": 0.6832,
7947
+ "step": 11340
7948
+ },
7949
+ {
7950
+ "epoch": 5.776464223774443,
7951
+ "grad_norm": 0.11848734319210052,
7952
+ "learning_rate": 4.983875848293753e-05,
7953
+ "loss": 0.6792,
7954
+ "step": 11350
7955
+ },
7956
+ {
7957
+ "epoch": 5.781555792454932,
7958
+ "grad_norm": 0.1246313750743866,
7959
+ "learning_rate": 4.9838616419926895e-05,
7960
+ "loss": 0.6794,
7961
+ "step": 11360
7962
+ },
7963
+ {
7964
+ "epoch": 5.78664736113542,
7965
+ "grad_norm": 0.17359575629234314,
7966
+ "learning_rate": 4.983847435691627e-05,
7967
+ "loss": 0.6762,
7968
+ "step": 11370
7969
+ },
7970
+ {
7971
+ "epoch": 5.791738929815908,
7972
+ "grad_norm": 0.16471154987812042,
7973
+ "learning_rate": 4.983833229390564e-05,
7974
+ "loss": 0.6742,
7975
+ "step": 11380
7976
+ },
7977
+ {
7978
+ "epoch": 5.796830498496396,
7979
+ "grad_norm": 0.1479930430650711,
7980
+ "learning_rate": 4.9838190230895014e-05,
7981
+ "loss": 0.678,
7982
+ "step": 11390
7983
+ },
7984
+ {
7985
+ "epoch": 5.801922067176884,
7986
+ "grad_norm": 0.11385341733694077,
7987
+ "learning_rate": 4.983804816788439e-05,
7988
+ "loss": 0.6791,
7989
+ "step": 11400
7990
+ },
7991
+ {
7992
+ "epoch": 5.8070136358573725,
7993
+ "grad_norm": 0.13574256002902985,
7994
+ "learning_rate": 4.983790610487376e-05,
7995
+ "loss": 0.6795,
7996
+ "step": 11410
7997
+ },
7998
+ {
7999
+ "epoch": 5.81210520453786,
8000
+ "grad_norm": 0.1701575517654419,
8001
+ "learning_rate": 4.9837764041863134e-05,
8002
+ "loss": 0.6791,
8003
+ "step": 11420
8004
+ },
8005
+ {
8006
+ "epoch": 5.817196773218349,
8007
+ "grad_norm": 0.11972179263830185,
8008
+ "learning_rate": 4.98376219788525e-05,
8009
+ "loss": 0.6802,
8010
+ "step": 11430
8011
+ },
8012
+ {
8013
+ "epoch": 5.822288341898837,
8014
+ "grad_norm": 0.15830230712890625,
8015
+ "learning_rate": 4.983747991584187e-05,
8016
+ "loss": 0.6761,
8017
+ "step": 11440
8018
+ },
8019
+ {
8020
+ "epoch": 5.827379910579325,
8021
+ "grad_norm": 0.16592001914978027,
8022
+ "learning_rate": 4.9837337852831246e-05,
8023
+ "loss": 0.6768,
8024
+ "step": 11450
8025
+ },
8026
+ {
8027
+ "epoch": 5.832471479259813,
8028
+ "grad_norm": 0.21496979892253876,
8029
+ "learning_rate": 4.983719578982062e-05,
8030
+ "loss": 0.6783,
8031
+ "step": 11460
8032
+ },
8033
+ {
8034
+ "epoch": 5.837563047940302,
8035
+ "grad_norm": 0.14850680530071259,
8036
+ "learning_rate": 4.983705372680999e-05,
8037
+ "loss": 0.6781,
8038
+ "step": 11470
8039
+ },
8040
+ {
8041
+ "epoch": 5.8426546166207896,
8042
+ "grad_norm": 0.12256158143281937,
8043
+ "learning_rate": 4.9836911663799365e-05,
8044
+ "loss": 0.6776,
8045
+ "step": 11480
8046
+ },
8047
+ {
8048
+ "epoch": 5.847746185301277,
8049
+ "grad_norm": 0.14311592280864716,
8050
+ "learning_rate": 4.983676960078874e-05,
8051
+ "loss": 0.6717,
8052
+ "step": 11490
8053
+ },
8054
+ {
8055
+ "epoch": 5.852837753981766,
8056
+ "grad_norm": 0.1648699939250946,
8057
+ "learning_rate": 4.9836627537778105e-05,
8058
+ "loss": 0.6779,
8059
+ "step": 11500
8060
+ },
8061
+ {
8062
+ "epoch": 5.857929322662254,
8063
+ "grad_norm": 0.13590501248836517,
8064
+ "learning_rate": 4.983648547476748e-05,
8065
+ "loss": 0.6824,
8066
+ "step": 11510
8067
+ },
8068
+ {
8069
+ "epoch": 5.863020891342742,
8070
+ "grad_norm": 0.13972793519496918,
8071
+ "learning_rate": 4.983634341175685e-05,
8072
+ "loss": 0.679,
8073
+ "step": 11520
8074
+ },
8075
+ {
8076
+ "epoch": 5.86811246002323,
8077
+ "grad_norm": 0.11360618472099304,
8078
+ "learning_rate": 4.9836201348746224e-05,
8079
+ "loss": 0.6746,
8080
+ "step": 11530
8081
+ },
8082
+ {
8083
+ "epoch": 5.873204028703718,
8084
+ "grad_norm": 0.14063167572021484,
8085
+ "learning_rate": 4.98360592857356e-05,
8086
+ "loss": 0.6818,
8087
+ "step": 11540
8088
+ },
8089
+ {
8090
+ "epoch": 5.878295597384207,
8091
+ "grad_norm": 0.12393573671579361,
8092
+ "learning_rate": 4.983591722272497e-05,
8093
+ "loss": 0.6771,
8094
+ "step": 11550
8095
+ },
8096
+ {
8097
+ "epoch": 5.883387166064694,
8098
+ "grad_norm": 0.12383928149938583,
8099
+ "learning_rate": 4.9835775159714344e-05,
8100
+ "loss": 0.6807,
8101
+ "step": 11560
8102
+ },
8103
+ {
8104
+ "epoch": 5.888478734745183,
8105
+ "grad_norm": 0.11464569717645645,
8106
+ "learning_rate": 4.983563309670372e-05,
8107
+ "loss": 0.6823,
8108
+ "step": 11570
8109
+ },
8110
+ {
8111
+ "epoch": 5.893570303425671,
8112
+ "grad_norm": 0.15896569192409515,
8113
+ "learning_rate": 4.983549103369309e-05,
8114
+ "loss": 0.678,
8115
+ "step": 11580
8116
+ },
8117
+ {
8118
+ "epoch": 5.898661872106159,
8119
+ "grad_norm": 0.11153749376535416,
8120
+ "learning_rate": 4.983534897068246e-05,
8121
+ "loss": 0.6799,
8122
+ "step": 11590
8123
+ },
8124
+ {
8125
+ "epoch": 5.903753440786647,
8126
+ "grad_norm": 0.13557817041873932,
8127
+ "learning_rate": 4.9835206907671836e-05,
8128
+ "loss": 0.678,
8129
+ "step": 11600
8130
+ },
8131
+ {
8132
+ "epoch": 5.908845009467136,
8133
+ "grad_norm": 0.12681804597377777,
8134
+ "learning_rate": 4.98350648446612e-05,
8135
+ "loss": 0.6853,
8136
+ "step": 11610
8137
+ },
8138
+ {
8139
+ "epoch": 5.913936578147624,
8140
+ "grad_norm": 0.11007581651210785,
8141
+ "learning_rate": 4.9834922781650575e-05,
8142
+ "loss": 0.6799,
8143
+ "step": 11620
8144
+ },
8145
+ {
8146
+ "epoch": 5.919028146828111,
8147
+ "grad_norm": 0.14073921740055084,
8148
+ "learning_rate": 4.983478071863995e-05,
8149
+ "loss": 0.6809,
8150
+ "step": 11630
8151
+ },
8152
+ {
8153
+ "epoch": 5.9241197155086,
8154
+ "grad_norm": 0.17294389009475708,
8155
+ "learning_rate": 4.9834638655629315e-05,
8156
+ "loss": 0.677,
8157
+ "step": 11640
8158
+ },
8159
+ {
8160
+ "epoch": 5.929211284189088,
8161
+ "grad_norm": 0.11901852488517761,
8162
+ "learning_rate": 4.983449659261869e-05,
8163
+ "loss": 0.6814,
8164
+ "step": 11650
8165
+ },
8166
+ {
8167
+ "epoch": 5.9343028528695765,
8168
+ "grad_norm": 0.1563209444284439,
8169
+ "learning_rate": 4.983435452960806e-05,
8170
+ "loss": 0.6803,
8171
+ "step": 11660
8172
+ },
8173
+ {
8174
+ "epoch": 5.939394421550064,
8175
+ "grad_norm": 0.1763051152229309,
8176
+ "learning_rate": 4.9834212466597434e-05,
8177
+ "loss": 0.6713,
8178
+ "step": 11670
8179
+ },
8180
+ {
8181
+ "epoch": 5.944485990230553,
8182
+ "grad_norm": 0.1412787139415741,
8183
+ "learning_rate": 4.983407040358681e-05,
8184
+ "loss": 0.6791,
8185
+ "step": 11680
8186
+ },
8187
+ {
8188
+ "epoch": 5.949577558911041,
8189
+ "grad_norm": 0.13946793973445892,
8190
+ "learning_rate": 4.983392834057618e-05,
8191
+ "loss": 0.674,
8192
+ "step": 11690
8193
+ },
8194
+ {
8195
+ "epoch": 5.954669127591529,
8196
+ "grad_norm": 0.1848699301481247,
8197
+ "learning_rate": 4.9833786277565553e-05,
8198
+ "loss": 0.6785,
8199
+ "step": 11700
8200
+ },
8201
+ {
8202
+ "epoch": 5.959760696272017,
8203
+ "grad_norm": 0.14714594185352325,
8204
+ "learning_rate": 4.9833644214554927e-05,
8205
+ "loss": 0.6764,
8206
+ "step": 11710
8207
+ },
8208
+ {
8209
+ "epoch": 5.964852264952505,
8210
+ "grad_norm": 0.14410807192325592,
8211
+ "learning_rate": 4.98335021515443e-05,
8212
+ "loss": 0.6755,
8213
+ "step": 11720
8214
+ },
8215
+ {
8216
+ "epoch": 5.9699438336329935,
8217
+ "grad_norm": 0.11196265369653702,
8218
+ "learning_rate": 4.983336008853367e-05,
8219
+ "loss": 0.6801,
8220
+ "step": 11730
8221
+ },
8222
+ {
8223
+ "epoch": 5.975035402313481,
8224
+ "grad_norm": 0.14931631088256836,
8225
+ "learning_rate": 4.9833218025523046e-05,
8226
+ "loss": 0.6761,
8227
+ "step": 11740
8228
+ },
8229
+ {
8230
+ "epoch": 5.98012697099397,
8231
+ "grad_norm": 0.1235998123884201,
8232
+ "learning_rate": 4.983307596251241e-05,
8233
+ "loss": 0.6816,
8234
+ "step": 11750
8235
+ },
8236
+ {
8237
+ "epoch": 5.985218539674458,
8238
+ "grad_norm": 0.14235694706439972,
8239
+ "learning_rate": 4.9832933899501785e-05,
8240
+ "loss": 0.6784,
8241
+ "step": 11760
8242
+ },
8243
+ {
8244
+ "epoch": 5.9903101083549455,
8245
+ "grad_norm": 0.11291839182376862,
8246
+ "learning_rate": 4.983279183649116e-05,
8247
+ "loss": 0.6857,
8248
+ "step": 11770
8249
+ },
8250
+ {
8251
+ "epoch": 5.995401677035434,
8252
+ "grad_norm": 0.12273520231246948,
8253
+ "learning_rate": 4.983264977348053e-05,
8254
+ "loss": 0.6801,
8255
+ "step": 11780
8256
+ },
8257
+ {
8258
+ "epoch": 6.0,
8259
+ "grad_norm": 0.025783156976103783,
8260
+ "learning_rate": 4.9832507710469905e-05,
8261
+ "loss": 0.6142,
8262
+ "step": 11790
8263
+ },
8264
+ {
8265
+ "epoch": 6.005091568680488,
8266
+ "grad_norm": 0.1227310448884964,
8267
+ "learning_rate": 4.983236564745928e-05,
8268
+ "loss": 0.679,
8269
+ "step": 11800
8270
+ },
8271
+ {
8272
+ "epoch": 6.010183137360976,
8273
+ "grad_norm": 0.14122678339481354,
8274
+ "learning_rate": 4.983222358444865e-05,
8275
+ "loss": 0.677,
8276
+ "step": 11810
8277
+ },
8278
+ {
8279
+ "epoch": 6.015274706041464,
8280
+ "grad_norm": 0.14405541121959686,
8281
+ "learning_rate": 4.9832081521438024e-05,
8282
+ "loss": 0.6799,
8283
+ "step": 11820
8284
+ },
8285
+ {
8286
+ "epoch": 6.020366274721953,
8287
+ "grad_norm": 0.18694424629211426,
8288
+ "learning_rate": 4.98319394584274e-05,
8289
+ "loss": 0.675,
8290
+ "step": 11830
8291
+ },
8292
+ {
8293
+ "epoch": 6.025457843402441,
8294
+ "grad_norm": 0.1961718052625656,
8295
+ "learning_rate": 4.983179739541677e-05,
8296
+ "loss": 0.6819,
8297
+ "step": 11840
8298
+ },
8299
+ {
8300
+ "epoch": 6.030549412082929,
8301
+ "grad_norm": 0.1102224811911583,
8302
+ "learning_rate": 4.9831655332406137e-05,
8303
+ "loss": 0.682,
8304
+ "step": 11850
8305
+ },
8306
+ {
8307
+ "epoch": 6.035640980763417,
8308
+ "grad_norm": 0.1295260190963745,
8309
+ "learning_rate": 4.983151326939551e-05,
8310
+ "loss": 0.6794,
8311
+ "step": 11860
8312
+ },
8313
+ {
8314
+ "epoch": 6.040732549443905,
8315
+ "grad_norm": 0.12580661475658417,
8316
+ "learning_rate": 4.983137120638488e-05,
8317
+ "loss": 0.6791,
8318
+ "step": 11870
8319
+ },
8320
+ {
8321
+ "epoch": 6.0458241181243935,
8322
+ "grad_norm": 0.1288338154554367,
8323
+ "learning_rate": 4.9831229143374256e-05,
8324
+ "loss": 0.6805,
8325
+ "step": 11880
8326
+ },
8327
+ {
8328
+ "epoch": 6.050915686804881,
8329
+ "grad_norm": 0.1211671456694603,
8330
+ "learning_rate": 4.983108708036362e-05,
8331
+ "loss": 0.6764,
8332
+ "step": 11890
8333
+ },
8334
+ {
8335
+ "epoch": 6.05600725548537,
8336
+ "grad_norm": 0.15219536423683167,
8337
+ "learning_rate": 4.9830945017352995e-05,
8338
+ "loss": 0.6806,
8339
+ "step": 11900
8340
+ },
8341
+ {
8342
+ "epoch": 6.061098824165858,
8343
+ "grad_norm": 0.12759484350681305,
8344
+ "learning_rate": 4.983080295434237e-05,
8345
+ "loss": 0.676,
8346
+ "step": 11910
8347
+ },
8348
+ {
8349
+ "epoch": 6.066190392846346,
8350
+ "grad_norm": 0.1949695497751236,
8351
+ "learning_rate": 4.983066089133174e-05,
8352
+ "loss": 0.6832,
8353
+ "step": 11920
8354
+ },
8355
+ {
8356
+ "epoch": 6.071281961526834,
8357
+ "grad_norm": 0.11879277229309082,
8358
+ "learning_rate": 4.9830518828321115e-05,
8359
+ "loss": 0.6781,
8360
+ "step": 11930
8361
+ },
8362
+ {
8363
+ "epoch": 6.076373530207323,
8364
+ "grad_norm": 0.12636293470859528,
8365
+ "learning_rate": 4.983037676531049e-05,
8366
+ "loss": 0.6774,
8367
+ "step": 11940
8368
+ },
8369
+ {
8370
+ "epoch": 6.0814650988878105,
8371
+ "grad_norm": 0.13675157725811005,
8372
+ "learning_rate": 4.983023470229986e-05,
8373
+ "loss": 0.6789,
8374
+ "step": 11950
8375
+ },
8376
+ {
8377
+ "epoch": 6.086556667568298,
8378
+ "grad_norm": 0.13322140276432037,
8379
+ "learning_rate": 4.9830092639289234e-05,
8380
+ "loss": 0.6805,
8381
+ "step": 11960
8382
+ },
8383
+ {
8384
+ "epoch": 6.091648236248787,
8385
+ "grad_norm": 0.1352871060371399,
8386
+ "learning_rate": 4.982995057627861e-05,
8387
+ "loss": 0.6808,
8388
+ "step": 11970
8389
+ },
8390
+ {
8391
+ "epoch": 6.096739804929275,
8392
+ "grad_norm": 0.14976170659065247,
8393
+ "learning_rate": 4.982980851326798e-05,
8394
+ "loss": 0.6775,
8395
+ "step": 11980
8396
+ },
8397
+ {
8398
+ "epoch": 6.101831373609763,
8399
+ "grad_norm": 0.1250462532043457,
8400
+ "learning_rate": 4.982966645025735e-05,
8401
+ "loss": 0.6782,
8402
+ "step": 11990
8403
+ },
8404
+ {
8405
+ "epoch": 6.106922942290251,
8406
+ "grad_norm": 0.16815803945064545,
8407
+ "learning_rate": 4.9829524387246726e-05,
8408
+ "loss": 0.6721,
8409
+ "step": 12000
8410
  }
8411
  ],
8412
  "logging_steps": 10,