FormlessAI commited on
Commit
b2bceef
·
verified ·
1 Parent(s): 48e8e93

Training in progress, epoch 0, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a64a873d45775ddccac9ee78b1e12b16fcdd62971997377c3dd5ad690bf41a6c
3
  size 25191728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24d72ee90d76c81674597dcb0e10380b2068c60d3b91312ddf40281fe3f2f688
3
  size 25191728
last-checkpoint/global_step400/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2daa77467a73cd9bb82b5b385ef34b551e3cb167da2550ee87b22916884e4fe
3
+ size 18984805
last-checkpoint/global_step400/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcd67a8bdd2a3b3c9528fd0a1df9b021756ef46f33a1058534ba538677db0b27
3
+ size 18984805
last-checkpoint/global_step400/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f5929d75f4d7120a1ebc3928292823618ea6f2f4e243da19cf25438f4031742
3
+ size 18984869
last-checkpoint/global_step400/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60a21ecd251023e210801f80c09dcaea5751e75e23dc42a70014e47a5e4fffd1
3
+ size 18984869
last-checkpoint/global_step400/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b96ec202e5dcc71bec9fc6ea6262560febf6e29bba0eae4307889cfc9940cb17
3
+ size 336491749
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step300
 
1
+ global_step400
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe93893993c87c24f0d46f15fac0c7a3d8656e4fe0690c462da269c161456e0a
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d65adc402fa2fe4f38fc2f77d5dca31cd82c7b022b2f8c04bcc271f15359315
3
  size 15429
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c9e16fcf178aa5d16066ea40cd1adee1df67c4ea116c9e16f45572b01ca6d8a
3
  size 15365
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b96211db15079a80447ea224dd8d490fdb719f9c4f49a9d525e71e12040e35c9
3
  size 15365
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:45de998578ebb52844d84bdab92f735bd86ef126f0e856c874215a54e72eca5f
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d532cd2148ca59776629287179e81f4f180f4fd6a9868d52549ebf5b60992e2f
3
  size 15429
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5bb0e5c5b968031cb67c5fdf83c725ba288977a950e2bb2b6f8e63b3d8fe3336
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5eb0ba76503d7ba24e635fd14190b4003414c2cdd868c6428e425fe7fa3d3b6
3
  size 15429
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:738f1bd52a0e5e3570eb826d97ec615ac0a30012cc827a5a3725e6329285a9f5
3
  size 1401
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20868968ee1625b70346118dc2111e977bbac63a42d7a79aa9441567ffaca394
3
  size 1401
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "best_global_step": null,
3
- "best_metric": 0.6344618797302246,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.09894459102902374,
6
  "eval_steps": 100,
7
- "global_step": 300,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1772,11 +1772,599 @@
1772
  "eval_samples_per_second": 5.619,
1773
  "eval_steps_per_second": 0.351,
1774
  "step": 300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1775
  }
1776
  ],
1777
  "logging_steps": 5,
1778
  "max_steps": 1000,
1779
- "num_input_tokens_seen": 796625,
1780
  "num_train_epochs": 1,
1781
  "save_steps": 100,
1782
  "stateful_callbacks": {
 
1
  {
2
  "best_global_step": null,
3
+ "best_metric": 0.8869044184684753,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.13192612137203166,
6
  "eval_steps": 100,
7
+ "global_step": 400,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1772
  "eval_samples_per_second": 5.619,
1773
  "eval_steps_per_second": 0.351,
1774
  "step": 300
1775
+ },
1776
+ {
1777
+ "clip_ratio/high_max": 0.0,
1778
+ "clip_ratio/high_mean": 0.0,
1779
+ "clip_ratio/low_mean": 0.0,
1780
+ "clip_ratio/low_min": 0.0,
1781
+ "clip_ratio/region_mean": 0.0,
1782
+ "completions/clipped_ratio": 0.3,
1783
+ "completions/max_length": 32.0,
1784
+ "completions/max_terminated_length": 20.9,
1785
+ "completions/mean_length": 11.653125,
1786
+ "completions/mean_terminated_length": 2.9934032917022706,
1787
+ "completions/min_length": 1.0,
1788
+ "completions/min_terminated_length": 1.0,
1789
+ "epoch": 0.10059366754617415,
1790
+ "grad_norm": 2.035731077194214,
1791
+ "kl": 9.2609375,
1792
+ "learning_rate": 0.00017569950556517566,
1793
+ "loss": 0.767,
1794
+ "num_tokens": 809213.0,
1795
+ "reward": 37.08262882232666,
1796
+ "reward_std": 9.928300952911377,
1797
+ "rewards/conciseness_reward/mean": 7.599223709106445,
1798
+ "rewards/conciseness_reward/std": 3.068327784538269,
1799
+ "rewards/reward_func_correct_answer/mean": 0.0,
1800
+ "rewards/reward_func_correct_answer/std": 0.0,
1801
+ "rewards/reward_func_keywords/mean": 0.0,
1802
+ "rewards/reward_func_keywords/std": 0.0,
1803
+ "step": 305
1804
+ },
1805
+ {
1806
+ "clip_ratio/high_max": 0.0,
1807
+ "clip_ratio/high_mean": 0.0,
1808
+ "clip_ratio/low_mean": 0.0,
1809
+ "clip_ratio/low_min": 0.0,
1810
+ "clip_ratio/region_mean": 0.0,
1811
+ "completions/clipped_ratio": 0.25625,
1812
+ "completions/max_length": 32.0,
1813
+ "completions/max_terminated_length": 20.6,
1814
+ "completions/mean_length": 10.4125,
1815
+ "completions/mean_terminated_length": 3.0596010208129885,
1816
+ "completions/min_length": 1.0,
1817
+ "completions/min_terminated_length": 1.0,
1818
+ "epoch": 0.10224274406332454,
1819
+ "grad_norm": 2.1636366844177246,
1820
+ "kl": 8.353125,
1821
+ "learning_rate": 0.00017454759996828623,
1822
+ "loss": 0.7837,
1823
+ "num_tokens": 821771.0,
1824
+ "reward": 38.445166778564456,
1825
+ "reward_std": 10.677533721923828,
1826
+ "rewards/conciseness_reward/mean": 7.878444194793701,
1827
+ "rewards/conciseness_reward/std": 2.9698015213012696,
1828
+ "rewards/reward_func_correct_answer/mean": 0.0,
1829
+ "rewards/reward_func_correct_answer/std": 0.0,
1830
+ "rewards/reward_func_keywords/mean": 0.0,
1831
+ "rewards/reward_func_keywords/std": 0.0,
1832
+ "step": 310
1833
+ },
1834
+ {
1835
+ "clip_ratio/high_max": 0.0,
1836
+ "clip_ratio/high_mean": 0.0,
1837
+ "clip_ratio/low_mean": 0.0,
1838
+ "clip_ratio/low_min": 0.0,
1839
+ "clip_ratio/region_mean": 0.0,
1840
+ "completions/clipped_ratio": 0.19375,
1841
+ "completions/max_length": 32.0,
1842
+ "completions/max_terminated_length": 18.6,
1843
+ "completions/mean_length": 7.9625,
1844
+ "completions/mean_terminated_length": 2.1980216979980467,
1845
+ "completions/min_length": 1.0,
1846
+ "completions/min_terminated_length": 1.0,
1847
+ "epoch": 0.10389182058047493,
1848
+ "grad_norm": 1.422290563583374,
1849
+ "kl": 9.959375,
1850
+ "learning_rate": 0.00017337298645028764,
1851
+ "loss": 0.7172,
1852
+ "num_tokens": 832413.0,
1853
+ "reward": 41.570855712890626,
1854
+ "reward_std": 6.847911691665649,
1855
+ "rewards/conciseness_reward/mean": 8.51898136138916,
1856
+ "rewards/conciseness_reward/std": 2.5959963321685793,
1857
+ "rewards/reward_func_correct_answer/mean": 0.0,
1858
+ "rewards/reward_func_correct_answer/std": 0.0,
1859
+ "rewards/reward_func_keywords/mean": 0.0,
1860
+ "rewards/reward_func_keywords/std": 0.0,
1861
+ "step": 315
1862
+ },
1863
+ {
1864
+ "clip_ratio/high_max": 0.0,
1865
+ "clip_ratio/high_mean": 0.0,
1866
+ "clip_ratio/low_mean": 0.0,
1867
+ "clip_ratio/low_min": 0.0,
1868
+ "clip_ratio/region_mean": 0.0,
1869
+ "completions/clipped_ratio": 0.24375,
1870
+ "completions/max_length": 32.0,
1871
+ "completions/max_terminated_length": 11.6,
1872
+ "completions/mean_length": 9.3375,
1873
+ "completions/mean_terminated_length": 1.976455068588257,
1874
+ "completions/min_length": 1.0,
1875
+ "completions/min_terminated_length": 1.0,
1876
+ "epoch": 0.10554089709762533,
1877
+ "grad_norm": 5.7738847732543945,
1878
+ "kl": 11.91875,
1879
+ "learning_rate": 0.00017217602280983623,
1880
+ "loss": 0.9398,
1881
+ "num_tokens": 844013.0,
1882
+ "reward": 40.131536865234374,
1883
+ "reward_std": 10.431174755096436,
1884
+ "rewards/conciseness_reward/mean": 8.224026775360107,
1885
+ "rewards/conciseness_reward/std": 2.8040316104888916,
1886
+ "rewards/reward_func_correct_answer/mean": 0.0,
1887
+ "rewards/reward_func_correct_answer/std": 0.0,
1888
+ "rewards/reward_func_keywords/mean": 0.0,
1889
+ "rewards/reward_func_keywords/std": 0.0,
1890
+ "step": 320
1891
+ },
1892
+ {
1893
+ "clip_ratio/high_max": 0.0,
1894
+ "clip_ratio/high_mean": 0.0,
1895
+ "clip_ratio/low_mean": 0.0,
1896
+ "clip_ratio/low_min": 0.0,
1897
+ "clip_ratio/region_mean": 0.0,
1898
+ "completions/clipped_ratio": 0.2375,
1899
+ "completions/max_length": 32.0,
1900
+ "completions/max_terminated_length": 6.0,
1901
+ "completions/mean_length": 8.51875,
1902
+ "completions/mean_terminated_length": 1.1981538534164429,
1903
+ "completions/min_length": 1.0,
1904
+ "completions/min_terminated_length": 1.0,
1905
+ "epoch": 0.10718997361477572,
1906
+ "grad_norm": 1.6407678127288818,
1907
+ "kl": 9.725,
1908
+ "learning_rate": 0.0001709570736536521,
1909
+ "loss": 0.7634,
1910
+ "num_tokens": 855598.0,
1911
+ "reward": 40.76496963500976,
1912
+ "reward_std": 8.526632690429688,
1913
+ "rewards/conciseness_reward/mean": 8.353833961486817,
1914
+ "rewards/conciseness_reward/std": 2.905502271652222,
1915
+ "rewards/reward_func_correct_answer/mean": 0.0,
1916
+ "rewards/reward_func_correct_answer/std": 0.0,
1917
+ "rewards/reward_func_keywords/mean": 0.0,
1918
+ "rewards/reward_func_keywords/std": 0.0,
1919
+ "step": 325
1920
+ },
1921
+ {
1922
+ "clip_ratio/high_max": 0.0,
1923
+ "clip_ratio/high_mean": 0.0,
1924
+ "clip_ratio/low_mean": 0.0,
1925
+ "clip_ratio/low_min": 0.0,
1926
+ "clip_ratio/region_mean": 0.0,
1927
+ "completions/clipped_ratio": 0.2125,
1928
+ "completions/max_length": 32.0,
1929
+ "completions/max_terminated_length": 12.4,
1930
+ "completions/mean_length": 7.99375,
1931
+ "completions/mean_terminated_length": 1.5209110260009766,
1932
+ "completions/min_length": 1.0,
1933
+ "completions/min_terminated_length": 1.0,
1934
+ "epoch": 0.10883905013192612,
1935
+ "grad_norm": 2.852661609649658,
1936
+ "kl": 11.234375,
1937
+ "learning_rate": 0.00016971651028545648,
1938
+ "loss": 0.8528,
1939
+ "num_tokens": 869583.0,
1940
+ "reward": 40.956661987304685,
1941
+ "reward_std": 9.296725082397462,
1942
+ "rewards/conciseness_reward/mean": 8.393116474151611,
1943
+ "rewards/conciseness_reward/std": 2.911441469192505,
1944
+ "rewards/reward_func_correct_answer/mean": 0.0,
1945
+ "rewards/reward_func_correct_answer/std": 0.0,
1946
+ "rewards/reward_func_keywords/mean": 0.0,
1947
+ "rewards/reward_func_keywords/std": 0.0,
1948
+ "step": 330
1949
+ },
1950
+ {
1951
+ "clip_ratio/high_max": 0.0,
1952
+ "clip_ratio/high_mean": 0.0,
1953
+ "clip_ratio/low_mean": 0.0,
1954
+ "clip_ratio/low_min": 0.0,
1955
+ "clip_ratio/region_mean": 0.0,
1956
+ "completions/clipped_ratio": 0.2,
1957
+ "completions/max_length": 32.0,
1958
+ "completions/max_terminated_length": 17.8,
1959
+ "completions/mean_length": 7.90625,
1960
+ "completions/mean_terminated_length": 1.8667908191680909,
1961
+ "completions/min_length": 1.0,
1962
+ "completions/min_terminated_length": 1.0,
1963
+ "epoch": 0.11048812664907652,
1964
+ "grad_norm": 1.3491684198379517,
1965
+ "kl": 9.65625,
1966
+ "learning_rate": 0.00016845471059286887,
1967
+ "loss": 0.7327,
1968
+ "num_tokens": 882242.0,
1969
+ "reward": 41.10959243774414,
1970
+ "reward_std": 8.142712497711182,
1971
+ "rewards/conciseness_reward/mean": 8.424456214904785,
1972
+ "rewards/conciseness_reward/std": 2.8741564750671387,
1973
+ "rewards/reward_func_correct_answer/mean": 0.0,
1974
+ "rewards/reward_func_correct_answer/std": 0.0,
1975
+ "rewards/reward_func_keywords/mean": 0.0,
1976
+ "rewards/reward_func_keywords/std": 0.0,
1977
+ "step": 335
1978
+ },
1979
+ {
1980
+ "clip_ratio/high_max": 0.0,
1981
+ "clip_ratio/high_mean": 0.0,
1982
+ "clip_ratio/low_mean": 0.0,
1983
+ "clip_ratio/low_min": 0.0,
1984
+ "clip_ratio/region_mean": 0.0,
1985
+ "completions/clipped_ratio": 0.31875,
1986
+ "completions/max_length": 32.0,
1987
+ "completions/max_terminated_length": 16.6,
1988
+ "completions/mean_length": 11.775,
1989
+ "completions/mean_terminated_length": 2.221480059623718,
1990
+ "completions/min_length": 1.0,
1991
+ "completions/min_terminated_length": 1.0,
1992
+ "epoch": 0.11213720316622691,
1993
+ "grad_norm": 2.546013593673706,
1994
+ "kl": 8.3375,
1995
+ "learning_rate": 0.00016717205893229903,
1996
+ "loss": 0.6472,
1997
+ "num_tokens": 894454.0,
1998
+ "reward": 36.69198989868164,
1999
+ "reward_std": 8.701870346069336,
2000
+ "rewards/conciseness_reward/mean": 7.519171237945557,
2001
+ "rewards/conciseness_reward/std": 3.3117987632751467,
2002
+ "rewards/reward_func_correct_answer/mean": 0.0,
2003
+ "rewards/reward_func_correct_answer/std": 0.0,
2004
+ "rewards/reward_func_keywords/mean": 0.0,
2005
+ "rewards/reward_func_keywords/std": 0.0,
2006
+ "step": 340
2007
+ },
2008
+ {
2009
+ "clip_ratio/high_max": 0.0,
2010
+ "clip_ratio/high_mean": 0.0,
2011
+ "clip_ratio/low_mean": 0.0,
2012
+ "clip_ratio/low_min": 0.0,
2013
+ "clip_ratio/region_mean": 0.0,
2014
+ "completions/clipped_ratio": 0.23125,
2015
+ "completions/max_length": 32.0,
2016
+ "completions/max_terminated_length": 13.6,
2017
+ "completions/mean_length": 8.6375,
2018
+ "completions/mean_terminated_length": 1.6123589992523193,
2019
+ "completions/min_length": 1.0,
2020
+ "completions/min_terminated_length": 1.0,
2021
+ "epoch": 0.1137862796833773,
2022
+ "grad_norm": 1.6078628301620483,
2023
+ "kl": 5.08125,
2024
+ "learning_rate": 0.00016586894601186805,
2025
+ "loss": 0.4841,
2026
+ "num_tokens": 907630.0,
2027
+ "reward": 40.832821655273435,
2028
+ "reward_std": 7.08829927444458,
2029
+ "rewards/conciseness_reward/mean": 8.367738628387452,
2030
+ "rewards/conciseness_reward/std": 2.808896017074585,
2031
+ "rewards/reward_func_correct_answer/mean": 0.0,
2032
+ "rewards/reward_func_correct_answer/std": 0.0,
2033
+ "rewards/reward_func_keywords/mean": 0.0,
2034
+ "rewards/reward_func_keywords/std": 0.0,
2035
+ "step": 345
2036
+ },
2037
+ {
2038
+ "clip_ratio/high_max": 0.0,
2039
+ "clip_ratio/high_mean": 0.0,
2040
+ "clip_ratio/low_mean": 0.0,
2041
+ "clip_ratio/low_min": 0.0,
2042
+ "clip_ratio/region_mean": 0.0,
2043
+ "completions/clipped_ratio": 0.5,
2044
+ "completions/max_length": 32.0,
2045
+ "completions/max_terminated_length": 23.2,
2046
+ "completions/mean_length": 17.94375,
2047
+ "completions/mean_terminated_length": 3.825910973548889,
2048
+ "completions/min_length": 1.0,
2049
+ "completions/min_terminated_length": 1.0,
2050
+ "epoch": 0.11543535620052771,
2051
+ "grad_norm": 8.127303123474121,
2052
+ "kl": 16.021875,
2053
+ "learning_rate": 0.00016454576877239507,
2054
+ "loss": 1.0026,
2055
+ "num_tokens": 920553.0,
2056
+ "reward": 31.658840942382813,
2057
+ "reward_std": 11.956652450561524,
2058
+ "rewards/conciseness_reward/mean": 6.487744331359863,
2059
+ "rewards/conciseness_reward/std": 3.047306680679321,
2060
+ "rewards/reward_func_correct_answer/mean": 0.0,
2061
+ "rewards/reward_func_correct_answer/std": 0.0,
2062
+ "rewards/reward_func_keywords/mean": 0.0,
2063
+ "rewards/reward_func_keywords/std": 0.0,
2064
+ "step": 350
2065
+ },
2066
+ {
2067
+ "clip_ratio/high_max": 0.0,
2068
+ "clip_ratio/high_mean": 0.0,
2069
+ "clip_ratio/low_mean": 0.0,
2070
+ "clip_ratio/low_min": 0.0,
2071
+ "clip_ratio/region_mean": 0.0,
2072
+ "completions/clipped_ratio": 0.49375,
2073
+ "completions/max_length": 32.0,
2074
+ "completions/max_terminated_length": 20.6,
2075
+ "completions/mean_length": 17.4375,
2076
+ "completions/mean_terminated_length": 3.490882396697998,
2077
+ "completions/min_length": 1.0,
2078
+ "completions/min_terminated_length": 1.0,
2079
+ "epoch": 0.1170844327176781,
2080
+ "grad_norm": 2.003816843032837,
2081
+ "kl": 5.35625,
2082
+ "learning_rate": 0.0001632029302664851,
2083
+ "loss": 0.5399,
2084
+ "num_tokens": 936001.0,
2085
+ "reward": 30.59487419128418,
2086
+ "reward_std": 10.253981018066407,
2087
+ "rewards/conciseness_reward/mean": 6.269708824157715,
2088
+ "rewards/conciseness_reward/std": 3.309423828125,
2089
+ "rewards/reward_func_correct_answer/mean": 0.0,
2090
+ "rewards/reward_func_correct_answer/std": 0.0,
2091
+ "rewards/reward_func_keywords/mean": 0.0,
2092
+ "rewards/reward_func_keywords/std": 0.0,
2093
+ "step": 355
2094
+ },
2095
+ {
2096
+ "clip_ratio/high_max": 0.0,
2097
+ "clip_ratio/high_mean": 0.0,
2098
+ "clip_ratio/low_mean": 0.0,
2099
+ "clip_ratio/low_min": 0.0,
2100
+ "clip_ratio/region_mean": 0.0,
2101
+ "completions/clipped_ratio": 0.2625,
2102
+ "completions/max_length": 32.0,
2103
+ "completions/max_terminated_length": 21.8,
2104
+ "completions/mean_length": 10.33125,
2105
+ "completions/mean_terminated_length": 2.5056591749191286,
2106
+ "completions/min_length": 1.0,
2107
+ "completions/min_terminated_length": 1.0,
2108
+ "epoch": 0.11873350923482849,
2109
+ "grad_norm": 1.3860801458358765,
2110
+ "kl": 6.5125,
2111
+ "learning_rate": 0.0001618408395357554,
2112
+ "loss": 0.6358,
2113
+ "num_tokens": 947848.0,
2114
+ "reward": 37.97140731811523,
2115
+ "reward_std": 9.808005714416504,
2116
+ "rewards/conciseness_reward/mean": 7.781358432769776,
2117
+ "rewards/conciseness_reward/std": 3.182269048690796,
2118
+ "rewards/reward_func_correct_answer/mean": 0.0,
2119
+ "rewards/reward_func_correct_answer/std": 0.0,
2120
+ "rewards/reward_func_keywords/mean": 0.0,
2121
+ "rewards/reward_func_keywords/std": 0.0,
2122
+ "step": 360
2123
+ },
2124
+ {
2125
+ "clip_ratio/high_max": 0.0,
2126
+ "clip_ratio/high_mean": 0.0,
2127
+ "clip_ratio/low_mean": 0.0,
2128
+ "clip_ratio/low_min": 0.0,
2129
+ "clip_ratio/region_mean": 0.0,
2130
+ "completions/clipped_ratio": 0.1875,
2131
+ "completions/max_length": 32.0,
2132
+ "completions/max_terminated_length": 19.0,
2133
+ "completions/mean_length": 7.55625,
2134
+ "completions/mean_terminated_length": 1.9193000078201294,
2135
+ "completions/min_length": 1.0,
2136
+ "completions/min_terminated_length": 1.0,
2137
+ "epoch": 0.1203825857519789,
2138
+ "grad_norm": 3.3567373752593994,
2139
+ "kl": 13.759375,
2140
+ "learning_rate": 0.0001604599114862375,
2141
+ "loss": 0.8571,
2142
+ "num_tokens": 959841.0,
2143
+ "reward": 41.46042251586914,
2144
+ "reward_std": 6.935988235473633,
2145
+ "rewards/conciseness_reward/mean": 8.496350860595703,
2146
+ "rewards/conciseness_reward/std": 2.798071002960205,
2147
+ "rewards/reward_func_correct_answer/mean": 0.0,
2148
+ "rewards/reward_func_correct_answer/std": 0.0,
2149
+ "rewards/reward_func_keywords/mean": 0.0,
2150
+ "rewards/reward_func_keywords/std": 0.0,
2151
+ "step": 365
2152
+ },
2153
+ {
2154
+ "clip_ratio/high_max": 0.0,
2155
+ "clip_ratio/high_mean": 0.0,
2156
+ "clip_ratio/low_mean": 0.0,
2157
+ "clip_ratio/low_min": 0.0,
2158
+ "clip_ratio/region_mean": 0.0,
2159
+ "completions/clipped_ratio": 0.25625,
2160
+ "completions/max_length": 32.0,
2161
+ "completions/max_terminated_length": 14.2,
2162
+ "completions/mean_length": 9.6625,
2163
+ "completions/mean_terminated_length": 2.0050908803939818,
2164
+ "completions/min_length": 1.0,
2165
+ "completions/min_terminated_length": 1.0,
2166
+ "epoch": 0.12203166226912929,
2167
+ "grad_norm": 1.5818016529083252,
2168
+ "kl": 8.46875,
2169
+ "learning_rate": 0.00015906056676199255,
2170
+ "loss": 0.7285,
2171
+ "num_tokens": 971895.0,
2172
+ "reward": 38.987307739257815,
2173
+ "reward_std": 9.816894721984863,
2174
+ "rewards/conciseness_reward/mean": 7.989543151855469,
2175
+ "rewards/conciseness_reward/std": 3.1488665103912354,
2176
+ "rewards/reward_func_correct_answer/mean": 0.0,
2177
+ "rewards/reward_func_correct_answer/std": 0.0,
2178
+ "rewards/reward_func_keywords/mean": 0.0,
2179
+ "rewards/reward_func_keywords/std": 0.0,
2180
+ "step": 370
2181
+ },
2182
+ {
2183
+ "clip_ratio/high_max": 0.0,
2184
+ "clip_ratio/high_mean": 0.0,
2185
+ "clip_ratio/low_mean": 0.0,
2186
+ "clip_ratio/low_min": 0.0,
2187
+ "clip_ratio/region_mean": 0.0,
2188
+ "completions/clipped_ratio": 0.2375,
2189
+ "completions/max_length": 32.0,
2190
+ "completions/max_terminated_length": 12.8,
2191
+ "completions/mean_length": 8.95,
2192
+ "completions/mean_terminated_length": 1.782608699798584,
2193
+ "completions/min_length": 1.0,
2194
+ "completions/min_terminated_length": 1.0,
2195
+ "epoch": 0.12368073878627968,
2196
+ "grad_norm": 2.568260669708252,
2197
+ "kl": 8.846875,
2198
+ "learning_rate": 0.00015764323161697935,
2199
+ "loss": 0.7342,
2200
+ "num_tokens": 983269.0,
2201
+ "reward": 40.019395446777345,
2202
+ "reward_std": 8.695895671844482,
2203
+ "rewards/conciseness_reward/mean": 8.201046085357666,
2204
+ "rewards/conciseness_reward/std": 2.975964069366455,
2205
+ "rewards/reward_func_correct_answer/mean": 0.0,
2206
+ "rewards/reward_func_correct_answer/std": 0.0,
2207
+ "rewards/reward_func_keywords/mean": 0.0,
2208
+ "rewards/reward_func_keywords/std": 0.0,
2209
+ "step": 375
2210
+ },
2211
+ {
2212
+ "clip_ratio/high_max": 0.0,
2213
+ "clip_ratio/high_mean": 0.0,
2214
+ "clip_ratio/low_mean": 0.0,
2215
+ "clip_ratio/low_min": 0.0,
2216
+ "clip_ratio/region_mean": 0.0,
2217
+ "completions/clipped_ratio": 0.2375,
2218
+ "completions/max_length": 32.0,
2219
+ "completions/max_terminated_length": 10.0,
2220
+ "completions/mean_length": 8.64375,
2221
+ "completions/mean_terminated_length": 1.3947399377822876,
2222
+ "completions/min_length": 1.0,
2223
+ "completions/min_terminated_length": 1.0,
2224
+ "epoch": 0.12532981530343007,
2225
+ "grad_norm": 1.7566519975662231,
2226
+ "kl": 10.090625,
2227
+ "learning_rate": 0.00015620833778521307,
2228
+ "loss": 0.7109,
2229
+ "num_tokens": 994490.0,
2230
+ "reward": 40.795552825927736,
2231
+ "reward_std": 6.594453907012939,
2232
+ "rewards/conciseness_reward/mean": 8.360101222991943,
2233
+ "rewards/conciseness_reward/std": 2.781657338142395,
2234
+ "rewards/reward_func_correct_answer/mean": 0.0,
2235
+ "rewards/reward_func_correct_answer/std": 0.0,
2236
+ "rewards/reward_func_keywords/mean": 0.0,
2237
+ "rewards/reward_func_keywords/std": 0.0,
2238
+ "step": 380
2239
+ },
2240
+ {
2241
+ "clip_ratio/high_max": 0.0,
2242
+ "clip_ratio/high_mean": 0.0,
2243
+ "clip_ratio/low_mean": 0.0,
2244
+ "clip_ratio/low_min": 0.0,
2245
+ "clip_ratio/region_mean": 0.0,
2246
+ "completions/clipped_ratio": 0.36875,
2247
+ "completions/max_length": 32.0,
2248
+ "completions/max_terminated_length": 1.0,
2249
+ "completions/mean_length": 12.43125,
2250
+ "completions/mean_terminated_length": 1.0,
2251
+ "completions/min_length": 1.0,
2252
+ "completions/min_terminated_length": 1.0,
2253
+ "epoch": 0.12697889182058048,
2254
+ "grad_norm": 1.1844645738601685,
2255
+ "kl": 6.378125,
2256
+ "learning_rate": 0.00015475632234925504,
2257
+ "loss": 0.6129,
2258
+ "num_tokens": 1006117.0,
2259
+ "reward": 36.47860527038574,
2260
+ "reward_std": 11.079174518585205,
2261
+ "rewards/conciseness_reward/mean": 7.475443267822266,
2262
+ "rewards/conciseness_reward/std": 3.2058629512786867,
2263
+ "rewards/reward_func_correct_answer/mean": 0.0,
2264
+ "rewards/reward_func_correct_answer/std": 0.0,
2265
+ "rewards/reward_func_keywords/mean": 0.0,
2266
+ "rewards/reward_func_keywords/std": 0.0,
2267
+ "step": 385
2268
+ },
2269
+ {
2270
+ "clip_ratio/high_max": 0.0,
2271
+ "clip_ratio/high_mean": 0.0,
2272
+ "clip_ratio/low_mean": 0.0,
2273
+ "clip_ratio/low_min": 0.0,
2274
+ "clip_ratio/region_mean": 0.0,
2275
+ "completions/clipped_ratio": 0.5125,
2276
+ "completions/max_length": 32.0,
2277
+ "completions/max_terminated_length": 9.4,
2278
+ "completions/mean_length": 17.15,
2279
+ "completions/mean_terminated_length": 1.519215726852417,
2280
+ "completions/min_length": 1.0,
2281
+ "completions/min_terminated_length": 1.0,
2282
+ "epoch": 0.12862796833773088,
2283
+ "grad_norm": 1.8607726097106934,
2284
+ "kl": 10.7625,
2285
+ "learning_rate": 0.000153287627607073,
2286
+ "loss": 0.8252,
2287
+ "num_tokens": 1019911.0,
2288
+ "reward": 31.58481674194336,
2289
+ "reward_std": 12.696942138671876,
2290
+ "rewards/conciseness_reward/mean": 6.472575092315674,
2291
+ "rewards/conciseness_reward/std": 3.4355133533477784,
2292
+ "rewards/reward_func_correct_answer/mean": 0.0,
2293
+ "rewards/reward_func_correct_answer/std": 0.0,
2294
+ "rewards/reward_func_keywords/mean": 0.0,
2295
+ "rewards/reward_func_keywords/std": 0.0,
2296
+ "step": 390
2297
+ },
2298
+ {
2299
+ "clip_ratio/high_max": 0.0,
2300
+ "clip_ratio/high_mean": 0.0,
2301
+ "clip_ratio/low_mean": 0.0,
2302
+ "clip_ratio/low_min": 0.0,
2303
+ "clip_ratio/region_mean": 0.0,
2304
+ "completions/clipped_ratio": 0.26875,
2305
+ "completions/max_length": 32.0,
2306
+ "completions/max_terminated_length": 9.2,
2307
+ "completions/mean_length": 9.78125,
2308
+ "completions/mean_terminated_length": 1.5907407760620118,
2309
+ "completions/min_length": 1.0,
2310
+ "completions/min_terminated_length": 1.0,
2311
+ "epoch": 0.13027704485488126,
2312
+ "grad_norm": 1.307873010635376,
2313
+ "kl": 7.2125,
2314
+ "learning_rate": 0.00015180270093731303,
2315
+ "loss": 0.6376,
2316
+ "num_tokens": 1034198.0,
2317
+ "reward": 39.4156982421875,
2318
+ "reward_std": 8.192217206954956,
2319
+ "rewards/conciseness_reward/mean": 8.075745105743408,
2320
+ "rewards/conciseness_reward/std": 3.032507038116455,
2321
+ "rewards/reward_func_correct_answer/mean": 0.0,
2322
+ "rewards/reward_func_correct_answer/std": 0.0,
2323
+ "rewards/reward_func_keywords/mean": 0.002083333395421505,
2324
+ "rewards/reward_func_keywords/std": 0.01178511381149292,
2325
+ "step": 395
2326
+ },
2327
+ {
2328
+ "epoch": 0.13192612137203166,
2329
+ "grad_norm": 1.1687965393066406,
2330
+ "learning_rate": 0.00015030199466302353,
2331
+ "loss": 0.5685,
2332
+ "step": 400
2333
+ },
2334
+ {
2335
+ "epoch": 0.13192612137203166,
2336
+ "eval_clip_ratio/high_max": 0.0,
2337
+ "eval_clip_ratio/high_mean": 0.0,
2338
+ "eval_clip_ratio/low_mean": 0.0,
2339
+ "eval_clip_ratio/low_min": 0.0,
2340
+ "eval_clip_ratio/region_mean": 0.0,
2341
+ "eval_completions/clipped_ratio": 0.2789835164835165,
2342
+ "eval_completions/max_length": 32.0,
2343
+ "eval_completions/max_terminated_length": 8.082417582417582,
2344
+ "eval_completions/mean_length": 10.17390110204508,
2345
+ "eval_completions/mean_terminated_length": 1.7402353758340354,
2346
+ "eval_completions/min_length": 1.0,
2347
+ "eval_completions/min_terminated_length": 1.0,
2348
+ "eval_kl": 12.785199175824175,
2349
+ "eval_loss": 0.8869044184684753,
2350
+ "eval_num_tokens": 1045628.0,
2351
+ "eval_reward": 39.34467090355171,
2352
+ "eval_reward_std": 9.140103686150614,
2353
+ "eval_rewards/conciseness_reward/mean": 8.06277670441093,
2354
+ "eval_rewards/conciseness_reward/std": 2.9405157920587195,
2355
+ "eval_rewards/reward_func_correct_answer/mean": 0.0,
2356
+ "eval_rewards/reward_func_correct_answer/std": 0.0,
2357
+ "eval_rewards/reward_func_keywords/mean": 0.0,
2358
+ "eval_rewards/reward_func_keywords/std": 0.0,
2359
+ "eval_runtime": 259.9033,
2360
+ "eval_samples_per_second": 5.598,
2361
+ "eval_steps_per_second": 0.35,
2362
+ "step": 400
2363
  }
2364
  ],
2365
  "logging_steps": 5,
2366
  "max_steps": 1000,
2367
+ "num_input_tokens_seen": 1045628,
2368
  "num_train_epochs": 1,
2369
  "save_steps": 100,
2370
  "stateful_callbacks": {