diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..e7aa951fcf47d3a7b41348e96165658f2c25e83d 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,316 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/checkpoint filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.depth_decoder.depth_layers_3.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.depth_decoder.depth_layers_3.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_11.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_11.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_14.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_14.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_14.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_15.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_15.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_16.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_16.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_16.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_16.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_16.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_16.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_17.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_17.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_18.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_19.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_19.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_19.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_19.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_2.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_2.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_2.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_2.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_3.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_3.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_3.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_3.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_3.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_4.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_4.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_5.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_5.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_5.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_5.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_7.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_7.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_7.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_7.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_9.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.decoder.decoder.temporal_decoder.layers_9.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_1.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_1.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_1.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_1.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_10.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_10.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_10.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_10.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_10.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_11.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_15.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_15.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_15.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_15.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_15.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_16.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_18.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_18.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_19.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_19.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_2.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_2.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_2.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_20.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_22.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_3.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_3.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_4.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_4.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_5.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_5.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_5.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_5.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_5.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_6.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_6.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_6.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_7.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1861001/target.encoder.layers_7.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/checkpoint filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.decoder.decoder.temporal_decoder.layers_1.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.decoder.decoder.temporal_decoder.layers_11.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.decoder.decoder.temporal_decoder.layers_14.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.decoder.decoder.temporal_decoder.layers_14.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.decoder.decoder.temporal_decoder.layers_15.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.decoder.decoder.temporal_decoder.layers_15.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.decoder.decoder.temporal_decoder.layers_16.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.decoder.decoder.temporal_decoder.layers_16.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.decoder.decoder.temporal_decoder.layers_18.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.decoder.decoder.temporal_decoder.layers_19.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.decoder.decoder.temporal_decoder.layers_2.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.decoder.decoder.temporal_decoder.layers_3.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.decoder.decoder.temporal_decoder.layers_6.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.decoder.decoder.temporal_decoder.layers_7.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.decoder.decoder.temporal_decoder.layers_8.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.decoder.decoder.temporal_decoder.layers_8.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.decoder.decoder.temporal_decoder.layers_8.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.decoder.decoder.temporal_decoder.layers_8.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.decoder.decoder.temporal_decoder.layers_8.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.encoder.layers_10.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.encoder.layers_11.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.encoder.layers_12.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.encoder.layers_13.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.encoder.layers_13.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.encoder.layers_14.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.encoder.layers_15.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.encoder.layers_16.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.encoder.layers_16.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.encoder.layers_18.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.encoder.layers_19.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.encoder.layers_21.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.encoder.layers_21.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.encoder.layers_22.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.encoder.layers_23.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.encoder.layers_3.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.encoder.layers_4.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.encoder.layers_5.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.encoder.layers_6.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.encoder.layers_7.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1862001/target.encoder.layers_9.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/checkpoint filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.decoder.decoder.temporal_decoder.layers_14.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.decoder.decoder.temporal_decoder.layers_14.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.decoder.decoder.temporal_decoder.layers_14.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.decoder.decoder.temporal_decoder.layers_14.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.decoder.decoder.temporal_decoder.layers_14.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.decoder.decoder.temporal_decoder.layers_14.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.decoder.decoder.temporal_decoder.layers_16.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.decoder.decoder.temporal_decoder.layers_17.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.decoder.decoder.temporal_decoder.layers_17.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.decoder.decoder.temporal_decoder.layers_9.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_0.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_10.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_11.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_12.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_12.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_12.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_12.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_15.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_16.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_16.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_18.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_19.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_2.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_20.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_21.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_21.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_23.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_3.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_4.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_5.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_7.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_7.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_8.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_8.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_8.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_8.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_9.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_9.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_9.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1863001/target.encoder.layers_9.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/checkpoint filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_13.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_13.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_13.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_13.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_14.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_15.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_15.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_15.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_16.encoder_decoder_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_16.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_16.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_16.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_16.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_17.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_17.encoder_decoder_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_17.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_17.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_17.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_17.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_18.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_18.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_18.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_2.encoder_decoder_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_2.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_2.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_4.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_5.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_5.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_6.self_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_6.self_attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_6.self_attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_8.encoder_decoder_attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.decoder.temporal_decoder.layers_9.self_attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.logits_dense.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.decoder.logits_dense.kernel/0.1 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_1.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_1.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_1.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_10.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_11.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_12.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_12.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_12.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_14.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_14.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_14.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_15.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_15.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_16.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_16.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_17.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_18.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_18.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_18.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_19.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_19.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_2.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_2.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_20.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_22.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_22.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_23.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_23.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_23.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_3.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_3.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_4.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_4.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_4.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_4.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_5.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_5.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_6.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_7.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_7.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_8.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_8.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_9.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_9.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_9.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.encoder.layers_9.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.token_embedder.embedding/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1864001/target.token_embedder.embedding/1.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/checkpoint filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_10.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_10.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_11.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_11.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_14.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_15.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_16.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_16.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_17.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_17.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_18.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_18.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_18.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_18.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_18.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_19.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_2.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_2.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_2.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_20.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_20.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_21.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_21.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_21.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_21.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_22.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_22.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_22.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_23.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_23.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_23.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_23.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_3.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_3.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_3.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_4.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_4.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_4.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_4.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_4.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_4.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_5.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_5.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_5.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_5.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_6.attention.key.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_6.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_6.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_6.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_6.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_6.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_7.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_7.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_7.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_7.mlp.wi_0.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_7.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_7.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_8.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_8.attention.value.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_8.mlp.wi_1.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_8.mlp.wo.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_9.attention.out.kernel/0.0 filter=lfs diff=lfs merge=lfs -text +checkpoint_1865001/target.encoder.layers_9.attention.query.kernel/0.0 filter=lfs diff=lfs merge=lfs -text diff --git a/checkpoint_1861001/checkpoint b/checkpoint_1861001/checkpoint new file mode 100644 index 0000000000000000000000000000000000000000..71215aee5f08aed37f2e21d0b800c9d4a5a155de --- /dev/null +++ b/checkpoint_1861001/checkpoint @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb08c8b8fdce3ca579ac7ea65ee5cff255069de54af6adb0141191336be23cd8 +size 4880667 diff --git a/checkpoint_1861001/state.param_states.decoder.decoder.depth_decoder.depth_layers_0.pre_mlp_layer_norm.scale.v/.zarray b/checkpoint_1861001/state.param_states.decoder.decoder.depth_decoder.depth_layers_0.pre_mlp_layer_norm.scale.v/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..fb266c57e8e0dee6193b582ba985adeb7c50f8cf --- /dev/null +++ b/checkpoint_1861001/state.param_states.decoder.decoder.depth_decoder.depth_layers_0.pre_mlp_layer_norm.scale.v/.zarray @@ -0,0 +1 @@ +{"chunks":[1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"