alenphilip commited on
Commit
e55d343
·
verified ·
1 Parent(s): fb6ab86

Training in progress, step 300, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aeda7b371ff2e1752bf1aff362fa660259c343ff41adf1ebf4a35769f07ce5e5
3
  size 201880976
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f643dbf65b92f1c5c43f0488d3b0424400933078475a5081bb3bfb31da7c67b
3
  size 201880976
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:21b4cbbc7c7e47ac572d0611695777730d7795b30ce4422d923e37f4c43b2d15
3
- size 102771467
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:380b3da97256bb7499e3718daa9ad4cb8e41346fcc168085abcd230d3cd7472c
3
+ size 102771659
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de2f6fd8a366989100bcb570e1fd69da9deb6a29ce5bba1d2c8889118062705c
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3e9275e9bcf8d784f094b4cc97e271b21581c7795c6976a444a4601101c07b6
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:63ca6d6866d748b90a4b2173e0ca24db709af27b45b8531207b094cb85539103
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f579db49b2c936dca1651451380432d685b899aacc0578d54ef97a37d2a2fdb6
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 200,
3
- "best_metric": 0.7492690359164101,
4
- "best_model_checkpoint": "./qwen2.5-7b-sft-qlora/checkpoint-200",
5
- "epoch": 0.935672514619883,
6
  "eval_steps": 50,
7
- "global_step": 200,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -260,6 +260,132 @@
260
  "eval_samples_per_second": 27.23,
261
  "eval_steps_per_second": 1.704,
262
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  }
264
  ],
265
  "logging_steps": 10,
@@ -279,7 +405,7 @@
279
  "attributes": {}
280
  }
281
  },
282
- "total_flos": 4.157882340289413e+17,
283
  "train_batch_size": 16,
284
  "trial_name": null,
285
  "trial_params": null
 
1
  {
2
+ "best_global_step": 300,
3
+ "best_metric": 0.7527724116774704,
4
+ "best_model_checkpoint": "./qwen2.5-7b-sft-qlora/checkpoint-300",
5
+ "epoch": 1.4023391812865498,
6
  "eval_steps": 50,
7
+ "global_step": 300,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
260
  "eval_samples_per_second": 27.23,
261
  "eval_steps_per_second": 1.704,
262
  "step": 200
263
+ },
264
+ {
265
+ "entropy": 0.607174352556467,
266
+ "epoch": 0.9824561403508771,
267
+ "grad_norm": 0.1787632554769516,
268
+ "learning_rate": 0.00012145582208119497,
269
+ "loss": 0.6041,
270
+ "mean_token_accuracy": 0.826733535528183,
271
+ "num_tokens": 5046903.0,
272
+ "step": 210
273
+ },
274
+ {
275
+ "entropy": 0.5812954008579254,
276
+ "epoch": 1.0280701754385966,
277
+ "grad_norm": 0.18372896313667297,
278
+ "learning_rate": 0.00011342332658176555,
279
+ "loss": 0.5672,
280
+ "mean_token_accuracy": 0.8368643965476599,
281
+ "num_tokens": 5286085.0,
282
+ "step": 220
283
+ },
284
+ {
285
+ "entropy": 0.5580198377370834,
286
+ "epoch": 1.0748538011695907,
287
+ "grad_norm": 0.19435207545757294,
288
+ "learning_rate": 0.00010530150105862748,
289
+ "loss": 0.5539,
290
+ "mean_token_accuracy": 0.8394016489386559,
291
+ "num_tokens": 5522827.0,
292
+ "step": 230
293
+ },
294
+ {
295
+ "entropy": 0.5524544611573219,
296
+ "epoch": 1.1216374269005849,
297
+ "grad_norm": 0.2061990201473236,
298
+ "learning_rate": 9.71443949206304e-05,
299
+ "loss": 0.5445,
300
+ "mean_token_accuracy": 0.8409878596663475,
301
+ "num_tokens": 5764879.0,
302
+ "step": 240
303
+ },
304
+ {
305
+ "entropy": 0.5744347527623177,
306
+ "epoch": 1.168421052631579,
307
+ "grad_norm": 0.21095068752765656,
308
+ "learning_rate": 8.900629236329482e-05,
309
+ "loss": 0.5672,
310
+ "mean_token_accuracy": 0.8354128882288933,
311
+ "num_tokens": 6002932.0,
312
+ "step": 250
313
+ },
314
+ {
315
+ "epoch": 1.168421052631579,
316
+ "eval_bleu": 61.178798380428454,
317
+ "eval_entropy": 0.5307412986402158,
318
+ "eval_loss": 0.6018245816230774,
319
+ "eval_mean_token_accuracy": 0.8288786930066568,
320
+ "eval_num_tokens": 6002932.0,
321
+ "eval_rougeL": 0.7482538683957054,
322
+ "eval_runtime": 63.5756,
323
+ "eval_samples_per_second": 27.149,
324
+ "eval_steps_per_second": 1.699,
325
+ "step": 250
326
+ },
327
+ {
328
+ "entropy": 0.5755763545632362,
329
+ "epoch": 1.2152046783625732,
330
+ "grad_norm": 0.20451681315898895,
331
+ "learning_rate": 8.094135111644742e-05,
332
+ "loss": 0.568,
333
+ "mean_token_accuracy": 0.8352775603532792,
334
+ "num_tokens": 6236870.0,
335
+ "step": 260
336
+ },
337
+ {
338
+ "entropy": 0.5376525044441223,
339
+ "epoch": 1.2619883040935673,
340
+ "grad_norm": 0.21364448964595795,
341
+ "learning_rate": 7.300324203346431e-05,
342
+ "loss": 0.5367,
343
+ "mean_token_accuracy": 0.8422502785921097,
344
+ "num_tokens": 6475853.0,
345
+ "step": 270
346
+ },
347
+ {
348
+ "entropy": 0.5575953021645546,
349
+ "epoch": 1.3087719298245615,
350
+ "grad_norm": 0.2244759500026703,
351
+ "learning_rate": 6.524479192059698e-05,
352
+ "loss": 0.5487,
353
+ "mean_token_accuracy": 0.8396281078457832,
354
+ "num_tokens": 6717478.0,
355
+ "step": 280
356
+ },
357
+ {
358
+ "entropy": 0.5532271094620228,
359
+ "epoch": 1.3555555555555556,
360
+ "grad_norm": 0.22009848058223724,
361
+ "learning_rate": 5.7717631983292375e-05,
362
+ "loss": 0.5539,
363
+ "mean_token_accuracy": 0.839461912214756,
364
+ "num_tokens": 6956096.0,
365
+ "step": 290
366
+ },
367
+ {
368
+ "entropy": 0.5678240090608597,
369
+ "epoch": 1.4023391812865498,
370
+ "grad_norm": 0.22350303828716278,
371
+ "learning_rate": 5.047185422903928e-05,
372
+ "loss": 0.5536,
373
+ "mean_token_accuracy": 0.8384527832269668,
374
+ "num_tokens": 7198597.0,
375
+ "step": 300
376
+ },
377
+ {
378
+ "epoch": 1.4023391812865498,
379
+ "eval_bleu": 61.61816935351767,
380
+ "eval_entropy": 0.529030436442958,
381
+ "eval_loss": 0.5979623198509216,
382
+ "eval_mean_token_accuracy": 0.8302161036818115,
383
+ "eval_num_tokens": 7198597.0,
384
+ "eval_rougeL": 0.7527724116774704,
385
+ "eval_runtime": 63.2813,
386
+ "eval_samples_per_second": 27.275,
387
+ "eval_steps_per_second": 1.707,
388
+ "step": 300
389
  }
390
  ],
391
  "logging_steps": 10,
 
405
  "attributes": {}
406
  }
407
  },
408
+ "total_flos": 6.256838307941868e+17,
409
  "train_batch_size": 16,
410
  "trial_name": null,
411
  "trial_params": null