NorthernTribe-Research commited on
Commit
8de9739
·
verified ·
1 Parent(s): 633d172

Autonomous Space trainer update

Browse files
adapter_config.json CHANGED
@@ -29,9 +29,9 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
32
  "c_fc",
33
- "c_attn",
34
- "c_proj"
35
  ],
36
  "target_parameters": null,
37
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "c_proj",
33
  "c_fc",
34
+ "c_attn"
 
35
  ],
36
  "target_parameters": null,
37
  "task_type": "CAUSAL_LM",
effective_run_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "ce_weight_end": 0.5,
3
  "ce_weight_start": 0.35,
4
- "created_at": "2026-02-24T08:01:09.221857+00:00",
5
  "dataset_id": "NorthernTribe-Research/UMSR-v1",
6
  "distill_enabled": true,
7
  "enforce_inhouse_models": true,
@@ -23,8 +23,18 @@
23
  ],
24
  "min_quality": 0.72,
25
  "model_dtype": "bfloat16",
26
- "output_dir": "/app/runs/20260224_080047",
27
  "resume_from_checkpoint": "",
 
 
 
 
 
 
 
 
 
 
28
  "save_total_limit": 4,
29
  "student_model": "NorthernTribe-Research/UMSR-Reasoner-7B",
30
  "target_repo_id": "NorthernTribe-Research/UMSR-Reasoner-7B",
 
1
  {
2
  "ce_weight_end": 0.5,
3
  "ce_weight_start": 0.35,
4
+ "created_at": "2026-02-24T08:19:15.059846+00:00",
5
  "dataset_id": "NorthernTribe-Research/UMSR-v1",
6
  "distill_enabled": true,
7
  "enforce_inhouse_models": true,
 
23
  ],
24
  "min_quality": 0.72,
25
  "model_dtype": "bfloat16",
26
+ "output_dir": "/app/runs/20260224_081901",
27
  "resume_from_checkpoint": "",
28
+ "runtime_hardware": {
29
+ "cuda_available": false,
30
+ "cuda_compute_capability_0": "",
31
+ "cuda_device_0": "",
32
+ "cuda_device_count": 0,
33
+ "cuda_total_memory_gb_0": null,
34
+ "mps_available": false,
35
+ "torch_available": true,
36
+ "torch_version": "2.10.0+cu128"
37
+ },
38
  "save_total_limit": 4,
39
  "student_model": "NorthernTribe-Research/UMSR-Reasoner-7B",
40
  "target_repo_id": "NorthernTribe-Research/UMSR-Reasoner-7B",
live_events.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
live_progress.json CHANGED
@@ -16,6 +16,16 @@
16
  "learning_rate": 4.032258064516129e-07,
17
  "loss": 5.431174278259277
18
  },
 
 
 
 
 
 
 
 
 
 
19
  "status": "completed",
20
- "updated_at": "2026-02-24T08:07:49.286465+00:00"
21
  }
 
16
  "learning_rate": 4.032258064516129e-07,
17
  "loss": 5.431174278259277
18
  },
19
+ "runtime_hardware": {
20
+ "cuda_available": false,
21
+ "cuda_compute_capability_0": "",
22
+ "cuda_device_0": "",
23
+ "cuda_device_count": 0,
24
+ "cuda_total_memory_gb_0": null,
25
+ "mps_available": false,
26
+ "torch_available": true,
27
+ "torch_version": "2.10.0+cu128"
28
+ },
29
  "status": "completed",
30
+ "updated_at": "2026-02-24T08:23:53.900701+00:00"
31
  }
metrics/eval_metrics.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "eval_loss": 5.438441753387451,
3
- "eval_runtime": 16.5202,
4
  "eval_samples": 64,
5
- "eval_samples_per_second": 3.874,
6
- "eval_steps_per_second": 3.874
7
  }
 
1
  {
2
  "eval_loss": 5.438441753387451,
3
+ "eval_runtime": 17.0533,
4
  "eval_samples": 64,
5
+ "eval_samples_per_second": 3.753,
6
+ "eval_steps_per_second": 3.753
7
  }
metrics/train_metrics.json CHANGED
@@ -9,8 +9,8 @@
9
  "temperature_start": 2.5,
10
  "total_flos": 42322071132.0,
11
  "train_loss": 4.595640664920211,
12
- "train_runtime": 383.1142,
13
  "train_samples": 256,
14
- "train_samples_per_second": 0.668,
15
- "train_steps_per_second": 0.668
16
  }
 
9
  "temperature_start": 2.5,
10
  "total_flos": 42322071132.0,
11
  "train_loss": 4.595640664920211,
12
+ "train_runtime": 261.398,
13
  "train_samples": 256,
14
+ "train_samples_per_second": 0.979,
15
+ "train_steps_per_second": 0.979
16
  }
run_summary.json CHANGED
@@ -10,13 +10,13 @@
10
  "distill_enabled": true,
11
  "enforce_inhouse_models": true,
12
  "eval_rows": 64,
13
- "finished_at": "2026-02-24T08:07:49.286128+00:00",
14
  "fp16": false,
15
  "gradient_checkpointing": true,
16
  "kd_weight_end": 0.5,
17
  "kd_weight_start": 0.65,
18
- "live_events_path": "/app/runs/20260224_080047/live_events.jsonl",
19
- "live_progress_path": "/app/runs/20260224_080047/live_progress.json",
20
  "lora_alpha": 64,
21
  "lora_dropout": 0.05,
22
  "lora_enabled": true,
@@ -32,9 +32,19 @@
32
  ],
33
  "model_dtype": "bfloat16",
34
  "mps_available": false,
35
- "output_dir": "/app/runs/20260224_080047",
36
  "requested_warmup_steps": 0,
37
  "resume_from_checkpoint": "",
 
 
 
 
 
 
 
 
 
 
38
  "save_total_limit": 4,
39
  "target_repo_id": "NorthernTribe-Research/UMSR-Reasoner-7B",
40
  "teacher_count": 1,
 
10
  "distill_enabled": true,
11
  "enforce_inhouse_models": true,
12
  "eval_rows": 64,
13
+ "finished_at": "2026-02-24T08:23:53.900247+00:00",
14
  "fp16": false,
15
  "gradient_checkpointing": true,
16
  "kd_weight_end": 0.5,
17
  "kd_weight_start": 0.65,
18
+ "live_events_path": "/app/runs/20260224_081901/live_events.jsonl",
19
+ "live_progress_path": "/app/runs/20260224_081901/live_progress.json",
20
  "lora_alpha": 64,
21
  "lora_dropout": 0.05,
22
  "lora_enabled": true,
 
32
  ],
33
  "model_dtype": "bfloat16",
34
  "mps_available": false,
35
+ "output_dir": "/app/runs/20260224_081901",
36
  "requested_warmup_steps": 0,
37
  "resume_from_checkpoint": "",
38
+ "runtime_hardware": {
39
+ "cuda_available": false,
40
+ "cuda_compute_capability_0": "",
41
+ "cuda_device_0": "",
42
+ "cuda_device_count": 0,
43
+ "cuda_total_memory_gb_0": null,
44
+ "mps_available": false,
45
+ "torch_available": true,
46
+ "torch_version": "2.10.0+cu128"
47
+ },
48
  "save_total_limit": 4,
49
  "target_repo_id": "NorthernTribe-Research/UMSR-Reasoner-7B",
50
  "teacher_count": 1,
trainer_state.json CHANGED
@@ -317,9 +317,9 @@
317
  "distill_temperature": 2.373046875,
318
  "epoch": 0.09765625,
319
  "eval_loss": 3.927885055541992,
320
- "eval_runtime": 26.4719,
321
- "eval_samples_per_second": 2.418,
322
- "eval_steps_per_second": 2.418,
323
  "step": 25
324
  },
325
  {
@@ -630,9 +630,9 @@
630
  "distill_temperature": 2.24609375,
631
  "epoch": 0.1953125,
632
  "eval_loss": 4.088868141174316,
633
- "eval_runtime": 24.5155,
634
- "eval_samples_per_second": 2.611,
635
- "eval_steps_per_second": 2.611,
636
  "step": 50
637
  },
638
  {
@@ -943,9 +943,9 @@
943
  "distill_temperature": 2.119140625,
944
  "epoch": 0.29296875,
945
  "eval_loss": 4.2497992515563965,
946
- "eval_runtime": 24.4874,
947
- "eval_samples_per_second": 2.614,
948
- "eval_steps_per_second": 2.614,
949
  "step": 75
950
  },
951
  {
@@ -1256,9 +1256,9 @@
1256
  "distill_temperature": 1.9921875,
1257
  "epoch": 0.390625,
1258
  "eval_loss": 4.411334991455078,
1259
- "eval_runtime": 22.5608,
1260
- "eval_samples_per_second": 2.837,
1261
- "eval_steps_per_second": 2.837,
1262
  "step": 100
1263
  },
1264
  {
@@ -1569,9 +1569,9 @@
1569
  "distill_temperature": 1.865234375,
1570
  "epoch": 0.48828125,
1571
  "eval_loss": 4.573975086212158,
1572
- "eval_runtime": 21.1387,
1573
- "eval_samples_per_second": 3.028,
1574
- "eval_steps_per_second": 3.028,
1575
  "step": 125
1576
  },
1577
  {
@@ -1882,9 +1882,9 @@
1882
  "distill_temperature": 1.73828125,
1883
  "epoch": 0.5859375,
1884
  "eval_loss": 4.739337921142578,
1885
- "eval_runtime": 23.1527,
1886
- "eval_samples_per_second": 2.764,
1887
- "eval_steps_per_second": 2.764,
1888
  "step": 150
1889
  },
1890
  {
@@ -2195,9 +2195,9 @@
2195
  "distill_temperature": 1.611328125,
2196
  "epoch": 0.68359375,
2197
  "eval_loss": 4.90593957901001,
2198
- "eval_runtime": 21.2637,
2199
- "eval_samples_per_second": 3.01,
2200
- "eval_steps_per_second": 3.01,
2201
  "step": 175
2202
  },
2203
  {
@@ -2508,9 +2508,9 @@
2508
  "distill_temperature": 1.484375,
2509
  "epoch": 0.78125,
2510
  "eval_loss": 5.072885513305664,
2511
- "eval_runtime": 17.5798,
2512
- "eval_samples_per_second": 3.641,
2513
- "eval_steps_per_second": 3.641,
2514
  "step": 200
2515
  },
2516
  {
@@ -2821,9 +2821,9 @@
2821
  "distill_temperature": 1.357421875,
2822
  "epoch": 0.87890625,
2823
  "eval_loss": 5.237745761871338,
2824
- "eval_runtime": 16.7918,
2825
- "eval_samples_per_second": 3.811,
2826
- "eval_steps_per_second": 3.811,
2827
  "step": 225
2828
  },
2829
  {
@@ -3134,9 +3134,9 @@
3134
  "distill_temperature": 1.23046875,
3135
  "epoch": 0.9765625,
3136
  "eval_loss": 5.399942398071289,
3137
- "eval_runtime": 16.9158,
3138
- "eval_samples_per_second": 3.783,
3139
- "eval_steps_per_second": 3.783,
3140
  "step": 250
3141
  },
3142
  {
@@ -3221,9 +3221,9 @@
3221
  "step": 256,
3222
  "total_flos": 42322071132.0,
3223
  "train_loss": 4.595640664920211,
3224
- "train_runtime": 383.1142,
3225
- "train_samples_per_second": 0.668,
3226
- "train_steps_per_second": 0.668
3227
  }
3228
  ],
3229
  "logging_steps": 1,
 
317
  "distill_temperature": 2.373046875,
318
  "epoch": 0.09765625,
319
  "eval_loss": 3.927885055541992,
320
+ "eval_runtime": 13.3298,
321
+ "eval_samples_per_second": 4.801,
322
+ "eval_steps_per_second": 4.801,
323
  "step": 25
324
  },
325
  {
 
630
  "distill_temperature": 2.24609375,
631
  "epoch": 0.1953125,
632
  "eval_loss": 4.088868141174316,
633
+ "eval_runtime": 13.1424,
634
+ "eval_samples_per_second": 4.87,
635
+ "eval_steps_per_second": 4.87,
636
  "step": 50
637
  },
638
  {
 
943
  "distill_temperature": 2.119140625,
944
  "epoch": 0.29296875,
945
  "eval_loss": 4.2497992515563965,
946
+ "eval_runtime": 14.3197,
947
+ "eval_samples_per_second": 4.469,
948
+ "eval_steps_per_second": 4.469,
949
  "step": 75
950
  },
951
  {
 
1256
  "distill_temperature": 1.9921875,
1257
  "epoch": 0.390625,
1258
  "eval_loss": 4.411334991455078,
1259
+ "eval_runtime": 14.5811,
1260
+ "eval_samples_per_second": 4.389,
1261
+ "eval_steps_per_second": 4.389,
1262
  "step": 100
1263
  },
1264
  {
 
1569
  "distill_temperature": 1.865234375,
1570
  "epoch": 0.48828125,
1571
  "eval_loss": 4.573975086212158,
1572
+ "eval_runtime": 14.6477,
1573
+ "eval_samples_per_second": 4.369,
1574
+ "eval_steps_per_second": 4.369,
1575
  "step": 125
1576
  },
1577
  {
 
1882
  "distill_temperature": 1.73828125,
1883
  "epoch": 0.5859375,
1884
  "eval_loss": 4.739337921142578,
1885
+ "eval_runtime": 15.7116,
1886
+ "eval_samples_per_second": 4.073,
1887
+ "eval_steps_per_second": 4.073,
1888
  "step": 150
1889
  },
1890
  {
 
2195
  "distill_temperature": 1.611328125,
2196
  "epoch": 0.68359375,
2197
  "eval_loss": 4.90593957901001,
2198
+ "eval_runtime": 14.8353,
2199
+ "eval_samples_per_second": 4.314,
2200
+ "eval_steps_per_second": 4.314,
2201
  "step": 175
2202
  },
2203
  {
 
2508
  "distill_temperature": 1.484375,
2509
  "epoch": 0.78125,
2510
  "eval_loss": 5.072885513305664,
2511
+ "eval_runtime": 15.3273,
2512
+ "eval_samples_per_second": 4.176,
2513
+ "eval_steps_per_second": 4.176,
2514
  "step": 200
2515
  },
2516
  {
 
2821
  "distill_temperature": 1.357421875,
2822
  "epoch": 0.87890625,
2823
  "eval_loss": 5.237745761871338,
2824
+ "eval_runtime": 15.8537,
2825
+ "eval_samples_per_second": 4.037,
2826
+ "eval_steps_per_second": 4.037,
2827
  "step": 225
2828
  },
2829
  {
 
3134
  "distill_temperature": 1.23046875,
3135
  "epoch": 0.9765625,
3136
  "eval_loss": 5.399942398071289,
3137
+ "eval_runtime": 16.0564,
3138
+ "eval_samples_per_second": 3.986,
3139
+ "eval_steps_per_second": 3.986,
3140
  "step": 250
3141
  },
3142
  {
 
3221
  "step": 256,
3222
  "total_flos": 42322071132.0,
3223
  "train_loss": 4.595640664920211,
3224
+ "train_runtime": 261.398,
3225
+ "train_samples_per_second": 0.979,
3226
+ "train_steps_per_second": 0.979
3227
  }
3228
  ],
3229
  "logging_steps": 1,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be358fbb43e9d9e8f930a1c065bc3e315768742f862cda8dafe891203e7f0b93
3
  size 5201
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2d23671895b8a0d20a8fc1fc999d056c5abf3a9e171b9f55654865cd05ff443
3
  size 5201