Autonomous Space trainer update
Browse files- adapter_config.json +2 -2
- effective_run_config.json +12 -2
- live_events.jsonl +0 -0
- live_progress.json +11 -1
- metrics/eval_metrics.json +3 -3
- metrics/train_metrics.json +3 -3
- run_summary.json +14 -4
- trainer_state.json +33 -33
- training_args.bin +1 -1
adapter_config.json
CHANGED
|
@@ -29,9 +29,9 @@
|
|
| 29 |
"rank_pattern": {},
|
| 30 |
"revision": null,
|
| 31 |
"target_modules": [
|
|
|
|
| 32 |
"c_fc",
|
| 33 |
-
"c_attn"
|
| 34 |
-
"c_proj"
|
| 35 |
],
|
| 36 |
"target_parameters": null,
|
| 37 |
"task_type": "CAUSAL_LM",
|
|
|
|
| 29 |
"rank_pattern": {},
|
| 30 |
"revision": null,
|
| 31 |
"target_modules": [
|
| 32 |
+
"c_proj",
|
| 33 |
"c_fc",
|
| 34 |
+
"c_attn"
|
|
|
|
| 35 |
],
|
| 36 |
"target_parameters": null,
|
| 37 |
"task_type": "CAUSAL_LM",
|
effective_run_config.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"ce_weight_end": 0.5,
|
| 3 |
"ce_weight_start": 0.35,
|
| 4 |
-
"created_at": "2026-02-24T08:
|
| 5 |
"dataset_id": "NorthernTribe-Research/UMSR-v1",
|
| 6 |
"distill_enabled": true,
|
| 7 |
"enforce_inhouse_models": true,
|
|
@@ -23,8 +23,18 @@
|
|
| 23 |
],
|
| 24 |
"min_quality": 0.72,
|
| 25 |
"model_dtype": "bfloat16",
|
| 26 |
-
"output_dir": "/app/runs/
|
| 27 |
"resume_from_checkpoint": "",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
"save_total_limit": 4,
|
| 29 |
"student_model": "NorthernTribe-Research/UMSR-Reasoner-7B",
|
| 30 |
"target_repo_id": "NorthernTribe-Research/UMSR-Reasoner-7B",
|
|
|
|
| 1 |
{
|
| 2 |
"ce_weight_end": 0.5,
|
| 3 |
"ce_weight_start": 0.35,
|
| 4 |
+
"created_at": "2026-02-24T08:19:15.059846+00:00",
|
| 5 |
"dataset_id": "NorthernTribe-Research/UMSR-v1",
|
| 6 |
"distill_enabled": true,
|
| 7 |
"enforce_inhouse_models": true,
|
|
|
|
| 23 |
],
|
| 24 |
"min_quality": 0.72,
|
| 25 |
"model_dtype": "bfloat16",
|
| 26 |
+
"output_dir": "/app/runs/20260224_081901",
|
| 27 |
"resume_from_checkpoint": "",
|
| 28 |
+
"runtime_hardware": {
|
| 29 |
+
"cuda_available": false,
|
| 30 |
+
"cuda_compute_capability_0": "",
|
| 31 |
+
"cuda_device_0": "",
|
| 32 |
+
"cuda_device_count": 0,
|
| 33 |
+
"cuda_total_memory_gb_0": null,
|
| 34 |
+
"mps_available": false,
|
| 35 |
+
"torch_available": true,
|
| 36 |
+
"torch_version": "2.10.0+cu128"
|
| 37 |
+
},
|
| 38 |
"save_total_limit": 4,
|
| 39 |
"student_model": "NorthernTribe-Research/UMSR-Reasoner-7B",
|
| 40 |
"target_repo_id": "NorthernTribe-Research/UMSR-Reasoner-7B",
|
live_events.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
live_progress.json
CHANGED
|
@@ -16,6 +16,16 @@
|
|
| 16 |
"learning_rate": 4.032258064516129e-07,
|
| 17 |
"loss": 5.431174278259277
|
| 18 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
"status": "completed",
|
| 20 |
-
"updated_at": "2026-02-24T08:
|
| 21 |
}
|
|
|
|
| 16 |
"learning_rate": 4.032258064516129e-07,
|
| 17 |
"loss": 5.431174278259277
|
| 18 |
},
|
| 19 |
+
"runtime_hardware": {
|
| 20 |
+
"cuda_available": false,
|
| 21 |
+
"cuda_compute_capability_0": "",
|
| 22 |
+
"cuda_device_0": "",
|
| 23 |
+
"cuda_device_count": 0,
|
| 24 |
+
"cuda_total_memory_gb_0": null,
|
| 25 |
+
"mps_available": false,
|
| 26 |
+
"torch_available": true,
|
| 27 |
+
"torch_version": "2.10.0+cu128"
|
| 28 |
+
},
|
| 29 |
"status": "completed",
|
| 30 |
+
"updated_at": "2026-02-24T08:23:53.900701+00:00"
|
| 31 |
}
|
metrics/eval_metrics.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"eval_loss": 5.438441753387451,
|
| 3 |
-
"eval_runtime":
|
| 4 |
"eval_samples": 64,
|
| 5 |
-
"eval_samples_per_second": 3.
|
| 6 |
-
"eval_steps_per_second": 3.
|
| 7 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"eval_loss": 5.438441753387451,
|
| 3 |
+
"eval_runtime": 17.0533,
|
| 4 |
"eval_samples": 64,
|
| 5 |
+
"eval_samples_per_second": 3.753,
|
| 6 |
+
"eval_steps_per_second": 3.753
|
| 7 |
}
|
metrics/train_metrics.json
CHANGED
|
@@ -9,8 +9,8 @@
|
|
| 9 |
"temperature_start": 2.5,
|
| 10 |
"total_flos": 42322071132.0,
|
| 11 |
"train_loss": 4.595640664920211,
|
| 12 |
-
"train_runtime":
|
| 13 |
"train_samples": 256,
|
| 14 |
-
"train_samples_per_second": 0.
|
| 15 |
-
"train_steps_per_second": 0.
|
| 16 |
}
|
|
|
|
| 9 |
"temperature_start": 2.5,
|
| 10 |
"total_flos": 42322071132.0,
|
| 11 |
"train_loss": 4.595640664920211,
|
| 12 |
+
"train_runtime": 261.398,
|
| 13 |
"train_samples": 256,
|
| 14 |
+
"train_samples_per_second": 0.979,
|
| 15 |
+
"train_steps_per_second": 0.979
|
| 16 |
}
|
run_summary.json
CHANGED
|
@@ -10,13 +10,13 @@
|
|
| 10 |
"distill_enabled": true,
|
| 11 |
"enforce_inhouse_models": true,
|
| 12 |
"eval_rows": 64,
|
| 13 |
-
"finished_at": "2026-02-24T08:
|
| 14 |
"fp16": false,
|
| 15 |
"gradient_checkpointing": true,
|
| 16 |
"kd_weight_end": 0.5,
|
| 17 |
"kd_weight_start": 0.65,
|
| 18 |
-
"live_events_path": "/app/runs/
|
| 19 |
-
"live_progress_path": "/app/runs/
|
| 20 |
"lora_alpha": 64,
|
| 21 |
"lora_dropout": 0.05,
|
| 22 |
"lora_enabled": true,
|
|
@@ -32,9 +32,19 @@
|
|
| 32 |
],
|
| 33 |
"model_dtype": "bfloat16",
|
| 34 |
"mps_available": false,
|
| 35 |
-
"output_dir": "/app/runs/
|
| 36 |
"requested_warmup_steps": 0,
|
| 37 |
"resume_from_checkpoint": "",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
"save_total_limit": 4,
|
| 39 |
"target_repo_id": "NorthernTribe-Research/UMSR-Reasoner-7B",
|
| 40 |
"teacher_count": 1,
|
|
|
|
| 10 |
"distill_enabled": true,
|
| 11 |
"enforce_inhouse_models": true,
|
| 12 |
"eval_rows": 64,
|
| 13 |
+
"finished_at": "2026-02-24T08:23:53.900247+00:00",
|
| 14 |
"fp16": false,
|
| 15 |
"gradient_checkpointing": true,
|
| 16 |
"kd_weight_end": 0.5,
|
| 17 |
"kd_weight_start": 0.65,
|
| 18 |
+
"live_events_path": "/app/runs/20260224_081901/live_events.jsonl",
|
| 19 |
+
"live_progress_path": "/app/runs/20260224_081901/live_progress.json",
|
| 20 |
"lora_alpha": 64,
|
| 21 |
"lora_dropout": 0.05,
|
| 22 |
"lora_enabled": true,
|
|
|
|
| 32 |
],
|
| 33 |
"model_dtype": "bfloat16",
|
| 34 |
"mps_available": false,
|
| 35 |
+
"output_dir": "/app/runs/20260224_081901",
|
| 36 |
"requested_warmup_steps": 0,
|
| 37 |
"resume_from_checkpoint": "",
|
| 38 |
+
"runtime_hardware": {
|
| 39 |
+
"cuda_available": false,
|
| 40 |
+
"cuda_compute_capability_0": "",
|
| 41 |
+
"cuda_device_0": "",
|
| 42 |
+
"cuda_device_count": 0,
|
| 43 |
+
"cuda_total_memory_gb_0": null,
|
| 44 |
+
"mps_available": false,
|
| 45 |
+
"torch_available": true,
|
| 46 |
+
"torch_version": "2.10.0+cu128"
|
| 47 |
+
},
|
| 48 |
"save_total_limit": 4,
|
| 49 |
"target_repo_id": "NorthernTribe-Research/UMSR-Reasoner-7B",
|
| 50 |
"teacher_count": 1,
|
trainer_state.json
CHANGED
|
@@ -317,9 +317,9 @@
|
|
| 317 |
"distill_temperature": 2.373046875,
|
| 318 |
"epoch": 0.09765625,
|
| 319 |
"eval_loss": 3.927885055541992,
|
| 320 |
-
"eval_runtime":
|
| 321 |
-
"eval_samples_per_second":
|
| 322 |
-
"eval_steps_per_second":
|
| 323 |
"step": 25
|
| 324 |
},
|
| 325 |
{
|
|
@@ -630,9 +630,9 @@
|
|
| 630 |
"distill_temperature": 2.24609375,
|
| 631 |
"epoch": 0.1953125,
|
| 632 |
"eval_loss": 4.088868141174316,
|
| 633 |
-
"eval_runtime":
|
| 634 |
-
"eval_samples_per_second":
|
| 635 |
-
"eval_steps_per_second":
|
| 636 |
"step": 50
|
| 637 |
},
|
| 638 |
{
|
|
@@ -943,9 +943,9 @@
|
|
| 943 |
"distill_temperature": 2.119140625,
|
| 944 |
"epoch": 0.29296875,
|
| 945 |
"eval_loss": 4.2497992515563965,
|
| 946 |
-
"eval_runtime":
|
| 947 |
-
"eval_samples_per_second":
|
| 948 |
-
"eval_steps_per_second":
|
| 949 |
"step": 75
|
| 950 |
},
|
| 951 |
{
|
|
@@ -1256,9 +1256,9 @@
|
|
| 1256 |
"distill_temperature": 1.9921875,
|
| 1257 |
"epoch": 0.390625,
|
| 1258 |
"eval_loss": 4.411334991455078,
|
| 1259 |
-
"eval_runtime":
|
| 1260 |
-
"eval_samples_per_second":
|
| 1261 |
-
"eval_steps_per_second":
|
| 1262 |
"step": 100
|
| 1263 |
},
|
| 1264 |
{
|
|
@@ -1569,9 +1569,9 @@
|
|
| 1569 |
"distill_temperature": 1.865234375,
|
| 1570 |
"epoch": 0.48828125,
|
| 1571 |
"eval_loss": 4.573975086212158,
|
| 1572 |
-
"eval_runtime":
|
| 1573 |
-
"eval_samples_per_second":
|
| 1574 |
-
"eval_steps_per_second":
|
| 1575 |
"step": 125
|
| 1576 |
},
|
| 1577 |
{
|
|
@@ -1882,9 +1882,9 @@
|
|
| 1882 |
"distill_temperature": 1.73828125,
|
| 1883 |
"epoch": 0.5859375,
|
| 1884 |
"eval_loss": 4.739337921142578,
|
| 1885 |
-
"eval_runtime":
|
| 1886 |
-
"eval_samples_per_second":
|
| 1887 |
-
"eval_steps_per_second":
|
| 1888 |
"step": 150
|
| 1889 |
},
|
| 1890 |
{
|
|
@@ -2195,9 +2195,9 @@
|
|
| 2195 |
"distill_temperature": 1.611328125,
|
| 2196 |
"epoch": 0.68359375,
|
| 2197 |
"eval_loss": 4.90593957901001,
|
| 2198 |
-
"eval_runtime":
|
| 2199 |
-
"eval_samples_per_second":
|
| 2200 |
-
"eval_steps_per_second":
|
| 2201 |
"step": 175
|
| 2202 |
},
|
| 2203 |
{
|
|
@@ -2508,9 +2508,9 @@
|
|
| 2508 |
"distill_temperature": 1.484375,
|
| 2509 |
"epoch": 0.78125,
|
| 2510 |
"eval_loss": 5.072885513305664,
|
| 2511 |
-
"eval_runtime":
|
| 2512 |
-
"eval_samples_per_second":
|
| 2513 |
-
"eval_steps_per_second":
|
| 2514 |
"step": 200
|
| 2515 |
},
|
| 2516 |
{
|
|
@@ -2821,9 +2821,9 @@
|
|
| 2821 |
"distill_temperature": 1.357421875,
|
| 2822 |
"epoch": 0.87890625,
|
| 2823 |
"eval_loss": 5.237745761871338,
|
| 2824 |
-
"eval_runtime":
|
| 2825 |
-
"eval_samples_per_second":
|
| 2826 |
-
"eval_steps_per_second":
|
| 2827 |
"step": 225
|
| 2828 |
},
|
| 2829 |
{
|
|
@@ -3134,9 +3134,9 @@
|
|
| 3134 |
"distill_temperature": 1.23046875,
|
| 3135 |
"epoch": 0.9765625,
|
| 3136 |
"eval_loss": 5.399942398071289,
|
| 3137 |
-
"eval_runtime": 16.
|
| 3138 |
-
"eval_samples_per_second": 3.
|
| 3139 |
-
"eval_steps_per_second": 3.
|
| 3140 |
"step": 250
|
| 3141 |
},
|
| 3142 |
{
|
|
@@ -3221,9 +3221,9 @@
|
|
| 3221 |
"step": 256,
|
| 3222 |
"total_flos": 42322071132.0,
|
| 3223 |
"train_loss": 4.595640664920211,
|
| 3224 |
-
"train_runtime":
|
| 3225 |
-
"train_samples_per_second": 0.
|
| 3226 |
-
"train_steps_per_second": 0.
|
| 3227 |
}
|
| 3228 |
],
|
| 3229 |
"logging_steps": 1,
|
|
|
|
| 317 |
"distill_temperature": 2.373046875,
|
| 318 |
"epoch": 0.09765625,
|
| 319 |
"eval_loss": 3.927885055541992,
|
| 320 |
+
"eval_runtime": 13.3298,
|
| 321 |
+
"eval_samples_per_second": 4.801,
|
| 322 |
+
"eval_steps_per_second": 4.801,
|
| 323 |
"step": 25
|
| 324 |
},
|
| 325 |
{
|
|
|
|
| 630 |
"distill_temperature": 2.24609375,
|
| 631 |
"epoch": 0.1953125,
|
| 632 |
"eval_loss": 4.088868141174316,
|
| 633 |
+
"eval_runtime": 13.1424,
|
| 634 |
+
"eval_samples_per_second": 4.87,
|
| 635 |
+
"eval_steps_per_second": 4.87,
|
| 636 |
"step": 50
|
| 637 |
},
|
| 638 |
{
|
|
|
|
| 943 |
"distill_temperature": 2.119140625,
|
| 944 |
"epoch": 0.29296875,
|
| 945 |
"eval_loss": 4.2497992515563965,
|
| 946 |
+
"eval_runtime": 14.3197,
|
| 947 |
+
"eval_samples_per_second": 4.469,
|
| 948 |
+
"eval_steps_per_second": 4.469,
|
| 949 |
"step": 75
|
| 950 |
},
|
| 951 |
{
|
|
|
|
| 1256 |
"distill_temperature": 1.9921875,
|
| 1257 |
"epoch": 0.390625,
|
| 1258 |
"eval_loss": 4.411334991455078,
|
| 1259 |
+
"eval_runtime": 14.5811,
|
| 1260 |
+
"eval_samples_per_second": 4.389,
|
| 1261 |
+
"eval_steps_per_second": 4.389,
|
| 1262 |
"step": 100
|
| 1263 |
},
|
| 1264 |
{
|
|
|
|
| 1569 |
"distill_temperature": 1.865234375,
|
| 1570 |
"epoch": 0.48828125,
|
| 1571 |
"eval_loss": 4.573975086212158,
|
| 1572 |
+
"eval_runtime": 14.6477,
|
| 1573 |
+
"eval_samples_per_second": 4.369,
|
| 1574 |
+
"eval_steps_per_second": 4.369,
|
| 1575 |
"step": 125
|
| 1576 |
},
|
| 1577 |
{
|
|
|
|
| 1882 |
"distill_temperature": 1.73828125,
|
| 1883 |
"epoch": 0.5859375,
|
| 1884 |
"eval_loss": 4.739337921142578,
|
| 1885 |
+
"eval_runtime": 15.7116,
|
| 1886 |
+
"eval_samples_per_second": 4.073,
|
| 1887 |
+
"eval_steps_per_second": 4.073,
|
| 1888 |
"step": 150
|
| 1889 |
},
|
| 1890 |
{
|
|
|
|
| 2195 |
"distill_temperature": 1.611328125,
|
| 2196 |
"epoch": 0.68359375,
|
| 2197 |
"eval_loss": 4.90593957901001,
|
| 2198 |
+
"eval_runtime": 14.8353,
|
| 2199 |
+
"eval_samples_per_second": 4.314,
|
| 2200 |
+
"eval_steps_per_second": 4.314,
|
| 2201 |
"step": 175
|
| 2202 |
},
|
| 2203 |
{
|
|
|
|
| 2508 |
"distill_temperature": 1.484375,
|
| 2509 |
"epoch": 0.78125,
|
| 2510 |
"eval_loss": 5.072885513305664,
|
| 2511 |
+
"eval_runtime": 15.3273,
|
| 2512 |
+
"eval_samples_per_second": 4.176,
|
| 2513 |
+
"eval_steps_per_second": 4.176,
|
| 2514 |
"step": 200
|
| 2515 |
},
|
| 2516 |
{
|
|
|
|
| 2821 |
"distill_temperature": 1.357421875,
|
| 2822 |
"epoch": 0.87890625,
|
| 2823 |
"eval_loss": 5.237745761871338,
|
| 2824 |
+
"eval_runtime": 15.8537,
|
| 2825 |
+
"eval_samples_per_second": 4.037,
|
| 2826 |
+
"eval_steps_per_second": 4.037,
|
| 2827 |
"step": 225
|
| 2828 |
},
|
| 2829 |
{
|
|
|
|
| 3134 |
"distill_temperature": 1.23046875,
|
| 3135 |
"epoch": 0.9765625,
|
| 3136 |
"eval_loss": 5.399942398071289,
|
| 3137 |
+
"eval_runtime": 16.0564,
|
| 3138 |
+
"eval_samples_per_second": 3.986,
|
| 3139 |
+
"eval_steps_per_second": 3.986,
|
| 3140 |
"step": 250
|
| 3141 |
},
|
| 3142 |
{
|
|
|
|
| 3221 |
"step": 256,
|
| 3222 |
"total_flos": 42322071132.0,
|
| 3223 |
"train_loss": 4.595640664920211,
|
| 3224 |
+
"train_runtime": 261.398,
|
| 3225 |
+
"train_samples_per_second": 0.979,
|
| 3226 |
+
"train_steps_per_second": 0.979
|
| 3227 |
}
|
| 3228 |
],
|
| 3229 |
"logging_steps": 1,
|
training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 5201
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d2d23671895b8a0d20a8fc1fc999d056c5abf3a9e171b9f55654865cd05ff443
|
| 3 |
size 5201
|