Autonomous Space trainer update

Browse files

Files changed (11) hide show

README.md +3 -1
adapter_config.json +42 -0
adapter_model.safetensors +3 -0
effective_run_config.json +37 -0
live_events.jsonl +0 -0
live_progress.json +21 -0
metrics/eval_metrics.json +4 -5
metrics/train_metrics.json +13 -6
run_summary.json +38 -4
trainer_state.json +0 -0
training_args.bin +1 -1

README.md CHANGED Viewed

@@ -14,7 +14,9 @@ tags:
 Standalone reasoning model trained from UMSR-v1 using the autonomous trainer Space.
 - Dataset: https://huggingface.co/datasets/NorthernTribe-Research/UMSR-v1
-- Base model: `sshleifer/tiny-gpt2`
 - Model repo: `https://huggingface.co/NorthernTribe-Research/UMSR-Reasoner-7B`
 ## Output Contract

 Standalone reasoning model trained from UMSR-v1 using the autonomous trainer Space.
 - Dataset: https://huggingface.co/datasets/NorthernTribe-Research/UMSR-v1
+- Base model: `NorthernTribe-Research/UMSR-Reasoner-7B`
+- Training mode: `teacher-student distillation`
+- Teacher model(s): `NorthernTribe-Research/UMSR-Reasoner-7B`
 - Model repo: `https://huggingface.co/NorthernTribe-Research/UMSR-Reasoner-7B`
 ## Output Contract

adapter_config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "NorthernTribe-Research/UMSR-Reasoner-7B",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": true,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "c_attn",
+    "c_proj",
+    "c_fc"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8444e8496b9822bac86d49856a46867e951265c5d385636a2ebc02d5a8b175f
+size 10120

effective_run_config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "ce_weight_end": 0.5,
+  "ce_weight_start": 0.35,
+  "created_at": "2026-02-24T05:06:50.053819+00:00",
+  "dataset_id": "NorthernTribe-Research/UMSR-v1",
+  "distill_enabled": true,
+  "enforce_inhouse_models": true,
+  "eval_split": "validation",
+  "kd_weight_end": 0.5,
+  "kd_weight_start": 0.65,
+  "lora_alpha": 64,
+  "lora_dropout": 0.05,
+  "lora_enabled": true,
+  "lora_r": 32,
+  "lora_target_modules": [
+    "c_attn",
+    "c_proj",
+    "c_fc"
+  ],
+  "min_quality": 0.72,
+  "model_dtype": "bfloat16",
+  "output_dir": "/app/runs/20260224_050637",
+  "resume_from_checkpoint": "",
+  "student_model": "NorthernTribe-Research/UMSR-Reasoner-7B",
+  "target_repo_id": "NorthernTribe-Research/UMSR-Reasoner-7B",
+  "teacher_dtype": "bfloat16",
+  "teacher_models": [
+    "NorthernTribe-Research/UMSR-Reasoner-7B"
+  ],
+  "temperature_end": 1.2,
+  "temperature_start": 2.5,
+  "train_split": "train",
+  "use_4bit_student_effective": false,
+  "use_4bit_student_requested": true,
+  "use_4bit_teacher_effective": false,
+  "use_4bit_teacher_requested": true
+}

live_events.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

live_progress.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "distill_enabled": true,
+  "epoch": 1.0,
+  "global_step": 256,
+  "max_steps": 256,
+  "message": "training finished",
+  "metrics": {
+    "distill_ce_loss": 10.826530456542969,
+    "distill_ce_weight": 0.5,
+    "distill_kd_loss": 0.0498046875,
+    "distill_kd_weight": 0.5,
+    "distill_temperature": 1.2,
+    "epoch": 1.0,
+    "eval_loss": 5.438445091247559,
+    "grad_norm": 0.003390513826161623,
+    "learning_rate": 4.032258064516129e-07,
+    "loss": 5.431177616119385
+  },
+  "status": "completed",
+  "updated_at": "2026-02-24T05:10:53.354667+00:00"
+}

metrics/eval_metrics.json CHANGED Viewed

@@ -1,8 +1,7 @@
 {
-  "epoch": 1.0,
-  "eval_loss": 10.716222763061523,
-  "eval_runtime": 2.186,
   "eval_samples": 64,
-  "eval_samples_per_second": 29.278,
-  "eval_steps_per_second": 29.278
 }

 {
+  "eval_loss": 5.438445091247559,
+  "eval_runtime": 12.7364,
   "eval_samples": 64,
+  "eval_samples_per_second": 5.025,
+  "eval_steps_per_second": 5.025
 }

metrics/train_metrics.json CHANGED Viewed

@@ -1,9 +1,16 @@
 {
-  "epoch": 1.0,
-  "total_flos": 39501942396.0,
-  "train_loss": 10.738998085260391,
-  "train_runtime": 50.9815,
   "train_samples": 256,
-  "train_samples_per_second": 5.021,
-  "train_steps_per_second": 5.021
 }

 {
+  "ce_weight_end": 0.5,
+  "ce_weight_start": 0.35,
+  "distill_enabled": true,
+  "kd_weight_end": 0.5,
+  "kd_weight_start": 0.65,
+  "teacher_count": 1,
+  "temperature_end": 1.2,
+  "temperature_start": 2.5,
+  "total_flos": 42322071132.0,
+  "train_loss": 4.595647823996842,
+  "train_runtime": 230.1787,
   "train_samples": 256,
+  "train_samples_per_second": 1.112,
+  "train_steps_per_second": 1.112
 }

run_summary.json CHANGED Viewed

@@ -1,17 +1,51 @@
 {
-  "base_model": "sshleifer/tiny-gpt2",
   "bf16": false,
   "cuda_available": false,
   "dataset_id": "NorthernTribe-Research/UMSR-v1",
   "device": "cpu",
   "eval_rows": 64,
-  "finished_at": "2026-02-23T22:13:49.188375+00:00",
   "fp16": false,
   "mps_available": false,
-  "output_dir": "/app/runs/20260223_221248",
   "target_repo_id": "NorthernTribe-Research/UMSR-Reasoner-7B",
   "tie_word_embeddings": false,
   "total_train_steps_estimate": 256,
   "train_rows": 256,
-  "warmup_steps": 0
 }

 {
+  "attn_implementation": "",
+  "base_model": "NorthernTribe-Research/UMSR-Reasoner-7B",
   "bf16": false,
+  "ce_weight_end": 0.5,
+  "ce_weight_start": 0.35,
   "cuda_available": false,
   "dataset_id": "NorthernTribe-Research/UMSR-v1",
   "device": "cpu",
+  "distill_enabled": true,
+  "enforce_inhouse_models": true,
   "eval_rows": 64,
+  "finished_at": "2026-02-24T05:10:53.354403+00:00",
   "fp16": false,
+  "gradient_checkpointing": true,
+  "kd_weight_end": 0.5,
+  "kd_weight_start": 0.65,
+  "live_events_path": "/app/runs/20260224_050637/live_events.jsonl",
+  "live_progress_path": "/app/runs/20260224_050637/live_progress.json",
+  "lora_alpha": 64,
+  "lora_dropout": 0.05,
+  "lora_enabled": true,
+  "lora_r": 32,
+  "lora_target_modules": [
+    "c_attn",
+    "c_proj",
+    "c_fc"
+  ],
+  "model_dtype": "bfloat16",
   "mps_available": false,
+  "output_dir": "/app/runs/20260224_050637",
+  "requested_warmup_steps": 0,
+  "resume_from_checkpoint": "",
   "target_repo_id": "NorthernTribe-Research/UMSR-Reasoner-7B",
+  "teacher_count": 1,
+  "teacher_dtype": "bfloat16",
+  "teacher_models": [
+    "NorthernTribe-Research/UMSR-Reasoner-7B"
+  ],
+  "temperature_end": 1.2,
+  "temperature_start": 2.5,
   "tie_word_embeddings": false,
   "total_train_steps_estimate": 256,
   "train_rows": 256,
+  "use_4bit_effective": false,
+  "use_4bit_requested": true,
+  "use_4bit_teacher_effective": false,
+  "use_4bit_teacher_requested": true,
+  "warmup_ratio": 0.03,
+  "warmup_steps": 8
 }

trainer_state.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d7115dfd0cd49a909973dbd1dcbd2ddb2fa6c969feef938104a458e52997dfb8
 size 5201

 version https://git-lfs.github.com/spec/v1
+oid sha256:6ff001674e1758cd59d09d54fcb8a1ab0ffb4cf7b95e257802f7c5f963b5ba1a
 size 5201