MainStack
/

marvy-1-14B-lora

+{
+    "adapter_path": "train/adapters",
+    "batch_size": 1,
+    "clear_cache_threshold": 0,
+    "config": "train/lora_config.yaml",
+    "data": "train/data",
+    "fine_tune_type": "lora",
+    "grad_accumulation_steps": 16,
+    "grad_checkpoint": true,
+    "iters": 350,
+    "learning_rate": 0.0001,
+    "lora_parameters": {
+        "rank": 32,
+        "scale": 20.0,
+        "dropout": 0.0,
+        "keys": [
+            "self_attn.q_proj",
+            "self_attn.k_proj",
+            "self_attn.v_proj",
+            "self_attn.o_proj",
+            "mlp.gate_proj",
+            "mlp.up_proj",
+            "mlp.down_proj"
+        ]
+    },
+    "lr_schedule": {
+        "name": "cosine_decay",
+        "warmup": 20,
+        "arguments": [
+            0.0001,
+            350,
+            1e-06
+        ]
+    },
+    "mask_prompt": true,
+    "max_seq_length": 8192,
+    "model": "mlx-community/Qwen2.5-14B-Instruct-4bit",
+    "num_layers": 16,
+    "optimizer": "adamw",
+    "optimizer_config": {
+        "adam": {},
+        "adamw": {},
+        "muon": {},
+        "sgd": {},
+        "adafactor": {}
+    },
+    "project_name": null,
+    "report_to": null,
+    "resume_adapter_file": null,
+    "save_every": 50,
+    "seed": 42,
+    "steps_per_eval": 50,
+    "steps_per_report": 10,
+    "test": false,
+    "test_batches": 500,
+    "train": true,
+    "val_batches": 25
+}