TrueV1sion123
/

rae-training

Model card Files Files and versions

xet

Community

TrueV1sion123 commited on 28 days ago

Commit

4957259

verified ·

1 Parent(s): 909a7a7

Upload configs/rae_training_config.json with huggingface_hub

Browse files

Files changed (1) hide show

configs/rae_training_config.json +71 -0

configs/rae_training_config.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+  "model": {
+    "base_model": "Qwen/Qwen2.5-7B-Instruct",
+    "quantization": "int4",
+    "torch_dtype": "bfloat16",
+    "attn_implementation": "flash_attention_2",
+    "trust_remote_code": true
+  },
+  "lora": {
+    "enabled": true,
+    "r": 32,
+    "alpha": 64,
+    "dropout": 0.05,
+    "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+    "task_type": "CAUSAL_LM",
+    "bias": "none"
+  },
+  "data": {
+    "train_path": "data/rae_training_data/train.jsonl",
+    "eval_path": "data/rae_training_data/validation.jsonl",
+    "max_seq_length": 4096,
+    "chat_template": "auto",
+    "phase_tags": {
+      "saturation": ["<SATURATION>", "</SATURATION>"],
+      "abstraction": ["<ABSTRACTION>", "</ABSTRACTION>"],
+      "descent": ["<DESCENT>", "</DESCENT>"],
+      "integration": ["<INTEGRATION>", "</INTEGRATION>"]
+    }
+  },
+  "training": {
+    "epochs": 3,
+    "batch_size": 1,
+    "gradient_accumulation_steps": 8,
+    "effective_batch_size": 8,
+    "learning_rate": 5e-6,
+    "lr_scheduler": "cosine",
+    "warmup_ratio": 0.1,
+    "weight_decay": 0.01,
+    "max_grad_norm": 1.0,
+    "bf16": true,
+    "logging_steps": 10,
+    "eval_steps": 100,
+    "save_steps": 200,
+    "save_total_limit": 3
+  },
+  "rae_loss": {
+    "_comment": "Multi-objective loss replicating the handwriting effect",
+    "enabled": true,
+    "phase_weights": {
+      "saturation": 1.0,
+      "abstraction": 1.5,
+      "descent": 1.5,
+      "integration": 1.0
+    },
+    "coherence_weight": 0.3,
+    "compression_weight": 0.2,
+    "_coherence_note": "Penalizes abstraction that doesn't follow from saturation",
+    "_compression_note": "Rewards abstraction being shorter than saturation (information compression)"
+  },
+  "output": {
+    "dir": "outputs/rae-trained-model",
+    "push_to_hub": true,
+    "hub_model_id": "rae-cognitive-model",
+    "hub_token_env": "HF_TOKEN"
+  },
+  "wandb": {
+    "enabled": true,
+    "project": "rae-training",
+    "name": "rae-handwriting-v1"
+  }
+}