TrueV1sion123 commited on
Commit
4957259
·
verified ·
1 Parent(s): 909a7a7

Upload configs/rae_training_config.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. configs/rae_training_config.json +71 -0
configs/rae_training_config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "base_model": "Qwen/Qwen2.5-7B-Instruct",
4
+ "quantization": "int4",
5
+ "torch_dtype": "bfloat16",
6
+ "attn_implementation": "flash_attention_2",
7
+ "trust_remote_code": true
8
+ },
9
+ "lora": {
10
+ "enabled": true,
11
+ "r": 32,
12
+ "alpha": 64,
13
+ "dropout": 0.05,
14
+ "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
15
+ "task_type": "CAUSAL_LM",
16
+ "bias": "none"
17
+ },
18
+ "data": {
19
+ "train_path": "data/rae_training_data/train.jsonl",
20
+ "eval_path": "data/rae_training_data/validation.jsonl",
21
+ "max_seq_length": 4096,
22
+ "chat_template": "auto",
23
+ "phase_tags": {
24
+ "saturation": ["<SATURATION>", "</SATURATION>"],
25
+ "abstraction": ["<ABSTRACTION>", "</ABSTRACTION>"],
26
+ "descent": ["<DESCENT>", "</DESCENT>"],
27
+ "integration": ["<INTEGRATION>", "</INTEGRATION>"]
28
+ }
29
+ },
30
+ "training": {
31
+ "epochs": 3,
32
+ "batch_size": 1,
33
+ "gradient_accumulation_steps": 8,
34
+ "effective_batch_size": 8,
35
+ "learning_rate": 5e-6,
36
+ "lr_scheduler": "cosine",
37
+ "warmup_ratio": 0.1,
38
+ "weight_decay": 0.01,
39
+ "max_grad_norm": 1.0,
40
+ "bf16": true,
41
+ "logging_steps": 10,
42
+ "eval_steps": 100,
43
+ "save_steps": 200,
44
+ "save_total_limit": 3
45
+ },
46
+ "rae_loss": {
47
+ "_comment": "Multi-objective loss replicating the handwriting effect",
48
+ "enabled": true,
49
+ "phase_weights": {
50
+ "saturation": 1.0,
51
+ "abstraction": 1.5,
52
+ "descent": 1.5,
53
+ "integration": 1.0
54
+ },
55
+ "coherence_weight": 0.3,
56
+ "compression_weight": 0.2,
57
+ "_coherence_note": "Penalizes abstraction that doesn't follow from saturation",
58
+ "_compression_note": "Rewards abstraction being shorter than saturation (information compression)"
59
+ },
60
+ "output": {
61
+ "dir": "outputs/rae-trained-model",
62
+ "push_to_hub": true,
63
+ "hub_model_id": "rae-cognitive-model",
64
+ "hub_token_env": "HF_TOKEN"
65
+ },
66
+ "wandb": {
67
+ "enabled": true,
68
+ "project": "rae-training",
69
+ "name": "rae-handwriting-v1"
70
+ }
71
+ }