vxnuaj commited on
Commit
71857c9
·
verified ·
1 Parent(s): e8820a5

Upload ATHENA_V1_TINY_39.7M/checkpoints/RUN_002/RUN_002_DATETIME_2025-07-13_21-09-44_CONFIG.json with huggingface_hub

Browse files
ATHENA_V1_TINY_39.7M/checkpoints/RUN_002/RUN_002_DATETIME_2025-07-13_21-09-44_CONFIG.json ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train_config": {
3
+ "vocab_size": 10000,
4
+ "context_length": 512,
5
+ "epochs": 30,
6
+ "checkpoint_steps": 1356,
7
+ "save_checkpoint_path": "main/checkpoints",
8
+ "save_hf": true,
9
+ "val_steps": 250,
10
+ "mixed_precision": false,
11
+ "max_grad_norm": 1.0,
12
+ "track_grad_norm": true,
13
+ "parallel_type": "fsdp",
14
+ "val_mixed_precision": false,
15
+ "val_mixed_precision_dtype": "f16",
16
+ "fsdp_wrap_policy": "transformer",
17
+ "wandb": true,
18
+ "log_level": "INFO",
19
+ "_compile": false,
20
+ "_compile_warmup_steps": null,
21
+ "hf_repo_config": {
22
+ "hf_repo_exists": true,
23
+ "hf_repo_id": "tiny-research/athena",
24
+ "hf_repo_type": "model",
25
+ "hf_root_path": "ATHENA_V1_TINY_39.7M/checkpoints"
26
+ },
27
+ "mixed_precision_dtype": "f16",
28
+ "log_root_path": "main/logs",
29
+ "load_checkpoint": true,
30
+ "load_checkpoint_path": "main/checkpoints/RUN_002/RUN_002_DATETIME_2025-07-12_21-36-20_EPOCH_4_STEP_1352_GLOBAL_STEPS_6780.pt",
31
+ "save_on_interrupt": true,
32
+ "extra_args": {}
33
+ },
34
+ "optimizer_config": {
35
+ "lr": 0.0005,
36
+ "betas": [
37
+ 0.9,
38
+ 0.999
39
+ ],
40
+ "eps": 1e-08,
41
+ "weight_decay": 0.01,
42
+ "fused": true,
43
+ "extra_args": {}
44
+ },
45
+ "scheduler_config": {
46
+ "warmup_steps": 1000,
47
+ "constant_steps": 1000,
48
+ "decay_steps": 8852,
49
+ "max_lr": 0.0001,
50
+ "min_lr": 1e-05,
51
+ "extra_args": {}
52
+ },
53
+ "dataloader_config": {
54
+ "train_dataloader_config": {
55
+ "train_batch_size": 192,
56
+ "train_num_workers": 12,
57
+ "train_shuffle": true,
58
+ "train_pin_memory": true,
59
+ "train_data_root_path": "data/tensors/train"
60
+ },
61
+ "val_dataloader_config": {
62
+ "val_batch_size": 64,
63
+ "val_num_workers": 12,
64
+ "val_shuffle": false,
65
+ "val_pin_memory": true,
66
+ "val_data_root_path": "data/tensors/val"
67
+ },
68
+ "extra_args": {}
69
+ },
70
+ "model_config": {
71
+ "context_len": 512,
72
+ "d_model": 512,
73
+ "n_heads": 8,
74
+ "n_blocks": 12,
75
+ "vocab_size": 10000,
76
+ "pos_emb_dropout_p": 0.1,
77
+ "pos_emb_type": "rope",
78
+ "learned": false,
79
+ "ntk_rope_scaling": false,
80
+ "dyn_scaling": false,
81
+ "attn_type": "mhsa",
82
+ "n_groups": null,
83
+ "top_k_sparsev": null,
84
+ "p_threshold": null,
85
+ "p_threshold_steps_fraction": null,
86
+ "flash_attn": true,
87
+ "flash_attn_dtype": "float16",
88
+ "supress_warnings": true,
89
+ "verbose": null,
90
+ "model_name": "ATHENA_V1_TINY_39.7M",
91
+ "model_series_name": "ATHENA \n 39.7M",
92
+ "extra_args": {}
93
+ },
94
+ "criterion_config": {
95
+ "reduction": "none",
96
+ "ignore_index": 0,
97
+ "extra_args": {}
98
+ },
99
+ "wandb_config": {
100
+ "project": "ATHENA",
101
+ "name": "ATHENA_V1_TINY_39.7M",
102
+ "entity": "vxnuaj",
103
+ "tags": [
104
+ "ATHENA",
105
+ "39.7M",
106
+ "PRETRAIN",
107
+ "TINY",
108
+ "V1"
109
+ ],
110
+ "notes": "",
111
+ "id": "b630r3vt",
112
+ "extra_args": {}
113
+ }
114
+ }