| { | |
| "model_type": "ao_gpt_hybrid", | |
| "architecture": "TinyDecoderLM", | |
| "vocab_size": 2481, | |
| "atomic_motifs": 2387, | |
| "freq_cutoff": 5000, | |
| "d_model": 512, | |
| "n_heads": 8, | |
| "n_layers": 8, | |
| "d_ff": 2048, | |
| "max_seq_len": 64, | |
| "dropout": 0.1, | |
| "use_adaln": true, | |
| "bidirectional": false, | |
| "dtype": "bfloat16", | |
| "epoch": 7, | |
| "n_params_total": 31099825, | |
| "training": { | |
| "dataset": "30M STAMP molecules (train split, all_pass=True)", | |
| "train_rows": 19148578, | |
| "valid_rows_sampled": 20000, | |
| "optimizer": "AdamW (fused, bf16)", | |
| "lr": 5e-4, | |
| "weight_decay": 0.01, | |
| "micro_batch_size": 6144, | |
| "global_batch_size": 6144, | |
| "grad_accum_steps": 1, | |
| "random_ratio": 0.9, | |
| "torch_compile": true, | |
| "fallback_p_low": 0.02, | |
| "fallback_p_high": 0.15 | |
| }, | |
| "default_sampling": { | |
| "temperature": 0.95, | |
| "top_p": 0.85, | |
| "top_k": 0, | |
| "max_new_tokens": 64 | |
| }, | |
| "eval": { | |
| "N": 1024, | |
| "validity_pct": 100.0, | |
| "uniqueness_pct": 100.0, | |
| "quality_over_valid_pct": 79.16, | |
| "genmol_pct": 79.00, | |
| "qed_mean": 0.727, | |
| "sa_mean": 2.92, | |
| "diversity": 0.860, | |
| "reference_ar_baseline_genmol_pct": 79.64 | |
| } | |
| } | |