File size: 2,440 Bytes
61bff85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# src/config.py
from pathlib import Path
from dataclasses import dataclass


@dataclass
class Config:
    SEED: int = 42

    ROOT_DIR: Path = Path(__file__).resolve().parent.parent

    def __post_init__(self):
        self.DATA_DIR   = self.ROOT_DIR / "data" / "processed"
        self.RAW_DIR    = self.ROOT_DIR / "data" / "raw"
        self.OUTPUT_DIR = self.ROOT_DIR / "output"
        self.CASF_DIR    = self.ROOT_DIR / "data" / "external" / "CASF-2016"
        self.CASF13_DIR  = self.ROOT_DIR / "data" / "external" / "CASF-2013"
        self.DATA_DIR.mkdir(parents=True, exist_ok=True)
        self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    # LP-PDBBind CSV
    LPPDB_CSV: str = "LP_PDBBind.csv"   # filename inside RAW_DIR

    # ESM — 35M everywhere (train = deploy = benchmark, honest)
    ESM_MODEL:  str   = "facebook/esm2_t12_35M_UR50D"
    ESM_LAYERS: tuple = (8, 10, 11)   # 0-indexed, proportional to 20/26/30 in 150M
    ESM_DIM:    int   = 480           # hidden dim of 35M

    # Long-sequence chunking
    MAX_SEQ_LEN:  int = 1022
    HALF_SEQ_LEN: int = 511

    # Ligand
    ECFP_BITS:   int = 1024
    ECFP_RADIUS: int = 2

    # Interaction projection
    INTERACT_DIM: int = 128

    # GBM training
    # N_TREES=3000 with early_stop=150 is the sweet spot:
    #   - LR=0.02 + 3000 trees → actual stopping usually ~800-1200 trees
    #   - 5 seeds × 4 models × 5 folds = 100 runs × ~6 min = ~10 hrs total
    #   - Reduce SEEDS to (42, 123, 456) for ~6 hrs if needed
    N_FOLDS:    int   = 5
    SEEDS:      tuple = (42, 123, 456)   # 3 seeds ~6 hrs; add 789,1337 if time allows
    LR:         float = 0.02
    N_TREES:    int   = 3000
    EARLY_STOP: int   = 150

    # TTA
    TTA_SCREEN:   int = 5
    TTA_ACCURATE: int = 20

    # Applicability domain
    AD_KNN_K:      int   = 5
    AD_PERCENTILE: float = 95.0
    AD_MAX_MONO:   float = 0.40   # max single-AA fraction before flagging


config = Config()


if __name__ == "__main__":
    print(f"ROOT:   {config.ROOT_DIR}")
    print(f"DATA:   {config.DATA_DIR}   exists={config.DATA_DIR.exists()}")
    print(f"OUTPUT: {config.OUTPUT_DIR}  exists={config.OUTPUT_DIR.exists()}")
    print(f"CASF:   {config.CASF_DIR}   exists={config.CASF_DIR.exists()}")
    print(f"CASF13:   {config.CASF13_DIR}   exists={config.CASF13_DIR.exists()}")
    lp = config.RAW_DIR / config.LPPDB_CSV
    print(f"LP-PDBBind CSV: {lp}   exists={lp.exists()}")