| from fuson_plm.utils.logging import CustomParams | |
| # Clustering Parameters | |
| # Need to be stacked, because there are 4 properties | |
| CLUSTER = CustomParams( | |
| # MMSeqs2 parameters: see GitHub or MMSeqs2 Wiki for guidance | |
| MIN_SEQ_ID = 0.3, # % identity | |
| C = 0.5, # % sequence length overlap | |
| COV_MODE = 1, # cov-mode: 0 = bidirectional, 1 = target coverage, 2 = query coverage, 3 = target-in-query length coverage. | |
| CLUSTER_MODE = 2, | |
| # File paths | |
| INPUT_PATH = 'processed_data/all_albatross_seqs_and_properties.csv', | |
| PATH_TO_MMSEQS = '../../mmseqs' # path to where you installed MMSeqs2 | |
| ) | |
| # Here, we'll be splitting the train set into train and val. we aren't touching test | |
| SPLIT = CustomParams( | |
| IDR_DB_PATH = 'processed_data/all_albatross_seqs_and_properties.csv', | |
| CLUSTER_OUTPUT_PATH = 'clustering/mmseqs_full_results.csv', | |
| #RANDOM_STATE = 7, # random_state_1 = state for splitting all data into train & test | |
| #VAL_SIZE = 0.10, # val size for data -> train/val split. e.g. 20 means 80% clusters in train, 20% clusters in val | |
| RANDOM_STATE_1 = 2, # random_state_1 = state for splitting all data into train & other | |
| TEST_SIZE_1 = 0.21, # test size for data -> train/test split. e.g. 20 means 80% clusters in train, 20% clusters in other | |
| RANDOM_STATE_2 = 6, # random_state_2 = state for splitting other from ^ into val and test | |
| TEST_SIZE_2 = 0.50 # test size for train -> train/val split. e.g. 0.50 means 50% clusters in train, 50% clusters in test | |
| ) | |
| # Which models to benchmark | |
| TRAIN = CustomParams( | |
| BENCHMARK_FUSONPLM = True, | |
| FUSONPLM_CKPTS= "FusOn-pLM", # Dictionary: key = run name, values = epochs, or string "FusOn-pLM" | |
| BENCHMARK_ESM = True, | |
| # GPU configs | |
| CUDA_VISIBLE_DEVICES="0", | |
| # Overwriting configs | |
| PERMISSION_TO_OVERWRITE_EMBEDDINGS = False, # if False, script will halt if it believes these embeddings have already been made. | |
| PERMISSION_TO_OVERWRITE_MODELS = False # if False, script will halt if it believes these embeddings have already been made. | |
| ) |