# -*- coding: utf-8 -*- """binder_design.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1fsVz3x72UZlRP7L2VCBwFIKHwGRpbqAS # AfDesign - peptide binder design For a given protein target and protein binder length, generate/hallucinate a protein binder sequence AlphaFold thinks will bind to the target structure. To do this, we maximize number of contacts at the interface and maximize pLDDT of the binder. **WARNING** 1. This notebook is in active development and was designed for demonstration purposes only. 2. Using AfDesign as the only "loss" function for design might be a bad idea, you may find adversarial sequences (aka. sequences that trick AlphaFold). """ #@title **setup** import os if not os.path.isdir("params"): # get code os.system("pip -q install git+https://github.com/sokrypton/ColabDesign.git@v1.1.1") # for debugging os.system("ln -s /usr/local/lib/python3.*/dist-packages/colabdesign colabdesign") # download params os.system("mkdir params") os.system("apt-get install aria2 -qq") os.system("aria2c -q -x 16 https://storage.googleapis.com/alphafold/alphafold_params_2022-12-06.tar") os.system("tar -xf alphafold_params_2022-12-06.tar -C params") import warnings warnings.simplefilter(action='ignore', category=FutureWarning) import os from colabdesign import mk_afdesign_model, clear_mem from colabdesign.shared.utils import copy_dict from colabdesign.af.alphafold.common import residue_constants from IPython.display import HTML from google.colab import files import numpy as np ######################### def get_pdb(pdb_code=""): if pdb_code is None or pdb_code == "": upload_dict = files.upload() pdb_string = upload_dict[list(upload_dict.keys())[0]] with open("tmp.pdb","wb") as out: out.write(pdb_string) return "tmp.pdb" elif os.path.isfile(pdb_code): return pdb_code elif len(pdb_code) == 4: os.system(f"wget -qnc https://files.rcsb.org/view/{pdb_code}.pdb") return f"{pdb_code}.pdb" else: os.system(f"wget -qnc https://alphafold.ebi.ac.uk/files/AF-{pdb_code}-F1-model_v3.pdb") return f"AF-{pdb_code}-F1-model_v3.pdb" #@title **prep inputs** import re #@markdown --- #@markdown **target info** pdb = "5F9R" #@param {type:"string"} #@markdown - enter PDB code or UniProt code (to fetch AlphaFoldDB model) or leave blink to upload your own target_chain = "B" #@param {type:"string"} target_hotspot = "" #@param {type:"string"} if target_hotspot == "": target_hotspot = None #@markdown - restrict loss to predefined positions on target (eg. "1-10,12,15") target_flexible = False #@param {type:"boolean"} #@markdown - allow backbone of target structure to be flexible #@markdown --- #@markdown **binder info** binder_len = 25 #@param {type:"integer"} #@markdown - length of binder to hallucination binder_seq = "" #@param {type:"string"} binder_seq = re.sub("[^A-Z]", "", binder_seq.upper()) if len(binder_seq) > 0: binder_len = len(binder_seq) else: binder_seq = None #@markdown - if defined, will initialize design with this sequence binder_chain = "" #@param {type:"string"} if binder_chain == "": binder_chain = None #@markdown - if defined, supervised loss is used (binder_len is ignored) #@markdown --- #@markdown **model config** use_multimer = False #@param {type:"boolean"} #@markdown - use alphafold-multimer for design num_recycles = 1 #@param ["0", "1", "3", "6"] {type:"raw"} num_models = "1" #@param ["1", "2", "3", "4", "5", "all"] num_models = 5 if num_models == "all" else int(num_models) #@markdown - number of trained models to use during optimization x = {"pdb_filename":pdb, "chain":target_chain, "binder_len":binder_len, "binder_chain":binder_chain, "hotspot":target_hotspot, "use_multimer":use_multimer, "rm_target_seq":target_flexible} x["pdb_filename"] = get_pdb(x["pdb_filename"]) if "x_prev" not in dir() or x != x_prev: clear_mem() model = mk_afdesign_model(protocol="binder", use_multimer=x["use_multimer"], num_recycles=num_recycles, recycle_mode="sample") model.prep_inputs(**x, ignore_missing=False) x_prev = copy_dict(x) print("target length:", model._target_len) print("binder length:", model._binder_len) binder_len = model._binder_len #@title **run AfDesign** from scipy.special import softmax optimizer = "pssm_semigreedy" #@param ["pssm_semigreedy", "3stage", "semigreedy", "pssm", "logits", "soft", "hard"] #@markdown - `pssm_semigreedy` - uses the designed PSSM to bias semigreedy opt. (Recommended) #@markdown - `3stage` - gradient based optimization (GD) (logits → soft → hard) #@markdown - `pssm` - GD optimize (logits → soft) to get a sequence profile (PSSM). #@markdown - `semigreedy` - tries X random mutations, accepts those that decrease loss #@markdown - `logits` - GD optimize logits inputs (continious) #@markdown - `soft` - GD optimize softmax(logits) inputs (probabilities) #@markdown - `hard` - GD optimize one_hot(logits) inputs (discrete) #@markdown WARNING: The output sequence from `pssm`,`logits`,`soft` is not one_hot. To get a valid sequence use the other optimizers, or redesign the output backbone with another protocol like ProteinMPNN. #@markdown ---- #@markdown #### advanced GD settings GD_method = "sgd" #@param ["adabelief", "adafactor", "adagrad", "adam", "adamw", "fromage", "lamb", "lars", "noisy_sgd", "dpsgd", "radam", "rmsprop", "sgd", "sm3", "yogi"] learning_rate = 0.1 #@param {type:"raw"} norm_seq_grad = True #@param {type:"boolean"} dropout = True #@param {type:"boolean"} model.restart(seq=binder_seq) model.set_optimizer(optimizer=GD_method, learning_rate=learning_rate, norm_seq_grad=norm_seq_grad) models = model._model_names[:num_models] flags = {"num_recycles":num_recycles, "models":models, "dropout":dropout} if optimizer == "3stage": model.design_3stage(120, 60, 10, **flags) pssm = softmax(model._tmp["seq_logits"],-1) if optimizer == "pssm_semigreedy": model.design_pssm_semigreedy(120, 32, **flags) pssm = softmax(model._tmp["seq_logits"],1) if optimizer == "semigreedy": model.design_pssm_semigreedy(0, 32, **flags) pssm = None if optimizer == "pssm": model.design_logits(120, e_soft=1.0, num_models=1, ramp_recycles=True, **flags) model.design_soft(32, num_models=1, **flags) flags.update({"dropout":False,"save_best":True}) model.design_soft(10, num_models=num_models, **flags) pssm = softmax(model.aux["seq"]["logits"],-1) O = {"logits":model.design_logits, "soft":model.design_soft, "hard":model.design_hard} if optimizer in O: O[optimizer](120, num_models=1, ramp_recycles=True, **flags) flags.update({"dropout":False,"save_best":True}) O[optimizer](10, num_models=num_models, **flags) pssm = softmax(model.aux["seq"]["logits"],-1) model.save_pdb(f"{model.protocol}.pdb") #@title display hallucinated protein {run: "auto"} color = "pLDDT" #@param ["chain", "pLDDT", "rainbow"] show_sidechains = False #@param {type:"boolean"} show_mainchains = False #@param {type:"boolean"} color_HP = False #@param {type:"boolean"} animate = True #@param {type:"boolean"} model.plot_pdb(show_sidechains=show_sidechains, show_mainchains=show_mainchains, color=color, color_HP=color_HP, animate=animate) HTML(model.animate(dpi=100)) model.save_pdb(f"{model.protocol}.pdb") model.get_seqs() #@markdown ### Amino acid probabilties import plotly.express as px alphabet = "ACDEFGHIKLMNPQRSTVWY" if "pssm" in dir() and pssm is not None: fig = px.imshow(pssm.mean(0).T, labels=dict(x="positions", y="amino acids", color="probability"), y=residue_constants.restypes, zmin=0, zmax=1, template="simple_white", ) fig.update_xaxes(side="top") fig.show() # log model._tmp["best"]["aux"]["log"]