42Cummer commited on
Commit
e639e39
·
verified ·
1 Parent(s): f2a576a

Upload 5 files

Browse files
Files changed (2) hide show
  1. scripts/generator.py +75 -50
  2. scripts/refine.py +176 -13
scripts/generator.py CHANGED
@@ -11,21 +11,14 @@ def run_broteinshake_generator(pdb_path, fixed_chains, variable_chains, num_seqs
11
 
12
  Args:
13
  pdb_path: Path to the target complex (e.g., 'data/3KAS.pdb').
14
- fixed_chains: Chains to remain unchanged (e.g., 'A').
15
- variable_chains: Chains to be redesigned/repainted (e.g., 'B').
16
  """
17
  # 1. Setup project identifiers and directories
18
  pdb_name = os.path.basename(pdb_path).split('.')[0]
19
  output_dir = f"./generated/{pdb_name}"
20
  os.makedirs(output_dir, exist_ok=True)
21
 
22
- # 2. Parse the PDB into JSONL format for the model
23
- # parse_multiple_chains.py expects a folder, not a file
24
- pdb_dir = os.path.dirname(os.path.abspath(pdb_path))
25
- if not pdb_dir:
26
- pdb_dir = "."
27
- jsonl_path = os.path.join(output_dir, "parsed_pdbs.jsonl")
28
-
29
  # Get the project root directory (where ProteinMPNN should be)
30
  script_dir = os.path.dirname(os.path.abspath(__file__))
31
  project_root = os.path.dirname(script_dir)
@@ -42,56 +35,88 @@ def run_broteinshake_generator(pdb_path, fixed_chains, variable_chains, num_seqs
42
  stderr=subprocess.DEVNULL
43
  )
44
 
45
- parse_script = os.path.join(proteinmpnn_dir, "helper_scripts", "parse_multiple_chains.py")
46
 
47
- parse_cmd = f"python -W ignore {parse_script} --input_path={pdb_dir}/ --output_path={jsonl_path}"
48
- subprocess.run(parse_cmd, shell=True, check=True, stderr=subprocess.DEVNULL)
49
-
50
- # Update the name in parsed JSONL to include "_clones"
51
- pdb_name_clones = f"{pdb_name}_clones"
52
- with open(jsonl_path, 'r') as f:
53
- jsonl_data = json.loads(f.readline())
54
- jsonl_data['name'] = pdb_name_clones
55
- with open(jsonl_path, 'w') as f:
56
- f.write(json.dumps(jsonl_data) + '\n')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
- # 3. Generate the Chain Configuration JSONs (The 'Engine' Logic)
59
- # Format: {"name": [masked_chains_list, visible_chains_list]}
60
- # masked_chains = chains to redesign, visible_chains = chains to keep fixed
61
- masked_chains_list = [c for c in variable_chains]
62
- visible_chains_list = [c for c in fixed_chains]
63
- chain_id_dict = {pdb_name_clones: [masked_chains_list, visible_chains_list]}
64
-
65
- chain_id_json = os.path.join(output_dir, "chain_id_dict.json")
66
- with open(chain_id_json, 'w') as f:
67
- json.dump(chain_id_dict, f)
68
-
69
- # fixed_positions_jsonl is for specific residue positions, not entire chains
70
- # Since we're fixing entire chains via chain_id_dict, we don't need fixed_positions_jsonl
71
- fixed_chain_json = os.path.join(output_dir, "fixed_chain_dict.json")
72
- # Create empty file or omit the argument - let's just not pass it
73
 
74
- # 4. Execute optimized ProteinMPNN command
75
- # Uses the user-specified fixed and variable chains to 'repaint' the binder.
76
- # Note: We don't pass --fixed_positions_jsonl since we're fixing entire chains via chain_id_jsonl
77
- mpnn_script = os.path.join(proteinmpnn_dir, "protein_mpnn_run.py")
78
- mpnn_cmd = (
79
- f"python -W ignore {mpnn_script} "
80
- f"--jsonl_path {jsonl_path} "
81
- f"--chain_id_jsonl {chain_id_json} "
82
- f"--out_folder {output_dir} "
83
- f"--num_seq_per_target {num_seqs} "
84
- f"--sampling_temp {temp} "
85
- f"--seed 42"
86
- )
87
 
88
- print(f"🚀 Designing sequences for {pdb_name}...")
89
- print(f"🔒 Fixed: {fixed_chains} | ✏️ Redesigning: {variable_chains}")
 
 
 
 
 
 
 
 
 
 
 
 
90
  # Suppress warnings by redirecting stderr
91
  env = os.environ.copy()
92
  env['PYTHONWARNINGS'] = 'ignore'
93
  subprocess.run(mpnn_cmd, shell=True, check=True, env=env, stderr=subprocess.DEVNULL)
94
 
 
 
 
 
 
 
 
 
 
 
95
  print(f"✅ Success! Fold the top sequences at https://esmatlas.com/resources?action=fold")
96
 
97
  if __name__ == "__main__":
 
11
 
12
  Args:
13
  pdb_path: Path to the target complex (e.g., 'data/3KAS.pdb').
14
+ fixed_chains: Chains to remain unchanged (e.g., 'A'). Empty for single-chain proteins.
15
+ variable_chains: Chains to be redesigned/repainted (e.g., 'B'). For single-chain, this is the only chain.
16
  """
17
  # 1. Setup project identifiers and directories
18
  pdb_name = os.path.basename(pdb_path).split('.')[0]
19
  output_dir = f"./generated/{pdb_name}"
20
  os.makedirs(output_dir, exist_ok=True)
21
 
 
 
 
 
 
 
 
22
  # Get the project root directory (where ProteinMPNN should be)
23
  script_dir = os.path.dirname(os.path.abspath(__file__))
24
  project_root = os.path.dirname(script_dir)
 
35
  stderr=subprocess.DEVNULL
36
  )
37
 
38
+ mpnn_script = os.path.join(proteinmpnn_dir, "protein_mpnn_run.py")
39
 
40
+ # 2. Check if single-chain protein (no fixed chains means single-chain)
41
+ if not fixed_chains or len(fixed_chains) == 0:
42
+ # Single-chain protein: use direct PDB path command
43
+ # For single-chain, variable_chains should be the only chain (e.g., "A")
44
+ chain_to_design = variable_chains[0] if variable_chains else "A"
45
+
46
+ mpnn_cmd = (
47
+ f"python -W ignore {mpnn_script} "
48
+ f"--pdb_path {pdb_path} "
49
+ f"--pdb_path_chains {chain_to_design} "
50
+ f"--out_folder {output_dir} "
51
+ f"--num_seq_per_target {num_seqs} "
52
+ f"--sampling_temp {temp} "
53
+ f"--seed 42 "
54
+ f"--batch_size 1"
55
+ )
56
+
57
+ print(f"🚀 Designing sequences for {pdb_name} (single-chain mode)...")
58
+ print(f"✏️ Redesigning chain: {chain_to_design}")
59
+ else:
60
+ # Multi-chain protein: use JSONL-based command
61
+ # 2. Parse the PDB into JSONL format for the model
62
+ pdb_dir = os.path.dirname(os.path.abspath(pdb_path))
63
+ if not pdb_dir:
64
+ pdb_dir = "."
65
+ jsonl_path = os.path.join(output_dir, "parsed_pdbs.jsonl")
66
+
67
+ parse_script = os.path.join(proteinmpnn_dir, "helper_scripts", "parse_multiple_chains.py")
68
+
69
+ parse_cmd = f"python -W ignore {parse_script} --input_path={pdb_dir}/ --output_path={jsonl_path}"
70
+ subprocess.run(parse_cmd, shell=True, check=True, stderr=subprocess.DEVNULL)
71
 
72
+ # Update the name in parsed JSONL to include "_clones"
73
+ pdb_name_clones = f"{pdb_name}_clones"
74
+ with open(jsonl_path, 'r') as f:
75
+ jsonl_data = json.loads(f.readline())
76
+ jsonl_data['name'] = pdb_name_clones
77
+ with open(jsonl_path, 'w') as f:
78
+ f.write(json.dumps(jsonl_data) + '\n')
 
 
 
 
 
 
 
 
79
 
80
+ # 3. Generate the Chain Configuration JSONs (The 'Engine' Logic)
81
+ # Format: {"name": [masked_chains_list, visible_chains_list]}
82
+ # masked_chains = chains to redesign, visible_chains = chains to keep fixed
83
+ masked_chains_list = [c for c in variable_chains]
84
+ visible_chains_list = [c for c in fixed_chains]
85
+ chain_id_dict = {pdb_name_clones: [masked_chains_list, visible_chains_list]}
86
+
87
+ chain_id_json = os.path.join(output_dir, "chain_id_dict.json")
88
+ with open(chain_id_json, 'w') as f:
89
+ json.dump(chain_id_dict, f)
 
 
 
90
 
91
+ # 4. Execute optimized ProteinMPNN command for multi-chain
92
+ mpnn_cmd = (
93
+ f"python -W ignore {mpnn_script} "
94
+ f"--jsonl_path {jsonl_path} "
95
+ f"--chain_id_jsonl {chain_id_json} "
96
+ f"--out_folder {output_dir} "
97
+ f"--num_seq_per_target {num_seqs} "
98
+ f"--sampling_temp {temp} "
99
+ f"--seed 42"
100
+ )
101
+
102
+ print(f"🚀 Designing sequences for {pdb_name}...")
103
+ print(f"🔒 Fixed: {fixed_chains} | ✏️ Redesigning: {variable_chains}")
104
+
105
  # Suppress warnings by redirecting stderr
106
  env = os.environ.copy()
107
  env['PYTHONWARNINGS'] = 'ignore'
108
  subprocess.run(mpnn_cmd, shell=True, check=True, env=env, stderr=subprocess.DEVNULL)
109
 
110
+ # For single-chain proteins, ProteinMPNN saves as {pdb_name}.fa
111
+ # Rename it to {pdb_name}_clones.fa for consistency
112
+ if not fixed_chains or len(fixed_chains) == 0:
113
+ seqs_dir = os.path.join(output_dir, "seqs")
114
+ old_file = os.path.join(seqs_dir, f"{pdb_name}.fa")
115
+ new_file = os.path.join(seqs_dir, f"{pdb_name}_clones.fa")
116
+ if os.path.exists(old_file) and not os.path.exists(new_file):
117
+ os.rename(old_file, new_file)
118
+ print(f"📝 Renamed {pdb_name}.fa → {pdb_name}_clones.fa")
119
+
120
  print(f"✅ Success! Fold the top sequences at https://esmatlas.com/resources?action=fold")
121
 
122
  if __name__ == "__main__":
scripts/refine.py CHANGED
@@ -1,31 +1,148 @@
1
  import os
2
  from Bio.PDB import PDBParser, Superimposer, PDBIO
3
 
4
- def polish_design(target_pdb_id, uploaded_file_path):
5
  """
6
- Performs high-precision structural alignment without the OpenMM headache.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  """
8
  # 1. Setup paths
9
- # target_path should point to your local data/3kas.pdb
10
  target_path = os.path.join("data", f"{target_pdb_id.lower()}.pdb")
11
  output_name = "Refined_Shuttle.pdb"
12
 
13
- # 2. ALIGNMENT (The Core Scientific Proof)
14
  parser = PDBParser(QUIET=True)
15
  target_struct = parser.get_structure("target", target_path)
16
  design_struct = parser.get_structure("design", uploaded_file_path)
17
 
18
- sup = Superimposer()
19
- # Aligning using Alpha Carbons (the backbone 'skeleton')
20
- t_atoms = [a for a in target_struct.get_atoms() if a.get_name() == 'CA']
21
- d_atoms = [a for a in design_struct.get_atoms() if a.get_name() == 'CA']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- # Ensure we are comparing the same number of atoms
24
- min_len = min(len(t_atoms), len(d_atoms))
25
- sup.set_atoms(t_atoms[:min_len], d_atoms[:min_len])
 
 
 
 
 
 
 
 
 
26
  sup.apply(design_struct.get_atoms())
27
 
28
- rmsd = sup.rms # This is your 0.75A proof
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  # 3. EXPORT
31
  # This saves the design in the same 3D coordinate space as the human receptor
@@ -33,4 +150,50 @@ def polish_design(target_pdb_id, uploaded_file_path):
33
  io.set_structure(design_struct)
34
  io.save(output_name)
35
 
36
- return output_name, rmsd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  from Bio.PDB import PDBParser, Superimposer, PDBIO
3
 
4
+ def get_core_rmsd(reference_pdb, design_pdb, plddt_threshold=70.0):
5
  """
6
+ Calculate RMSD using only high-confidence residues (pLDDT > threshold).
7
+ This focuses on the core scaffold alignment, ignoring low-confidence regions.
8
+
9
+ Handles both normalized (0-1) and raw pLDDT (0-100) values in B-factor column.
10
+ """
11
+ parser = PDBParser(QUIET=True)
12
+ ref_struct = parser.get_structure("ref", reference_pdb)
13
+ des_struct = parser.get_structure("des", design_pdb)
14
+
15
+ ref_atoms = []
16
+ des_atoms = []
17
+
18
+ # Detect if B-factors are normalized (0-1) or raw pLDDT (0-100)
19
+ sample_bfactor = None
20
+ for res in des_struct.get_residues():
21
+ if 'CA' in res:
22
+ sample_bfactor = res['CA'].get_bfactor()
23
+ break
24
+
25
+ # If max B-factor is < 1.0, assume normalized (0-1 scale)
26
+ # Otherwise assume raw pLDDT (0-100 scale)
27
+ is_normalized = sample_bfactor is not None and sample_bfactor < 1.0
28
+
29
+ # Adjust threshold based on scale
30
+ if is_normalized:
31
+ # Normalized: 70 pLDDT = 0.70
32
+ actual_threshold = plddt_threshold / 100.0
33
+ else:
34
+ # Raw pLDDT: use threshold as-is
35
+ actual_threshold = plddt_threshold
36
+
37
+ # Iterate through residues and filter by B-factor (pLDDT is stored there)
38
+ for ref_res, des_res in zip(ref_struct.get_residues(), des_struct.get_residues()):
39
+ # ESMFold/AlphaFold store pLDDT in the B-factor column
40
+ # We only take Alpha Carbons (CA) for a standard backbone alignment
41
+ if 'CA' in des_res and 'CA' in ref_res:
42
+ plddt = des_res['CA'].get_bfactor()
43
+
44
+ if plddt >= actual_threshold:
45
+ ref_atoms.append(ref_res['CA'])
46
+ des_atoms.append(des_res['CA'])
47
+
48
+ if len(ref_atoms) == 0:
49
+ # Fallback to all residues if no high-confidence ones found
50
+ ref_atoms = [a for a in ref_struct.get_atoms() if a.get_name() == 'CA']
51
+ des_atoms = [a for a in des_struct.get_atoms() if a.get_name() == 'CA']
52
+ min_len = min(len(ref_atoms), len(des_atoms))
53
+ ref_atoms = ref_atoms[:min_len]
54
+ des_atoms = des_atoms[:min_len]
55
+
56
+ # Superimpose and calculate RMSD
57
+ super_imposer = Superimposer()
58
+ super_imposer.set_atoms(ref_atoms, des_atoms)
59
+ super_imposer.apply(des_struct.get_atoms())
60
+
61
+ return super_imposer.rms, len(ref_atoms)
62
+
63
+ def polish_design(target_pdb_id, uploaded_file_path, plddt_threshold=70.0):
64
+ """
65
+ Performs high-precision structural alignment using core-scaffold RMSD.
66
+ Uses only high-confidence residues (pLDDT > threshold) for more meaningful metrics.
67
+ Returns both global and core RMSD values.
68
  """
69
  # 1. Setup paths
 
70
  target_path = os.path.join("data", f"{target_pdb_id.lower()}.pdb")
71
  output_name = "Refined_Shuttle.pdb"
72
 
73
+ # 2. ALIGNMENT using core-scaffold RMSD (high-confidence residues only)
74
  parser = PDBParser(QUIET=True)
75
  target_struct = parser.get_structure("target", target_path)
76
  design_struct = parser.get_structure("design", uploaded_file_path)
77
 
78
+ # Get atoms for alignment - filter by pLDDT if available
79
+ ref_atoms = []
80
+ des_atoms = []
81
+ ref_atoms_high_conf = [] # For pLDDT > 80
82
+ des_atoms_high_conf = [] # For pLDDT > 80
83
+
84
+ # Detect if B-factors are normalized (0-1) or raw pLDDT (0-100)
85
+ sample_bfactor = None
86
+ for res in design_struct.get_residues():
87
+ if 'CA' in res:
88
+ sample_bfactor = res['CA'].get_bfactor()
89
+ break
90
+
91
+ is_normalized = sample_bfactor is not None and sample_bfactor < 1.0
92
+ actual_threshold = (plddt_threshold / 100.0) if is_normalized else plddt_threshold
93
+ high_conf_threshold = (80.0 / 100.0) if is_normalized else 80.0
94
+
95
+ # Collect atoms for alignment (using plddt_threshold)
96
+ # Also collect high-confidence atoms (pLDDT > 80) for detailed report
97
+ for ref_res, des_res in zip(target_struct.get_residues(), design_struct.get_residues()):
98
+ if 'CA' in des_res and 'CA' in ref_res:
99
+ plddt = des_res['CA'].get_bfactor()
100
+ if plddt >= actual_threshold:
101
+ ref_atoms.append(ref_res['CA'])
102
+ des_atoms.append(des_res['CA'])
103
+ if plddt >= high_conf_threshold:
104
+ ref_atoms_high_conf.append(ref_res['CA'])
105
+ des_atoms_high_conf.append(des_res['CA'])
106
 
107
+ # Fallback to all CA atoms if no high-confidence ones found
108
+ if len(ref_atoms) == 0:
109
+ print(f"⚠️ No residues with pLDDT >= {plddt_threshold}. Using all residues.")
110
+ ref_atoms = [a for a in target_struct.get_atoms() if a.get_name() == 'CA']
111
+ des_atoms = [a for a in design_struct.get_atoms() if a.get_name() == 'CA']
112
+ min_len = min(len(ref_atoms), len(des_atoms))
113
+ ref_atoms = ref_atoms[:min_len]
114
+ des_atoms = des_atoms[:min_len]
115
+
116
+ # Perform alignment using the main threshold atoms
117
+ sup = Superimposer()
118
+ sup.set_atoms(ref_atoms, des_atoms)
119
  sup.apply(design_struct.get_atoms())
120
 
121
+ core_rmsd = sup.rms
122
+ num_residues = len(ref_atoms)
123
+ print(f"🎯 Core-Scaffold RMSD (pLDDT > {plddt_threshold}): {core_rmsd:.3f} Å ({num_residues} residues)")
124
+
125
+ # Calculate global RMSD (all CA atoms)
126
+ all_ref_atoms = [a for a in target_struct.get_atoms() if a.get_name() == 'CA']
127
+ all_des_atoms = [a for a in design_struct.get_atoms() if a.get_name() == 'CA']
128
+ min_len = min(len(all_ref_atoms), len(all_des_atoms))
129
+ all_ref_atoms = all_ref_atoms[:min_len]
130
+ all_des_atoms = all_des_atoms[:min_len]
131
+
132
+ # Calculate global RMSD after alignment
133
+ sup_global = Superimposer()
134
+ sup_global.set_atoms(all_ref_atoms, all_des_atoms)
135
+ global_rmsd = sup_global.rms
136
+
137
+ # Calculate high-confidence core RMSD (pLDDT > 80)
138
+ high_conf_rmsd = None
139
+ if len(ref_atoms_high_conf) > 0:
140
+ sup_high_conf = Superimposer()
141
+ sup_high_conf.set_atoms(ref_atoms_high_conf, des_atoms_high_conf)
142
+ high_conf_rmsd = sup_high_conf.rms
143
+ else:
144
+ # If no high-confidence atoms, use core_rmsd as fallback
145
+ high_conf_rmsd = core_rmsd
146
 
147
  # 3. EXPORT
148
  # This saves the design in the same 3D coordinate space as the human receptor
 
150
  io.set_structure(design_struct)
151
  io.save(output_name)
152
 
153
+ return output_name, global_rmsd, core_rmsd, high_conf_rmsd
154
+
155
+ def process_results(target_pdb_id, result_pdb, global_rmsd, core_rmsd):
156
+ """
157
+ Generate a detailed structural validation report with tiered RMSD analysis.
158
+
159
+ Args:
160
+ target_pdb_id: Target PDB ID
161
+ result_pdb: Path to the aligned result PDB
162
+ global_rmsd: Global RMSD (all residues)
163
+ core_rmsd: High-confidence core RMSD (pLDDT > 80)
164
+
165
+ Returns:
166
+ str: Formatted validation report
167
+ """
168
+ # Calculate the tiers we found earlier
169
+ # pLDDT > 80: High Fidelity Core
170
+ # pLDDT < 50: Disordered Loop
171
+
172
+ # Determine design status based on core RMSD
173
+ if core_rmsd < 1.0:
174
+ status = "Success - High-Precision Core Match"
175
+ status_emoji = "✅"
176
+ elif core_rmsd < 2.0:
177
+ status = "Good - Minor Core Deviation"
178
+ status_emoji = "⚠️"
179
+ else:
180
+ status = "Possible Fold Drift - Review Required"
181
+ status_emoji = "❌"
182
+
183
+ report = f"""
184
+ ### 🔬 Structural Validation Report
185
+
186
+ **Target:** {target_pdb_id.upper()}
187
+
188
+ **RMSD Metrics:**
189
+ - **Global RMSD:** {global_rmsd:.2f} Å (all residues)
190
+ - **High-Confidence Core RMSD (pLDDT > 80):** {core_rmsd:.2f} Å
191
+
192
+ **Design Status:** {status_emoji} {status}
193
+
194
+ **Interpretation:**
195
+ - Core RMSD < 1.0 Å: Excellent scaffold preservation
196
+ - Core RMSD 1.0-2.0 Å: Good structural match
197
+ - Core RMSD > 2.0 Å: Possible fold drift, review structure
198
+ """
199
+ return report