Spaces:

42Cummer
/

BroteinShake

Running

App Files Files Community

42Cummer commited on Jan 2

Commit

e639e39

verified ·

1 Parent(s): f2a576a

Upload 5 files

Browse files

Files changed (2) hide show

scripts/generator.py +75 -50
scripts/refine.py +176 -13

scripts/generator.py CHANGED Viewed

@@ -11,21 +11,14 @@ def run_broteinshake_generator(pdb_path, fixed_chains, variable_chains, num_seqs
     Args:
         pdb_path: Path to the target complex (e.g., 'data/3KAS.pdb').
-        fixed_chains: Chains to remain unchanged (e.g., 'A').
-        variable_chains: Chains to be redesigned/repainted (e.g., 'B').
     """
     # 1. Setup project identifiers and directories
     pdb_name = os.path.basename(pdb_path).split('.')[0]
     output_dir = f"./generated/{pdb_name}"
     os.makedirs(output_dir, exist_ok=True)
-    # 2. Parse the PDB into JSONL format for the model
-    # parse_multiple_chains.py expects a folder, not a file
-    pdb_dir = os.path.dirname(os.path.abspath(pdb_path))
-    if not pdb_dir:
-        pdb_dir = "."
-    jsonl_path = os.path.join(output_dir, "parsed_pdbs.jsonl")
     # Get the project root directory (where ProteinMPNN should be)
     script_dir = os.path.dirname(os.path.abspath(__file__))
     project_root = os.path.dirname(script_dir)
@@ -42,56 +35,88 @@ def run_broteinshake_generator(pdb_path, fixed_chains, variable_chains, num_seqs
             stderr=subprocess.DEVNULL
         )
-    parse_script = os.path.join(proteinmpnn_dir, "helper_scripts", "parse_multiple_chains.py")
-    parse_cmd = f"python -W ignore {parse_script} --input_path={pdb_dir}/ --output_path={jsonl_path}"
-    subprocess.run(parse_cmd, shell=True, check=True, stderr=subprocess.DEVNULL)
-    # Update the name in parsed JSONL to include "_clones"
-    pdb_name_clones = f"{pdb_name}_clones"
-    with open(jsonl_path, 'r') as f:
-        jsonl_data = json.loads(f.readline())
-    jsonl_data['name'] = pdb_name_clones
-    with open(jsonl_path, 'w') as f:
-        f.write(json.dumps(jsonl_data) + '\n')
-    # 3. Generate the Chain Configuration JSONs (The 'Engine' Logic)
-    # Format: {"name": [masked_chains_list, visible_chains_list]}
-    # masked_chains = chains to redesign, visible_chains = chains to keep fixed
-    masked_chains_list = [c for c in variable_chains]
-    visible_chains_list = [c for c in fixed_chains]
-    chain_id_dict = {pdb_name_clones: [masked_chains_list, visible_chains_list]}
-    chain_id_json = os.path.join(output_dir, "chain_id_dict.json")
-    with open(chain_id_json, 'w') as f:
-        json.dump(chain_id_dict, f)
-    # fixed_positions_jsonl is for specific residue positions, not entire chains
-    # Since we're fixing entire chains via chain_id_dict, we don't need fixed_positions_jsonl
-    fixed_chain_json = os.path.join(output_dir, "fixed_chain_dict.json")
-    # Create empty file or omit the argument - let's just not pass it
-    # 4. Execute optimized ProteinMPNN command
-    # Uses the user-specified fixed and variable chains to 'repaint' the binder.
-    # Note: We don't pass --fixed_positions_jsonl since we're fixing entire chains via chain_id_jsonl
-    mpnn_script = os.path.join(proteinmpnn_dir, "protein_mpnn_run.py")
-    mpnn_cmd = (
-        f"python -W ignore {mpnn_script} "
-        f"--jsonl_path {jsonl_path} "
-        f"--chain_id_jsonl {chain_id_json} "
-        f"--out_folder {output_dir} "
-        f"--num_seq_per_target {num_seqs} "
-        f"--sampling_temp {temp} "
-        f"--seed 42"
-    )
-    print(f"🚀 Designing sequences for {pdb_name}...")
-    print(f"🔒 Fixed: {fixed_chains} | ✏️ Redesigning: {variable_chains}")
     # Suppress warnings by redirecting stderr
     env = os.environ.copy()
     env['PYTHONWARNINGS'] = 'ignore'
     subprocess.run(mpnn_cmd, shell=True, check=True, env=env, stderr=subprocess.DEVNULL)
     print(f"✅ Success! Fold the top sequences at https://esmatlas.com/resources?action=fold")
 if __name__ == "__main__":

     Args:
         pdb_path: Path to the target complex (e.g., 'data/3KAS.pdb').
+        fixed_chains: Chains to remain unchanged (e.g., 'A'). Empty for single-chain proteins.
+        variable_chains: Chains to be redesigned/repainted (e.g., 'B'). For single-chain, this is the only chain.
     """
     # 1. Setup project identifiers and directories
     pdb_name = os.path.basename(pdb_path).split('.')[0]
     output_dir = f"./generated/{pdb_name}"
     os.makedirs(output_dir, exist_ok=True)
     # Get the project root directory (where ProteinMPNN should be)
     script_dir = os.path.dirname(os.path.abspath(__file__))
     project_root = os.path.dirname(script_dir)
             stderr=subprocess.DEVNULL
         )
+    mpnn_script = os.path.join(proteinmpnn_dir, "protein_mpnn_run.py")
+    # 2. Check if single-chain protein (no fixed chains means single-chain)
+    if not fixed_chains or len(fixed_chains) == 0:
+        # Single-chain protein: use direct PDB path command
+        # For single-chain, variable_chains should be the only chain (e.g., "A")
+        chain_to_design = variable_chains[0] if variable_chains else "A"
+        mpnn_cmd = (
+            f"python -W ignore {mpnn_script} "
+            f"--pdb_path {pdb_path} "
+            f"--pdb_path_chains {chain_to_design} "
+            f"--out_folder {output_dir} "
+            f"--num_seq_per_target {num_seqs} "
+            f"--sampling_temp {temp} "
+            f"--seed 42 "
+            f"--batch_size 1"
+        )
+        print(f"🚀 Designing sequences for {pdb_name} (single-chain mode)...")
+        print(f"✏️ Redesigning chain: {chain_to_design}")
+    else:
+        # Multi-chain protein: use JSONL-based command
+        # 2. Parse the PDB into JSONL format for the model
+        pdb_dir = os.path.dirname(os.path.abspath(pdb_path))
+        if not pdb_dir:
+            pdb_dir = "."
+        jsonl_path = os.path.join(output_dir, "parsed_pdbs.jsonl")
+        parse_script = os.path.join(proteinmpnn_dir, "helper_scripts", "parse_multiple_chains.py")
+        parse_cmd = f"python -W ignore {parse_script} --input_path={pdb_dir}/ --output_path={jsonl_path}"
+        subprocess.run(parse_cmd, shell=True, check=True, stderr=subprocess.DEVNULL)
+        # Update the name in parsed JSONL to include "_clones"
+        pdb_name_clones = f"{pdb_name}_clones"
+        with open(jsonl_path, 'r') as f:
+            jsonl_data = json.loads(f.readline())
+        jsonl_data['name'] = pdb_name_clones
+        with open(jsonl_path, 'w') as f:
+            f.write(json.dumps(jsonl_data) + '\n')
+        # 3. Generate the Chain Configuration JSONs (The 'Engine' Logic)
+        # Format: {"name": [masked_chains_list, visible_chains_list]}
+        # masked_chains = chains to redesign, visible_chains = chains to keep fixed
+        masked_chains_list = [c for c in variable_chains]
+        visible_chains_list = [c for c in fixed_chains]
+        chain_id_dict = {pdb_name_clones: [masked_chains_list, visible_chains_list]}
+        chain_id_json = os.path.join(output_dir, "chain_id_dict.json")
+        with open(chain_id_json, 'w') as f:
+            json.dump(chain_id_dict, f)
+        # 4. Execute optimized ProteinMPNN command for multi-chain
+        mpnn_cmd = (
+            f"python -W ignore {mpnn_script} "
+            f"--jsonl_path {jsonl_path} "
+            f"--chain_id_jsonl {chain_id_json} "
+            f"--out_folder {output_dir} "
+            f"--num_seq_per_target {num_seqs} "
+            f"--sampling_temp {temp} "
+            f"--seed 42"
+        )
+        print(f"🚀 Designing sequences for {pdb_name}...")
+        print(f"🔒 Fixed: {fixed_chains} | ✏️ Redesigning: {variable_chains}")
     # Suppress warnings by redirecting stderr
     env = os.environ.copy()
     env['PYTHONWARNINGS'] = 'ignore'
     subprocess.run(mpnn_cmd, shell=True, check=True, env=env, stderr=subprocess.DEVNULL)
+    # For single-chain proteins, ProteinMPNN saves as {pdb_name}.fa
+    # Rename it to {pdb_name}_clones.fa for consistency
+    if not fixed_chains or len(fixed_chains) == 0:
+        seqs_dir = os.path.join(output_dir, "seqs")
+        old_file = os.path.join(seqs_dir, f"{pdb_name}.fa")
+        new_file = os.path.join(seqs_dir, f"{pdb_name}_clones.fa")
+        if os.path.exists(old_file) and not os.path.exists(new_file):
+            os.rename(old_file, new_file)
+            print(f"📝 Renamed {pdb_name}.fa → {pdb_name}_clones.fa")
     print(f"✅ Success! Fold the top sequences at https://esmatlas.com/resources?action=fold")
 if __name__ == "__main__":

scripts/refine.py CHANGED Viewed

@@ -1,31 +1,148 @@
 import os
 from Bio.PDB import PDBParser, Superimposer, PDBIO
-def polish_design(target_pdb_id, uploaded_file_path):
     """
-    Performs high-precision structural alignment without the OpenMM headache.
     """
     # 1. Setup paths
-    # target_path should point to your local data/3kas.pdb
     target_path = os.path.join("data", f"{target_pdb_id.lower()}.pdb")
     output_name = "Refined_Shuttle.pdb"
-    # 2. ALIGNMENT (The Core Scientific Proof)
     parser = PDBParser(QUIET=True)
     target_struct = parser.get_structure("target", target_path)
     design_struct = parser.get_structure("design", uploaded_file_path)
-    sup = Superimposer()
-    # Aligning using Alpha Carbons (the backbone 'skeleton')
-    t_atoms = [a for a in target_struct.get_atoms() if a.get_name() == 'CA']
-    d_atoms = [a for a in design_struct.get_atoms() if a.get_name() == 'CA']
-    # Ensure we are comparing the same number of atoms
-    min_len = min(len(t_atoms), len(d_atoms))
-    sup.set_atoms(t_atoms[:min_len], d_atoms[:min_len])
     sup.apply(design_struct.get_atoms())
-    rmsd = sup.rms # This is your 0.75A proof
     # 3. EXPORT
     # This saves the design in the same 3D coordinate space as the human receptor
@@ -33,4 +150,50 @@ def polish_design(target_pdb_id, uploaded_file_path):
     io.set_structure(design_struct)
     io.save(output_name)
-    return output_name, rmsd

 import os
 from Bio.PDB import PDBParser, Superimposer, PDBIO
+def get_core_rmsd(reference_pdb, design_pdb, plddt_threshold=70.0):
     """
+    Calculate RMSD using only high-confidence residues (pLDDT > threshold).
+    This focuses on the core scaffold alignment, ignoring low-confidence regions.
+    Handles both normalized (0-1) and raw pLDDT (0-100) values in B-factor column.
+    """
+    parser = PDBParser(QUIET=True)
+    ref_struct = parser.get_structure("ref", reference_pdb)
+    des_struct = parser.get_structure("des", design_pdb)
+    ref_atoms = []
+    des_atoms = []
+    # Detect if B-factors are normalized (0-1) or raw pLDDT (0-100)
+    sample_bfactor = None
+    for res in des_struct.get_residues():
+        if 'CA' in res:
+            sample_bfactor = res['CA'].get_bfactor()
+            break
+    # If max B-factor is < 1.0, assume normalized (0-1 scale)
+    # Otherwise assume raw pLDDT (0-100 scale)
+    is_normalized = sample_bfactor is not None and sample_bfactor < 1.0
+    # Adjust threshold based on scale
+    if is_normalized:
+        # Normalized: 70 pLDDT = 0.70
+        actual_threshold = plddt_threshold / 100.0
+    else:
+        # Raw pLDDT: use threshold as-is
+        actual_threshold = plddt_threshold
+    # Iterate through residues and filter by B-factor (pLDDT is stored there)
+    for ref_res, des_res in zip(ref_struct.get_residues(), des_struct.get_residues()):
+        # ESMFold/AlphaFold store pLDDT in the B-factor column
+        # We only take Alpha Carbons (CA) for a standard backbone alignment
+        if 'CA' in des_res and 'CA' in ref_res:
+            plddt = des_res['CA'].get_bfactor()
+            if plddt >= actual_threshold:
+                ref_atoms.append(ref_res['CA'])
+                des_atoms.append(des_res['CA'])
+    if len(ref_atoms) == 0:
+        # Fallback to all residues if no high-confidence ones found
+        ref_atoms = [a for a in ref_struct.get_atoms() if a.get_name() == 'CA']
+        des_atoms = [a for a in des_struct.get_atoms() if a.get_name() == 'CA']
+        min_len = min(len(ref_atoms), len(des_atoms))
+        ref_atoms = ref_atoms[:min_len]
+        des_atoms = des_atoms[:min_len]
+    # Superimpose and calculate RMSD
+    super_imposer = Superimposer()
+    super_imposer.set_atoms(ref_atoms, des_atoms)
+    super_imposer.apply(des_struct.get_atoms())
+    return super_imposer.rms, len(ref_atoms)
+def polish_design(target_pdb_id, uploaded_file_path, plddt_threshold=70.0):
+    """
+    Performs high-precision structural alignment using core-scaffold RMSD.
+    Uses only high-confidence residues (pLDDT > threshold) for more meaningful metrics.
+    Returns both global and core RMSD values.
     """
     # 1. Setup paths
     target_path = os.path.join("data", f"{target_pdb_id.lower()}.pdb")
     output_name = "Refined_Shuttle.pdb"
+    # 2. ALIGNMENT using core-scaffold RMSD (high-confidence residues only)
     parser = PDBParser(QUIET=True)
     target_struct = parser.get_structure("target", target_path)
     design_struct = parser.get_structure("design", uploaded_file_path)
+    # Get atoms for alignment - filter by pLDDT if available
+    ref_atoms = []
+    des_atoms = []
+    ref_atoms_high_conf = []  # For pLDDT > 80
+    des_atoms_high_conf = []  # For pLDDT > 80
+    # Detect if B-factors are normalized (0-1) or raw pLDDT (0-100)
+    sample_bfactor = None
+    for res in design_struct.get_residues():
+        if 'CA' in res:
+            sample_bfactor = res['CA'].get_bfactor()
+            break
+    is_normalized = sample_bfactor is not None and sample_bfactor < 1.0
+    actual_threshold = (plddt_threshold / 100.0) if is_normalized else plddt_threshold
+    high_conf_threshold = (80.0 / 100.0) if is_normalized else 80.0
+    # Collect atoms for alignment (using plddt_threshold)
+    # Also collect high-confidence atoms (pLDDT > 80) for detailed report
+    for ref_res, des_res in zip(target_struct.get_residues(), design_struct.get_residues()):
+        if 'CA' in des_res and 'CA' in ref_res:
+            plddt = des_res['CA'].get_bfactor()
+            if plddt >= actual_threshold:
+                ref_atoms.append(ref_res['CA'])
+                des_atoms.append(des_res['CA'])
+            if plddt >= high_conf_threshold:
+                ref_atoms_high_conf.append(ref_res['CA'])
+                des_atoms_high_conf.append(des_res['CA'])
+    # Fallback to all CA atoms if no high-confidence ones found
+    if len(ref_atoms) == 0:
+        print(f"⚠️ No residues with pLDDT >= {plddt_threshold}. Using all residues.")
+        ref_atoms = [a for a in target_struct.get_atoms() if a.get_name() == 'CA']
+        des_atoms = [a for a in design_struct.get_atoms() if a.get_name() == 'CA']
+        min_len = min(len(ref_atoms), len(des_atoms))
+        ref_atoms = ref_atoms[:min_len]
+        des_atoms = des_atoms[:min_len]
+    # Perform alignment using the main threshold atoms
+    sup = Superimposer()
+    sup.set_atoms(ref_atoms, des_atoms)
     sup.apply(design_struct.get_atoms())
+    core_rmsd = sup.rms
+    num_residues = len(ref_atoms)
+    print(f"🎯 Core-Scaffold RMSD (pLDDT > {plddt_threshold}): {core_rmsd:.3f} Å ({num_residues} residues)")
+    # Calculate global RMSD (all CA atoms)
+    all_ref_atoms = [a for a in target_struct.get_atoms() if a.get_name() == 'CA']
+    all_des_atoms = [a for a in design_struct.get_atoms() if a.get_name() == 'CA']
+    min_len = min(len(all_ref_atoms), len(all_des_atoms))
+    all_ref_atoms = all_ref_atoms[:min_len]
+    all_des_atoms = all_des_atoms[:min_len]
+    # Calculate global RMSD after alignment
+    sup_global = Superimposer()
+    sup_global.set_atoms(all_ref_atoms, all_des_atoms)
+    global_rmsd = sup_global.rms
+    # Calculate high-confidence core RMSD (pLDDT > 80)
+    high_conf_rmsd = None
+    if len(ref_atoms_high_conf) > 0:
+        sup_high_conf = Superimposer()
+        sup_high_conf.set_atoms(ref_atoms_high_conf, des_atoms_high_conf)
+        high_conf_rmsd = sup_high_conf.rms
+    else:
+        # If no high-confidence atoms, use core_rmsd as fallback
+        high_conf_rmsd = core_rmsd
     # 3. EXPORT
     # This saves the design in the same 3D coordinate space as the human receptor
     io.set_structure(design_struct)
     io.save(output_name)
+    return output_name, global_rmsd, core_rmsd, high_conf_rmsd
+def process_results(target_pdb_id, result_pdb, global_rmsd, core_rmsd):
+    """
+    Generate a detailed structural validation report with tiered RMSD analysis.
+    Args:
+        target_pdb_id: Target PDB ID
+        result_pdb: Path to the aligned result PDB
+        global_rmsd: Global RMSD (all residues)
+        core_rmsd: High-confidence core RMSD (pLDDT > 80)
+    Returns:
+        str: Formatted validation report
+    """
+    # Calculate the tiers we found earlier
+    # pLDDT > 80: High Fidelity Core
+    # pLDDT < 50: Disordered Loop
+    # Determine design status based on core RMSD
+    if core_rmsd < 1.0:
+        status = "Success - High-Precision Core Match"
+        status_emoji = "✅"
+    elif core_rmsd < 2.0:
+        status = "Good - Minor Core Deviation"
+        status_emoji = "⚠️"
+    else:
+        status = "Possible Fold Drift - Review Required"
+        status_emoji = "❌"
+    report = f"""
+### 🔬 Structural Validation Report
+**Target:** {target_pdb_id.upper()}
+**RMSD Metrics:**
+- **Global RMSD:** {global_rmsd:.2f} Å (all residues)
+- **High-Confidence Core RMSD (pLDDT > 80):** {core_rmsd:.2f} Å
+**Design Status:** {status_emoji} {status}
+**Interpretation:**
+- Core RMSD < 1.0 Å: Excellent scaffold preservation
+- Core RMSD 1.0-2.0 Å: Good structural match
+- Core RMSD > 2.0 Å: Possible fold drift, review structure
+"""
+    return report