nikraf commited on Mar 11

Commit

714cf46

verified ·

1 Parent(s): 9b113fb

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +65 -0
config.json +116 -0
model.safetensors +3 -0
packaged_probe_model.py +215 -0
protify/FastPLMs/__init__.py +0 -0
protify/FastPLMs/boltz/scripts/eval/aggregate_evals.py +753 -0
protify/FastPLMs/boltz/scripts/eval/physcialsim_metrics.py +304 -0
protify/FastPLMs/boltz/scripts/eval/run_evals.py +167 -0
protify/FastPLMs/boltz/scripts/process/ccd.py +295 -0
protify/FastPLMs/boltz/scripts/process/cluster.py +111 -0
protify/FastPLMs/boltz/scripts/process/mmcif.py +1123 -0
protify/FastPLMs/boltz/scripts/process/msa.py +130 -0
protify/FastPLMs/boltz/scripts/process/rcsb.py +359 -0
protify/FastPLMs/boltz/scripts/train/train.py +241 -0
protify/FastPLMs/boltz/src/boltz/__init__.py +7 -0
protify/FastPLMs/boltz/src/boltz/data/__init__.py +0 -0
protify/FastPLMs/boltz/src/boltz/data/const.py +1184 -0
protify/FastPLMs/boltz/src/boltz/data/crop/__init__.py +0 -0
protify/FastPLMs/boltz/src/boltz/data/crop/affinity.py +164 -0
protify/FastPLMs/boltz/src/boltz/data/crop/boltz.py +296 -0
protify/FastPLMs/boltz/src/boltz/data/crop/cropper.py +45 -0
protify/FastPLMs/boltz/src/boltz/data/feature/__init__.py +0 -0
protify/FastPLMs/boltz/src/boltz/data/feature/featurizer.py +1225 -0
protify/FastPLMs/boltz/src/boltz/data/feature/featurizerv2.py +2354 -0
protify/FastPLMs/boltz/src/boltz/data/feature/symmetry.py +602 -0
protify/FastPLMs/boltz/src/boltz/data/filter/__init__.py +0 -0
protify/FastPLMs/boltz/src/boltz/data/filter/dynamic/__init__.py +0 -0
protify/FastPLMs/boltz/src/boltz/data/filter/dynamic/date.py +76 -0
protify/FastPLMs/boltz/src/boltz/data/filter/dynamic/filter.py +24 -0
protify/FastPLMs/boltz/src/boltz/data/filter/dynamic/max_residues.py +37 -0
protify/FastPLMs/boltz/src/boltz/data/filter/dynamic/resolution.py +34 -0
protify/FastPLMs/boltz/src/boltz/data/filter/dynamic/size.py +38 -0
protify/FastPLMs/boltz/src/boltz/data/filter/dynamic/subset.py +42 -0
protify/FastPLMs/boltz/src/boltz/data/filter/static/__init__.py +0 -0
protify/FastPLMs/boltz/src/boltz/data/filter/static/filter.py +26 -0
protify/FastPLMs/boltz/src/boltz/data/filter/static/ligand.py +37 -0
protify/FastPLMs/boltz/src/boltz/data/filter/static/polymer.py +299 -0
protify/FastPLMs/boltz/src/boltz/data/module/__init__.py +0 -0
protify/FastPLMs/boltz/src/boltz/data/module/inference.py +310 -0
protify/FastPLMs/boltz/src/boltz/data/module/inferencev2.py +433 -0
protify/FastPLMs/boltz/src/boltz/data/module/training.py +687 -0
protify/FastPLMs/boltz/src/boltz/data/module/trainingv2.py +660 -0
protify/FastPLMs/boltz/src/boltz/data/mol.py +900 -0
protify/FastPLMs/boltz/src/boltz/data/msa/__init__.py +0 -0
protify/FastPLMs/boltz/src/boltz/data/msa/mmseqs2.py +286 -0
protify/FastPLMs/boltz/src/boltz/data/pad.py +84 -0
protify/FastPLMs/boltz/src/boltz/data/parse/__init__.py +0 -0
protify/FastPLMs/boltz/src/boltz/data/parse/a3m.py +134 -0
protify/FastPLMs/boltz/src/boltz/data/parse/csv.py +100 -0
protify/FastPLMs/boltz/src/boltz/data/parse/fasta.py +138 -0

README.md ADDED Viewed

	@@ -0,0 +1,65 @@

+---
+library_name: transformers
+tags: []
+---
+# nikraf/OmniPath_2class_clustered-30_ESMC-600_2026-03-11-15-46_NQRV
+Fine-tuned with Protify.
+## About Protify
+Protify is an open source platform designed to simplify and democratize workflows for chemical language models. With Protify, deep learning models can be trained to predict chemical properties without requiring extensive coding knowledge or computational resources.
+### Why Protify?
+- Benchmark multiple models efficiently.
+- Flexible for all skill levels.
+- Accessible computing with support for precomputed embeddings.
+- Cost-effective workflows for training and evaluation.
+## Training Run
+- `dataset`: OmniPath_2class_clustered-30
+- `model`: ESMC-600
+- `run_id`: 2026-03-11-15-46_NQRV
+- `task_type`: singlelabel
+- `num_runs`: 1
+## Dataset Statistics
+- `train_size`: 102872
+- `valid_size`: 18102
+- `test_size`: 18074
+## Validation Metrics
+- `epoch`: 5.000000
+- `eval_accuracy`: 0.789750
+- `eval_f1`: 0.789330
+- `eval_loss`: 0.445219
+- `eval_mcc`: 0.581780
+- `eval_model_preparation_time`: 0.000300
+- `eval_pr_auc`: 0.884610
+- `eval_precision`: 0.792040
+- `eval_recall`: 0.789750
+- `eval_roc_auc`: 0.880010
+- `eval_runtime`: 21.260300
+- `eval_samples_per_second`: 851.444000
+- `eval_steps_per_second`: 13.311000
+## Test Metrics
+- `test_accuracy`: 0.779350
+- `test_f1`: 0.778210
+- `test_loss`: 0.455012
+- `test_mcc`: 0.564560
+- `test_model_preparation_time`: 0.000300
+- `test_pr_auc`: 0.884200
+- `test_precision`: 0.785240
+- `test_recall`: 0.779350
+- `test_roc_auc`: 0.874270
+- `test_runtime`: 21.119900
+- `test_samples_per_second`: 855.780000
+- `test_steps_per_second`: 13.400000
+- `training_time_seconds`: 1235.285100

config.json ADDED Viewed

	@@ -0,0 +1,116 @@

+{
+  "add_token_ids": false,
+  "architectures": [
+    "PackagedProbeModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "packaged_probe_model.PackagedProbeConfig",
+    "AutoModel": "packaged_probe_model.PackagedProbeModel"
+  },
+  "base_model_name": "ESMC-600",
+  "dtype": "float32",
+  "matrix_embed": true,
+  "model_type": "packaged_probe",
+  "pooling_types": [
+    "mean",
+    "var"
+  ],
+  "ppi": true,
+  "probe_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "add_token_ids": false,
+    "architectures": [
+      "TransformerForSequenceClassification"
+    ],
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "classifier_dropout": 0.2,
+    "classifier_size": 4096,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.2,
+    "dtype": "float32",
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_size": 512,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "input_size": 1152,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "lora": false,
+    "lora_alpha": 32.0,
+    "lora_dropout": 0.01,
+    "lora_r": 8,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "probe",
+    "n_heads": 4,
+    "n_layers": 1,
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "pooling_types": [
+      "mean",
+      "cls"
+    ],
+    "pre_ln": true,
+    "prefix": null,
+    "probe_type": "transformer",
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "rotary": true,
+    "sep_token_id": null,
+    "sim_type": "dot",
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "task_type": "singlelabel",
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "token_attention": false,
+    "tokenizer_class": null,
+    "tokenwise": false,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torchscript": false,
+    "transformer_dropout": 0.1,
+    "transformer_hidden_size": 512,
+    "transformers_version": "4.57.6",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_bias": false
+  },
+  "probe_type": "transformer",
+  "sep_token_id": 2,
+  "task_type": "singlelabel",
+  "tokenwise": false,
+  "transformers_version": "4.57.6"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ec8ea16612d2975dad1abb1da8977591cbba6ff2b0566374755120e6e950bded
+size 2331568712

packaged_probe_model.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import os
+import sys
+from typing import Any, Dict, Optional
+import torch
+from torch import nn
+from transformers import AutoModel, PreTrainedModel, PretrainedConfig
+from transformers.modeling_outputs import SequenceClassifierOutput, TokenClassifierOutput
+try:
+    from protify.base_models.supported_models import all_presets_with_paths
+    from protify.pooler import Pooler
+    from protify.probes.get_probe import rebuild_probe_from_saved_config
+except ImportError:
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    candidate_paths = [
+        current_dir,
+        os.path.dirname(current_dir),
+        os.path.dirname(os.path.dirname(current_dir)),
+        os.path.join(current_dir, "src"),
+    ]
+    for candidate in candidate_paths:
+        if os.path.isdir(candidate) and candidate not in sys.path:
+            sys.path.insert(0, candidate)
+    from protify.base_models.supported_models import all_presets_with_paths
+    from protify.pooler import Pooler
+    from protify.probes.get_probe import rebuild_probe_from_saved_config
+class PackagedProbeConfig(PretrainedConfig):
+    model_type = "packaged_probe"
+    def __init__(
+            self,
+            base_model_name: str = "",
+            probe_type: str = "linear",
+            probe_config: Optional[Dict[str, Any]] = None,
+            tokenwise: bool = False,
+            matrix_embed: bool = False,
+            pooling_types: Optional[list[str]] = None,
+            task_type: str = "singlelabel",
+            num_labels: int = 2,
+            ppi: bool = False,
+            add_token_ids: bool = False,
+            sep_token_id: Optional[int] = None,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.base_model_name = base_model_name
+        self.probe_type = probe_type
+        self.probe_config = {} if probe_config is None else probe_config
+        self.tokenwise = tokenwise
+        self.matrix_embed = matrix_embed
+        self.pooling_types = ["mean"] if pooling_types is None else pooling_types
+        self.task_type = task_type
+        self.num_labels = num_labels
+        self.ppi = ppi
+        self.add_token_ids = add_token_ids
+        self.sep_token_id = sep_token_id
+class PackagedProbeModel(PreTrainedModel):
+    config_class = PackagedProbeConfig
+    base_model_prefix = "backbone"
+    all_tied_weights_keys = {}
+    def __init__(
+            self,
+            config: PackagedProbeConfig,
+            base_model: Optional[nn.Module] = None,
+            probe: Optional[nn.Module] = None,
+    ):
+        super().__init__(config)
+        self.config = config
+        self.backbone = self._load_base_model() if base_model is None else base_model
+        self.probe = self._load_probe() if probe is None else probe
+        self.pooler = Pooler(self.config.pooling_types)
+    def _load_base_model(self) -> nn.Module:
+        if self.config.base_model_name in all_presets_with_paths:
+            model_path = all_presets_with_paths[self.config.base_model_name]
+        else:
+            model_path = self.config.base_model_name
+        model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
+        model.eval()
+        return model
+    def _load_probe(self) -> nn.Module:
+        return rebuild_probe_from_saved_config(
+            probe_type=self.config.probe_type,
+            tokenwise=self.config.tokenwise,
+            probe_config=self.config.probe_config,
+        )
+    @staticmethod
+    def _extract_hidden_states(backbone_output: Any) -> torch.Tensor:
+        if isinstance(backbone_output, tuple):
+            return backbone_output[0]
+        if hasattr(backbone_output, "last_hidden_state"):
+            return backbone_output.last_hidden_state
+        if isinstance(backbone_output, torch.Tensor):
+            return backbone_output
+        raise ValueError("Unsupported backbone output format for packaged probe model")
+    @staticmethod
+    def _extract_attentions(backbone_output: Any) -> Optional[torch.Tensor]:
+        if hasattr(backbone_output, "attentions"):
+            return backbone_output.attentions
+        return None
+    def _build_ppi_segment_masks(
+            self,
+            input_ids: torch.Tensor,
+            attention_mask: torch.Tensor,
+            token_type_ids: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if token_type_ids is not None and torch.any(token_type_ids == 1):
+            mask_a = ((token_type_ids == 0) & (attention_mask == 1)).long()
+            mask_b = ((token_type_ids == 1) & (attention_mask == 1)).long()
+            assert torch.all(mask_a.sum(dim=1) > 0), "PPI token_type_ids produced empty segment A"
+            assert torch.all(mask_b.sum(dim=1) > 0), "PPI token_type_ids produced empty segment B"
+            return mask_a, mask_b
+        assert self.config.sep_token_id is not None, "sep_token_id is required for PPI fallback segmentation"
+        batch_size, seq_len = input_ids.shape
+        mask_a = torch.zeros((batch_size, seq_len), dtype=torch.long, device=input_ids.device)
+        mask_b = torch.zeros((batch_size, seq_len), dtype=torch.long, device=input_ids.device)
+        for batch_idx in range(batch_size):
+            valid_positions = torch.where(attention_mask[batch_idx] == 1)[0]
+            sep_positions = torch.where((input_ids[batch_idx] == self.config.sep_token_id) & (attention_mask[batch_idx] == 1))[0]
+            if len(valid_positions) == 0:
+                continue
+            if len(sep_positions) >= 2:
+                first_sep = int(sep_positions[0].item())
+                second_sep = int(sep_positions[1].item())
+                mask_a[batch_idx, :first_sep + 1] = 1
+                mask_b[batch_idx, first_sep + 1:second_sep + 1] = 1
+            elif len(sep_positions) == 1:
+                first_sep = int(sep_positions[0].item())
+                mask_a[batch_idx, :first_sep + 1] = 1
+                mask_b[batch_idx, first_sep + 1: int(valid_positions[-1].item()) + 1] = 1
+            else:
+                midpoint = len(valid_positions) // 2
+                mask_a[batch_idx, valid_positions[:midpoint]] = 1
+                mask_b[batch_idx, valid_positions[midpoint:]] = 1
+        assert torch.all(mask_a.sum(dim=1) > 0), "PPI fallback segmentation produced empty segment A"
+        assert torch.all(mask_b.sum(dim=1) > 0), "PPI fallback segmentation produced empty segment B"
+        return mask_a, mask_b
+    def _build_probe_inputs(
+            self,
+            hidden_states: torch.Tensor,
+            input_ids: torch.Tensor,
+            attention_mask: torch.Tensor,
+            token_type_ids: Optional[torch.Tensor],
+            attentions: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        if self.config.ppi and (not self.config.matrix_embed) and (not self.config.tokenwise):
+            mask_a, mask_b = self._build_ppi_segment_masks(input_ids, attention_mask, token_type_ids)
+            vec_a = self.pooler(hidden_states, attention_mask=mask_a, attentions=attentions)
+            vec_b = self.pooler(hidden_states, attention_mask=mask_b, attentions=attentions)
+            return torch.cat((vec_a, vec_b), dim=-1), None
+        if self.config.matrix_embed or self.config.tokenwise:
+            return hidden_states, attention_mask
+        pooled = self.pooler(hidden_states, attention_mask=attention_mask, attentions=attentions)
+        return pooled, None
+    def forward(
+            self,
+            input_ids: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+    ) -> SequenceClassifierOutput | TokenClassifierOutput:
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.long)
+        requires_attentions = "parti" in self.config.pooling_types and (not self.config.matrix_embed) and (not self.config.tokenwise)
+        backbone_kwargs: Dict[str, Any] = {"input_ids": input_ids, "attention_mask": attention_mask}
+        if requires_attentions:
+            backbone_kwargs["output_attentions"] = True
+        backbone_output = self.backbone(**backbone_kwargs)
+        hidden_states = self._extract_hidden_states(backbone_output)
+        attentions = self._extract_attentions(backbone_output)
+        if requires_attentions:
+            assert attentions is not None, "parti pooling requires base model attentions"
+        probe_embeddings, probe_attention_mask = self._build_probe_inputs(
+            hidden_states=hidden_states,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            attentions=attentions,
+        )
+        if self.config.probe_type == "linear":
+            return self.probe(embeddings=probe_embeddings, labels=labels)
+        if self.config.probe_type == "transformer":
+            forward_kwargs: Dict[str, Any] = {"embeddings": probe_embeddings, "labels": labels}
+            if probe_attention_mask is not None:
+                forward_kwargs["attention_mask"] = probe_attention_mask
+            if self.config.add_token_ids and token_type_ids is not None and probe_attention_mask is not None:
+                forward_kwargs["token_type_ids"] = token_type_ids
+            return self.probe(**forward_kwargs)
+        if self.config.probe_type in ["retrievalnet", "lyra"]:
+            return self.probe(embeddings=probe_embeddings, attention_mask=probe_attention_mask, labels=labels)
+        raise ValueError(f"Unsupported probe type for packaged model: {self.config.probe_type}")

protify/FastPLMs/__init__.py ADDED Viewed

File without changes

protify/FastPLMs/boltz/scripts/eval/aggregate_evals.py ADDED Viewed

	@@ -0,0 +1,753 @@

+import json
+from pathlib import Path
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+METRICS = ["lddt", "bb_lddt", "tm_score", "rmsd"]
+def compute_af3_metrics(preds, evals, name):
+    metrics = {}
+    top_model = None
+    top_confidence = -1000
+    for model_id in range(5):
+        # Load confidence file
+        confidence_file = (
+            Path(preds) / f"seed-1_sample-{model_id}" / "summary_confidences.json"
+        )
+        with confidence_file.open("r") as f:
+            confidence_data = json.load(f)
+            confidence = confidence_data["ranking_score"]
+            if confidence > top_confidence:
+                top_model = model_id
+                top_confidence = confidence
+        # Load eval file
+        eval_file = Path(evals) / f"{name}_model_{model_id}.json"
+        with eval_file.open("r") as f:
+            eval_data = json.load(f)
+            for metric_name in METRICS:
+                if metric_name in eval_data:
+                    metrics.setdefault(metric_name, []).append(eval_data[metric_name])
+            if "dockq" in eval_data and eval_data["dockq"] is not None:
+                metrics.setdefault("dockq_>0.23", []).append(
+                    np.mean(
+                        [float(v > 0.23) for v in eval_data["dockq"] if v is not None]
+                    )
+                )
+                metrics.setdefault("dockq_>0.49", []).append(
+                    np.mean(
+                        [float(v > 0.49) for v in eval_data["dockq"] if v is not None]
+                    )
+                )
+                metrics.setdefault("len_dockq_", []).append(
+                    len([v for v in eval_data["dockq"] if v is not None])
+                )
+        eval_file = Path(evals) / f"{name}_model_{model_id}_ligand.json"
+        with eval_file.open("r") as f:
+            eval_data = json.load(f)
+            if "lddt_pli" in eval_data:
+                lddt_plis = [
+                    x["score"] for x in eval_data["lddt_pli"]["assigned_scores"]
+                ]
+                for _ in eval_data["lddt_pli"][
+                    "model_ligand_unassigned_reason"
+                ].items():
+                    lddt_plis.append(0)
+                if not lddt_plis:
+                    continue
+                lddt_pli = np.mean([x for x in lddt_plis])
+                metrics.setdefault("lddt_pli", []).append(lddt_pli)
+                metrics.setdefault("len_lddt_pli", []).append(len(lddt_plis))
+            if "rmsd" in eval_data:
+                rmsds = [x["score"] for x in eval_data["rmsd"]["assigned_scores"]]
+                for _ in eval_data["rmsd"]["model_ligand_unassigned_reason"].items():
+                    rmsds.append(100)
+                if not rmsds:
+                    continue
+                rmsd2 = np.mean([x < 2.0 for x in rmsds])
+                rmsd5 = np.mean([x < 5.0 for x in rmsds])
+                metrics.setdefault("rmsd<2", []).append(rmsd2)
+                metrics.setdefault("rmsd<5", []).append(rmsd5)
+                metrics.setdefault("len_rmsd", []).append(len(rmsds))
+    # Get oracle
+    oracle = {k: min(v) if k == "rmsd" else max(v) for k, v in metrics.items()}
+    avg = {k: sum(v) / len(v) for k, v in metrics.items()}
+    top1 = {k: v[top_model] for k, v in metrics.items()}
+    results = {}
+    for metric_name in metrics:
+        if metric_name.startswith("len_"):
+            continue
+        if metric_name == "lddt_pli":
+            l = metrics["len_lddt_pli"][0]
+        elif metric_name == "rmsd<2" or metric_name == "rmsd<5":
+            l = metrics["len_rmsd"][0]
+        elif metric_name == "dockq_>0.23" or metric_name == "dockq_>0.49":
+            l = metrics["len_dockq_"][0]
+        else:
+            l = 1
+        results[metric_name] = {
+            "oracle": oracle[metric_name],
+            "average": avg[metric_name],
+            "top1": top1[metric_name],
+            "len": l,
+        }
+    return results
+def compute_chai_metrics(preds, evals, name):
+    metrics = {}
+    top_model = None
+    top_confidence = 0
+    for model_id in range(5):
+        # Load confidence file
+        confidence_file = Path(preds) / f"scores.model_idx_{model_id}.npz"
+        confidence_data = np.load(confidence_file)
+        confidence = confidence_data["aggregate_score"].item()
+        if confidence > top_confidence:
+            top_model = model_id
+            top_confidence = confidence
+        # Load eval file
+        eval_file = Path(evals) / f"{name}_model_{model_id}.json"
+        with eval_file.open("r") as f:
+            eval_data = json.load(f)
+            for metric_name in METRICS:
+                if metric_name in eval_data:
+                    metrics.setdefault(metric_name, []).append(eval_data[metric_name])
+            if "dockq" in eval_data and eval_data["dockq"] is not None:
+                metrics.setdefault("dockq_>0.23", []).append(
+                    np.mean(
+                        [float(v > 0.23) for v in eval_data["dockq"] if v is not None]
+                    )
+                )
+                metrics.setdefault("dockq_>0.49", []).append(
+                    np.mean(
+                        [float(v > 0.49) for v in eval_data["dockq"] if v is not None]
+                    )
+                )
+                metrics.setdefault("len_dockq_", []).append(
+                    len([v for v in eval_data["dockq"] if v is not None])
+                )
+        eval_file = Path(evals) / f"{name}_model_{model_id}_ligand.json"
+        with eval_file.open("r") as f:
+            eval_data = json.load(f)
+            if "lddt_pli" in eval_data:
+                lddt_plis = [
+                    x["score"] for x in eval_data["lddt_pli"]["assigned_scores"]
+                ]
+                for _ in eval_data["lddt_pli"][
+                    "model_ligand_unassigned_reason"
+                ].items():
+                    lddt_plis.append(0)
+                if not lddt_plis:
+                    continue
+                lddt_pli = np.mean([x for x in lddt_plis])
+                metrics.setdefault("lddt_pli", []).append(lddt_pli)
+                metrics.setdefault("len_lddt_pli", []).append(len(lddt_plis))
+            if "rmsd" in eval_data:
+                rmsds = [x["score"] for x in eval_data["rmsd"]["assigned_scores"]]
+                for _ in eval_data["rmsd"]["model_ligand_unassigned_reason"].items():
+                    rmsds.append(100)
+                if not rmsds:
+                    continue
+                rmsd2 = np.mean([x < 2.0 for x in rmsds])
+                rmsd5 = np.mean([x < 5.0 for x in rmsds])
+                metrics.setdefault("rmsd<2", []).append(rmsd2)
+                metrics.setdefault("rmsd<5", []).append(rmsd5)
+                metrics.setdefault("len_rmsd", []).append(len(rmsds))
+    # Get oracle
+    oracle = {k: min(v) if k == "rmsd" else max(v) for k, v in metrics.items()}
+    avg = {k: sum(v) / len(v) for k, v in metrics.items()}
+    top1 = {k: v[top_model] for k, v in metrics.items()}
+    results = {}
+    for metric_name in metrics:
+        if metric_name.startswith("len_"):
+            continue
+        if metric_name == "lddt_pli":
+            l = metrics["len_lddt_pli"][0]
+        elif metric_name == "rmsd<2" or metric_name == "rmsd<5":
+            l = metrics["len_rmsd"][0]
+        elif metric_name == "dockq_>0.23" or metric_name == "dockq_>0.49":
+            l = metrics["len_dockq_"][0]
+        else:
+            l = 1
+        results[metric_name] = {
+            "oracle": oracle[metric_name],
+            "average": avg[metric_name],
+            "top1": top1[metric_name],
+            "len": l,
+        }
+    return results
+def compute_boltz_metrics(preds, evals, name):
+    metrics = {}
+    top_model = None
+    top_confidence = 0
+    for model_id in range(5):
+        # Load confidence file
+        confidence_file = (
+            Path(preds) / f"confidence_{Path(preds).name}_model_{model_id}.json"
+        )
+        with confidence_file.open("r") as f:
+            confidence_data = json.load(f)
+            confidence = confidence_data["confidence_score"]
+            if confidence > top_confidence:
+                top_model = model_id
+                top_confidence = confidence
+        # Load eval file
+        eval_file = Path(evals) / f"{name}_model_{model_id}.json"
+        with eval_file.open("r") as f:
+            eval_data = json.load(f)
+            for metric_name in METRICS:
+                if metric_name in eval_data:
+                    metrics.setdefault(metric_name, []).append(eval_data[metric_name])
+            if "dockq" in eval_data and eval_data["dockq"] is not None:
+                metrics.setdefault("dockq_>0.23", []).append(
+                    np.mean(
+                        [float(v > 0.23) for v in eval_data["dockq"] if v is not None]
+                    )
+                )
+                metrics.setdefault("dockq_>0.49", []).append(
+                    np.mean(
+                        [float(v > 0.49) for v in eval_data["dockq"] if v is not None]
+                    )
+                )
+                metrics.setdefault("len_dockq_", []).append(
+                    len([v for v in eval_data["dockq"] if v is not None])
+                )
+        eval_file = Path(evals) / f"{name}_model_{model_id}_ligand.json"
+        with eval_file.open("r") as f:
+            eval_data = json.load(f)
+            if "lddt_pli" in eval_data:
+                lddt_plis = [
+                    x["score"] for x in eval_data["lddt_pli"]["assigned_scores"]
+                ]
+                for _ in eval_data["lddt_pli"][
+                    "model_ligand_unassigned_reason"
+                ].items():
+                    lddt_plis.append(0)
+                if not lddt_plis:
+                    continue
+                lddt_pli = np.mean([x for x in lddt_plis])
+                metrics.setdefault("lddt_pli", []).append(lddt_pli)
+                metrics.setdefault("len_lddt_pli", []).append(len(lddt_plis))
+            if "rmsd" in eval_data:
+                rmsds = [x["score"] for x in eval_data["rmsd"]["assigned_scores"]]
+                for _ in eval_data["rmsd"]["model_ligand_unassigned_reason"].items():
+                    rmsds.append(100)
+                if not rmsds:
+                    continue
+                rmsd2 = np.mean([x < 2.0 for x in rmsds])
+                rmsd5 = np.mean([x < 5.0 for x in rmsds])
+                metrics.setdefault("rmsd<2", []).append(rmsd2)
+                metrics.setdefault("rmsd<5", []).append(rmsd5)
+                metrics.setdefault("len_rmsd", []).append(len(rmsds))
+    # Get oracle
+    oracle = {k: min(v) if k == "rmsd" else max(v) for k, v in metrics.items()}
+    avg = {k: sum(v) / len(v) for k, v in metrics.items()}
+    top1 = {k: v[top_model] for k, v in metrics.items()}
+    results = {}
+    for metric_name in metrics:
+        if metric_name.startswith("len_"):
+            continue
+        if metric_name == "lddt_pli":
+            l = metrics["len_lddt_pli"][0]
+        elif metric_name == "rmsd<2" or metric_name == "rmsd<5":
+            l = metrics["len_rmsd"][0]
+        elif metric_name == "dockq_>0.23" or metric_name == "dockq_>0.49":
+            l = metrics["len_dockq_"][0]
+        else:
+            l = 1
+        results[metric_name] = {
+            "oracle": oracle[metric_name],
+            "average": avg[metric_name],
+            "top1": top1[metric_name],
+            "len": l,
+        }
+    return results
+def eval_models(
+    chai_preds,
+    chai_evals,
+    af3_preds,
+    af3_evals,
+    boltz_preds,
+    boltz_evals,
+    boltz_preds_x,
+    boltz_evals_x,
+):
+    # Load preds and make sure we have predictions for all models
+    chai_preds_names = {
+        x.name.lower(): x
+        for x in Path(chai_preds).iterdir()
+        if not x.name.lower().startswith(".")
+    }
+    af3_preds_names = {
+        x.name.lower(): x
+        for x in Path(af3_preds).iterdir()
+        if not x.name.lower().startswith(".")
+    }
+    boltz_preds_names = {
+        x.name.lower(): x
+        for x in Path(boltz_preds).iterdir()
+        if not x.name.lower().startswith(".")
+    }
+    boltz_preds_names_x = {
+        x.name.lower(): x
+        for x in Path(boltz_preds_x).iterdir()
+        if not x.name.lower().startswith(".")
+    }
+    print("Chai preds", len(chai_preds_names))
+    print("Af3 preds", len(af3_preds_names))
+    print("Boltz preds", len(boltz_preds_names))
+    print("Boltzx preds", len(boltz_preds_names_x))
+    common = (
+        set(chai_preds_names.keys())
+        & set(af3_preds_names.keys())
+        & set(boltz_preds_names.keys())
+        & set(boltz_preds_names_x.keys())
+    )
+    # Remove examples in the validation set
+    keys_to_remove = ["t1133", "h1134", "r1134s1", "t1134s2", "t1121", "t1123", "t1159"]
+    for key in keys_to_remove:
+        if key in common:
+            common.remove(key)
+    print("Common", len(common))
+    # Create a dataframe with the following schema:
+    # tool, name, metric, oracle, average, top1
+    results = []
+    for name in tqdm(common):
+        try:
+            af3_results = compute_af3_metrics(
+                af3_preds_names[name],
+                af3_evals,
+                name,
+            )
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            print(f"Error evaluating AF3 {name}: {e}")
+            continue
+        try:
+            chai_results = compute_chai_metrics(
+                chai_preds_names[name],
+                chai_evals,
+                name,
+            )
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            print(f"Error evaluating Chai {name}: {e}")
+            continue
+        try:
+            boltz_results = compute_boltz_metrics(
+                boltz_preds_names[name],
+                boltz_evals,
+                name,
+            )
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            print(f"Error evaluating Boltz {name}: {e}")
+            continue
+        try:
+            boltz_results_x = compute_boltz_metrics(
+                boltz_preds_names_x[name],
+                boltz_evals_x,
+                name,
+            )
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            print(f"Error evaluating Boltzx {name}: {e}")
+            continue
+        for metric_name in af3_results:
+            if metric_name in chai_results and metric_name in boltz_results:
+                if (
+                    (
+                        af3_results[metric_name]["len"]
+                        == chai_results[metric_name]["len"]
+                    )
+                    and (
+                        af3_results[metric_name]["len"]
+                        == boltz_results[metric_name]["len"]
+                    )
+                    and (
+                        af3_results[metric_name]["len"]
+                        == boltz_results_x[metric_name]["len"]
+                    )
+                ):
+                    results.append(
+                        {
+                            "tool": "AF3 oracle",
+                            "target": name,
+                            "metric": metric_name,
+                            "value": af3_results[metric_name]["oracle"],
+                        }
+                    )
+                    results.append(
+                        {
+                            "tool": "AF3 top-1",
+                            "target": name,
+                            "metric": metric_name,
+                            "value": af3_results[metric_name]["top1"],
+                        }
+                    )
+                    results.append(
+                        {
+                            "tool": "Chai-1 oracle",
+                            "target": name,
+                            "metric": metric_name,
+                            "value": chai_results[metric_name]["oracle"],
+                        }
+                    )
+                    results.append(
+                        {
+                            "tool": "Chai-1 top-1",
+                            "target": name,
+                            "metric": metric_name,
+                            "value": chai_results[metric_name]["top1"],
+                        }
+                    )
+                    results.append(
+                        {
+                            "tool": "Boltz-1 oracle",
+                            "target": name,
+                            "metric": metric_name,
+                            "value": boltz_results[metric_name]["oracle"],
+                        }
+                    )
+                    results.append(
+                        {
+                            "tool": "Boltz-1 top-1",
+                            "target": name,
+                            "metric": metric_name,
+                            "value": boltz_results[metric_name]["top1"],
+                        }
+                    )
+                    results.append(
+                        {
+                            "tool": "Boltz-1x oracle",
+                            "target": name,
+                            "metric": metric_name,
+                            "value": boltz_results_x[metric_name]["oracle"],
+                        }
+                    )
+                    results.append(
+                        {
+                            "tool": "Boltz-1x top-1",
+                            "target": name,
+                            "metric": metric_name,
+                            "value": boltz_results_x[metric_name]["top1"],
+                        }
+                    )
+                else:
+                    print(
+                        "Different lengths",
+                        name,
+                        metric_name,
+                        af3_results[metric_name]["len"],
+                        chai_results[metric_name]["len"],
+                        boltz_results[metric_name]["len"],
+                        boltz_results_x[metric_name]["len"],
+                    )
+            else:
+                print(
+                    "Missing metric",
+                    name,
+                    metric_name,
+                    metric_name in chai_results,
+                    metric_name in boltz_results,
+                    metric_name in boltz_results_x,
+                )
+    # Write the results to a file, ensure we only keep the target & metrics where we have all tools
+    df = pd.DataFrame(results)
+    return df
+def eval_validity_checks(df):
+    # Filter the dataframe to only include the targets in the validity checks
+    name_mapping = {
+        "af3": "AF3 top-1",
+        "chai": "Chai-1 top-1",
+        "boltz1": "Boltz-1 top-1",
+        "boltz1x": "Boltz-1x top-1",
+    }
+    top1 = df[df["model_idx"] == 0]
+    top1 = top1[["tool", "pdb_id", "valid"]]
+    top1["tool"] = top1["tool"].apply(lambda x: name_mapping[x])
+    top1 = top1.rename(columns={"tool": "tool", "pdb_id": "target", "valid": "value"})
+    top1["metric"] = "physical validity"
+    top1["target"] = top1["target"].apply(lambda x: x.lower())
+    top1 = top1[["tool", "target", "metric", "value"]]
+    name_mapping = {
+        "af3": "AF3 oracle",
+        "chai": "Chai-1 oracle",
+        "boltz1": "Boltz-1 oracle",
+        "boltz1x": "Boltz-1x oracle",
+    }
+    oracle = df[["tool", "model_idx", "pdb_id", "valid"]]
+    oracle = oracle.groupby(["tool", "pdb_id"])["valid"].max().reset_index()
+    oracle = oracle.rename(
+        columns={"tool": "tool", "pdb_id": "target", "valid": "value"}
+    )
+    oracle["tool"] = oracle["tool"].apply(lambda x: name_mapping[x])
+    oracle["metric"] = "physical validity"
+    oracle = oracle[["tool", "target", "metric", "value"]]
+    oracle["target"] = oracle["target"].apply(lambda x: x.lower())
+    out = pd.concat([top1, oracle])
+    return out
+def bootstrap_ci(series, n_boot=1000, alpha=0.05):
+    """
+    Compute 95% bootstrap confidence intervals for the mean of 'series'.
+    """
+    n = len(series)
+    boot_means = []
+    # Perform bootstrap resampling
+    for _ in range(n_boot):
+        sample = series.sample(n, replace=True)
+        boot_means.append(sample.mean())
+    boot_means = np.array(boot_means)
+    mean_val = np.mean(series)
+    lower = np.percentile(boot_means, 100 * alpha / 2)
+    upper = np.percentile(boot_means, 100 * (1 - alpha / 2))
+    return mean_val, lower, upper
+def plot_data(desired_tools, desired_metrics, df, dataset, filename):
+    filtered_df = df[
+        df["tool"].isin(desired_tools) & df["metric"].isin(desired_metrics)
+    ]
+    # Apply bootstrap to each (tool, metric) group
+    boot_stats = filtered_df.groupby(["tool", "metric"])["value"].apply(bootstrap_ci)
+    # boot_stats is a Series of tuples (mean, lower, upper). Convert to DataFrame:
+    boot_stats = boot_stats.apply(pd.Series)
+    boot_stats.columns = ["mean", "lower", "upper"]
+    # Unstack to get a DataFrame suitable for plotting
+    plot_data = boot_stats["mean"].unstack("tool")
+    plot_data = plot_data.reindex(desired_metrics)
+    lower_data = boot_stats["lower"].unstack("tool")
+    lower_data = lower_data.reindex(desired_metrics)
+    upper_data = boot_stats["upper"].unstack("tool")
+    upper_data = upper_data.reindex(desired_metrics)
+    # If you need a specific order of tools:
+    tool_order = [
+        "AF3 oracle",
+        "AF3 top-1",
+        "Chai-1 oracle",
+        "Chai-1 top-1",
+        "Boltz-1 oracle",
+        "Boltz-1 top-1",
+        "Boltz-1x oracle",
+        "Boltz-1x top-1",
+    ]
+    plot_data = plot_data[tool_order]
+    lower_data = lower_data[tool_order]
+    upper_data = upper_data[tool_order]
+    # Rename metrics
+    renaming = {
+        "lddt_pli": "Mean LDDT-PLI",
+        "rmsd<2": "L-RMSD < 2A",
+        "lddt": "Mean LDDT",
+        "dockq_>0.23": "DockQ > 0.23",
+        "physical validity": "Physical Validity",
+    }
+    plot_data = plot_data.rename(index=renaming)
+    lower_data = lower_data.rename(index=renaming)
+    upper_data = upper_data.rename(index=renaming)
+    mean_vals = plot_data.values
+    # Colors
+    tool_colors = [
+        "#994C00",  # AF3 oracle
+        "#FFB55A",  # AF3 top-1
+        "#931652",  # Chai-1 oracle
+        "#FC8AD9",  # Chai-1 top-1
+        "#188F52",  # Boltz-1 oracle
+        "#86E935",  # Boltz-1 top-1
+        "#004D80",  # Boltz-1x oracle
+        "#55C2FF",  # Boltz-1x top-1
+    ]
+    fig, ax = plt.subplots(figsize=(10, 5))
+    x = np.arange(len(plot_data.index))
+    bar_spacing = 0.015
+    total_width = 0.7
+    # Adjust width to account for the spacing
+    width = (total_width - (len(tool_order) - 1) * bar_spacing) / len(tool_order)
+    for i, tool in enumerate(tool_order):
+        # Each subsequent bar moves over by width + bar_spacing
+        offsets = x - (total_width - width) / 2 + i * (width + bar_spacing)
+        # Extract the means and errors for this tool
+        tool_means = plot_data[tool].values
+        tool_yerr_lower = mean_vals[:, i] - lower_data.values[:, i]
+        tool_yerr_upper = upper_data.values[:, i] - mean_vals[:, i]
+        # Construct yerr array specifically for this tool
+        tool_yerr = np.vstack([tool_yerr_lower, tool_yerr_upper])
+        ax.bar(
+            offsets,
+            tool_means,
+            width=width,
+            color=tool_colors[i],
+            label=tool,
+            yerr=tool_yerr,
+            capsize=2,
+            error_kw={"elinewidth": 0.75},
+        )
+    ax.set_xticks(x)
+    ax.set_xticklabels(plot_data.index, rotation=0)
+    ax.set_ylabel("Value")
+    ax.set_title(f"Performances on {dataset} with 95% CI (Bootstrap)")
+    plt.tight_layout()
+    ax.legend(loc="lower center", bbox_to_anchor=(0.5, 0.85), ncols=4, frameon=False)
+    plt.savefig(filename)
+    plt.show()
+def main():
+    eval_folder = "../../boltz_results_final/"
+    output_folder = "../../boltz_results_final/"
+    # Eval the test set
+    chai_preds = eval_folder + "outputs/test/chai"
+    chai_evals = eval_folder + "evals/test/chai"
+    af3_preds = eval_folder + "outputs/test/af3"
+    af3_evals = eval_folder + "evals/test/af3"
+    boltz_preds = eval_folder + "outputs/test/boltz/predictions"
+    boltz_evals = eval_folder + "evals/test/boltz"
+    boltz_preds_x = eval_folder + "outputs/test/boltzx/predictions"
+    boltz_evals_x = eval_folder + "evals/test/boltzx"
+    validity_checks = eval_folder + "physical_checks_test.csv"
+    df_validity_checks = pd.read_csv(validity_checks)
+    df_validity_checks = eval_validity_checks(df_validity_checks)
+    df = eval_models(
+        chai_preds,
+        chai_evals,
+        af3_preds,
+        af3_evals,
+        boltz_preds,
+        boltz_evals,
+        boltz_preds_x,
+        boltz_evals_x,
+    )
+    df = pd.concat([df, df_validity_checks]).reset_index(drop=True)
+    df.to_csv(output_folder + "results_test.csv", index=False)
+    desired_tools = [
+        "AF3 oracle",
+        "AF3 top-1",
+        "Chai-1 oracle",
+        "Chai-1 top-1",
+        "Boltz-1 oracle",
+        "Boltz-1 top-1",
+        "Boltz-1x oracle",
+        "Boltz-1x top-1",
+    ]
+    desired_metrics = ["lddt", "dockq_>0.23", "lddt_pli", "rmsd<2", "physical validity"]
+    plot_data(
+        desired_tools, desired_metrics, df, "PDB Test", output_folder + "plot_test.pdf"
+    )
+    # Eval CASP
+    chai_preds = eval_folder + "outputs/casp15/chai"
+    chai_evals = eval_folder + "evals/casp15/chai"
+    af3_preds = eval_folder + "outputs/casp15/af3"
+    af3_evals = eval_folder + "evals/casp15/af3"
+    boltz_preds = eval_folder + "outputs/casp15/boltz/predictions"
+    boltz_evals = eval_folder + "evals/casp15/boltz"
+    boltz_preds_x = eval_folder + "outputs/casp15/boltzx/predictions"
+    boltz_evals_x = eval_folder + "evals/casp15/boltzx"
+    validity_checks = eval_folder + "physical_checks_casp.csv"
+    df_validity_checks = pd.read_csv(validity_checks)
+    df_validity_checks = eval_validity_checks(df_validity_checks)
+    df = eval_models(
+        chai_preds,
+        chai_evals,
+        af3_preds,
+        af3_evals,
+        boltz_preds,
+        boltz_evals,
+        boltz_preds_x,
+        boltz_evals_x,
+    )
+    df = pd.concat([df, df_validity_checks]).reset_index(drop=True)
+    df.to_csv(output_folder + "results_casp.csv", index=False)
+    plot_data(
+        desired_tools, desired_metrics, df, "CASP15", output_folder + "plot_casp.pdf"
+    )
+if __name__ == "__main__":
+    main()

protify/FastPLMs/boltz/scripts/eval/physcialsim_metrics.py ADDED Viewed

	@@ -0,0 +1,304 @@

+import os
+import pickle
+import numpy as np
+import torch
+from pathlib import Path
+from tqdm import tqdm
+import pandas as pd
+from boltz.data.mol import load_molecules
+from boltz.data import const
+from boltz.data.parse.mmcif_with_constraints import parse_mmcif
+from multiprocessing import Pool
+def compute_torsion_angles(coords, torsion_index):
+    r_ij = coords[..., torsion_index[0], :] - coords[..., torsion_index[1], :]
+    r_kj = coords[..., torsion_index[2], :] - coords[..., torsion_index[1], :]
+    r_kl = coords[..., torsion_index[2], :] - coords[..., torsion_index[3], :]
+    n_ijk = np.cross(r_ij, r_kj, axis=-1)
+    n_jkl = np.cross(r_kj, r_kl, axis=-1)
+    r_kj_norm = np.linalg.norm(r_kj, axis=-1)
+    n_ijk_norm = np.linalg.norm(n_ijk, axis=-1)
+    n_jkl_norm = np.linalg.norm(n_jkl, axis=-1)
+    sign_phi = np.sign(
+        r_kj[..., None, :] @ np.cross(n_ijk, n_jkl, axis=-1)[..., None]
+    ).squeeze(axis=(-1, -2))
+    phi = sign_phi * np.arccos(
+        np.clip(
+            (n_ijk[..., None, :] @ n_jkl[..., None]).squeeze(axis=(-1, -2))
+            / (n_ijk_norm * n_jkl_norm),
+            -1 + 1e-8,
+            1 - 1e-8,
+        )
+    )
+    return phi
+def check_ligand_distance_geometry(
+    structure, constraints, bond_buffer=0.25, angle_buffer=0.25, clash_buffer=0.2
+):
+    coords = structure.coords["coords"]
+    rdkit_bounds_constraints = constraints.rdkit_bounds_constraints
+    pair_index = rdkit_bounds_constraints["atom_idxs"].copy().astype(np.int64).T
+    bond_mask = rdkit_bounds_constraints["is_bond"].copy().astype(bool)
+    angle_mask = rdkit_bounds_constraints["is_angle"].copy().astype(bool)
+    upper_bounds = rdkit_bounds_constraints["upper_bound"].copy().astype(np.float32)
+    lower_bounds = rdkit_bounds_constraints["lower_bound"].copy().astype(np.float32)
+    dists = np.linalg.norm(coords[pair_index[0]] - coords[pair_index[1]], axis=-1)
+    bond_length_violations = (
+        dists[bond_mask] <= lower_bounds[bond_mask] * (1.0 - bond_buffer)
+    ) + (dists[bond_mask] >= upper_bounds[bond_mask] * (1.0 + bond_buffer))
+    bond_angle_violations = (
+        dists[angle_mask] <= lower_bounds[angle_mask] * (1.0 - angle_buffer)
+    ) + (dists[angle_mask] >= upper_bounds[angle_mask] * (1.0 + angle_buffer))
+    internal_clash_violations = dists[~bond_mask * ~angle_mask] <= lower_bounds[
+        ~bond_mask * ~angle_mask
+    ] * (1.0 - clash_buffer)
+    num_ligands = sum(
+        [
+            int(const.chain_types[chain["mol_type"]] == "NONPOLYMER")
+            for chain in structure.chains
+        ]
+    )
+    return {
+        "num_ligands": num_ligands,
+        "num_bond_length_violations": bond_length_violations.sum(),
+        "num_bonds": bond_mask.sum(),
+        "num_bond_angle_violations": bond_angle_violations.sum(),
+        "num_angles": angle_mask.sum(),
+        "num_internal_clash_violations": internal_clash_violations.sum(),
+        "num_non_neighbors": (~bond_mask * ~angle_mask).sum(),
+    }
+def check_ligand_stereochemistry(structure, constraints):
+    coords = structure.coords["coords"]
+    chiral_atom_constraints = constraints.chiral_atom_constraints
+    stereo_bond_constraints = constraints.stereo_bond_constraints
+    chiral_atom_index = chiral_atom_constraints["atom_idxs"].T
+    true_chiral_atom_orientations = chiral_atom_constraints["is_r"]
+    chiral_atom_ref_mask = chiral_atom_constraints["is_reference"]
+    chiral_atom_index = chiral_atom_index[:, chiral_atom_ref_mask]
+    true_chiral_atom_orientations = true_chiral_atom_orientations[chiral_atom_ref_mask]
+    pred_chiral_atom_orientations = (
+        compute_torsion_angles(coords, chiral_atom_index) > 0
+    )
+    chiral_atom_violations = (
+        pred_chiral_atom_orientations != true_chiral_atom_orientations
+    )
+    stereo_bond_index = stereo_bond_constraints["atom_idxs"].T
+    true_stereo_bond_orientations = stereo_bond_constraints["is_e"]
+    stereo_bond_ref_mask = stereo_bond_constraints["is_reference"]
+    stereo_bond_index = stereo_bond_index[:, stereo_bond_ref_mask]
+    true_stereo_bond_orientations = true_stereo_bond_orientations[stereo_bond_ref_mask]
+    pred_stereo_bond_orientations = (
+        np.abs(compute_torsion_angles(coords, stereo_bond_index)) > np.pi / 2
+    )
+    stereo_bond_violations = (
+        pred_stereo_bond_orientations != true_stereo_bond_orientations
+    )
+    return {
+        "num_chiral_atom_violations": chiral_atom_violations.sum(),
+        "num_chiral_atoms": chiral_atom_index.shape[1],
+        "num_stereo_bond_violations": stereo_bond_violations.sum(),
+        "num_stereo_bonds": stereo_bond_index.shape[1],
+    }
+def check_ligand_flatness(structure, constraints, buffer=0.25):
+    coords = structure.coords["coords"]
+    planar_ring_5_index = constraints.planar_ring_5_constraints["atom_idxs"]
+    ring_5_coords = coords[planar_ring_5_index, :]
+    centered_ring_5_coords = ring_5_coords - ring_5_coords.mean(axis=-2, keepdims=True)
+    ring_5_vecs = np.linalg.svd(centered_ring_5_coords)[2][..., -1, :, None]
+    ring_5_dists = np.abs((centered_ring_5_coords @ ring_5_vecs).squeeze(axis=-1))
+    ring_5_violations = np.all(ring_5_dists <= buffer, axis=-1)
+    planar_ring_6_index = constraints.planar_ring_6_constraints["atom_idxs"]
+    ring_6_coords = coords[planar_ring_6_index, :]
+    centered_ring_6_coords = ring_6_coords - ring_6_coords.mean(axis=-2, keepdims=True)
+    ring_6_vecs = np.linalg.svd(centered_ring_6_coords)[2][..., -1, :, None]
+    ring_6_dists = np.abs((centered_ring_6_coords @ ring_6_vecs)).squeeze(axis=-1)
+    ring_6_violations = np.any(ring_6_dists >= buffer, axis=-1)
+    planar_bond_index = constraints.planar_bond_constraints["atom_idxs"]
+    bond_coords = coords[planar_bond_index, :]
+    centered_bond_coords = bond_coords - bond_coords.mean(axis=-2, keepdims=True)
+    bond_vecs = np.linalg.svd(centered_bond_coords)[2][..., -1, :, None]
+    bond_dists = np.abs((centered_bond_coords @ bond_vecs)).squeeze(axis=-1)
+    bond_violations = np.any(bond_dists >= buffer, axis=-1)
+    return {
+        "num_planar_5_ring_violations": ring_5_violations.sum(),
+        "num_planar_5_rings": ring_5_violations.shape[0],
+        "num_planar_6_ring_violations": ring_6_violations.sum(),
+        "num_planar_6_rings": ring_6_violations.shape[0],
+        "num_planar_double_bond_violations": bond_violations.sum(),
+        "num_planar_double_bonds": bond_violations.shape[0],
+    }
+def check_steric_clash(structure, molecules, buffer=0.25):
+    result = {}
+    for type_i in const.chain_types:
+        out_type_i = type_i.lower()
+        out_type_i = out_type_i if out_type_i != "nonpolymer" else "ligand"
+        result[f"num_chain_pairs_sym_{out_type_i}"] = 0
+        result[f"num_chain_clashes_sym_{out_type_i}"] = 0
+        for type_j in const.chain_types:
+            out_type_j = type_j.lower()
+            out_type_j = out_type_j if out_type_j != "nonpolymer" else "ligand"
+            result[f"num_chain_pairs_asym_{out_type_i}_{out_type_j}"] = 0
+            result[f"num_chain_clashes_asym_{out_type_i}_{out_type_j}"] = 0
+    connected_chains = set()
+    for bond in structure.bonds:
+        if bond["chain_1"] != bond["chain_2"]:
+            connected_chains.add(tuple(sorted((bond["chain_1"], bond["chain_2"]))))
+    vdw_radii = []
+    for res in structure.residues:
+        mol = molecules[res["name"]]
+        token_atoms = structure.atoms[
+            res["atom_idx"] : res["atom_idx"] + res["atom_num"]
+        ]
+        atom_name_to_ref = {a.GetProp("name"): a for a in mol.GetAtoms()}
+        token_atoms_ref = [atom_name_to_ref[a["name"]] for a in token_atoms]
+        vdw_radii.extend(
+            [const.vdw_radii[a.GetAtomicNum() - 1] for a in token_atoms_ref]
+        )
+    vdw_radii = np.array(vdw_radii, dtype=np.float32)
+    np.array([a.GetAtomicNum() for a in token_atoms_ref])
+    for i, chain_i in enumerate(structure.chains):
+        for j, chain_j in enumerate(structure.chains):
+            if (
+                chain_i["atom_num"] == 1
+                or chain_j["atom_num"] == 1
+                or j <= i
+                or (i, j) in connected_chains
+            ):
+                continue
+            coords_i = structure.coords["coords"][
+                chain_i["atom_idx"] : chain_i["atom_idx"] + chain_i["atom_num"]
+            ]
+            coords_j = structure.coords["coords"][
+                chain_j["atom_idx"] : chain_j["atom_idx"] + chain_j["atom_num"]
+            ]
+            dists = np.linalg.norm(coords_i[:, None, :] - coords_j[None, :, :], axis=-1)
+            radii_i = vdw_radii[
+                chain_i["atom_idx"] : chain_i["atom_idx"] + chain_i["atom_num"]
+            ]
+            radii_j = vdw_radii[
+                chain_j["atom_idx"] : chain_j["atom_idx"] + chain_j["atom_num"]
+            ]
+            radii_sum = radii_i[:, None] + radii_j[None, :]
+            is_clashing = np.any(dists < radii_sum * (1.00 - buffer))
+            type_i = const.chain_types[chain_i["mol_type"]].lower()
+            type_j = const.chain_types[chain_j["mol_type"]].lower()
+            type_i = type_i if type_i != "nonpolymer" else "ligand"
+            type_j = type_j if type_j != "nonpolymer" else "ligand"
+            is_symmetric = (
+                chain_i["entity_id"] == chain_j["entity_id"]
+                and chain_i["atom_num"] == chain_j["atom_num"]
+            )
+            if is_symmetric:
+                key = "sym_" + type_i
+            else:
+                key = "asym_" + type_i + "_" + type_j
+            result["num_chain_pairs_" + key] += 1
+            result["num_chain_clashes_" + key] += int(is_clashing)
+    return result
+cache_dir = Path("/data/rbg/users/jwohlwend/boltz-cache")
+ccd_path = cache_dir / "ccd.pkl"
+moldir = cache_dir / "mols"
+with ccd_path.open("rb") as file:
+    ccd = pickle.load(file)
+boltz1_dir = Path(
+    "/data/rbg/shared/projects/foldeverything/boltz_results_final/outputs/test/boltz/predictions"
+)
+boltz1x_dir = Path(
+    "/data/scratch/getzn/boltz_private/boltz_1x_test_results_final_new/full_predictions"
+)
+chai_dir = Path(
+    "/data/rbg/shared/projects/foldeverything/boltz_results_final/outputs/test/chai"
+)
+af3_dir = Path(
+    "/data/rbg/shared/projects/foldeverything/boltz_results_final/outputs/test/af3"
+)
+boltz1_pdb_ids = set(os.listdir(boltz1_dir))
+boltz1x_pdb_ids = set(os.listdir(boltz1x_dir))
+chai_pdb_ids = set(os.listdir(chai_dir))
+af3_pdb_ids = set([pdb_id for pdb_id in os.listdir(af3_dir)])
+common_pdb_ids = boltz1_pdb_ids & boltz1x_pdb_ids & chai_pdb_ids & af3_pdb_ids
+tools = ["boltz1", "boltz1x", "chai", "af3"]
+num_samples = 5
+def process_fn(key):
+    tool, pdb_id, model_idx = key
+    if tool == "boltz1":
+        cif_path = boltz1_dir / pdb_id / f"{pdb_id}_model_{model_idx}.cif"
+    elif tool == "boltz1x":
+        cif_path = boltz1x_dir / pdb_id / f"{pdb_id}_model_{model_idx}.cif"
+    elif tool == "chai":
+        cif_path = chai_dir / pdb_id / f"pred.model_idx_{model_idx}.cif"
+    elif tool == "af3":
+        cif_path = af3_dir / pdb_id.lower() / f"seed-1_sample-{model_idx}" / "model.cif"
+    parsed_structure = parse_mmcif(
+        cif_path,
+        ccd,
+        moldir,
+    )
+    structure = parsed_structure.data
+    constraints = parsed_structure.residue_constraints
+    record = {
+        "tool": tool,
+        "pdb_id": pdb_id,
+        "model_idx": model_idx,
+    }
+    record.update(check_ligand_distance_geometry(structure, constraints))
+    record.update(check_ligand_stereochemistry(structure, constraints))
+    record.update(check_ligand_flatness(structure, constraints))
+    record.update(check_steric_clash(structure, molecules=ccd))
+    return record
+keys = []
+for tool in tools:
+    for pdb_id in common_pdb_ids:
+        for model_idx in range(num_samples):
+            keys.append((tool, pdb_id, model_idx))
+process_fn(keys[0])
+records = []
+with Pool(48) as p:
+    with tqdm(total=len(keys)) as pbar:
+        for record in p.imap_unordered(process_fn, keys):
+            records.append(record)
+            pbar.update(1)
+df = pd.DataFrame.from_records(records)
+df["num_chain_clashes_all"] = df[
+    [key for key in df.columns if "chain_clash" in key]
+].sum(axis=1)
+df["num_pairs_all"] = df[[key for key in df.columns if "chain_pair" in key]].sum(axis=1)
+df["clash_free"] = df["num_chain_clashes_all"] == 0
+df["valid_ligand"] = (
+    df[[key for key in df.columns if "violation" in key]].sum(axis=1) == 0
+)
+df["valid"] = (df["clash_free"]) & (df["valid_ligand"])
+df.to_csv("physical_checks_test.csv")

protify/FastPLMs/boltz/scripts/eval/run_evals.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import argparse
+import concurrent.futures
+import subprocess
+from pathlib import Path
+from tqdm import tqdm
+OST_COMPARE_STRUCTURE = r"""
+#!/bin/bash
+# https://openstructure.org/docs/2.7/actions/#ost-compare-structures
+IMAGE_NAME=openstructure-0.2.8
+command="compare-structures \
+-m {model_file} \
+-r {reference_file} \
+--fault-tolerant \
+--min-pep-length 4 \
+--min-nuc-length 4 \
+-o {output_path} \
+--lddt --bb-lddt --qs-score --dockq \
+--ics --ips --rigid-scores --patch-scores --tm-score"
+sudo docker run -u $(id -u):$(id -g) --rm --volume {mount}:{mount} $IMAGE_NAME $command
+"""
+OST_COMPARE_LIGAND = r"""
+#!/bin/bash
+# https://openstructure.org/docs/2.7/actions/#ost-compare-structures
+IMAGE_NAME=openstructure-0.2.8
+command="compare-ligand-structures \
+-m {model_file} \
+-r {reference_file} \
+--fault-tolerant \
+--lddt-pli --rmsd \
+--substructure-match \
+-o {output_path}"
+sudo docker run -u $(id -u):$(id -g) --rm --volume {mount}:{mount} $IMAGE_NAME $command
+"""
+def evaluate_structure(
+    name: str,
+    pred: Path,
+    reference: Path,
+    outdir: str,
+    mount: str,
+    executable: str = "/bin/bash",
+) -> None:
+    """Evaluate the structure."""
+    # Evaluate polymer metrics
+    out_path = Path(outdir) / f"{name}.json"
+    if out_path.exists():
+        print(  # noqa: T201
+            f"Skipping recomputation of {name} as protein json file already exists"
+        )
+    else:
+        subprocess.run(
+            OST_COMPARE_STRUCTURE.format(
+                model_file=str(pred),
+                reference_file=str(reference),
+                output_path=str(out_path),
+                mount=mount,
+            ),
+            shell=True,  # noqa: S602
+            check=False,
+            executable=executable,
+            capture_output=True,
+        )
+    # Evaluate ligand metrics
+    out_path = Path(outdir) / f"{name}_ligand.json"
+    if out_path.exists():
+        print(f"Skipping recomputation of {name} as ligand json file already exists")  # noqa: T201
+    else:
+        subprocess.run(
+            OST_COMPARE_LIGAND.format(
+                model_file=str(pred),
+                reference_file=str(reference),
+                output_path=str(out_path),
+                mount=mount,
+            ),
+            shell=True,  # noqa: S602
+            check=False,
+            executable=executable,
+            capture_output=True,
+        )
+def main(args):
+    # Aggregate the predictions and references
+    files = list(args.data.iterdir())
+    names = {f.stem.lower(): f for f in files}
+    # Create the output directory
+    args.outdir.mkdir(parents=True, exist_ok=True)
+    first_item = True
+    with concurrent.futures.ThreadPoolExecutor(args.max_workers) as executor:
+        futures = []
+        for name, folder in names.items():
+            for model_id in range(5):
+                # Split the input data
+                if args.format == "af3":
+                    pred_path = folder / f"seed-1_sample-{model_id}" / "model.cif"
+                elif args.format == "chai":
+                    pred_path = folder / f"pred.model_idx_{model_id}.cif"
+                elif args.format == "boltz":
+                    name_file = (
+                        f"{name[0].upper()}{name[1:]}"
+                        if args.testset == "casp"
+                        else name.lower()
+                    )
+                    pred_path = folder / f"{name_file}_model_{model_id}.cif"
+                if args.testset == "casp":
+                    ref_path = args.pdb / f"{name[0].upper()}{name[1:]}.cif"
+                elif args.testset == "test":
+                    ref_path = args.pdb / f"{name.lower()}.cif.gz"
+                if first_item:
+                    # Evaluate the first item in the first prediction
+                    # Ensures that the docker image is downloaded
+                    evaluate_structure(
+                        name=f"{name}_model_{model_id}",
+                        pred=str(pred_path),
+                        reference=str(ref_path),
+                        outdir=str(args.outdir),
+                        mount=args.mount,
+                        executable=args.executable,
+                    )
+                    first_item = False
+                else:
+                    future = executor.submit(
+                        evaluate_structure,
+                        name=f"{name}_model_{model_id}",
+                        pred=str(pred_path),
+                        reference=str(ref_path),
+                        outdir=str(args.outdir),
+                        mount=args.mount,
+                        executable=args.executable,
+                    )
+                    futures.append(future)
+        # Wait for all tasks to complete
+        with tqdm(total=len(futures)) as pbar:
+            for _ in concurrent.futures.as_completed(futures):
+                pbar.update(1)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("data", type=Path)
+    parser.add_argument("pdb", type=Path)
+    parser.add_argument("outdir", type=Path)
+    parser.add_argument("--format", type=str, default="af3")
+    parser.add_argument("--testset", type=str, default="casp")
+    parser.add_argument("--mount", type=str)
+    parser.add_argument("--executable", type=str, default="/bin/bash")
+    parser.add_argument("--max-workers", type=int, default=32)
+    args = parser.parse_args()
+    main(args)

protify/FastPLMs/boltz/scripts/process/ccd.py ADDED Viewed

	@@ -0,0 +1,295 @@

+"""Compute conformers and symmetries for all the CCD molecules."""
+import argparse
+import multiprocessing
+import pickle
+import sys
+from functools import partial
+from pathlib import Path
+import pandas as pd
+import rdkit
+from p_tqdm import p_uimap
+from pdbeccdutils.core import ccd_reader
+from pdbeccdutils.core.component import ConformerType
+from rdkit import rdBase
+from rdkit.Chem import AllChem
+from rdkit.Chem.rdchem import Conformer, Mol
+from tqdm import tqdm
+def load_molecules(components: str) -> list[Mol]:
+    """Load the CCD components file.
+    Parameters
+    ----------
+    components : str
+        Path to the CCD components file.
+    Returns
+    -------
+    list[Mol]
+    """
+    components: dict[str, ccd_reader.CCDReaderResult]
+    components = ccd_reader.read_pdb_components_file(components)
+    mols = []
+    for name, component in components.items():
+        mol = component.component.mol
+        mol.SetProp("PDB_NAME", name)
+        mols.append(mol)
+    return mols
+def compute_3d(mol: Mol, version: str = "v3") -> bool:
+    """Generate 3D coordinates using EKTDG method.
+    Taken from `pdbeccdutils.core.component.Component`.
+    Parameters
+    ----------
+    mol: Mol
+        The RDKit molecule to process
+    version: str, optional
+        The ETKDG version, defaults ot v3
+    Returns
+    -------
+    bool
+        Whether computation was successful.
+    """
+    if version == "v3":
+        options = rdkit.Chem.AllChem.ETKDGv3()
+    elif version == "v2":
+        options = rdkit.Chem.AllChem.ETKDGv2()
+    else:
+        options = rdkit.Chem.AllChem.ETKDGv2()
+    options.clearConfs = False
+    conf_id = -1
+    try:
+        conf_id = rdkit.Chem.AllChem.EmbedMolecule(mol, options)
+        rdkit.Chem.AllChem.UFFOptimizeMolecule(mol, confId=conf_id, maxIters=1000)
+    except RuntimeError:
+        pass  # Force field issue here
+    except ValueError:
+        pass  # sanitization issue here
+    if conf_id != -1:
+        conformer = mol.GetConformer(conf_id)
+        conformer.SetProp("name", ConformerType.Computed.name)
+        conformer.SetProp("coord_generation", f"ETKDG{version}")
+        return True
+    return False
+def get_conformer(mol: Mol, c_type: ConformerType) -> Conformer:
+    """Retrieve an rdkit object for a deemed conformer.
+    Taken from `pdbeccdutils.core.component.Component`.
+    Parameters
+    ----------
+    mol: Mol
+        The molecule to process.
+    c_type: ConformerType
+        The conformer type to extract.
+    Returns
+    -------
+    Conformer
+        The desired conformer, if any.
+    Raises
+    ------
+    ValueError
+        If there are no conformers of the given tyoe.
+    """
+    for c in mol.GetConformers():
+        try:
+            if c.GetProp("name") == c_type.name:
+                return c
+        except KeyError:  # noqa: PERF203
+            pass
+    msg = f"Conformer {c_type.name} does not exist."
+    raise ValueError(msg)
+def compute_symmetries(mol: Mol) -> list[list[int]]:
+    """Compute the symmetries of a molecule.
+    Parameters
+    ----------
+    mol : Mol
+        The molecule to process
+    Returns
+    -------
+    list[list[int]]
+        The symmetries as a list of index permutations
+    """
+    mol = AllChem.RemoveHs(mol)
+    idx_map = {}
+    atom_idx = 0
+    for i, atom in enumerate(mol.GetAtoms()):
+        # Skip if leaving atoms
+        if int(atom.GetProp("leaving_atom")):
+            continue
+        idx_map[i] = atom_idx
+        atom_idx += 1
+    # Calculate self permutations
+    permutations = []
+    raw_permutations = mol.GetSubstructMatches(mol, uniquify=False)
+    for raw_permutation in raw_permutations:
+        # Filter out permutations with leaving atoms
+        try:
+            if {raw_permutation[idx] for idx in idx_map} == set(idx_map.keys()):
+                permutation = [
+                    idx_map[idx] for idx in raw_permutation if idx in idx_map
+                ]
+                permutations.append(permutation)
+        except Exception:  # noqa: S110, PERF203, BLE001
+            pass
+    serialized_permutations = pickle.dumps(permutations)
+    mol.SetProp("symmetries", serialized_permutations.hex())
+    return permutations
+def process(mol: Mol, output: str) -> tuple[str, str]:
+    """Process a CCD component.
+    Parameters
+    ----------
+    mol : Mol
+        The molecule to process
+    output : str
+        The directory to save the molecules
+    Returns
+    -------
+    str
+        The name of the component
+    str
+        The result of the conformer generation
+    """
+    # Get name
+    name = mol.GetProp("PDB_NAME")
+    # Check if single atom
+    if mol.GetNumAtoms() == 1:
+        result = "single"
+    else:
+        # Get the 3D conformer
+        try:
+            # Try to generate a 3D conformer with RDKit
+            success = compute_3d(mol, version="v3")
+            if success:
+                _ = get_conformer(mol, ConformerType.Computed)
+                result = "computed"
+            # Otherwise, default to the ideal coordinates
+            else:
+                _ = get_conformer(mol, ConformerType.Ideal)
+                result = "ideal"
+        except ValueError:
+            result = "failed"
+    # Dump the molecule
+    path = Path(output) / f"{name}.pkl"
+    with path.open("wb") as f:
+        pickle.dump(mol, f)
+    # Output the results
+    return name, result
+def main(args: argparse.Namespace) -> None:
+    """Process conformers."""
+    # Set property saving
+    rdkit.Chem.SetDefaultPickleProperties(rdkit.Chem.PropertyPickleOptions.AllProps)
+    # Load components
+    print("Loading components")  # noqa: T201
+    molecules = load_molecules(args.components)
+    # Reset stdout and stderr, as pdbccdutils messes with them
+    sys.stdout = sys.__stdout__
+    sys.stderr = sys.__stderr__
+    # Disable rdkit warnings
+    blocker = rdBase.BlockLogs()  # noqa: F841
+    # Setup processing function
+    outdir = Path(args.outdir)
+    outdir.mkdir(parents=True, exist_ok=True)
+    mol_output = outdir / "mols"
+    mol_output.mkdir(parents=True, exist_ok=True)
+    process_fn = partial(process, output=str(mol_output))
+    # Process the files in parallel
+    print("Processing components")  # noqa: T201
+    metadata = []
+    # Check if we can run in parallel
+    max_processes = multiprocessing.cpu_count()
+    num_processes = max(1, min(args.num_processes, max_processes, len(molecules)))
+    parallel = num_processes > 1
+    if parallel:
+        for name, result in p_uimap(
+            process_fn,
+            molecules,
+            num_cpus=num_processes,
+        ):
+            metadata.append({"name": name, "result": result})
+    else:
+        for mol in tqdm(molecules):
+            name, result = process_fn(mol)
+            metadata.append({"name": name, "result": result})
+    # Load and group outputs
+    molecules = {}
+    for item in metadata:
+        if item["result"] == "failed":
+            continue
+        # Load the mol file
+        path = mol_output / f"{item['name']}.pkl"
+        with path.open("rb") as f:
+            mol = pickle.load(f)  # noqa: S301
+            molecules[item["name"]] = mol
+    # Dump metadata
+    path = outdir / "results.csv"
+    metadata = pd.DataFrame(metadata)
+    metadata.to_csv(path)
+    # Dump the components
+    path = outdir / "ccd.pkl"
+    with path.open("wb") as f:
+        pickle.dump(molecules, f)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--components", type=str)
+    parser.add_argument("--outdir", type=str)
+    parser.add_argument(
+        "--num_processes",
+        type=int,
+        default=multiprocessing.cpu_count(),
+    )
+    args = parser.parse_args()
+    main(args)

protify/FastPLMs/boltz/scripts/process/cluster.py ADDED Viewed

	@@ -0,0 +1,111 @@

+"""Create a mapping from structure and chain ID to MSA indices."""
+import argparse
+import hashlib
+import json
+import pickle
+import subprocess
+from pathlib import Path
+import pandas as pd
+from Bio import SeqIO
+def hash_sequence(seq: str) -> str:
+    """Hash a sequence."""
+    return hashlib.sha256(seq.encode()).hexdigest()
+def main(args: argparse.Namespace) -> None:
+    """Create clustering."""
+    # Set output directory
+    outdir = Path(args.outdir)
+    outdir.mkdir(parents=True, exist_ok=True)
+    # Split the sequences into proteins and nucleotides
+    with Path(args.sequences).open("r") as f:
+        data = list(SeqIO.parse(f, "fasta"))
+    proteins = set()
+    shorts = set()
+    nucleotides = set()
+    # Separate the sequences into proteins, nucleotides and short sequences
+    # Short sequences cause a bug in the clustering, so they are separated
+    for seq in data:
+        if set(str(seq.seq)).issubset({"A", "C", "G", "T", "U", "N"}):
+            nucleotides.add(str(seq.seq).strip())
+        elif len(str(seq.seq).strip()) < 10:  # noqa: PLR2004
+            shorts.add(str(seq.seq).strip())
+        else:
+            proteins.add(str(seq.seq).strip())
+    # Run mmseqs on the protein data
+    proteins = [f">{hash_sequence(seq)}\n{seq}" for seq in proteins]
+    with (outdir / "proteins.fasta").open("w") as f:
+        f.write("\n".join(proteins))
+    subprocess.run(
+        f"{args.mmseqs} easy-cluster {outdir / 'proteins.fasta'} {outdir / 'clust_prot'} {outdir / 'tmp'} --min-seq-id 0.4",  # noqa: E501
+        shell=True,  # noqa: S602
+        check=True,
+    )
+    # Load protein clusters
+    clustering_path = outdir / "clust_prot_cluster.tsv"
+    protein_data = pd.read_csv(clustering_path, sep="\t", header=None)
+    clusters = protein_data[0]
+    items = protein_data[1]
+    clustering = dict(zip(list(items), list(clusters)))
+    # Each shqrt sequence is given an id
+    for short in shorts:
+        short_id = hash_sequence(short)
+        clustering[short_id] = short_id
+    # Each unique rna sequence is given an id
+    for nucl in nucleotides:
+        nucl_id = hash_sequence(nucl)
+        clustering[nucl_id] = nucl_id
+    # Load ligand data
+    with Path(args.ccd).open("rb") as handle:
+        ligand_data = pickle.load(handle)  # noqa: S301
+    # Each unique ligand CCD is given an id
+    for ccd_code in ligand_data:
+        clustering[ccd_code] = ccd_code
+    # Save clustering
+    with (outdir / "clustering.json").open("w") as handle:
+        json.dump(clustering, handle)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--sequences",
+        type=str,
+        help="Input to protein fasta.",
+        required=True,
+    )
+    parser.add_argument(
+        "--ccd",
+        type=str,
+        help="Input to rna fasta.",
+        required=True,
+    )
+    parser.add_argument(
+        "--outdir",
+        type=str,
+        help="Output directory.",
+        required=True,
+    )
+    parser.add_argument(
+        "--mmseqs",
+        type=str,
+        help="Path to mmseqs program.",
+        default="mmseqs",
+    )
+    args = parser.parse_args()
+    main(args)

protify/FastPLMs/boltz/scripts/process/mmcif.py ADDED Viewed

	@@ -0,0 +1,1123 @@

+import contextlib
+from dataclasses import dataclass, replace
+from typing import Optional
+import gemmi
+import numpy as np
+from rdkit import rdBase
+from rdkit.Chem import AllChem
+from rdkit.Chem.rdchem import Conformer, Mol
+from sklearn.neighbors import KDTree
+from boltz.data import const
+from boltz.data.types import (
+    Atom,
+    Bond,
+    Chain,
+    Connection,
+    Interface,
+    Residue,
+    Structure,
+    StructureInfo,
+)
+####################################################################################################
+# DATACLASSES
+####################################################################################################
+@dataclass(frozen=True, slots=True)
+class ParsedAtom:
+    """A parsed atom object."""
+    name: str
+    element: int
+    charge: int
+    coords: tuple[float, float, float]
+    conformer: tuple[float, float, float]
+    is_present: bool
+    chirality: int
+@dataclass(frozen=True, slots=True)
+class ParsedBond:
+    """A parsed bond object."""
+    atom_1: int
+    atom_2: int
+    type: int
+@dataclass(frozen=True, slots=True)
+class ParsedResidue:
+    """A parsed residue object."""
+    name: str
+    type: int
+    idx: int
+    atoms: list[ParsedAtom]
+    bonds: list[ParsedBond]
+    orig_idx: Optional[int]
+    atom_center: int
+    atom_disto: int
+    is_standard: bool
+    is_present: bool
+@dataclass(frozen=True, slots=True)
+class ParsedChain:
+    """A parsed chain object."""
+    name: str
+    entity: str
+    type: str
+    residues: list[ParsedResidue]
+    sequence: list[str]
+@dataclass(frozen=True, slots=True)
+class ParsedConnection:
+    """A parsed connection object."""
+    chain_1: str
+    chain_2: str
+    residue_index_1: int
+    residue_index_2: int
+    atom_index_1: str
+    atom_index_2: str
+@dataclass(frozen=True, slots=True)
+class ParsedStructure:
+    """A parsed structure object."""
+    data: Structure
+    info: StructureInfo
+    covalents: list[int]
+####################################################################################################
+# HELPERS
+####################################################################################################
+def get_dates(block: gemmi.cif.Block) -> tuple[str, str, str]:
+    """Get the deposited, released, and last revision dates.
+    Parameters
+    ----------
+    block : gemmi.cif.Block
+        The block to process.
+    Returns
+    -------
+    str
+        The deposited date.
+    str
+        The released date.
+    str
+        The last revision date.
+    """
+    deposited = "_pdbx_database_status.recvd_initial_deposition_date"
+    revision = "_pdbx_audit_revision_history.revision_date"
+    deposit_date = revision_date = release_date = ""
+    with contextlib.suppress(Exception):
+        deposit_date = block.find([deposited])[0][0]
+        release_date = block.find([revision])[0][0]
+        revision_date = block.find([revision])[-1][0]
+    return deposit_date, release_date, revision_date
+def get_resolution(block: gemmi.cif.Block) -> float:
+    """Get the resolution from a gemmi structure.
+    Parameters
+    ----------
+    block : gemmi.cif.Block
+        The block to process.
+    Returns
+    -------
+    float
+        The resolution.
+    """
+    resolution = 0.0
+    for res_key in (
+        "_refine.ls_d_res_high",
+        "_em_3d_reconstruction.resolution",
+        "_reflns.d_resolution_high",
+    ):
+        with contextlib.suppress(Exception):
+            resolution = float(block.find([res_key])[0].str(0))
+            break
+    return resolution
+def get_method(block: gemmi.cif.Block) -> str:
+    """Get the method from a gemmi structure.
+    Parameters
+    ----------
+    block : gemmi.cif.Block
+        The block to process.
+    Returns
+    -------
+    str
+        The method.
+    """
+    method = ""
+    method_key = "_exptl.method"
+    with contextlib.suppress(Exception):
+        methods = block.find([method_key])
+        method = ",".join([m.str(0).lower() for m in methods])
+    return method
+def convert_atom_name(name: str) -> tuple[int, int, int, int]:
+    """Convert an atom name to a standard format.
+    Parameters
+    ----------
+    name : str
+        The atom name.
+    Returns
+    -------
+    tuple[int, int, int, int]
+        The converted atom name.
+    """
+    name = name.strip()
+    name = [ord(c) - 32 for c in name]
+    name = name + [0] * (4 - len(name))
+    return tuple(name)
+def get_unk_token(dtype: gemmi.PolymerType) -> str:
+    """Get the unknown token for a given entity type.
+    Parameters
+    ----------
+    dtype : gemmi.EntityType
+        The entity type.
+    Returns
+    -------
+    str
+        The unknown token.
+    """
+    if dtype == gemmi.PolymerType.PeptideL:
+        unk = const.unk_token["PROTEIN"]
+    elif dtype == gemmi.PolymerType.Dna:
+        unk = const.unk_token["DNA"]
+    elif dtype == gemmi.PolymerType.Rna:
+        unk = const.unk_token["RNA"]
+    else:
+        msg = f"Unknown polymer type: {dtype}"
+        raise ValueError(msg)
+    return unk
+def get_conformer(mol: Mol) -> Conformer:
+    """Retrieve an rdkit object for a deemed conformer.
+    Inspired by `pdbeccdutils.core.component.Component`.
+    Parameters
+    ----------
+    mol: Mol
+        The molecule to process.
+    Returns
+    -------
+    Conformer
+        The desired conformer, if any.
+    Raises
+    ------
+    ValueError
+        If there are no conformers of the given tyoe.
+    """
+    for c in mol.GetConformers():
+        try:
+            if c.GetProp("name") == "Computed":
+                return c
+        except KeyError:  # noqa: PERF203
+            pass
+    for c in mol.GetConformers():
+        try:
+            if c.GetProp("name") == "Ideal":
+                return c
+        except KeyError:  # noqa: PERF203
+            pass
+    msg = "Conformer does not exist."
+    raise ValueError(msg)
+def compute_covalent_ligands(
+    connections: list[gemmi.Connection],
+    subchain_map: dict[tuple[str, int], str],
+    entities: dict[str, gemmi.Entity],
+) -> set[str]:
+    """Compute the covalent ligands from a list of connections.
+    Parameters
+    ----------
+    connections: List[gemmi.Connection]
+        The connections to process.
+    subchain_map: dict[tuple[str, int], str]
+        The mapping from chain, residue index to subchain name.
+    entities: dict[str, gemmi.Entity]
+        The entities in the structure.
+    Returns
+    -------
+    set
+        The covalent ligand subchains.
+    """
+    # Get covalent chain ids
+    covalent_chain_ids = set()
+    for connection in connections:
+        if connection.type.name != "Covale":
+            continue
+        # Map to correct subchain
+        chain_1_name = connection.partner1.chain_name
+        chain_2_name = connection.partner2.chain_name
+        res_1_id = connection.partner1.res_id.seqid
+        res_1_id = str(res_1_id.num) + str(res_1_id.icode).strip()
+        res_2_id = connection.partner2.res_id.seqid
+        res_2_id = str(res_2_id.num) + str(res_2_id.icode).strip()
+        subchain_1 = subchain_map[(chain_1_name, res_1_id)]
+        subchain_2 = subchain_map[(chain_2_name, res_2_id)]
+        # If non-polymer or branched, add to set
+        entity_1 = entities[subchain_1].entity_type.name
+        entity_2 = entities[subchain_2].entity_type.name
+        if entity_1 in {"NonPolymer", "Branched"}:
+            covalent_chain_ids.add(subchain_1)
+        if entity_2 in {"NonPolymer", "Branched"}:
+            covalent_chain_ids.add(subchain_2)
+    return covalent_chain_ids
+def compute_interfaces(atom_data: np.ndarray, chain_data: np.ndarray) -> np.ndarray:
+    """Compute the chain-chain interfaces from a gemmi structure.
+    Parameters
+    ----------
+    atom_data : List[tuple]
+        The atom data.
+    chain_data : List[tuple]
+        The chain data.
+    Returns
+    -------
+    List[tuple[int, int]]
+        The interfaces.
+    """
+    # Compute chain_id per atom
+    chain_ids = []
+    for idx, chain in enumerate(chain_data):
+        chain_ids.extend([idx] * chain["atom_num"])
+    chain_ids = np.array(chain_ids)
+    # Filte to present atoms
+    coords = atom_data["coords"]
+    mask = atom_data["is_present"]
+    coords = coords[mask]
+    chain_ids = chain_ids[mask]
+    # Compute the distance matrix
+    tree = KDTree(coords, metric="euclidean")
+    query = tree.query_radius(coords, const.atom_interface_cutoff)
+    # Get unique chain pairs
+    interfaces = set()
+    for c1, pairs in zip(chain_ids, query):
+        chains = np.unique(chain_ids[pairs])
+        chains = chains[chains != c1]
+        interfaces.update((c1, c2) for c2 in chains)
+    # Get unique chain pairs
+    interfaces = [(min(i, j), max(i, j)) for i, j in interfaces]
+    interfaces = list({(int(i), int(j)) for i, j in interfaces})
+    interfaces = np.array(interfaces, dtype=Interface)
+    return interfaces
+####################################################################################################
+# PARSING
+####################################################################################################
+def parse_ccd_residue(  # noqa: PLR0915, C901
+    name: str,
+    components: dict[str, Mol],
+    res_idx: int,
+    gemmi_mol: Optional[gemmi.Residue] = None,
+    is_covalent: bool = False,
+) -> Optional[ParsedResidue]:
+    """Parse an MMCIF ligand.
+    First tries to get the SMILES string from the RCSB.
+    Then, tries to infer atom ordering using RDKit.
+    Parameters
+    ----------
+    name: str
+        The name of the molecule to parse.
+    components : dict
+        The preprocessed PDB components dictionary.
+    res_idx : int
+        The residue index.
+    gemmi_mol : Optional[gemmi.Residue]
+        The PDB molecule, as a gemmi Residue object, if any.
+    Returns
+    -------
+    ParsedResidue, optional
+       The output ParsedResidue, if successful.
+    """
+    unk_chirality = const.chirality_type_ids[const.unk_chirality_type]
+    # Check if we have a PDB structure for this residue,
+    # it could be a missing residue from the sequence
+    is_present = gemmi_mol is not None
+    # Save original index (required for parsing connections)
+    if is_present:
+        orig_idx = gemmi_mol.seqid
+        orig_idx = str(orig_idx.num) + str(orig_idx.icode).strip()
+    else:
+        orig_idx = None
+    # Get reference component
+    ref_mol = components[name]
+    # Remove hydrogens
+    ref_mol = AllChem.RemoveHs(ref_mol, sanitize=False)
+    # Check if this is a single atom CCD residue
+    if ref_mol.GetNumAtoms() == 1:
+        pos = (0, 0, 0)
+        if is_present:
+            pos = (
+                gemmi_mol[0].pos.x,
+                gemmi_mol[0].pos.y,
+                gemmi_mol[0].pos.z,
+            )
+        ref_atom = ref_mol.GetAtoms()[0]
+        chirality_type = const.chirality_type_ids.get(
+            str(ref_atom.GetChiralTag()), unk_chirality
+        )
+        atom = ParsedAtom(
+            name=ref_atom.GetProp("name"),
+            element=ref_atom.GetAtomicNum(),
+            charge=ref_atom.GetFormalCharge(),
+            coords=pos,
+            conformer=(0, 0, 0),
+            is_present=is_present,
+            chirality=chirality_type,
+        )
+        unk_prot_id = const.unk_token_ids["PROTEIN"]
+        residue = ParsedResidue(
+            name=name,
+            type=unk_prot_id,
+            atoms=[atom],
+            bonds=[],
+            idx=res_idx,
+            orig_idx=orig_idx,
+            atom_center=0,  # Placeholder, no center
+            atom_disto=0,  # Placeholder, no center
+            is_standard=False,
+            is_present=is_present,
+        )
+        return residue
+    # If multi-atom, start by getting the PDB coordinates
+    pdb_pos = {}
+    if is_present:
+        # Match atoms based on names
+        for atom in gemmi_mol:
+            atom: gemmi.Atom
+            pos = (atom.pos.x, atom.pos.y, atom.pos.z)
+            pdb_pos[atom.name] = pos
+    # Get reference conformer coordinates
+    conformer = get_conformer(ref_mol)
+    # Parse each atom in order of the reference mol
+    atoms = []
+    atom_idx = 0
+    idx_map = {}  # Used for bonds later
+    for i, atom in enumerate(ref_mol.GetAtoms()):
+        # Get atom name, charge, element and reference coordinates
+        atom_name = atom.GetProp("name")
+        charge = atom.GetFormalCharge()
+        element = atom.GetAtomicNum()
+        ref_coords = conformer.GetAtomPosition(atom.GetIdx())
+        ref_coords = (ref_coords.x, ref_coords.y, ref_coords.z)
+        chirality_type = const.chirality_type_ids.get(
+            str(atom.GetChiralTag()), unk_chirality
+        )
+        # If the atom is a leaving atom, skip if not in the PDB and is_covalent
+        if (
+            int(atom.GetProp("leaving_atom")) == 1
+            and is_covalent
+            and (atom_name not in pdb_pos)
+        ):
+            continue
+        # Get PDB coordinates, if any
+        coords = pdb_pos.get(atom_name)
+        if coords is None:
+            atom_is_present = False
+            coords = (0, 0, 0)
+        else:
+            atom_is_present = True
+        # Add atom to list
+        atoms.append(
+            ParsedAtom(
+                name=atom_name,
+                element=element,
+                charge=charge,
+                coords=coords,
+                conformer=ref_coords,
+                is_present=atom_is_present,
+                chirality=chirality_type,
+            )
+        )
+        idx_map[i] = atom_idx
+        atom_idx += 1
+    # Load bonds
+    bonds = []
+    unk_bond = const.bond_type_ids[const.unk_bond_type]
+    for bond in ref_mol.GetBonds():
+        idx_1 = bond.GetBeginAtomIdx()
+        idx_2 = bond.GetEndAtomIdx()
+        # Skip bonds with atoms ignored
+        if (idx_1 not in idx_map) or (idx_2 not in idx_map):
+            continue
+        idx_1 = idx_map[idx_1]
+        idx_2 = idx_map[idx_2]
+        start = min(idx_1, idx_2)
+        end = max(idx_1, idx_2)
+        bond_type = bond.GetBondType().name
+        bond_type = const.bond_type_ids.get(bond_type, unk_bond)
+        bonds.append(ParsedBond(start, end, bond_type))
+    unk_prot_id = const.unk_token_ids["PROTEIN"]
+    return ParsedResidue(
+        name=name,
+        type=unk_prot_id,
+        atoms=atoms,
+        bonds=bonds,
+        idx=res_idx,
+        atom_center=0,
+        atom_disto=0,
+        orig_idx=orig_idx,
+        is_standard=False,
+        is_present=is_present,
+    )
+def parse_polymer(  # noqa: C901, PLR0915, PLR0912
+    polymer: gemmi.ResidueSpan,
+    polymer_type: gemmi.PolymerType,
+    sequence: list[str],
+    chain_id: str,
+    entity: str,
+    components: dict[str, Mol],
+) -> Optional[ParsedChain]:
+    """Process a gemmi Polymer into a chain object.
+    Performs alignment of the full sequence to the polymer
+    residues. Loads coordinates and masks for the atoms in
+    the polymer, following the ordering in const.atom_order.
+    Parameters
+    ----------
+    polymer : gemmi.ResidueSpan
+        The polymer to process.
+    polymer_type : gemmi.PolymerType
+        The polymer type.
+    sequence : str
+        The full sequence of the polymer.
+    chain_id : str
+        The chain identifier.
+    entity : str
+        The entity name.
+    components : dict[str, Mol]
+        The preprocessed PDB components dictionary.
+    Returns
+    -------
+    ParsedChain, optional
+        The output chain, if successful.
+    Raises
+    ------
+    ValueError
+        If the alignment fails.
+    """
+    # Get unknown chirality token
+    unk_chirality = const.chirality_type_ids[const.unk_chirality_type]
+    # Ignore microheterogenities (pick first)
+    sequence = [gemmi.Entity.first_mon(item) for item in sequence]
+    # Align full sequence to polymer residues
+    # This is a simple way to handle all the different numbering schemes
+    result = gemmi.align_sequence_to_polymer(
+        sequence,
+        polymer,
+        polymer_type,
+        gemmi.AlignmentScoring(),
+    )
+    # Get coordinates and masks
+    i = 0
+    ref_res = set(const.tokens)
+    parsed = []
+    for j, match in enumerate(result.match_string):
+        # Get residue name from sequence
+        res_name = sequence[j]
+        # Check if we have a match in the structure
+        res = None
+        name_to_atom = {}
+        if match == "|":
+            # Get pdb residue
+            res = polymer[i]
+            name_to_atom = {a.name.upper(): a for a in res}
+            # Double check the match
+            if res.name != res_name:
+                msg = "Alignment mismatch!"
+                raise ValueError(msg)
+            # Increment polymer index
+            i += 1
+        # Map MSE to MET, put the selenium atom in the sulphur column
+        if res_name == "MSE":
+            res_name = "MET"
+            if "SE" in name_to_atom:
+                name_to_atom["SD"] = name_to_atom["SE"]
+        # Handle non-standard residues
+        elif res_name not in ref_res:
+            residue = parse_ccd_residue(
+                name=res_name,
+                components=components,
+                res_idx=j,
+                gemmi_mol=res,
+                is_covalent=True,
+            )
+            parsed.append(residue)
+            continue
+        # Load regular residues
+        ref_mol = components[res_name]
+        ref_mol = AllChem.RemoveHs(ref_mol, sanitize=False)
+        ref_conformer = get_conformer(ref_mol)
+        # Only use reference atoms set in constants
+        ref_name_to_atom = {a.GetProp("name"): a for a in ref_mol.GetAtoms()}
+        ref_atoms = [ref_name_to_atom[a] for a in const.ref_atoms[res_name]]
+        # Iterate, always in the same order
+        atoms: list[ParsedAtom] = []
+        for ref_atom in ref_atoms:
+            # Get atom name
+            atom_name = ref_atom.GetProp("name")
+            idx = ref_atom.GetIdx()
+            # Get conformer coordinates
+            ref_coords = ref_conformer.GetAtomPosition(idx)
+            ref_coords = (ref_coords.x, ref_coords.y, ref_coords.z)
+            # Get coordinated from PDB
+            if atom_name in name_to_atom:
+                atom = name_to_atom[atom_name]
+                atom_is_present = True
+                coords = (atom.pos.x, atom.pos.y, atom.pos.z)
+            else:
+                atom_is_present = False
+                coords = (0, 0, 0)
+            # Add atom to list
+            atoms.append(
+                ParsedAtom(
+                    name=atom_name,
+                    element=ref_atom.GetAtomicNum(),
+                    charge=ref_atom.GetFormalCharge(),
+                    coords=coords,
+                    conformer=ref_coords,
+                    is_present=atom_is_present,
+                    chirality=const.chirality_type_ids.get(
+                        str(ref_atom.GetChiralTag()), unk_chirality
+                    ),
+                )
+            )
+        # Fix naming errors in arginine residues where NH2 is
+        # incorrectly assigned to be closer to CD than NH1
+        if (res is not None) and (res_name == "ARG"):
+            ref_atoms: list[str] = const.ref_atoms["ARG"]
+            cd = atoms[ref_atoms.index("CD")]
+            nh1 = atoms[ref_atoms.index("NH1")]
+            nh2 = atoms[ref_atoms.index("NH2")]
+            cd_coords = np.array(cd.coords)
+            nh1_coords = np.array(nh1.coords)
+            nh2_coords = np.array(nh2.coords)
+            if all(atom.is_present for atom in (cd, nh1, nh2)) and (
+                np.linalg.norm(nh1_coords - cd_coords)
+                > np.linalg.norm(nh2_coords - cd_coords)
+            ):
+                atoms[ref_atoms.index("NH1")] = replace(nh1, coords=nh2.coords)
+                atoms[ref_atoms.index("NH2")] = replace(nh2, coords=nh1.coords)
+        # Add residue to parsed list
+        if res is not None:
+            orig_idx = res.seqid
+            orig_idx = str(orig_idx.num) + str(orig_idx.icode).strip()
+        else:
+            orig_idx = None
+        atom_center = const.res_to_center_atom_id[res_name]
+        atom_disto = const.res_to_disto_atom_id[res_name]
+        parsed.append(
+            ParsedResidue(
+                name=res_name,
+                type=const.token_ids[res_name],
+                atoms=atoms,
+                bonds=[],
+                idx=j,
+                atom_center=atom_center,
+                atom_disto=atom_disto,
+                is_standard=True,
+                is_present=res is not None,
+                orig_idx=orig_idx,
+            )
+        )
+    # Get polymer class
+    if polymer_type == gemmi.PolymerType.PeptideL:
+        chain_type = const.chain_type_ids["PROTEIN"]
+    elif polymer_type == gemmi.PolymerType.Dna:
+        chain_type = const.chain_type_ids["DNA"]
+    elif polymer_type == gemmi.PolymerType.Rna:
+        chain_type = const.chain_type_ids["RNA"]
+    # Return polymer object
+    return ParsedChain(
+        name=chain_id,
+        entity=entity,
+        residues=parsed,
+        type=chain_type,
+        sequence=gemmi.one_letter_code(sequence),
+    )
+def parse_connection(
+    connection: gemmi.Connection,
+    chains: list[ParsedChain],
+    subchain_map: dict[tuple[str, int], str],
+) -> ParsedConnection:
+    """Parse (covalent) connection from a gemmi Connection.
+    Parameters
+    ----------
+    connections : gemmi.ConnectionList
+        The connection list to parse.
+    chains : List[Chain]
+        The parsed chains.
+    subchain_map : dict[tuple[str, int], str]
+        The mapping from chain, residue index to subchain name.
+    Returns
+    -------
+    List[Connection]
+        The parsed connections.
+    """
+    # Map to correct subchains
+    chain_1_name = connection.partner1.chain_name
+    chain_2_name = connection.partner2.chain_name
+    res_1_id = connection.partner1.res_id.seqid
+    res_1_id = str(res_1_id.num) + str(res_1_id.icode).strip()
+    res_2_id = connection.partner2.res_id.seqid
+    res_2_id = str(res_2_id.num) + str(res_2_id.icode).strip()
+    subchain_1 = subchain_map[(chain_1_name, res_1_id)]
+    subchain_2 = subchain_map[(chain_2_name, res_2_id)]
+    # Get chain indices
+    chain_1 = next(chain for chain in chains if (chain.name == subchain_1))
+    chain_2 = next(chain for chain in chains if (chain.name == subchain_2))
+    # Get residue indices
+    res_1_idx, res_1 = next(
+        (idx, res)
+        for idx, res in enumerate(chain_1.residues)
+        if (res.orig_idx == res_1_id)
+    )
+    res_2_idx, res_2 = next(
+        (idx, res)
+        for idx, res in enumerate(chain_2.residues)
+        if (res.orig_idx == res_2_id)
+    )
+    # Get atom indices
+    atom_index_1 = next(
+        idx
+        for idx, atom in enumerate(res_1.atoms)
+        if atom.name == connection.partner1.atom_name
+    )
+    atom_index_2 = next(
+        idx
+        for idx, atom in enumerate(res_2.atoms)
+        if atom.name == connection.partner2.atom_name
+    )
+    conn = ParsedConnection(
+        chain_1=subchain_1,
+        chain_2=subchain_2,
+        residue_index_1=res_1_idx,
+        residue_index_2=res_2_idx,
+        atom_index_1=atom_index_1,
+        atom_index_2=atom_index_2,
+    )
+    return conn
+def parse_mmcif(  # noqa: C901, PLR0915, PLR0912
+    path: str,
+    components: dict[str, Mol],
+    use_assembly: bool = True,
+) -> ParsedStructure:
+    """Parse a structure in MMCIF format.
+    Parameters
+    ----------
+    mmcif_file : PathLike
+        Path to the MMCIF file.
+    components: dict[str, Mol]
+        The preprocessed PDB components dictionary.
+    use_assembly: bool
+        Whether to use the first assembly.
+    Returns
+    -------
+    ParsedStructure
+        The parsed structure.
+    """
+    # Disable rdkit warnings
+    blocker = rdBase.BlockLogs()  # noqa: F841
+    # Parse MMCIF input file
+    block = gemmi.cif.read(str(path))[0]
+    # Extract medatadata
+    deposit_date, release_date, revision_date = get_dates(block)
+    resolution = get_resolution(block)
+    method = get_method(block)
+    # Load structure object
+    structure = gemmi.make_structure_from_block(block)
+    # Clean up the structure
+    structure.merge_chain_parts()
+    structure.remove_waters()
+    structure.remove_hydrogens()
+    structure.remove_alternative_conformations()
+    structure.remove_empty_chains()
+    # Expand assembly 1
+    if use_assembly and structure.assemblies:
+        how = gemmi.HowToNameCopiedChain.AddNumber
+        assembly_name = structure.assemblies[0].name
+        structure.transform_to_assembly(assembly_name, how=how)
+    # Parse entities
+    # Create mapping from subchain id to entity
+    entities: dict[str, gemmi.Entity] = {}
+    entity_ids: dict[str, int] = {}
+    for entity_id, entity in enumerate(structure.entities):
+        entity: gemmi.Entity
+        if entity.entity_type.name == "Water":
+            continue
+        for subchain_id in entity.subchains:
+            entities[subchain_id] = entity
+            entity_ids[subchain_id] = entity_id
+    # Create mapping from chain, residue to subchains
+    # since a Connection uses the chains and not subchins
+    subchain_map = {}
+    for chain in structure[0]:
+        for residue in chain:
+            seq_id = residue.seqid
+            seq_id = str(seq_id.num) + str(seq_id.icode).strip()
+            subchain_map[(chain.name, seq_id)] = residue.subchain
+    # Find covalent ligands
+    covalent_chain_ids = compute_covalent_ligands(
+        connections=structure.connections,
+        subchain_map=subchain_map,
+        entities=entities,
+    )
+    # Parse chains
+    chains: list[ParsedChain] = []
+    chain_seqs = []
+    for raw_chain in structure[0].subchains():
+        # Check chain type
+        subchain_id = raw_chain.subchain_id()
+        entity: gemmi.Entity = entities[subchain_id]
+        entity_type = entity.entity_type.name
+        # Parse a polymer
+        if entity_type == "Polymer":
+            # Skip PeptideD, DnaRnaHybrid, Pna, Other
+            if entity.polymer_type.name not in {
+                "PeptideL",
+                "Dna",
+                "Rna",
+            }:
+                continue
+            # Add polymer if successful
+            parsed_polymer = parse_polymer(
+                polymer=raw_chain,
+                polymer_type=entity.polymer_type,
+                sequence=entity.full_sequence,
+                chain_id=subchain_id,
+                entity=entity.name,
+                components=components,
+            )
+            if parsed_polymer is not None:
+                chains.append(parsed_polymer)
+                chain_seqs.append(parsed_polymer.sequence)
+        # Parse a non-polymer
+        elif entity_type in {"NonPolymer", "Branched"}:
+            # Skip UNL or other missing ligands
+            if any(components.get(lig.name) is None for lig in raw_chain):
+                continue
+            residues = []
+            for lig_idx, ligand in enumerate(raw_chain):
+                # Check if ligand is covalent
+                if entity_type == "Branched":
+                    is_covalent = True
+                else:
+                    is_covalent = subchain_id in covalent_chain_ids
+                ligand: gemmi.Residue
+                residue = parse_ccd_residue(
+                    name=ligand.name,
+                    components=components,
+                    res_idx=lig_idx,
+                    gemmi_mol=ligand,
+                    is_covalent=is_covalent,
+                )
+                residues.append(residue)
+            if residues:
+                chains.append(
+                    ParsedChain(
+                        name=subchain_id,
+                        entity=entity.name,
+                        residues=residues,
+                        type=const.chain_type_ids["NONPOLYMER"],
+                        sequence=None,
+                    )
+                )
+    # If no chains parsed fail
+    if not chains:
+        msg = "No chains parsed!"
+        raise ValueError(msg)
+    # Parse covalent connections
+    connections: list[ParsedConnection] = []
+    for connection in structure.connections:
+        # Skip non-covalent connections
+        connection: gemmi.Connection
+        if connection.type.name != "Covale":
+            continue
+        parsed_connection = parse_connection(
+            connection=connection,
+            chains=chains,
+            subchain_map=subchain_map,
+        )
+        connections.append(parsed_connection)
+    # Create tables
+    atom_data = []
+    bond_data = []
+    res_data = []
+    chain_data = []
+    connection_data = []
+    # Convert parsed chains to tables
+    atom_idx = 0
+    res_idx = 0
+    asym_id = 0
+    sym_count = {}
+    chain_to_idx = {}
+    res_to_idx = {}
+    for asym_id, chain in enumerate(chains):
+        # Compute number of atoms and residues
+        res_num = len(chain.residues)
+        atom_num = sum(len(res.atoms) for res in chain.residues)
+        # Find all copies of this chain in the assembly
+        entity_id = entity_ids[chain.name]
+        sym_id = sym_count.get(entity_id, 0)
+        chain_data.append(
+            (
+                chain.name,
+                chain.type,
+                entity_id,
+                sym_id,
+                asym_id,
+                atom_idx,
+                atom_num,
+                res_idx,
+                res_num,
+            )
+        )
+        chain_to_idx[chain.name] = asym_id
+        sym_count[entity_id] = sym_id + 1
+        # Add residue, atom, bond, data
+        for i, res in enumerate(chain.residues):
+            atom_center = atom_idx + res.atom_center
+            atom_disto = atom_idx + res.atom_disto
+            res_data.append(
+                (
+                    res.name,
+                    res.type,
+                    res.idx,
+                    atom_idx,
+                    len(res.atoms),
+                    atom_center,
+                    atom_disto,
+                    res.is_standard,
+                    res.is_present,
+                )
+            )
+            res_to_idx[(chain.name, i)] = (res_idx, atom_idx)
+            for bond in res.bonds:
+                atom_1 = atom_idx + bond.atom_1
+                atom_2 = atom_idx + bond.atom_2
+                bond_data.append((atom_1, atom_2, bond.type))
+            for atom in res.atoms:
+                atom_data.append(
+                    (
+                        convert_atom_name(atom.name),
+                        atom.element,
+                        atom.charge,
+                        atom.coords,
+                        atom.conformer,
+                        atom.is_present,
+                        atom.chirality,
+                    )
+                )
+                atom_idx += 1
+            res_idx += 1
+    # Convert connections to tables
+    for conn in connections:
+        chain_1_idx = chain_to_idx[conn.chain_1]
+        chain_2_idx = chain_to_idx[conn.chain_2]
+        res_1_idx, atom_1_offset = res_to_idx[(conn.chain_1, conn.residue_index_1)]
+        res_2_idx, atom_2_offset = res_to_idx[(conn.chain_2, conn.residue_index_2)]
+        atom_1_idx = atom_1_offset + conn.atom_index_1
+        atom_2_idx = atom_2_offset + conn.atom_index_2
+        connection_data.append(
+            (
+                chain_1_idx,
+                chain_2_idx,
+                res_1_idx,
+                res_2_idx,
+                atom_1_idx,
+                atom_2_idx,
+            )
+        )
+    # Convert into datatypes
+    atoms = np.array(atom_data, dtype=Atom)
+    bonds = np.array(bond_data, dtype=Bond)
+    residues = np.array(res_data, dtype=Residue)
+    chains = np.array(chain_data, dtype=Chain)
+    connections = np.array(connection_data, dtype=Connection)
+    mask = np.ones(len(chain_data), dtype=bool)
+    # Compute interface chains (find chains with a heavy atom within 5A)
+    interfaces = compute_interfaces(atoms, chains)
+    # Return parsed structure
+    info = StructureInfo(
+        deposited=deposit_date,
+        revised=revision_date,
+        released=release_date,
+        resolution=resolution,
+        method=method,
+        num_chains=len(chains),
+        num_interfaces=len(interfaces),
+    )
+    data = Structure(
+        atoms=atoms,
+        bonds=bonds,
+        residues=residues,
+        chains=chains,
+        connections=connections,
+        interfaces=interfaces,
+        mask=mask,
+    )
+    return ParsedStructure(data=data, info=info, covalents=[])

protify/FastPLMs/boltz/scripts/process/msa.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import argparse
+import multiprocessing
+from dataclasses import asdict
+from functools import partial
+from pathlib import Path
+from typing import Any
+import numpy as np
+from p_tqdm import p_umap
+from redis import Redis
+from tqdm import tqdm
+from boltz.data.parse.a3m import parse_a3m
+class Resource:
+    """A shared resource for processing."""
+    def __init__(self, host: str, port: int) -> None:
+        """Initialize the redis database."""
+        self._redis = Redis(host=host, port=port)
+    def get(self, key: str) -> Any:  # noqa: ANN401
+        """Get an item from the Redis database."""
+        return self._redis.get(key)
+    def __getitem__(self, key: str) -> Any:  # noqa: ANN401
+        """Get an item from the resource."""
+        out = self.get(key)
+        if out is None:
+            raise KeyError(key)
+        return out
+def process_msa(
+    path: Path,
+    outdir: str,
+    max_seqs: int,
+    resource: Resource,
+) -> None:
+    """Run processing in a worker thread."""
+    outdir = Path(outdir)
+    out_path = outdir / f"{path.stem}.npz"
+    if not out_path.exists():
+        msa = parse_a3m(path, resource, max_seqs)
+        np.savez_compressed(out_path, **asdict(msa))
+def process(args) -> None:
+    """Run the data processing task."""
+    # Create output directory
+    args.outdir.mkdir(parents=True, exist_ok=True)
+    # Load the resource
+    resource = Resource(host=args.redis_host, port=args.redis_port)
+    # Get data points
+    print("Fetching data...")
+    data = list(args.msadir.rglob("*.a3m*"))
+    print(f"Found {len(data)} MSA's.")
+    # Check if we can run in parallel
+    max_processes = multiprocessing.cpu_count()
+    num_processes = max(1, min(args.num_processes, max_processes, len(data)))
+    parallel = num_processes > 1
+    # Run processing
+    if parallel:
+        # Create processing function
+        fn = partial(
+            process_msa,
+            outdir=args.outdir,
+            max_seqs=args.max_seqs,
+            resource=resource,
+        )
+        # Run in parallel
+        p_umap(fn, data, num_cpus=num_processes)
+    else:
+        # Run in serial
+        for path in tqdm(data):
+            process_msa(
+                path,
+                outdir=args.outdir,
+                max_seqs=args.max_seqs,
+                resource=resource,
+            )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Process MSA data.")
+    parser.add_argument(
+        "--msadir",
+        type=Path,
+        required=True,
+        help="The MSA data directory.",
+    )
+    parser.add_argument(
+        "--outdir",
+        type=Path,
+        default="data",
+        help="The output directory.",
+    )
+    parser.add_argument(
+        "--num-processes",
+        type=int,
+        default=multiprocessing.cpu_count(),
+        help="The number of processes.",
+    )
+    parser.add_argument(
+        "--redis-host",
+        type=str,
+        default="localhost",
+        help="The Redis host.",
+    )
+    parser.add_argument(
+        "--redis-port",
+        type=int,
+        default=7777,
+        help="The Redis port.",
+    )
+    parser.add_argument(
+        "--max-seqs",
+        type=int,
+        default=16384,
+        help="The maximum number of sequences.",
+    )
+    args = parser.parse_args()
+    process(args)

protify/FastPLMs/boltz/scripts/process/rcsb.py ADDED Viewed

	@@ -0,0 +1,359 @@

+import argparse
+import json
+import multiprocessing
+import pickle
+import traceback
+from dataclasses import asdict, dataclass, replace
+from functools import partial
+from pathlib import Path
+from typing import Any, Optional
+import numpy as np
+import rdkit
+from mmcif import parse_mmcif
+from p_tqdm import p_umap
+from redis import Redis
+from tqdm import tqdm
+from boltz.data.filter.static.filter import StaticFilter
+from boltz.data.filter.static.ligand import ExcludedLigands
+from boltz.data.filter.static.polymer import (
+    ClashingChainsFilter,
+    ConsecutiveCA,
+    MinimumLengthFilter,
+    UnknownFilter,
+)
+from boltz.data.types import ChainInfo, InterfaceInfo, Record, Target
+@dataclass(frozen=True, slots=True)
+class PDB:
+    """A raw MMCIF PDB file."""
+    id: str
+    path: str
+class Resource:
+    """A shared resource for processing."""
+    def __init__(self, host: str, port: int) -> None:
+        """Initialize the redis database."""
+        self._redis = Redis(host=host, port=port)
+    def get(self, key: str) -> Any:  # noqa: ANN401
+        """Get an item from the Redis database."""
+        value = self._redis.get(key)
+        if value is not None:
+            value = pickle.loads(value)  # noqa: S301
+        return value
+    def __getitem__(self, key: str) -> Any:  # noqa: ANN401
+        """Get an item from the resource."""
+        out = self.get(key)
+        if out is None:
+            raise KeyError(key)
+        return out
+def fetch(datadir: Path, max_file_size: Optional[int] = None) -> list[PDB]:
+    """Fetch the PDB files."""
+    data = []
+    excluded = 0
+    for file in datadir.rglob("*.cif*"):
+        # The clustering file is annotated by pdb_entity id
+        pdb_id = str(file.stem).lower()
+        # Check file size and skip if too large
+        if max_file_size is not None and (file.stat().st_size > max_file_size):
+            excluded += 1
+            continue
+        # Create the target
+        target = PDB(id=pdb_id, path=str(file))
+        data.append(target)
+    print(f"Excluded {excluded} files due to size.")  # noqa: T201
+    return data
+def finalize(outdir: Path) -> None:
+    """Run post-processing in main thread.
+    Parameters
+    ----------
+    outdir : Path
+        The output directory.
+    """
+    # Group records into a manifest
+    records_dir = outdir / "records"
+    failed_count = 0
+    records = []
+    for record in records_dir.iterdir():
+        path = record
+        try:
+            with path.open("r") as f:
+                records.append(json.load(f))
+        except:  # noqa: E722
+            failed_count += 1
+            print(f"Failed to parse {record}")  # noqa: T201
+    if failed_count > 0:
+        print(f"Failed to parse {failed_count} entries.")  # noqa: T201
+    else:
+        print("All entries parsed successfully.")
+    # Save manifest
+    outpath = outdir / "manifest.json"
+    with outpath.open("w") as f:
+        json.dump(records, f)
+def parse(data: PDB, resource: Resource, clusters: dict) -> Target:
+    """Process a structure.
+    Parameters
+    ----------
+    data : PDB
+        The raw input data.
+    resource: Resource
+        The shared resource.
+    Returns
+    -------
+    Target
+        The processed data.
+    """
+    # Get the PDB id
+    pdb_id = data.id.lower()
+    # Parse structure
+    parsed = parse_mmcif(data.path, resource)
+    structure = parsed.data
+    structure_info = parsed.info
+    # Create chain metadata
+    chain_info = []
+    for i, chain in enumerate(structure.chains):
+        key = f"{pdb_id}_{chain['entity_id']}"
+        chain_info.append(
+            ChainInfo(
+                chain_id=i,
+                chain_name=chain["name"],
+                msa_id="",  # FIX
+                mol_type=int(chain["mol_type"]),
+                cluster_id=clusters.get(key, -1),
+                num_residues=int(chain["res_num"]),
+            )
+        )
+    # Get interface metadata
+    interface_info = []
+    for interface in structure.interfaces:
+        chain_1 = int(interface["chain_1"])
+        chain_2 = int(interface["chain_2"])
+        interface_info.append(
+            InterfaceInfo(
+                chain_1=chain_1,
+                chain_2=chain_2,
+            )
+        )
+    # Create record
+    record = Record(
+        id=data.id,
+        structure=structure_info,
+        chains=chain_info,
+        interfaces=interface_info,
+    )
+    return Target(structure=structure, record=record)
+def process_structure(
+    data: PDB,
+    resource: Resource,
+    outdir: Path,
+    filters: list[StaticFilter],
+    clusters: dict,
+) -> None:
+    """Process a target.
+    Parameters
+    ----------
+    item : PDB
+        The raw input data.
+    resource: Resource
+        The shared resource.
+    outdir : Path
+        The output directory.
+    """
+    # Check if we need to process
+    struct_path = outdir / "structures" / f"{data.id}.npz"
+    record_path = outdir / "records" / f"{data.id}.json"
+    if struct_path.exists() and record_path.exists():
+        return
+    try:
+        # Parse the target
+        target: Target = parse(data, resource, clusters)
+        structure = target.structure
+        # Apply the filters
+        mask = structure.mask
+        if filters is not None:
+            for f in filters:
+                filter_mask = f.filter(structure)
+                mask = mask & filter_mask
+    except Exception:  # noqa: BLE001
+        traceback.print_exc()
+        print(f"Failed to parse {data.id}")
+        return
+    # Replace chains and interfaces
+    chains = []
+    for i, chain in enumerate(target.record.chains):
+        chains.append(replace(chain, valid=bool(mask[i])))
+    interfaces = []
+    for interface in target.record.interfaces:
+        chain_1 = bool(mask[interface.chain_1])
+        chain_2 = bool(mask[interface.chain_2])
+        interfaces.append(replace(interface, valid=(chain_1 and chain_2)))
+    # Replace structure and record
+    structure = replace(structure, mask=mask)
+    record = replace(target.record, chains=chains, interfaces=interfaces)
+    target = replace(target, structure=structure, record=record)
+    # Dump structure
+    np.savez_compressed(struct_path, **asdict(structure))
+    # Dump record
+    with record_path.open("w") as f:
+        json.dump(asdict(record), f)
+def process(args) -> None:
+    """Run the data processing task."""
+    # Create output directory
+    args.outdir.mkdir(parents=True, exist_ok=True)
+    # Create output directories
+    records_dir = args.outdir / "records"
+    records_dir.mkdir(parents=True, exist_ok=True)
+    structure_dir = args.outdir / "structures"
+    structure_dir.mkdir(parents=True, exist_ok=True)
+    # Load clusters
+    with Path(args.clusters).open("r") as f:
+        clusters: dict[str, str] = json.load(f)
+        clusters = {k.lower(): v.lower() for k, v in clusters.items()}
+    # Load filters
+    filters = [
+        ExcludedLigands(),
+        MinimumLengthFilter(min_len=4, max_len=5000),
+        UnknownFilter(),
+        ConsecutiveCA(max_dist=10.0),
+        ClashingChainsFilter(freq=0.3, dist=1.7),
+    ]
+    # Set default pickle properties
+    pickle_option = rdkit.Chem.PropertyPickleOptions.AllProps
+    rdkit.Chem.SetDefaultPickleProperties(pickle_option)
+    # Load shared data from redis
+    resource = Resource(host=args.redis_host, port=args.redis_port)
+    # Get data points
+    print("Fetching data...")
+    data = fetch(args.datadir)
+    # Check if we can run in parallel
+    max_processes = multiprocessing.cpu_count()
+    num_processes = max(1, min(args.num_processes, max_processes, len(data)))
+    parallel = num_processes > 1
+    # Run processing
+    print("Processing data...")
+    if parallel:
+        # Create processing function
+        fn = partial(
+            process_structure,
+            resource=resource,
+            outdir=args.outdir,
+            clusters=clusters,
+            filters=filters,
+        )
+        # Run processing in parallel
+        p_umap(fn, data, num_cpus=num_processes)
+    else:
+        for item in tqdm(data):
+            process_structure(
+                item,
+                resource=resource,
+                outdir=args.outdir,
+                clusters=clusters,
+                filters=filters,
+            )
+    # Finalize
+    finalize(args.outdir)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Process MSA data.")
+    parser.add_argument(
+        "--datadir",
+        type=Path,
+        required=True,
+        help="The data containing the MMCIF files.",
+    )
+    parser.add_argument(
+        "--clusters",
+        type=Path,
+        required=True,
+        help="Path to the cluster file.",
+    )
+    parser.add_argument(
+        "--outdir",
+        type=Path,
+        default="data",
+        help="The output directory.",
+    )
+    parser.add_argument(
+        "--num-processes",
+        type=int,
+        default=multiprocessing.cpu_count(),
+        help="The number of processes.",
+    )
+    parser.add_argument(
+        "--redis-host",
+        type=str,
+        default="localhost",
+        help="The Redis host.",
+    )
+    parser.add_argument(
+        "--redis-port",
+        type=int,
+        default=7777,
+        help="The Redis port.",
+    )
+    parser.add_argument(
+        "--use-assembly",
+        action="store_true",
+        help="Whether to use assembly 1.",
+    )
+    parser.add_argument(
+        "--max-file-size",
+        type=int,
+        default=None,
+    )
+    args = parser.parse_args()
+    process(args)

protify/FastPLMs/boltz/scripts/train/train.py ADDED Viewed

	@@ -0,0 +1,241 @@

+import os
+import random
+import string
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+import hydra
+import omegaconf
+import pytorch_lightning as pl
+import torch
+import torch.multiprocessing
+from omegaconf import OmegaConf, listconfig
+from pytorch_lightning import LightningModule
+from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
+from pytorch_lightning.loggers import WandbLogger
+from pytorch_lightning.strategies import DDPStrategy
+from pytorch_lightning.utilities import rank_zero_only
+from boltz.data.module.training import BoltzTrainingDataModule, DataConfig
+@dataclass
+class TrainConfig:
+    """Train configuration.
+    Attributes
+    ----------
+    data : DataConfig
+        The data configuration.
+    model : ModelConfig
+        The model configuration.
+    output : str
+        The output directory.
+    trainer : Optional[dict]
+        The trainer configuration.
+    resume : Optional[str]
+        The resume checkpoint.
+    pretrained : Optional[str]
+        The pretrained model.
+    wandb : Optional[dict]
+        The wandb configuration.
+    disable_checkpoint : bool
+        Disable checkpoint.
+    matmul_precision : Optional[str]
+        The matmul precision.
+    find_unused_parameters : Optional[bool]
+        Find unused parameters.
+    save_top_k : Optional[int]
+        Save top k checkpoints.
+    validation_only : bool
+        Run validation only.
+    debug : bool
+        Debug mode.
+    strict_loading : bool
+        Fail on mismatched checkpoint weights.
+    load_confidence_from_trunk: Optional[bool]
+        Load pre-trained confidence weights from trunk.
+    """
+    data: DataConfig
+    model: LightningModule
+    output: str
+    trainer: Optional[dict] = None
+    resume: Optional[str] = None
+    pretrained: Optional[str] = None
+    wandb: Optional[dict] = None
+    disable_checkpoint: bool = False
+    matmul_precision: Optional[str] = None
+    find_unused_parameters: Optional[bool] = False
+    save_top_k: Optional[int] = 1
+    validation_only: bool = False
+    debug: bool = False
+    strict_loading: bool = True
+    load_confidence_from_trunk: Optional[bool] = False
+def train(raw_config: str, args: list[str]) -> None:  # noqa: C901, PLR0912, PLR0915
+    """Run training.
+    Parameters
+    ----------
+    raw_config : str
+        The input yaml configuration.
+    args : list[str]
+        Any command line overrides.
+    """
+    # Load the configuration
+    raw_config = omegaconf.OmegaConf.load(raw_config)
+    # Apply input arguments
+    args = omegaconf.OmegaConf.from_dotlist(args)
+    raw_config = omegaconf.OmegaConf.merge(raw_config, args)
+    # Instantiate the task
+    cfg = hydra.utils.instantiate(raw_config)
+    cfg = TrainConfig(**cfg)
+    # Set matmul precision
+    if cfg.matmul_precision is not None:
+        torch.set_float32_matmul_precision(cfg.matmul_precision)
+    # Create trainer dict
+    trainer = cfg.trainer
+    if trainer is None:
+        trainer = {}
+    # Flip some arguments in debug mode
+    devices = trainer.get("devices", 1)
+    wandb = cfg.wandb
+    if cfg.debug:
+        if isinstance(devices, int):
+            devices = 1
+        elif isinstance(devices, (list, listconfig.ListConfig)):
+            devices = [devices[0]]
+        trainer["devices"] = devices
+        cfg.data.num_workers = 0
+        if wandb:
+            wandb = None
+    # Create objects
+    data_config = DataConfig(**cfg.data)
+    data_module = BoltzTrainingDataModule(data_config)
+    model_module = cfg.model
+    if cfg.pretrained and not cfg.resume:
+        # Load the pretrained weights into the confidence module
+        if cfg.load_confidence_from_trunk:
+            checkpoint = torch.load(cfg.pretrained, map_location="cpu")
+            # Modify parameter names in the state_dict
+            new_state_dict = {}
+            for key, value in checkpoint["state_dict"].items():
+                if not key.startswith("structure_module") and not key.startswith(
+                    "distogram_module"
+                ):
+                    new_key = "confidence_module." + key
+                    new_state_dict[new_key] = value
+            new_state_dict.update(checkpoint["state_dict"])
+            # Update the checkpoint with the new state_dict
+            checkpoint["state_dict"] = new_state_dict
+            # Save the modified checkpoint
+            random_string = "".join(
+                random.choices(string.ascii_lowercase + string.digits, k=10)
+            )
+            file_path = os.path.dirname(cfg.pretrained) + "/" + random_string + ".ckpt"
+            print(
+                f"Saving modified checkpoint to {file_path} created by broadcasting trunk of {cfg.pretrained} to confidence module."
+            )
+            torch.save(checkpoint, file_path)
+        else:
+            file_path = cfg.pretrained
+        print(f"Loading model from {file_path}")
+        model_module = type(model_module).load_from_checkpoint(
+            file_path, map_location="cpu", strict=False, **(model_module.hparams)
+        )
+        if cfg.load_confidence_from_trunk:
+            os.remove(file_path)
+    # Create checkpoint callback
+    callbacks = []
+    dirpath = cfg.output
+    if not cfg.disable_checkpoint:
+        mc = ModelCheckpoint(
+            monitor="val/lddt",
+            save_top_k=cfg.save_top_k,
+            save_last=True,
+            mode="max",
+            every_n_epochs=1,
+        )
+        callbacks = [mc]
+    # Create wandb logger
+    loggers = []
+    if wandb:
+        wdb_logger = WandbLogger(
+            name=wandb["name"],
+            group=wandb["name"],
+            save_dir=cfg.output,
+            project=wandb["project"],
+            entity=wandb["entity"],
+            log_model=False,
+        )
+        loggers.append(wdb_logger)
+        # Save the config to wandb
+        @rank_zero_only
+        def save_config_to_wandb() -> None:
+            config_out = Path(wdb_logger.experiment.dir) / "run.yaml"
+            with Path.open(config_out, "w") as f:
+                OmegaConf.save(raw_config, f)
+            wdb_logger.experiment.save(str(config_out))
+        save_config_to_wandb()
+    # Set up trainer
+    strategy = "auto"
+    if (isinstance(devices, int) and devices > 1) or (
+        isinstance(devices, (list, listconfig.ListConfig)) and len(devices) > 1
+    ):
+        strategy = DDPStrategy(find_unused_parameters=cfg.find_unused_parameters)
+    trainer = pl.Trainer(
+        default_root_dir=str(dirpath),
+        strategy=strategy,
+        callbacks=callbacks,
+        logger=loggers,
+        enable_checkpointing=not cfg.disable_checkpoint,
+        reload_dataloaders_every_n_epochs=1,
+        **trainer,
+    )
+    if not cfg.strict_loading:
+        model_module.strict_loading = False
+    if cfg.validation_only:
+        trainer.validate(
+            model_module,
+            datamodule=data_module,
+            ckpt_path=cfg.resume,
+        )
+    else:
+        trainer.fit(
+            model_module,
+            datamodule=data_module,
+            ckpt_path=cfg.resume,
+        )
+if __name__ == "__main__":
+    arg1 = sys.argv[1]
+    arg2 = sys.argv[2:]
+    train(arg1, arg2)

protify/FastPLMs/boltz/src/boltz/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from importlib.metadata import PackageNotFoundError, version
+try:  # noqa: SIM105
+    __version__ = version("boltz")
+except PackageNotFoundError:
+    # package is not installed
+    pass

protify/FastPLMs/boltz/src/boltz/data/__init__.py ADDED Viewed

File without changes

protify/FastPLMs/boltz/src/boltz/data/const.py ADDED Viewed

	@@ -0,0 +1,1184 @@

+####################################################################################################
+# CHAINS
+####################################################################################################
+chain_types = [
+    "PROTEIN",
+    "DNA",
+    "RNA",
+    "NONPOLYMER",
+]
+chain_type_ids = {chain: i for i, chain in enumerate(chain_types)}
+out_types = [
+    "dna_protein",
+    "rna_protein",
+    "ligand_protein",
+    "dna_ligand",
+    "rna_ligand",
+    "intra_ligand",
+    "intra_dna",
+    "intra_rna",
+    "intra_protein",
+    "protein_protein",
+    "modified",
+]
+out_types_weights_af3 = {
+    "dna_protein": 10.0,
+    "rna_protein": 10.0,
+    "ligand_protein": 10.0,
+    "dna_ligand": 5.0,
+    "rna_ligand": 5.0,
+    "intra_ligand": 20.0,
+    "intra_dna": 4.0,
+    "intra_rna": 16.0,
+    "intra_protein": 20.0,
+    "protein_protein": 20.0,
+    "modified": 0.0,
+}
+out_types_weights = {
+    "dna_protein": 5.0,
+    "rna_protein": 5.0,
+    "ligand_protein": 20.0,
+    "dna_ligand": 2.0,
+    "rna_ligand": 2.0,
+    "intra_ligand": 20.0,
+    "intra_dna": 2.0,
+    "intra_rna": 8.0,
+    "intra_protein": 20.0,
+    "protein_protein": 20.0,
+    "modified": 0.0,
+}
+out_single_types = ["protein", "ligand", "dna", "rna"]
+clash_types = [
+    "dna_protein",
+    "rna_protein",
+    "ligand_protein",
+    "protein_protein",
+    "dna_ligand",
+    "rna_ligand",
+    "ligand_ligand",
+    "rna_dna",
+    "dna_dna",
+    "rna_rna",
+]
+chain_types_to_clash_type = {
+    frozenset(("PROTEIN", "DNA")): "dna_protein",
+    frozenset(("PROTEIN", "RNA")): "rna_protein",
+    frozenset(("PROTEIN", "NONPOLYMER")): "ligand_protein",
+    frozenset(("PROTEIN",)): "protein_protein",
+    frozenset(("NONPOLYMER", "DNA")): "dna_ligand",
+    frozenset(("NONPOLYMER", "RNA")): "rna_ligand",
+    frozenset(("NONPOLYMER",)): "ligand_ligand",
+    frozenset(("DNA", "RNA")): "rna_dna",
+    frozenset(("DNA",)): "dna_dna",
+    frozenset(("RNA",)): "rna_rna",
+}
+chain_type_to_out_single_type = {
+    "PROTEIN": "protein",
+    "DNA": "dna",
+    "RNA": "rna",
+    "NONPOLYMER": "ligand",
+}
+####################################################################################################
+# RESIDUES & TOKENS
+####################################################################################################
+canonical_tokens = [
+    "ALA",
+    "ARG",
+    "ASN",
+    "ASP",
+    "CYS",
+    "GLN",
+    "GLU",
+    "GLY",
+    "HIS",
+    "ILE",
+    "LEU",
+    "LYS",
+    "MET",
+    "PHE",
+    "PRO",
+    "SER",
+    "THR",
+    "TRP",
+    "TYR",
+    "VAL",
+    "UNK",  # unknown protein token
+]
+tokens = [
+    "<pad>",
+    "-",
+    *canonical_tokens,
+    "A",
+    "G",
+    "C",
+    "U",
+    "N",  # unknown rna token
+    "DA",
+    "DG",
+    "DC",
+    "DT",
+    "DN",  # unknown dna token
+]
+token_ids = {token: i for i, token in enumerate(tokens)}
+num_tokens = len(tokens)
+unk_token = {"PROTEIN": "UNK", "DNA": "DN", "RNA": "N"}
+unk_token_ids = {m: token_ids[t] for m, t in unk_token.items()}
+prot_letter_to_token = {
+    "A": "ALA",
+    "R": "ARG",
+    "N": "ASN",
+    "D": "ASP",
+    "C": "CYS",
+    "E": "GLU",
+    "Q": "GLN",
+    "G": "GLY",
+    "H": "HIS",
+    "I": "ILE",
+    "L": "LEU",
+    "K": "LYS",
+    "M": "MET",
+    "F": "PHE",
+    "P": "PRO",
+    "S": "SER",
+    "T": "THR",
+    "W": "TRP",
+    "Y": "TYR",
+    "V": "VAL",
+    "X": "UNK",
+    "J": "UNK",
+    "B": "UNK",
+    "Z": "UNK",
+    "O": "UNK",
+    "U": "UNK",
+    "-": "-",
+}
+prot_token_to_letter = {v: k for k, v in prot_letter_to_token.items()}
+prot_token_to_letter["UNK"] = "X"
+rna_letter_to_token = {
+    "A": "A",
+    "G": "G",
+    "C": "C",
+    "U": "U",
+    "N": "N",
+}
+rna_token_to_letter = {v: k for k, v in rna_letter_to_token.items()}
+dna_letter_to_token = {
+    "A": "DA",
+    "G": "DG",
+    "C": "DC",
+    "T": "DT",
+    "N": "DN",
+}
+dna_token_to_letter = {v: k for k, v in dna_letter_to_token.items()}
+####################################################################################################
+# ATOMS
+####################################################################################################
+num_elements = 128
+chirality_types = [
+    "CHI_UNSPECIFIED",
+    "CHI_TETRAHEDRAL_CW",
+    "CHI_TETRAHEDRAL_CCW",
+    "CHI_SQUAREPLANAR",
+    "CHI_OCTAHEDRAL",
+    "CHI_TRIGONALBIPYRAMIDAL",
+    "CHI_OTHER",
+]
+chirality_type_ids = {chirality: i for i, chirality in enumerate(chirality_types)}
+unk_chirality_type = "CHI_OTHER"
+hybridization_map = [
+    "S",
+    "SP",
+    "SP2",
+    "SP2D",
+    "SP3",
+    "SP3D",
+    "SP3D2",
+    "OTHER",
+    "UNSPECIFIED",
+]
+hybridization_type_ids = {hybrid: i for i, hybrid in enumerate(hybridization_map)}
+unk_hybridization_type = "UNSPECIFIED"
+# fmt: off
+ref_atoms = {
+    "PAD": [],
+    "UNK": ["N", "CA", "C", "O", "CB"],
+    "-": [],
+    "ALA": ["N", "CA", "C", "O", "CB"],
+    "ARG": ["N", "CA", "C", "O", "CB", "CG", "CD", "NE", "CZ", "NH1", "NH2"],
+    "ASN": ["N", "CA", "C", "O", "CB", "CG", "OD1", "ND2"],
+    "ASP": ["N", "CA", "C", "O", "CB", "CG", "OD1", "OD2"],
+    "CYS": ["N", "CA", "C", "O", "CB", "SG"],
+    "GLN": ["N", "CA", "C", "O", "CB", "CG", "CD", "OE1", "NE2"],
+    "GLU": ["N", "CA", "C", "O", "CB", "CG", "CD", "OE1", "OE2"],
+    "GLY": ["N", "CA", "C", "O"],
+    "HIS": ["N", "CA", "C", "O", "CB", "CG", "ND1", "CD2", "CE1", "NE2"],
+    "ILE": ["N", "CA", "C", "O", "CB", "CG1", "CG2", "CD1"],
+    "LEU": ["N", "CA", "C", "O", "CB", "CG", "CD1", "CD2"],
+    "LYS": ["N", "CA", "C", "O", "CB", "CG", "CD", "CE", "NZ"],
+    "MET": ["N", "CA", "C", "O", "CB", "CG", "SD", "CE"],
+    "PHE": ["N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ"],
+    "PRO": ["N", "CA", "C", "O", "CB", "CG", "CD"],
+    "SER": ["N", "CA", "C", "O", "CB", "OG"],
+    "THR": ["N", "CA", "C", "O", "CB", "OG1", "CG2"],
+    "TRP": ["N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "NE1", "CE2", "CE3", "CZ2", "CZ3", "CH2"],  # noqa: E501
+    "TYR": ["N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "OH"],
+    "VAL": ["N", "CA", "C", "O", "CB", "CG1", "CG2"],
+    "A": ["P", "OP1", "OP2", "O5'", "C5'", "C4'", "O4'", "C3'", "O3'", "C2'", "O2'", "C1'", "N9", "C8", "N7", "C5", "C6", "N6", "N1", "C2", "N3", "C4"],  # noqa: E501
+    "G": ["P", "OP1", "OP2", "O5'", "C5'", "C4'", "O4'", "C3'", "O3'", "C2'", "O2'", "C1'", "N9", "C8", "N7", "C5", "C6", "O6", "N1", "C2", "N2", "N3", "C4"],  # noqa: E501
+    "C": ["P", "OP1", "OP2", "O5'", "C5'", "C4'", "O4'", "C3'", "O3'", "C2'", "O2'", "C1'", "N1", "C2", "O2", "N3", "C4", "N4", "C5", "C6"],  # noqa: E501
+    "U": ["P", "OP1", "OP2", "O5'", "C5'", "C4'", "O4'", "C3'", "O3'", "C2'", "O2'", "C1'", "N1", "C2", "O2", "N3", "C4", "O4", "C5", "C6"],  # noqa: E501
+    "N": ["P", "OP1", "OP2", "O5'", "C5'", "C4'", "O4'", "C3'", "O3'", "C2'", "O2'", "C1'"],  # noqa: E501
+    "DA": ["P", "OP1", "OP2", "O5'", "C5'", "C4'", "O4'", "C3'", "O3'", "C2'", "C1'", "N9", "C8", "N7", "C5", "C6", "N6", "N1", "C2", "N3", "C4"],  # noqa: E501
+    "DG": ["P", "OP1", "OP2", "O5'", "C5'", "C4'", "O4'", "C3'", "O3'", "C2'", "C1'", "N9", "C8", "N7", "C5", "C6", "O6", "N1", "C2", "N2", "N3", "C4"],  # noqa: E501
+    "DC": ["P", "OP1", "OP2", "O5'", "C5'", "C4'", "O4'", "C3'", "O3'", "C2'", "C1'", "N1", "C2", "O2", "N3", "C4", "N4", "C5", "C6"],  # noqa: E501
+    "DT": ["P", "OP1", "OP2", "O5'", "C5'", "C4'", "O4'", "C3'", "O3'", "C2'", "C1'", "N1", "C2", "O2", "N3", "C4", "O4", "C5", "C7", "C6"],  # noqa: E501
+    "DN": ["P", "OP1", "OP2", "O5'", "C5'", "C4'", "O4'", "C3'", "O3'", "C2'", "C1'"]
+}
+protein_backbone_atom_names = ["N", "CA", "C", "O"]
+nucleic_backbone_atom_names = ["P", "OP1", "OP2", "O5'", "C5'", "C4'", "O4'", "C3'", "O3'", "C2'", "O2'", "C1'"]
+protein_backbone_atom_index = {name: i for i, name in enumerate(protein_backbone_atom_names)}
+nucleic_backbone_atom_index = {name: i for i, name in enumerate(nucleic_backbone_atom_names)}
+ref_symmetries = {
+    "PAD": [],
+    "ALA": [],
+    "ARG": [],
+    "ASN": [],
+    "ASP": [[(6, 7), (7, 6)]],
+    "CYS": [],
+    "GLN": [],
+    "GLU": [[(7, 8), (8, 7)]],
+    "GLY": [],
+    "HIS": [],
+    "ILE": [],
+    "LEU": [],
+    "LYS": [],
+    "MET": [],
+    "PHE": [[(6, 7), (7, 6), (8, 9), (9, 8)]],
+    "PRO": [],
+    "SER": [],
+    "THR": [],
+    "TRP": [],
+    "TYR": [[(6, 7), (7, 6), (8, 9), (9, 8)]],
+    "VAL": [],
+    "A": [[(1, 2), (2, 1)]],
+    "G": [[(1, 2), (2, 1)]],
+    "C": [[(1, 2), (2, 1)]],
+    "U": [[(1, 2), (2, 1)]],
+    #"N": [[(1, 2), (2, 1)]],
+    "DA": [[(1, 2), (2, 1)]],
+    "DG": [[(1, 2), (2, 1)]],
+    "DC": [[(1, 2), (2, 1)]],
+    "DT": [[(1, 2), (2, 1)]],
+    #"DN": [[(1, 2), (2, 1)]]
+}
+res_to_center_atom = {
+    "UNK": "CA",
+    "ALA": "CA",
+    "ARG": "CA",
+    "ASN": "CA",
+    "ASP": "CA",
+    "CYS": "CA",
+    "GLN": "CA",
+    "GLU": "CA",
+    "GLY": "CA",
+    "HIS": "CA",
+    "ILE": "CA",
+    "LEU": "CA",
+    "LYS": "CA",
+    "MET": "CA",
+    "PHE": "CA",
+    "PRO": "CA",
+    "SER": "CA",
+    "THR": "CA",
+    "TRP": "CA",
+    "TYR": "CA",
+    "VAL": "CA",
+    "A": "C1'",
+    "G": "C1'",
+    "C": "C1'",
+    "U": "C1'",
+    "N": "C1'",
+    "DA": "C1'",
+    "DG": "C1'",
+    "DC": "C1'",
+    "DT": "C1'",
+    "DN": "C1'"
+}
+res_to_disto_atom = {
+    "UNK": "CB",
+    "ALA": "CB",
+    "ARG": "CB",
+    "ASN": "CB",
+    "ASP": "CB",
+    "CYS": "CB",
+    "GLN": "CB",
+    "GLU": "CB",
+    "GLY": "CA",
+    "HIS": "CB",
+    "ILE": "CB",
+    "LEU": "CB",
+    "LYS": "CB",
+    "MET": "CB",
+    "PHE": "CB",
+    "PRO": "CB",
+    "SER": "CB",
+    "THR": "CB",
+    "TRP": "CB",
+    "TYR": "CB",
+    "VAL": "CB",
+    "A": "C4",
+    "G": "C4",
+    "C": "C2",
+    "U": "C2",
+    "N": "C1'",
+    "DA": "C4",
+    "DG": "C4",
+    "DC": "C2",
+    "DT": "C2",
+    "DN": "C1'"
+}
+res_to_center_atom_id = {
+    res: ref_atoms[res].index(atom)
+    for res, atom in res_to_center_atom.items()
+}
+res_to_disto_atom_id = {
+    res: ref_atoms[res].index(atom)
+    for res, atom in res_to_disto_atom.items()
+}
+# fmt: on
+####################################################################################################
+# BONDS
+####################################################################################################
+atom_interface_cutoff = 5.0
+interface_cutoff = 15.0
+bond_types = [
+    "OTHER",
+    "SINGLE",
+    "DOUBLE",
+    "TRIPLE",
+    "AROMATIC",
+    "COVALENT",
+]
+bond_type_ids = {bond: i for i, bond in enumerate(bond_types)}
+unk_bond_type = "OTHER"
+####################################################################################################
+# Contacts
+####################################################################################################
+pocket_contact_info = {
+    "UNSPECIFIED": 0,
+    "UNSELECTED": 1,
+    "POCKET": 2,
+    "BINDER": 3,
+}
+contact_conditioning_info = {
+    "UNSPECIFIED": 0,
+    "UNSELECTED": 1,
+    "POCKET>BINDER": 2,
+    "BINDER>POCKET": 3,
+    "CONTACT": 4,
+}
+####################################################################################################
+# MSA
+####################################################################################################
+max_msa_seqs = 16384
+max_paired_seqs = 8192
+####################################################################################################
+# CHUNKING
+####################################################################################################
+chunk_size_threshold = 384
+####################################################################################################
+# Method conditioning
+####################################################################################################
+# Methods
+method_types_ids = {
+    "MD": 0,
+    "X-RAY DIFFRACTION": 1,
+    "ELECTRON MICROSCOPY": 2,
+    "SOLUTION NMR": 3,
+    "SOLID-STATE NMR": 4,
+    "NEUTRON DIFFRACTION": 4,
+    "ELECTRON CRYSTALLOGRAPHY": 4,
+    "FIBER DIFFRACTION": 4,
+    "POWDER DIFFRACTION": 4,
+    "INFRARED SPECTROSCOPY": 4,
+    "FLUORESCENCE TRANSFER": 4,
+    "EPR": 4,
+    "THEORETICAL MODEL": 4,
+    "SOLUTION SCATTERING": 4,
+    "OTHER": 4,
+    "AFDB": 5,
+    "BOLTZ-1": 6,
+    "FUTURE1": 7,  # Placeholder for future supervision sources
+    "FUTURE2": 8,
+    "FUTURE3": 9,
+    "FUTURE4": 10,
+    "FUTURE5": 11,
+}
+method_types_ids = {k.lower(): v for k, v in method_types_ids.items()}
+num_method_types = len(set(method_types_ids.values()))
+# Temperature
+temperature_bins = [(265, 280), (280, 295), (295, 310)]
+temperature_bins_ids = {temp: i for i, temp in enumerate(temperature_bins)}
+temperature_bins_ids["other"] = len(temperature_bins)
+num_temp_bins = len(temperature_bins_ids)
+# pH
+ph_bins = [(0, 6), (6, 8), (8, 14)]
+ph_bins_ids = {ph: i for i, ph in enumerate(ph_bins)}
+ph_bins_ids["other"] = len(ph_bins)
+num_ph_bins = len(ph_bins_ids)
+####################################################################################################
+# VDW_RADII
+####################################################################################################
+# fmt: off
+vdw_radii = [
+    1.2, 1.4, 2.2, 1.9, 1.8, 1.7, 1.6, 1.55, 1.5, 1.54,
+    2.4, 2.2, 2.1, 2.1, 1.95, 1.8, 1.8, 1.88, 2.8, 2.4,
+    2.3, 2.15, 2.05, 2.05, 2.05, 2.05, 2.0, 2.0, 2.0, 2.1,
+    2.1, 2.1, 2.05, 1.9, 1.9, 2.02, 2.9, 2.55, 2.4, 2.3,
+    2.15, 2.1, 2.05, 2.05, 2.0, 2.05, 2.1, 2.2, 2.2, 2.25,
+    2.2, 2.1, 2.1, 2.16, 3.0, 2.7, 2.5, 2.48, 2.47, 2.45,
+    2.43, 2.42, 2.4, 2.38, 2.37, 2.35, 2.33, 2.32, 2.3, 2.28,
+    2.27, 2.25, 2.2, 2.1, 2.05, 2.0, 2.0, 2.05, 2.1, 2.05,
+    2.2, 2.3, 2.3, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.4,
+    2.0, 2.3, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
+    2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
+    2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0
+]
+# fmt: on
+####################################################################################################
+# Excluded ligands
+####################################################################################################
+ligand_exclusion = {
+    "144",
+    "15P",
+    "1PE",
+    "2F2",
+    "2JC",
+    "3HR",
+    "3SY",
+    "7N5",
+    "7PE",
+    "9JE",
+    "AAE",
+    "ABA",
+    "ACE",
+    "ACN",
+    "ACT",
+    "ACY",
+    "AZI",
+    "BAM",
+    "BCN",
+    "BCT",
+    "BDN",
+    "BEN",
+    "BME",
+    "BO3",
+    "BTB",
+    "BTC",
+    "BU1",
+    "C8E",
+    "CAD",
+    "CAQ",
+    "CBM",
+    "CCN",
+    "CIT",
+    "CL",
+    "CLR",
+    "CM",
+    "CMO",
+    "CO3",
+    "CPT",
+    "CXS",
+    "D10",
+    "DEP",
+    "DIO",
+    "DMS",
+    "DN",
+    "DOD",
+    "DOX",
+    "EDO",
+    "EEE",
+    "EGL",
+    "EOH",
+    "EOX",
+    "EPE",
+    "ETF",
+    "FCY",
+    "FJO",
+    "FLC",
+    "FMT",
+    "FW5",
+    "GOL",
+    "GSH",
+    "GTT",
+    "GYF",
+    "HED",
+    "IHP",
+    "IHS",
+    "IMD",
+    "IOD",
+    "IPA",
+    "IPH",
+    "LDA",
+    "MB3",
+    "MEG",
+    "MES",
+    "MLA",
+    "MLI",
+    "MOH",
+    "MPD",
+    "MRD",
+    "MSE",
+    "MYR",
+    "N",
+    "NA",
+    "NH2",
+    "NH4",
+    "NHE",
+    "NO3",
+    "O4B",
+    "OHE",
+    "OLA",
+    "OLC",
+    "OMB",
+    "OME",
+    "OXA",
+    "P6G",
+    "PE3",
+    "PE4",
+    "PEG",
+    "PEO",
+    "PEP",
+    "PG0",
+    "PG4",
+    "PGE",
+    "PGR",
+    "PLM",
+    "PO4",
+    "POL",
+    "POP",
+    "PVO",
+    "SAR",
+    "SCN",
+    "SEO",
+    "SEP",
+    "SIN",
+    "SO4",
+    "SPD",
+    "SPM",
+    "SR",
+    "STE",
+    "STO",
+    "STU",
+    "TAR",
+    "TBU",
+    "TME",
+    "TPO",
+    "TRS",
+    "UNK",
+    "UNL",
+    "UNX",
+    "UPL",
+    "URE",
+}
+####################################################################################################
+# TEMPLATES
+####################################################################################################
+min_coverage_residues = 10
+min_coverage_fraction = 0.1
+####################################################################################################
+# Ambiguous atoms
+####################################################################################################
+ambiguous_atoms = {
+    "CA": {
+        "*": "C",
+        "OEX": "CA",
+        "OEC": "CA",
+        "543": "CA",
+        "OC6": "CA",
+        "OC1": "CA",
+        "OC7": "CA",
+        "OEY": "CA",
+        "OC4": "CA",
+        "OC3": "CA",
+        "ICA": "CA",
+        "CA": "CA",
+        "OC2": "CA",
+        "OC5": "CA",
+    },
+    "CD": {"*": "C", "CD": "CD", "CD3": "CD", "CD5": "CD", "CD1": "CD"},
+    "BR": "BR",
+    "CL": {
+        "*": "CL",
+        "C8P": "C",
+        "L3T": "C",
+        "TLC": "C",
+        "TZ0": "C",
+        "471": "C",
+        "NLK": "C",
+        "PGM": "C",
+        "PNE": "C",
+        "RCY": "C",
+        "11F": "C",
+        "PII": "C",
+        "C1Q": "C",
+        "4MD": "C",
+        "R5A": "C",
+        "KW2": "C",
+        "I7M": "C",
+        "R48": "C",
+        "FC3": "C",
+        "55V": "C",
+        "KPF": "C",
+        "SPZ": "C",
+        "0TT": "C",
+        "R9A": "C",
+        "5NA": "C",
+        "C55": "C",
+        "NIX": "C",
+        "5PM": "C",
+        "PP8": "C",
+        "544": "C",
+        "812": "C",
+        "NPM": "C",
+        "KU8": "C",
+        "A1AMM": "C",
+        "4S0": "C",
+        "AQC": "C",
+        "2JK": "C",
+        "WJR": "C",
+        "A1AAW": "C",
+        "85E": "C",
+        "MB0": "C",
+        "ZAB": "C",
+        "85K": "C",
+        "GBP": "C",
+        "A1H80": "C",
+        "A1AFR": "C",
+        "L9M": "C",
+        "MYK": "C",
+        "MB9": "C",
+        "38R": "C",
+        "EKB": "C",
+        "NKF": "C",
+        "UMQ": "C",
+        "T4K": "C",
+        "3PT": "C",
+        "A1A7S": "C",
+        "1Q9": "C",
+        "11R": "C",
+        "D2V": "C",
+        "SM8": "C",
+        "IFC": "C",
+        "DB5": "C",
+        "L2T": "C",
+        "GNB": "C",
+        "PP7": "C",
+        "072": "C",
+        "P88": "C",
+        "DRL": "C",
+        "C9W": "C",
+        "NTP": "C",
+        "4HJ": "C",
+        "7NA": "C",
+        "LPC": "C",
+        "T8W": "C",
+        "63R": "C",
+        "570": "C",
+        "R4A": "C",
+        "3BG": "C",
+        "4RB": "C",
+        "GSO": "C",
+        "BQ6": "C",
+        "R4P": "C",
+        "5CP": "C",
+        "TTR": "C",
+        "6UZ": "C",
+        "SPJ": "C",
+        "0SA": "C",
+        "ZL1": "C",
+        "BYG": "C",
+        "F0E": "C",
+        "PC0": "C",
+        "B2Q": "C",
+        "KV6": "C",
+        "NTO": "C",
+        "CLG": "C",
+        "R7U": "C",
+        "SMQ": "C",
+        "GM2": "C",
+        "Z7P": "C",
+        "NXF": "C",
+        "C6Q": "C",
+        "A1G": "C",
+        "433": "C",
+        "L9N": "C",
+        "7OX": "C",
+        "A1H84": "C",
+        "97L": "C",
+        "HDV": "C",
+        "LUO": "C",
+        "R6A": "C",
+        "1PC": "C",
+        "4PT": "C",
+        "SBZ": "C",
+        "EAB": "C",
+        "FL4": "C",
+        "OPS": "C",
+        "C2X": "C",
+        "SLL": "C",
+        "BFC": "C",
+        "GIP": "C",
+        "7CP": "C",
+        "CLH": "C",
+        "34E": "C",
+        "5NE": "C",
+        "PBF": "C",
+        "ABD": "C",
+        "ABC": "C",
+        "LPF": "C",
+        "TIZ": "C",
+        "4HH": "C",
+        "AFC": "C",
+        "WQH": "C",
+        "9JL": "C",
+        "CS3": "C",
+        "NL0": "C",
+        "KPY": "C",
+        "DNA": "C",
+        "B3C": "C",
+        "TKL": "C",
+        "KVS": "C",
+        "HO6": "C",
+        "NLH": "C",
+        "1PB": "C",
+        "CYF": "C",
+        "G4M": "C",
+        "R5B": "C",
+        "N4S": "C",
+        "N11": "C",
+        "C8F": "C",
+        "PIJ": "C",
+        "WIN": "C",
+        "NT1": "C",
+        "WJW": "C",
+        "HF7": "C",
+        "TY1": "C",
+        "VM1": "C",
+    },
+    "OS": {"*": "O", "DWC": "OS", "OHX": "OS", "OS": "OS", "8WV": "OS", "OS4": "OS"},
+    "PB": {"*": "P", "ZN9": "PB", "ZN7": "PB", "PBM": "PB", "PB": "PB", "CSB": "PB"},
+    "CE": {"*": "C", "CE": "CE"},
+    "FE": {"*": "FE", "TFR": "F", "PF5": "F", "IFC": "F", "F5C": "F"},
+    "NA": {"*": "N", "CGO": "NA", "R2K": "NA", "LVQ": "NA", "NA": "NA"},
+    "ND": {"*": "N", "ND": "ND"},
+    "CF": {"*": "C", "CF": "CF"},
+    "RU": "RU",
+    "BRAF": "BR",
+    "EU": "EU",
+    "CLAA": "CL",
+    "CLBQ": "CL",
+    "CM": {"*": "C", "ZCM": "CM"},
+    "SN": {"*": "SN", "TAP": "S", "SND": "S", "TAD": "S", "XPT": "S"},
+    "AG": "AG",
+    "CLN": "CL",
+    "CLM": "CL",
+    "CLA": {"*": "CL", "PII": "C", "TDL": "C", "D0J": "C", "GM2": "C", "PIJ": "C"},
+    "CLB": {
+        "*": "CL",
+        "TD5": "C",
+        "PII": "C",
+        "TDL": "C",
+        "GM2": "C",
+        "TD7": "C",
+        "TD6": "C",
+        "PIJ": "C",
+    },
+    "CR": {
+        "*": "C",
+        "BW9": "CR",
+        "CQ4": "CR",
+        "AC9": "CR",
+        "TIL": "CR",
+        "J7U": "CR",
+        "CR": "CR",
+    },
+    "CLAY": "CL",
+    "CLBC": "CL",
+    "PD": {
+        "*": "P",
+        "F6Q": "PD",
+        "SVP": "PD",
+        "SXC": "PD",
+        "U5U": "PD",
+        "PD": "PD",
+        "PLL": "PD",
+    },
+    "CO": {
+        "*": "C",
+        "J1S": "CO",
+        "OCN": "CO",
+        "OL3": "CO",
+        "OL4": "CO",
+        "B12": "CO",
+        "XCO": "CO",
+        "UFU": "CO",
+        "CON": "CO",
+        "OL5": "CO",
+        "B13": "CO",
+        "7KI": "CO",
+        "PL1": "CO",
+        "OCO": "CO",
+        "J1R": "CO",
+        "COH": "CO",
+        "SIR": "CO",
+        "6KI": "CO",
+        "NCO": "CO",
+        "9CO": "CO",
+        "PC3": "CO",
+        "BWU": "CO",
+        "B1Z": "CO",
+        "J83": "CO",
+        "CO": "CO",
+        "COY": "CO",
+        "CNC": "CO",
+        "3CO": "CO",
+        "OCL": "CO",
+        "R5Q": "CO",
+        "X5Z": "CO",
+        "CBY": "CO",
+        "OLS": "CO",
+        "F0X": "CO",
+        "I2A": "CO",
+        "OCM": "CO",
+    },
+    "CU": {
+        "*": "C",
+        "8ZR": "CU",
+        "K7E": "CU",
+        "CU3": "CU",
+        "SI9": "CU",
+        "35N": "CU",
+        "C2O": "CU",
+        "SI7": "CU",
+        "B15": "CU",
+        "SI0": "CU",
+        "CUP": "CU",
+        "SQ1": "CU",
+        "CUK": "CU",
+        "CUL": "CU",
+        "SI8": "CU",
+        "IC4": "CU",
+        "CUM": "CU",
+        "MM2": "CU",
+        "B30": "CU",
+        "S32": "CU",
+        "V79": "CU",
+        "IMF": "CU",
+        "CUN": "CU",
+        "MM1": "CU",
+        "MP1": "CU",
+        "IME": "CU",
+        "B17": "CU",
+        "C2C": "CU",
+        "1CU": "CU",
+        "CU6": "CU",
+        "C1O": "CU",
+        "CU1": "CU",
+        "B22": "CU",
+        "CUS": "CU",
+        "RUQ": "CU",
+        "CUF": "CU",
+        "CUA": "CU",
+        "CU": "CU",
+        "CUO": "CU",
+        "0TE": "CU",
+        "SI4": "CU",
+    },
+    "CS": {"*": "C", "CS": "CS"},
+    "CLQ": "CL",
+    "CLR": "CL",
+    "CLU": "CL",
+    "TE": "TE",
+    "NI": {
+        "*": "N",
+        "USN": "NI",
+        "NFO": "NI",
+        "NI2": "NI",
+        "NFS": "NI",
+        "NFR": "NI",
+        "82N": "NI",
+        "R5N": "NI",
+        "NFU": "NI",
+        "A1ICD": "NI",
+        "NI3": "NI",
+        "M43": "NI",
+        "MM5": "NI",
+        "BF8": "NI",
+        "TCN": "NI",
+        "NIK": "NI",
+        "CUV": "NI",
+        "MM6": "NI",
+        "J52": "NI",
+        "NI": "NI",
+        "SNF": "NI",
+        "XCC": "NI",
+        "F0L": "NI",
+        "UWE": "NI",
+        "NFC": "NI",
+        "3NI": "NI",
+        "HNI": "NI",
+        "F43": "NI",
+        "RQM": "NI",
+        "NFE": "NI",
+        "NFB": "NI",
+        "B51": "NI",
+        "NI1": "NI",
+        "WCC": "NI",
+        "NUF": "NI",
+    },
+    "SB": {"*": "S", "UJI": "SB", "SB": "SB", "118": "SB", "SBO": "SB", "3CG": "SB"},
+    "MO": "MO",
+    "SEG": "SE",
+    "CLL": "CL",
+    "CLAH": "CL",
+    "CLC": {
+        "*": "CL",
+        "TD5": "C",
+        "PII": "C",
+        "TDL": "C",
+        "GM2": "C",
+        "TD7": "C",
+        "TD6": "C",
+        "PIJ": "C",
+    },
+    "CLD": {"*": "CL", "PII": "C", "GM2": "C", "PIJ": "C"},
+    "CLAD": "CL",
+    "CLAE": "CL",
+    "LA": "LA",
+    "RH": "RH",
+    "BRAC": "BR",
+    "BRAD": "BR",
+    "CLBN": "CL",
+    "CLAC": "CL",
+    "BRAB": "BR",
+    "BRAE": "BR",
+    "MG": "MG",
+    "IR": "IR",
+    "SE": {
+        "*": "SE",
+        "HII": "S",
+        "NT2": "S",
+        "R2P": "S",
+        "S2P": "S",
+        "0IU": "S",
+        "QMB": "S",
+        "81S": "S",
+        "0QB": "S",
+        "UB4": "S",
+        "OHS": "S",
+        "Q78": "S",
+        "0Y2": "S",
+        "B3M": "S",
+        "NT1": "S",
+        "81R": "S",
+    },
+    "BRAG": "BR",
+    "CLF": {"*": "CL", "PII": "C", "GM2": "C", "PIJ": "C"},
+    "CLE": {"*": "CL", "PII": "C", "GM2": "C", "PIJ": "C"},
+    "BRAX": "BR",
+    "CLK": "CL",
+    "ZN": "ZN",
+    "AS": "AS",
+    "AU": "AU",
+    "PT": "PT",
+    "CLAS": "CL",
+    "MN": "MN",
+    "CLBE": "CL",
+    "CLBF": "CL",
+    "CLAF": "CL",
+    "NA'": {"*": "N", "CGO": "NA"},
+    "BRAH": "BR",
+    "BRAI": "BR",
+    "BRA": "BR",
+    "BRB": "BR",
+    "BRAV": "BR",
+    "HG": {
+        "*": "HG",
+        "BBA": "H",
+        "MID": "H",
+        "APM": "H",
+        "4QQ": "H",
+        "0ZG": "H",
+        "APH": "H",
+    },
+    "AR": "AR",
+    "D": "H",
+    "CLAN": "CL",
+    "SI": "SI",
+    "CLS": "CL",
+    "ZR": "ZR",
+    "CLAR": {"*": "CL", "ZM4": "C"},
+    "HO": "HO",
+    "CLI": {"*": "CL", "GM2": "C"},
+    "CLH": {"*": "CL", "GM2": "C"},
+    "CLAP": "CL",
+    "CLBL": "CL",
+    "CLBM": "CL",
+    "PR": {"*": "PR", "UF0": "P", "252": "P"},
+    "IN": "IN",
+    "CLJ": "CL",
+    "BRU": "BR",
+    "SC": {"*": "S", "SFL": "SC"},
+    "CLG": {"*": "CL", "GM2": "C"},
+    "BRAT": "BR",
+    "BRAR": "BR",
+    "CLAG": "CL",
+    "CLAB": "CL",
+    "CLV": "CL",
+    "TI": "TI",
+    "CLAX": "CL",
+    "CLAJ": "CL",
+    "CL'": {"*": "CL", "BNR": "C", "25A": "C", "BDA": "C"},
+    "CLAW": "CL",
+    "BRF": "BR",
+    "BRE": "BR",
+    "RE": "RE",
+    "GD": "GD",
+    "SM": {"*": "S", "SM": "SM"},
+    "CLBH": "CL",
+    "CLBI": "CL",
+    "CLAI": "CL",
+    "CLY": "CL",
+    "CLZ": "CL",
+    "AC": "AC",
+    "BR'": "BR",
+    "CLT": "CL",
+    "CLO": "CL",
+    "CLP": "CL",
+    "LU": "LU",
+    "BA": {"*": "B", "BA": "BA"},
+    "CLAU": "CL",
+    "RB": "RB",
+    "LI": "LI",
+    "MOM": "MO",
+    "BRAQ": "BR",
+    "SR": {"*": "S", "SR": "SR", "OER": "SR"},
+    "CLAT": "CL",
+    "BRAL": "BR",
+    "SEB": "SE",
+    "CLW": "CL",
+    "CLX": "CL",
+    "BE": "BE",
+    "BRG": "BR",
+    "SEA": "SE",
+    "BRAW": "BR",
+    "BRBB": "BR",
+    "ER": "ER",
+    "TH": "TH",
+    "BRR": "BR",
+    "CLBV": "CL",
+    "AL": "AL",
+    "CLAV": "CL",
+    "BRH": "BR",
+    "CLAQ": "CL",
+    "GA": "GA",
+    "X": "*",
+    "TL": "TL",
+    "CLBB": "CL",
+    "TB": "TB",
+    "CLAK": "CL",
+    "XE": {"*": "*", "XE": "XE"},
+    "SEL": "SE",
+    "PU": {"*": "P", "4PU": "PU"},
+    "CLAZ": "CL",
+    "SE'": "SE",
+    "CLBA": "CL",
+    "SEN": "SE",
+    "SNN": "SN",
+    "MOB": "MO",
+    "YB": "YB",
+    "BRC": "BR",
+    "BRD": "BR",
+    "CLAM": "CL",
+    "DA": "H",
+    "DB": "H",
+    "DC": "H",
+    "DXT": "H",
+    "DXU": "H",
+    "DXX": "H",
+    "DXY": "H",
+    "DXZ": "H",
+    "DY": "DY",
+    "TA": "TA",
+    "XD": "*",
+    "SED": "SE",
+    "CLAL": "CL",
+    "BRAJ": "BR",
+    "AM": "AM",
+    "CLAO": "CL",
+    "BI": "BI",
+    "KR": "KR",
+    "BRBJ": "BR",
+    "UNK": "*",
+}

protify/FastPLMs/boltz/src/boltz/data/crop/__init__.py ADDED Viewed

File without changes

protify/FastPLMs/boltz/src/boltz/data/crop/affinity.py ADDED Viewed

	@@ -0,0 +1,164 @@

+from dataclasses import replace
+from typing import Optional
+import numpy as np
+from boltz.data import const
+from boltz.data.crop.cropper import Cropper
+from boltz.data.types import Tokenized
+class AffinityCropper(Cropper):
+    """Interpolate between contiguous and spatial crops."""
+    def __init__(
+        self,
+        neighborhood_size: int = 10,
+        max_tokens_protein: int = 200,
+    ) -> None:
+        """Initialize the cropper.
+        Parameters
+        ----------
+        neighborhood_size : int
+            Modulates the type of cropping to be performed.
+            Smaller neighborhoods result in more spatial
+            cropping. Larger neighborhoods result in more
+            continuous cropping.
+        """
+        self.neighborhood_size = neighborhood_size
+        self.max_tokens_protein = max_tokens_protein
+    def crop(
+        self,
+        data: Tokenized,
+        max_tokens: int,
+        max_atoms: Optional[int] = None,
+    ) -> Tokenized:
+        """Crop the data to a maximum number of tokens.
+        Parameters
+        ----------
+        data : Tokenized
+            The tokenized data.
+        max_tokens : int
+            The maximum number of tokens to crop.
+        random : np.random.RandomState
+            The random state for reproducibility.
+        max_atoms : Optional[int]
+            The maximum number of atoms to consider.
+        Returns
+        -------
+        Tokenized
+            The cropped data.
+        """
+        # Get token data
+        token_data = data.tokens
+        token_bonds = data.bonds
+        # Filter to resolved tokens
+        valid_tokens = token_data[token_data["resolved_mask"]]
+        # Check if we have any valid tokens
+        if not valid_tokens.size:
+            msg = "No valid tokens in structure"
+            raise ValueError(msg)
+        # compute minimum distance to ligand
+        ligand_coords = valid_tokens[valid_tokens["affinity_mask"]]["center_coords"]
+        dists = np.min(
+            np.sum(
+                (valid_tokens["center_coords"][:, None] - ligand_coords[None]) ** 2,
+                axis=-1,
+            )
+            ** 0.5,
+            axis=1,
+        )
+        indices = np.argsort(dists)
+        # Select cropped indices
+        cropped: set[int] = set()
+        total_atoms = 0
+        # protein tokens
+        cropped_protein: set[int] = set()
+        ligand_ids = set(
+            valid_tokens[
+                valid_tokens["mol_type"] == const.chain_type_ids["NONPOLYMER"]
+            ]["token_idx"]
+        )
+        for idx in indices:
+            # Get the token
+            token = valid_tokens[idx]
+            # Get all tokens from this chain
+            chain_tokens = token_data[token_data["asym_id"] == token["asym_id"]]
+            # Pick the whole chain if possible, otherwise select
+            # a contiguous subset centered at the query token
+            if len(chain_tokens) <= self.neighborhood_size:
+                new_tokens = chain_tokens
+            else:
+                # First limit to the maximum set of tokens, with the
+                # neighborhood on both sides to handle edges. This
+                # is mostly for efficiency with the while loop below.
+                min_idx = token["res_idx"] - self.neighborhood_size
+                max_idx = token["res_idx"] + self.neighborhood_size
+                max_token_set = chain_tokens
+                max_token_set = max_token_set[max_token_set["res_idx"] >= min_idx]
+                max_token_set = max_token_set[max_token_set["res_idx"] <= max_idx]
+                # Start by adding just the query token
+                new_tokens = max_token_set[max_token_set["res_idx"] == token["res_idx"]]
+                # Expand the neighborhood until we have enough tokens, one
+                # by one to handle some edge cases with non-standard chains.
+                # We switch to the res_idx instead of the token_idx to always
+                # include all tokens from modified residues or from ligands.
+                min_idx = max_idx = token["res_idx"]
+                while new_tokens.size < self.neighborhood_size:
+                    min_idx = min_idx - 1
+                    max_idx = max_idx + 1
+                    new_tokens = max_token_set
+                    new_tokens = new_tokens[new_tokens["res_idx"] >= min_idx]
+                    new_tokens = new_tokens[new_tokens["res_idx"] <= max_idx]
+            # Compute new tokens and new atoms
+            new_indices = set(new_tokens["token_idx"]) - cropped
+            new_tokens = token_data[list(new_indices)]
+            new_atoms = np.sum(new_tokens["atom_num"])
+            # Stop if we exceed the max number of tokens or atoms
+            if (
+                (len(new_indices) > (max_tokens - len(cropped)))
+                or ((max_atoms is not None) and ((total_atoms + new_atoms) > max_atoms))
+                or (
+                    len(cropped_protein | new_indices - ligand_ids)
+                    > self.max_tokens_protein
+                )
+            ):
+                break
+            # Add new indices
+            cropped.update(new_indices)
+            total_atoms += new_atoms
+            # Add protein indices
+            cropped_protein.update(new_indices - ligand_ids)
+        # Get the cropped tokens sorted by index
+        token_data = token_data[sorted(cropped)]
+        # Only keep bonds within the cropped tokens
+        indices = token_data["token_idx"]
+        token_bonds = token_bonds[np.isin(token_bonds["token_1"], indices)]
+        token_bonds = token_bonds[np.isin(token_bonds["token_2"], indices)]
+        # Return the cropped tokens
+        return replace(data, tokens=token_data, bonds=token_bonds)

protify/FastPLMs/boltz/src/boltz/data/crop/boltz.py ADDED Viewed

	@@ -0,0 +1,296 @@

+from dataclasses import replace
+from typing import Optional
+import numpy as np
+from scipy.spatial.distance import cdist
+from boltz.data import const
+from boltz.data.crop.cropper import Cropper
+from boltz.data.types import Tokenized
+def pick_random_token(
+    tokens: np.ndarray,
+    random: np.random.RandomState,
+) -> np.ndarray:
+    """Pick a random token from the data.
+    Parameters
+    ----------
+    tokens : np.ndarray
+        The token data.
+    random : np.ndarray
+        The random state for reproducibility.
+    Returns
+    -------
+    np.ndarray
+        The selected token.
+    """
+    return tokens[random.randint(len(tokens))]
+def pick_chain_token(
+    tokens: np.ndarray,
+    chain_id: int,
+    random: np.random.RandomState,
+) -> np.ndarray:
+    """Pick a random token from a chain.
+    Parameters
+    ----------
+    tokens : np.ndarray
+        The token data.
+    chain_id : int
+        The chain ID.
+    random : np.ndarray
+        The random state for reproducibility.
+    Returns
+    -------
+    np.ndarray
+        The selected token.
+    """
+    # Filter to chain
+    chain_tokens = tokens[tokens["asym_id"] == chain_id]
+    # Pick from chain, fallback to all tokens
+    if chain_tokens.size:
+        query = pick_random_token(chain_tokens, random)
+    else:
+        query = pick_random_token(tokens, random)
+    return query
+def pick_interface_token(
+    tokens: np.ndarray,
+    interface: np.ndarray,
+    random: np.random.RandomState,
+) -> np.ndarray:
+    """Pick a random token from an interface.
+    Parameters
+    ----------
+    tokens : np.ndarray
+        The token data.
+    interface : int
+        The interface ID.
+    random : np.ndarray
+        The random state for reproducibility.
+    Returns
+    -------
+    np.ndarray
+        The selected token.
+    """
+    # Sample random interface
+    chain_1 = int(interface["chain_1"])
+    chain_2 = int(interface["chain_2"])
+    tokens_1 = tokens[tokens["asym_id"] == chain_1]
+    tokens_2 = tokens[tokens["asym_id"] == chain_2]
+    # If no interface, pick from the chains
+    if tokens_1.size and (not tokens_2.size):
+        query = pick_random_token(tokens_1, random)
+    elif tokens_2.size and (not tokens_1.size):
+        query = pick_random_token(tokens_2, random)
+    elif (not tokens_1.size) and (not tokens_2.size):
+        query = pick_random_token(tokens, random)
+    else:
+        # If we have tokens, compute distances
+        tokens_1_coords = tokens_1["center_coords"]
+        tokens_2_coords = tokens_2["center_coords"]
+        dists = cdist(tokens_1_coords, tokens_2_coords)
+        cuttoff = dists < const.interface_cutoff
+        # In rare cases, the interface cuttoff is slightly
+        # too small, then we slightly expand it if it happens
+        if not np.any(cuttoff):
+            cuttoff = dists < (const.interface_cutoff + 5.0)
+        tokens_1 = tokens_1[np.any(cuttoff, axis=1)]
+        tokens_2 = tokens_2[np.any(cuttoff, axis=0)]
+        # Select random token
+        candidates = np.concatenate([tokens_1, tokens_2])
+        query = pick_random_token(candidates, random)
+    return query
+class BoltzCropper(Cropper):
+    """Interpolate between contiguous and spatial crops."""
+    def __init__(self, min_neighborhood: int = 0, max_neighborhood: int = 40) -> None:
+        """Initialize the cropper.
+        Modulates the type of cropping to be performed.
+        Smaller neighborhoods result in more spatial
+        cropping. Larger neighborhoods result in more
+        continuous cropping. A mix can be achieved by
+        providing a range over which to sample.
+        Parameters
+        ----------
+        min_neighborhood : int
+            The minimum neighborhood size, by default 0.
+        max_neighborhood : int
+            The maximum neighborhood size, by default 40.
+        """
+        sizes = list(range(min_neighborhood, max_neighborhood + 1, 2))
+        self.neighborhood_sizes = sizes
+    def crop(  # noqa: PLR0915
+        self,
+        data: Tokenized,
+        max_tokens: int,
+        random: np.random.RandomState,
+        max_atoms: Optional[int] = None,
+        chain_id: Optional[int] = None,
+        interface_id: Optional[int] = None,
+    ) -> Tokenized:
+        """Crop the data to a maximum number of tokens.
+        Parameters
+        ----------
+        data : Tokenized
+            The tokenized data.
+        max_tokens : int
+            The maximum number of tokens to crop.
+        random : np.random.RandomState
+            The random state for reproducibility.
+        max_atoms : int, optional
+            The maximum number of atoms to consider.
+        chain_id : int, optional
+            The chain ID to crop.
+        interface_id : int, optional
+            The interface ID to crop.
+        Returns
+        -------
+        Tokenized
+            The cropped data.
+        """
+        # Check inputs
+        if chain_id is not None and interface_id is not None:
+            msg = "Only one of chain_id or interface_id can be provided."
+            raise ValueError(msg)
+        # Randomly select a neighborhood size
+        neighborhood_size = random.choice(self.neighborhood_sizes)
+        # Get token data
+        token_data = data.tokens
+        token_bonds = data.bonds
+        mask = data.structure.mask
+        chains = data.structure.chains
+        interfaces = data.structure.interfaces
+        # Filter to valid chains
+        valid_chains = chains[mask]
+        # Filter to valid interfaces
+        valid_interfaces = interfaces
+        valid_interfaces = valid_interfaces[mask[valid_interfaces["chain_1"]]]
+        valid_interfaces = valid_interfaces[mask[valid_interfaces["chain_2"]]]
+        # Filter to resolved tokens
+        valid_tokens = token_data[token_data["resolved_mask"]]
+        # Check if we have any valid tokens
+        if not valid_tokens.size:
+            msg = "No valid tokens in structure"
+            raise ValueError(msg)
+        # Pick a random token, chain, or interface
+        if chain_id is not None:
+            query = pick_chain_token(valid_tokens, chain_id, random)
+        elif interface_id is not None:
+            interface = interfaces[interface_id]
+            query = pick_interface_token(valid_tokens, interface, random)
+        elif valid_interfaces.size:
+            idx = random.randint(len(valid_interfaces))
+            interface = valid_interfaces[idx]
+            query = pick_interface_token(valid_tokens, interface, random)
+        else:
+            idx = random.randint(len(valid_chains))
+            chain_id = valid_chains[idx]["asym_id"]
+            query = pick_chain_token(valid_tokens, chain_id, random)
+        # Sort all tokens by distance to query_coords
+        dists = valid_tokens["center_coords"] - query["center_coords"]
+        indices = np.argsort(np.linalg.norm(dists, axis=1))
+        # Select cropped indices
+        cropped: set[int] = set()
+        total_atoms = 0
+        for idx in indices:
+            # Get the token
+            token = valid_tokens[idx]
+            # Get all tokens from this chain
+            chain_tokens = token_data[token_data["asym_id"] == token["asym_id"]]
+            # Pick the whole chain if possible, otherwise select
+            # a contiguous subset centered at the query token
+            if len(chain_tokens) <= neighborhood_size:
+                new_tokens = chain_tokens
+            else:
+                # First limit to the maximum set of tokens, with the
+                # neighborhood on both sides to handle edges. This
+                # is mostly for efficiency with the while loop below.
+                min_idx = token["res_idx"] - neighborhood_size
+                max_idx = token["res_idx"] + neighborhood_size
+                max_token_set = chain_tokens
+                max_token_set = max_token_set[max_token_set["res_idx"] >= min_idx]
+                max_token_set = max_token_set[max_token_set["res_idx"] <= max_idx]
+                # Start by adding just the query token
+                new_tokens = max_token_set[max_token_set["res_idx"] == token["res_idx"]]
+                # Expand the neighborhood until we have enough tokens, one
+                # by one to handle some edge cases with non-standard chains.
+                # We switch to the res_idx instead of the token_idx to always
+                # include all tokens from modified residues or from ligands.
+                min_idx = max_idx = token["res_idx"]
+                while new_tokens.size < neighborhood_size:
+                    min_idx = min_idx - 1
+                    max_idx = max_idx + 1
+                    new_tokens = max_token_set
+                    new_tokens = new_tokens[new_tokens["res_idx"] >= min_idx]
+                    new_tokens = new_tokens[new_tokens["res_idx"] <= max_idx]
+            # Compute new tokens and new atoms
+            new_indices = set(new_tokens["token_idx"]) - cropped
+            new_tokens = token_data[list(new_indices)]
+            new_atoms = np.sum(new_tokens["atom_num"])
+            # Stop if we exceed the max number of tokens or atoms
+            if (len(new_indices) > (max_tokens - len(cropped))) or (
+                (max_atoms is not None) and ((total_atoms + new_atoms) > max_atoms)
+            ):
+                break
+            # Add new indices
+            cropped.update(new_indices)
+            total_atoms += new_atoms
+        # Get the cropped tokens sorted by index
+        token_data = token_data[sorted(cropped)]
+        # Only keep bonds within the cropped tokens
+        indices = token_data["token_idx"]
+        token_bonds = token_bonds[np.isin(token_bonds["token_1"], indices)]
+        token_bonds = token_bonds[np.isin(token_bonds["token_2"], indices)]
+        # Return the cropped tokens
+        return replace(data, tokens=token_data, bonds=token_bonds)

protify/FastPLMs/boltz/src/boltz/data/crop/cropper.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from abc import ABC, abstractmethod
+from typing import Optional
+import numpy as np
+from boltz.data.types import Tokenized
+class Cropper(ABC):
+    """Abstract base class for cropper."""
+    @abstractmethod
+    def crop(
+        self,
+        data: Tokenized,
+        max_tokens: int,
+        random: np.random.RandomState,
+        max_atoms: Optional[int] = None,
+        chain_id: Optional[int] = None,
+        interface_id: Optional[int] = None,
+    ) -> Tokenized:
+        """Crop the data to a maximum number of tokens.
+        Parameters
+        ----------
+        data : Tokenized
+            The tokenized data.
+        max_tokens : int
+            The maximum number of tokens to crop.
+        random : np.random.RandomState
+            The random state for reproducibility.
+        max_atoms : Optional[int]
+            The maximum number of atoms to consider.
+        chain_id : Optional[int]
+            The chain ID to crop.
+        interface_id : Optional[int]
+            The interface ID to crop.
+        Returns
+        -------
+        Tokenized
+            The cropped data.
+        """
+        raise NotImplementedError

protify/FastPLMs/boltz/src/boltz/data/feature/__init__.py ADDED Viewed

File without changes

protify/FastPLMs/boltz/src/boltz/data/feature/featurizer.py ADDED Viewed

	@@ -0,0 +1,1225 @@

+import math
+import random
+from typing import Optional
+from collections import deque
+import numba
+import numpy as np
+import numpy.typing as npt
+import torch
+from numba import types
+from torch import Tensor, from_numpy
+from torch.nn.functional import one_hot
+from boltz.data import const
+from boltz.data.feature.symmetry import (
+    get_amino_acids_symmetries,
+    get_chain_symmetries,
+    get_ligand_symmetries,
+)
+from boltz.data.pad import pad_dim
+from boltz.data.types import (
+    MSA,
+    MSADeletion,
+    MSAResidue,
+    MSASequence,
+    Tokenized,
+)
+from boltz.model.modules.utils import center_random_augmentation
+####################################################################################################
+# HELPERS
+####################################################################################################
+def compute_frames_nonpolymer(
+    data: Tokenized,
+    coords,
+    resolved_mask,
+    atom_to_token,
+    frame_data: list,
+    resolved_frame_data: list,
+) -> tuple[list, list]:
+    """Get the frames for non-polymer tokens.
+    Parameters
+    ----------
+    data : Tokenized
+        The tokenized data.
+    frame_data : list
+        The frame data.
+    resolved_frame_data : list
+        The resolved frame data.
+    Returns
+    -------
+    tuple[list, list]
+        The frame data and resolved frame data.
+    """
+    frame_data = np.array(frame_data)
+    resolved_frame_data = np.array(resolved_frame_data)
+    asym_id_token = data.tokens["asym_id"]
+    asym_id_atom = data.tokens["asym_id"][atom_to_token]
+    token_idx = 0
+    atom_idx = 0
+    for id in np.unique(data.tokens["asym_id"]):
+        mask_chain_token = asym_id_token == id
+        mask_chain_atom = asym_id_atom == id
+        num_tokens = mask_chain_token.sum()
+        num_atoms = mask_chain_atom.sum()
+        if (
+            data.tokens[token_idx]["mol_type"] != const.chain_type_ids["NONPOLYMER"]
+            or num_atoms < 3
+        ):
+            token_idx += num_tokens
+            atom_idx += num_atoms
+            continue
+        dist_mat = (
+            (
+                coords.reshape(-1, 3)[mask_chain_atom][:, None, :]
+                - coords.reshape(-1, 3)[mask_chain_atom][None, :, :]
+            )
+            ** 2
+        ).sum(-1) ** 0.5
+        resolved_pair = 1 - (
+            resolved_mask[mask_chain_atom][None, :]
+            * resolved_mask[mask_chain_atom][:, None]
+        ).astype(np.float32)
+        resolved_pair[resolved_pair == 1] = math.inf
+        indices = np.argsort(dist_mat + resolved_pair, axis=1)
+        frames = (
+            np.concatenate(
+                [
+                    indices[:, 1:2],
+                    indices[:, 0:1],
+                    indices[:, 2:3],
+                ],
+                axis=1,
+            )
+            + atom_idx
+        )
+        frame_data[token_idx : token_idx + num_atoms, :] = frames
+        resolved_frame_data[token_idx : token_idx + num_atoms] = resolved_mask[
+            frames
+        ].all(axis=1)
+        token_idx += num_tokens
+        atom_idx += num_atoms
+    frames_expanded = coords.reshape(-1, 3)[frame_data]
+    mask_collinear = compute_collinear_mask(
+        frames_expanded[:, 1] - frames_expanded[:, 0],
+        frames_expanded[:, 1] - frames_expanded[:, 2],
+    )
+    return frame_data, resolved_frame_data & mask_collinear
+def compute_collinear_mask(v1, v2):
+    norm1 = np.linalg.norm(v1, axis=1, keepdims=True)
+    norm2 = np.linalg.norm(v2, axis=1, keepdims=True)
+    v1 = v1 / (norm1 + 1e-6)
+    v2 = v2 / (norm2 + 1e-6)
+    mask_angle = np.abs(np.sum(v1 * v2, axis=1)) < 0.9063
+    mask_overlap1 = norm1.reshape(-1) > 1e-2
+    mask_overlap2 = norm2.reshape(-1) > 1e-2
+    return mask_angle & mask_overlap1 & mask_overlap2
+def dummy_msa(residues: np.ndarray) -> MSA:
+    """Create a dummy MSA for a chain.
+    Parameters
+    ----------
+    residues : np.ndarray
+        The residues for the chain.
+    Returns
+    -------
+    MSA
+        The dummy MSA.
+    """
+    residues = [res["res_type"] for res in residues]
+    deletions = []
+    sequences = [(0, -1, 0, len(residues), 0, 0)]
+    return MSA(
+        residues=np.array(residues, dtype=MSAResidue),
+        deletions=np.array(deletions, dtype=MSADeletion),
+        sequences=np.array(sequences, dtype=MSASequence),
+    )
+def construct_paired_msa(  # noqa: C901, PLR0915, PLR0912
+    data: Tokenized,
+    max_seqs: int,
+    max_pairs: int = 8192,
+    max_total: int = 16384,
+    random_subset: bool = False,
+) -> tuple[Tensor, Tensor, Tensor]:
+    """Pair the MSA data.
+    Parameters
+    ----------
+    data : Input
+        The input data.
+    Returns
+    -------
+    Tensor
+        The MSA data.
+    Tensor
+        The deletion data.
+    Tensor
+        Mask indicating paired sequences.
+    """
+    # Get unique chains (ensuring monotonicity in the order)
+    assert np.all(np.diff(data.tokens["asym_id"], n=1) >= 0)
+    chain_ids = np.unique(data.tokens["asym_id"])
+    # Get relevant MSA, and create a dummy for chains without
+    msa = {k: data.msa[k] for k in chain_ids if k in data.msa}
+    for chain_id in chain_ids:
+        if chain_id not in msa:
+            chain = data.structure.chains[chain_id]
+            res_start = chain["res_idx"]
+            res_end = res_start + chain["res_num"]
+            residues = data.structure.residues[res_start:res_end]
+            msa[chain_id] = dummy_msa(residues)
+    # Map taxonomies to (chain_id, seq_idx)
+    taxonomy_map: dict[str, list] = {}
+    for chain_id, chain_msa in msa.items():
+        sequences = chain_msa.sequences
+        sequences = sequences[sequences["taxonomy"] != -1]
+        for sequence in sequences:
+            seq_idx = sequence["seq_idx"]
+            taxon = sequence["taxonomy"]
+            taxonomy_map.setdefault(taxon, []).append((chain_id, seq_idx))
+    # Remove taxonomies with only one sequence and sort by the
+    # number of chain_id present in each of the taxonomies
+    taxonomy_map = {k: v for k, v in taxonomy_map.items() if len(v) > 1}
+    taxonomy_map = sorted(
+        taxonomy_map.items(),
+        key=lambda x: len({c for c, _ in x[1]}),
+        reverse=True,
+    )
+    # Keep track of the sequences available per chain, keeping the original
+    # order of the sequences in the MSA to favor the best matching sequences
+    visited = {(c, s) for c, items in taxonomy_map for s in items}
+    available = {}
+    for c in chain_ids:
+        available[c] = deque(
+            i for i in range(1, len(msa[c].sequences)) if (c, i) not in visited
+        )
+    # Create sequence pairs
+    is_paired = []
+    pairing = []
+    # Start with the first sequence for each chain
+    is_paired.append({c: 1 for c in chain_ids})
+    pairing.append({c: 0 for c in chain_ids})
+    # Then add up to 8191 paired rows
+    for _, pairs in taxonomy_map:
+        # Group occurences by chain_id in case we have multiple
+        # sequences from the same chain and same taxonomy
+        chain_occurences = {}
+        for chain_id, seq_idx in pairs:
+            chain_occurences.setdefault(chain_id, []).append(seq_idx)
+        # We create as many pairings as the maximum number of occurences
+        max_occurences = max(len(v) for v in chain_occurences.values())
+        for i in range(max_occurences):
+            row_pairing = {}
+            row_is_paired = {}
+            # Add the chains present in the taxonomy
+            for chain_id, seq_idxs in chain_occurences.items():
+                # Roll over the sequence index to maximize diversity
+                idx = i % len(seq_idxs)
+                seq_idx = seq_idxs[idx]
+                # Add the sequence to the pairing
+                row_pairing[chain_id] = seq_idx
+                row_is_paired[chain_id] = 1
+            # Add any missing chains
+            for chain_id in chain_ids:
+                if chain_id not in row_pairing:
+                    row_is_paired[chain_id] = 0
+                    if available[chain_id]:
+                        # Add the next available sequence
+                        row_pairing[chain_id] = available[chain_id].popleft()
+                    else:
+                        # No more sequences available, we place a gap
+                        row_pairing[chain_id] = -1
+            pairing.append(row_pairing)
+            is_paired.append(row_is_paired)
+            # Break if we have enough pairs
+            if len(pairing) >= max_pairs:
+                break
+        # Break if we have enough pairs
+        if len(pairing) >= max_pairs:
+            break
+    # Now add up to 16384 unpaired rows total
+    max_left = max(len(v) for v in available.values())
+    for _ in range(min(max_total - len(pairing), max_left)):
+        row_pairing = {}
+        row_is_paired = {}
+        for chain_id in chain_ids:
+            row_is_paired[chain_id] = 0
+            if available[chain_id]:
+                # Add the next available sequence
+                row_pairing[chain_id] = available[chain_id].popleft()
+            else:
+                # No more sequences available, we place a gap
+                row_pairing[chain_id] = -1
+        pairing.append(row_pairing)
+        is_paired.append(row_is_paired)
+        # Break if we have enough sequences
+        if len(pairing) >= max_total:
+            break
+    # Randomly sample a subset of the pairs
+    # ensuring the first row is always present
+    if random_subset:
+        num_seqs = len(pairing)
+        if num_seqs > max_seqs:
+            indices = np.random.choice(
+                list(range(1, num_seqs)), size=max_seqs - 1, replace=False
+            )  # noqa: NPY002
+            pairing = [pairing[0]] + [pairing[i] for i in indices]
+            is_paired = [is_paired[0]] + [is_paired[i] for i in indices]
+    else:
+        # Deterministic downsample to max_seqs
+        pairing = pairing[:max_seqs]
+        is_paired = is_paired[:max_seqs]
+    # Map (chain_id, seq_idx, res_idx) to deletion
+    deletions = numba.typed.Dict.empty(
+        key_type=numba.types.Tuple(
+            [numba.types.int64, numba.types.int64, numba.types.int64]),
+        value_type=numba.types.int64
+    )
+    for chain_id, chain_msa in msa.items():
+        chain_deletions = chain_msa.deletions
+        for sequence in chain_msa.sequences:
+            seq_idx = sequence["seq_idx"]
+            del_start = sequence["del_start"]
+            del_end = sequence["del_end"]
+            chain_deletions = chain_deletions[del_start:del_end]
+            for deletion_data in chain_deletions:
+                res_idx = deletion_data["res_idx"]
+                deletion_values = deletion_data["deletion"]
+                deletions[(chain_id, seq_idx, res_idx)] = deletion_values
+    # Add all the token MSA data
+    msa_data, del_data, paired_data = prepare_msa_arrays(
+        data.tokens, pairing, is_paired, deletions, msa
+    )
+    msa_data = torch.tensor(msa_data, dtype=torch.long)
+    del_data = torch.tensor(del_data, dtype=torch.float)
+    paired_data = torch.tensor(paired_data, dtype=torch.float)
+    return msa_data, del_data, paired_data
+def prepare_msa_arrays(
+    tokens,
+    pairing: list[dict[int, int]],
+    is_paired: list[dict[int, int]],
+    deletions: dict[tuple[int, int, int], int],
+    msa: dict[int, MSA],
+) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64], npt.NDArray[np.int64]]:
+    """Reshape data to play nicely with numba jit."""
+    token_asym_ids_arr = np.array([t["asym_id"] for t in tokens], dtype=np.int64)
+    token_res_idxs_arr = np.array([t["res_idx"] for t in tokens], dtype=np.int64)
+    chain_ids = sorted(msa.keys())
+    # chain_ids are not necessarily contiguous (e.g. they might be 0, 24, 25).
+    # This allows us to look up a chain_id by it's index in the chain_ids list.
+    chain_id_to_idx = {chain_id: i for i, chain_id in enumerate(chain_ids)}
+    token_asym_ids_idx_arr = np.array(
+        [chain_id_to_idx[asym_id] for asym_id in token_asym_ids_arr], dtype=np.int64
+    )
+    pairing_arr = np.zeros((len(pairing), len(chain_ids)), dtype=np.int64)
+    is_paired_arr = np.zeros((len(is_paired), len(chain_ids)), dtype=np.int64)
+    for i, row_pairing in enumerate(pairing):
+        for chain_id in chain_ids:
+            pairing_arr[i, chain_id_to_idx[chain_id]] = row_pairing[chain_id]
+    for i, row_is_paired in enumerate(is_paired):
+        for chain_id in chain_ids:
+            is_paired_arr[i, chain_id_to_idx[chain_id]] = row_is_paired[chain_id]
+    max_seq_len = max(len(msa[chain_id].sequences) for chain_id in chain_ids)
+    # we want res_start from sequences
+    msa_sequences = np.full((len(chain_ids), max_seq_len), -1, dtype=np.int64)
+    for chain_id in chain_ids:
+        for i, seq in enumerate(msa[chain_id].sequences):
+            msa_sequences[chain_id_to_idx[chain_id], i] = seq["res_start"]
+    max_residues_len = max(len(msa[chain_id].residues) for chain_id in chain_ids)
+    msa_residues = np.full((len(chain_ids), max_residues_len), -1, dtype=np.int64)
+    for chain_id in chain_ids:
+        residues = msa[chain_id].residues.astype(np.int64)
+        idxs = np.arange(len(residues))
+        chain_idx = chain_id_to_idx[chain_id]
+        msa_residues[chain_idx, idxs] = residues
+    return _prepare_msa_arrays_inner(
+        token_asym_ids_arr,
+        token_res_idxs_arr,
+        token_asym_ids_idx_arr,
+        pairing_arr,
+        is_paired_arr,
+        deletions,
+        msa_sequences,
+        msa_residues,
+        const.token_ids["-"],
+    )
+deletions_dict_type = types.DictType(types.UniTuple(types.int64, 3), types.int64)
+@numba.njit(
+    [
+        types.Tuple(
+            (
+                types.int64[:, ::1],  # msa_data
+                types.int64[:, ::1],  # del_data
+                types.int64[:, ::1],  # paired_data
+            )
+        )(
+            types.int64[::1],  # token_asym_ids
+            types.int64[::1],  # token_res_idxs
+            types.int64[::1],  # token_asym_ids_idx
+            types.int64[:, ::1],  # pairing
+            types.int64[:, ::1],  # is_paired
+            deletions_dict_type,  # deletions
+            types.int64[:, ::1],  # msa_sequences
+            types.int64[:, ::1],  # msa_residues
+            types.int64,  # gap_token
+        )
+    ],
+    cache=True,
+)
+def _prepare_msa_arrays_inner(
+    token_asym_ids: npt.NDArray[np.int64],
+    token_res_idxs: npt.NDArray[np.int64],
+    token_asym_ids_idx: npt.NDArray[np.int64],
+    pairing: npt.NDArray[np.int64],
+    is_paired: npt.NDArray[np.int64],
+    deletions: dict[tuple[int, int, int], int],
+    msa_sequences: npt.NDArray[np.int64],
+    msa_residues: npt.NDArray[np.int64],
+    gap_token: int,
+) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64], npt.NDArray[np.int64]]:
+    n_tokens = len(token_asym_ids)
+    n_pairs = len(pairing)
+    msa_data = np.full((n_tokens, n_pairs), gap_token, dtype=np.int64)
+    paired_data = np.zeros((n_tokens, n_pairs), dtype=np.int64)
+    del_data = np.zeros((n_tokens, n_pairs), dtype=np.int64)
+    # Add all the token MSA data
+    for token_idx in range(n_tokens):
+        chain_id_idx = token_asym_ids_idx[token_idx]
+        chain_id = token_asym_ids[token_idx]
+        res_idx = token_res_idxs[token_idx]
+        for pair_idx in range(n_pairs):
+            seq_idx = pairing[pair_idx, chain_id_idx]
+            paired_data[token_idx, pair_idx] = is_paired[pair_idx, chain_id_idx]
+            # Add residue type
+            if seq_idx != -1:
+                res_start = msa_sequences[chain_id_idx, seq_idx]
+                res_type = msa_residues[chain_id_idx, res_start + res_idx]
+                k = (chain_id, seq_idx, res_idx)
+                if k in deletions:
+                    del_data[token_idx, pair_idx] = deletions[k]
+                msa_data[token_idx, pair_idx] = res_type
+    return msa_data, del_data, paired_data
+####################################################################################################
+# FEATURES
+####################################################################################################
+def select_subset_from_mask(mask, p):
+    num_true = np.sum(mask)
+    v = np.random.geometric(p) + 1
+    k = min(v, num_true)
+    true_indices = np.where(mask)[0]
+    # Randomly select k indices from the true_indices
+    selected_indices = np.random.choice(true_indices, size=k, replace=False)
+    new_mask = np.zeros_like(mask)
+    new_mask[selected_indices] = 1
+    return new_mask
+def process_token_features(
+    data: Tokenized,
+    max_tokens: Optional[int] = None,
+    binder_pocket_conditioned_prop: Optional[float] = 0.0,
+    binder_pocket_cutoff: Optional[float] = 6.0,
+    binder_pocket_sampling_geometric_p: Optional[float] = 0.0,
+    only_ligand_binder_pocket: Optional[bool] = False,
+    inference_binder: Optional[list[int]] = None,
+    inference_pocket: Optional[list[tuple[int, int]]] = None,
+) -> dict[str, Tensor]:
+    """Get the token features.
+    Parameters
+    ----------
+    data : Tokenized
+        The tokenized data.
+    max_tokens : int
+        The maximum number of tokens.
+    Returns
+    -------
+    dict[str, Tensor]
+        The token features.
+    """
+    # Token data
+    token_data = data.tokens
+    token_bonds = data.bonds
+    # Token core features
+    token_index = torch.arange(len(token_data), dtype=torch.long)
+    residue_index = from_numpy(token_data["res_idx"].copy()).long()
+    asym_id = from_numpy(token_data["asym_id"].copy()).long()
+    entity_id = from_numpy(token_data["entity_id"].copy()).long()
+    sym_id = from_numpy(token_data["sym_id"].copy()).long()
+    mol_type = from_numpy(token_data["mol_type"].copy()).long()
+    res_type = from_numpy(token_data["res_type"].copy()).long()
+    res_type = one_hot(res_type, num_classes=const.num_tokens)
+    disto_center = from_numpy(token_data["disto_coords"].copy())
+    # Token mask features
+    pad_mask = torch.ones(len(token_data), dtype=torch.float)
+    resolved_mask = from_numpy(token_data["resolved_mask"].copy()).float()
+    disto_mask = from_numpy(token_data["disto_mask"].copy()).float()
+    cyclic_period = from_numpy(token_data["cyclic_period"].copy())
+    # Token bond features
+    if max_tokens is not None:
+        pad_len = max_tokens - len(token_data)
+        num_tokens = max_tokens if pad_len > 0 else len(token_data)
+    else:
+        num_tokens = len(token_data)
+    tok_to_idx = {tok["token_idx"]: idx for idx, tok in enumerate(token_data)}
+    bonds = torch.zeros(num_tokens, num_tokens, dtype=torch.float)
+    for token_bond in token_bonds:
+        token_1 = tok_to_idx[token_bond["token_1"]]
+        token_2 = tok_to_idx[token_bond["token_2"]]
+        bonds[token_1, token_2] = 1
+        bonds[token_2, token_1] = 1
+    bonds = bonds.unsqueeze(-1)
+    # Pocket conditioned feature
+    pocket_feature = (
+        np.zeros(len(token_data)) + const.pocket_contact_info["UNSPECIFIED"]
+    )
+    if inference_binder is not None:
+        assert inference_pocket is not None
+        pocket_residues = set(inference_pocket)
+        for idx, token in enumerate(token_data):
+            if token["asym_id"] == inference_binder:
+                pocket_feature[idx] = const.pocket_contact_info["BINDER"]
+            elif (token["asym_id"], token["res_idx"]) in pocket_residues:
+                pocket_feature[idx] = const.pocket_contact_info["POCKET"]
+            else:
+                pocket_feature[idx] = const.pocket_contact_info["UNSELECTED"]
+    elif (
+        binder_pocket_conditioned_prop > 0.0
+        and random.random() < binder_pocket_conditioned_prop
+    ):
+        # choose as binder a random ligand in the crop, if there are no ligands select a protein chain
+        binder_asym_ids = np.unique(
+            token_data["asym_id"][
+                token_data["mol_type"] == const.chain_type_ids["NONPOLYMER"]
+            ]
+        )
+        if len(binder_asym_ids) == 0:
+            if not only_ligand_binder_pocket:
+                binder_asym_ids = np.unique(token_data["asym_id"])
+        if len(binder_asym_ids) > 0:
+            pocket_asym_id = random.choice(binder_asym_ids)
+            binder_mask = token_data["asym_id"] == pocket_asym_id
+            binder_coords = []
+            for token in token_data:
+                if token["asym_id"] == pocket_asym_id:
+                    binder_coords.append(
+                        data.structure.atoms["coords"][
+                            token["atom_idx"] : token["atom_idx"] + token["atom_num"]
+                        ]
+                    )
+            binder_coords = np.concatenate(binder_coords, axis=0)
+            # find the tokens in the pocket
+            token_dist = np.zeros(len(token_data)) + 1000
+            for i, token in enumerate(token_data):
+                if (
+                    token["mol_type"] != const.chain_type_ids["NONPOLYMER"]
+                    and token["asym_id"] != pocket_asym_id
+                    and token["resolved_mask"] == 1
+                ):
+                    token_coords = data.structure.atoms["coords"][
+                        token["atom_idx"] : token["atom_idx"] + token["atom_num"]
+                    ]
+                    # find chain and apply chain transformation
+                    for chain in data.structure.chains:
+                        if chain["asym_id"] == token["asym_id"]:
+                            break
+                    token_dist[i] = np.min(
+                        np.linalg.norm(
+                            token_coords[:, None, :] - binder_coords[None, :, :],
+                            axis=-1,
+                        )
+                    )
+            pocket_mask = token_dist < binder_pocket_cutoff
+            if np.sum(pocket_mask) > 0:
+                pocket_feature = (
+                    np.zeros(len(token_data)) + const.pocket_contact_info["UNSELECTED"]
+                )
+                pocket_feature[binder_mask] = const.pocket_contact_info["BINDER"]
+                if binder_pocket_sampling_geometric_p > 0.0:
+                    # select a subset of the pocket, according
+                    # to a geometric distribution with one as minimum
+                    pocket_mask = select_subset_from_mask(
+                        pocket_mask, binder_pocket_sampling_geometric_p
+                    )
+                pocket_feature[pocket_mask] = const.pocket_contact_info["POCKET"]
+    pocket_feature = from_numpy(pocket_feature).long()
+    pocket_feature = one_hot(pocket_feature, num_classes=len(const.pocket_contact_info))
+    # Pad to max tokens if given
+    if max_tokens is not None:
+        pad_len = max_tokens - len(token_data)
+        if pad_len > 0:
+            token_index = pad_dim(token_index, 0, pad_len)
+            residue_index = pad_dim(residue_index, 0, pad_len)
+            asym_id = pad_dim(asym_id, 0, pad_len)
+            entity_id = pad_dim(entity_id, 0, pad_len)
+            sym_id = pad_dim(sym_id, 0, pad_len)
+            mol_type = pad_dim(mol_type, 0, pad_len)
+            res_type = pad_dim(res_type, 0, pad_len)
+            disto_center = pad_dim(disto_center, 0, pad_len)
+            pad_mask = pad_dim(pad_mask, 0, pad_len)
+            resolved_mask = pad_dim(resolved_mask, 0, pad_len)
+            disto_mask = pad_dim(disto_mask, 0, pad_len)
+            pocket_feature = pad_dim(pocket_feature, 0, pad_len)
+            cyclic_period = pad_dim(cyclic_period, 0, pad_len)
+    token_features = {
+        "token_index": token_index,
+        "residue_index": residue_index,
+        "asym_id": asym_id,
+        "entity_id": entity_id,
+        "sym_id": sym_id,
+        "mol_type": mol_type,
+        "res_type": res_type,
+        "disto_center": disto_center,
+        "token_bonds": bonds,
+        "token_pad_mask": pad_mask,
+        "token_resolved_mask": resolved_mask,
+        "token_disto_mask": disto_mask,
+        "pocket_feature": pocket_feature,
+        "cyclic_period": cyclic_period,
+    }
+    return token_features
+def process_atom_features(
+    data: Tokenized,
+    atoms_per_window_queries: int = 32,
+    min_dist: float = 2.0,
+    max_dist: float = 22.0,
+    num_bins: int = 64,
+    max_atoms: Optional[int] = None,
+    max_tokens: Optional[int] = None,
+) -> dict[str, Tensor]:
+    """Get the atom features.
+    Parameters
+    ----------
+    data : Tokenized
+        The tokenized data.
+    max_atoms : int, optional
+        The maximum number of atoms.
+    Returns
+    -------
+    dict[str, Tensor]
+        The atom features.
+    """
+    # Filter to tokens' atoms
+    atom_data = []
+    ref_space_uid = []
+    coord_data = []
+    frame_data = []
+    resolved_frame_data = []
+    atom_to_token = []
+    token_to_rep_atom = []  # index on cropped atom table
+    r_set_to_rep_atom = []
+    disto_coords = []
+    atom_idx = 0
+    chain_res_ids = {}
+    for token_id, token in enumerate(data.tokens):
+        # Get the chain residue ids
+        chain_idx, res_id = token["asym_id"], token["res_idx"]
+        chain = data.structure.chains[chain_idx]
+        if (chain_idx, res_id) not in chain_res_ids:
+            new_idx = len(chain_res_ids)
+            chain_res_ids[(chain_idx, res_id)] = new_idx
+        else:
+            new_idx = chain_res_ids[(chain_idx, res_id)]
+        # Map atoms to token indices
+        ref_space_uid.extend([new_idx] * token["atom_num"])
+        atom_to_token.extend([token_id] * token["atom_num"])
+        # Add atom data
+        start = token["atom_idx"]
+        end = token["atom_idx"] + token["atom_num"]
+        token_atoms = data.structure.atoms[start:end]
+        # Map token to representative atom
+        token_to_rep_atom.append(atom_idx + token["disto_idx"] - start)
+        if (chain["mol_type"] != const.chain_type_ids["NONPOLYMER"]) and token[
+            "resolved_mask"
+        ]:
+            r_set_to_rep_atom.append(atom_idx + token["center_idx"] - start)
+        # Get token coordinates
+        token_coords = np.array([token_atoms["coords"]])
+        coord_data.append(token_coords)
+        # Get frame data
+        res_type = const.tokens[token["res_type"]]
+        if token["atom_num"] < 3 or res_type in ["PAD", "UNK", "-"]:
+            idx_frame_a, idx_frame_b, idx_frame_c = 0, 0, 0
+            mask_frame = False
+        elif (token["mol_type"] == const.chain_type_ids["PROTEIN"]) and (
+            res_type in const.ref_atoms
+        ):
+            idx_frame_a, idx_frame_b, idx_frame_c = (
+                const.ref_atoms[res_type].index("N"),
+                const.ref_atoms[res_type].index("CA"),
+                const.ref_atoms[res_type].index("C"),
+            )
+            mask_frame = (
+                token_atoms["is_present"][idx_frame_a]
+                and token_atoms["is_present"][idx_frame_b]
+                and token_atoms["is_present"][idx_frame_c]
+            )
+        elif (
+            token["mol_type"] == const.chain_type_ids["DNA"]
+            or token["mol_type"] == const.chain_type_ids["RNA"]
+        ) and (res_type in const.ref_atoms):
+            idx_frame_a, idx_frame_b, idx_frame_c = (
+                const.ref_atoms[res_type].index("C1'"),
+                const.ref_atoms[res_type].index("C3'"),
+                const.ref_atoms[res_type].index("C4'"),
+            )
+            mask_frame = (
+                token_atoms["is_present"][idx_frame_a]
+                and token_atoms["is_present"][idx_frame_b]
+                and token_atoms["is_present"][idx_frame_c]
+            )
+        else:
+            idx_frame_a, idx_frame_b, idx_frame_c = 0, 0, 0
+            mask_frame = False
+        frame_data.append(
+            [idx_frame_a + atom_idx, idx_frame_b + atom_idx, idx_frame_c + atom_idx]
+        )
+        resolved_frame_data.append(mask_frame)
+        # Get distogram coordinates
+        disto_coords_tok = data.structure.atoms[token["disto_idx"]]["coords"]
+        disto_coords.append(disto_coords_tok)
+        # Update atom data. This is technically never used again (we rely on coord_data),
+        # but we update for consistency and to make sure the Atom object has valid, transformed coordinates.
+        token_atoms = token_atoms.copy()
+        token_atoms["coords"] = token_coords[0]  # atom has a copy of first coords
+        atom_data.append(token_atoms)
+        atom_idx += len(token_atoms)
+    disto_coords = np.array(disto_coords)
+    # Compute distogram
+    t_center = torch.Tensor(disto_coords)
+    t_dists = torch.cdist(t_center, t_center)
+    boundaries = torch.linspace(min_dist, max_dist, num_bins - 1)
+    distogram = (t_dists.unsqueeze(-1) > boundaries).sum(dim=-1).long()
+    disto_target = one_hot(distogram, num_classes=num_bins)
+    atom_data = np.concatenate(atom_data)
+    coord_data = np.concatenate(coord_data, axis=1)
+    ref_space_uid = np.array(ref_space_uid)
+    # Compute features
+    ref_atom_name_chars = from_numpy(atom_data["name"]).long()
+    ref_element = from_numpy(atom_data["element"]).long()
+    ref_charge = from_numpy(atom_data["charge"])
+    ref_pos = from_numpy(
+        atom_data["conformer"].copy()
+    )  # not sure why I need to copy here..
+    ref_space_uid = from_numpy(ref_space_uid)
+    coords = from_numpy(coord_data.copy())
+    resolved_mask = from_numpy(atom_data["is_present"])
+    pad_mask = torch.ones(len(atom_data), dtype=torch.float)
+    atom_to_token = torch.tensor(atom_to_token, dtype=torch.long)
+    token_to_rep_atom = torch.tensor(token_to_rep_atom, dtype=torch.long)
+    r_set_to_rep_atom = torch.tensor(r_set_to_rep_atom, dtype=torch.long)
+    frame_data, resolved_frame_data = compute_frames_nonpolymer(
+        data,
+        coord_data,
+        atom_data["is_present"],
+        atom_to_token,
+        frame_data,
+        resolved_frame_data,
+    )  # Compute frames for NONPOLYMER tokens
+    frames = from_numpy(frame_data.copy())
+    frame_resolved_mask = from_numpy(resolved_frame_data.copy())
+    # Convert to one-hot
+    ref_atom_name_chars = one_hot(
+        ref_atom_name_chars % num_bins, num_classes=num_bins
+    )  # added for lower case letters
+    ref_element = one_hot(ref_element, num_classes=const.num_elements)
+    atom_to_token = one_hot(atom_to_token, num_classes=token_id + 1)
+    token_to_rep_atom = one_hot(token_to_rep_atom, num_classes=len(atom_data))
+    r_set_to_rep_atom = one_hot(r_set_to_rep_atom, num_classes=len(atom_data))
+    # Center the ground truth coordinates
+    center = (coords * resolved_mask[None, :, None]).sum(dim=1)
+    center = center / resolved_mask.sum().clamp(min=1)
+    coords = coords - center[:, None]
+    # Apply random roto-translation to the input atoms
+    ref_pos = center_random_augmentation(
+        ref_pos[None], resolved_mask[None], centering=False
+    )[0]
+    # Compute padding and apply
+    if max_atoms is not None:
+        assert max_atoms % atoms_per_window_queries == 0
+        pad_len = max_atoms - len(atom_data)
+    else:
+        pad_len = (
+            (len(atom_data) - 1) // atoms_per_window_queries + 1
+        ) * atoms_per_window_queries - len(atom_data)
+    if pad_len > 0:
+        pad_mask = pad_dim(pad_mask, 0, pad_len)
+        ref_pos = pad_dim(ref_pos, 0, pad_len)
+        resolved_mask = pad_dim(resolved_mask, 0, pad_len)
+        ref_element = pad_dim(ref_element, 0, pad_len)
+        ref_charge = pad_dim(ref_charge, 0, pad_len)
+        ref_atom_name_chars = pad_dim(ref_atom_name_chars, 0, pad_len)
+        ref_space_uid = pad_dim(ref_space_uid, 0, pad_len)
+        coords = pad_dim(coords, 1, pad_len)
+        atom_to_token = pad_dim(atom_to_token, 0, pad_len)
+        token_to_rep_atom = pad_dim(token_to_rep_atom, 1, pad_len)
+        r_set_to_rep_atom = pad_dim(r_set_to_rep_atom, 1, pad_len)
+    if max_tokens is not None:
+        pad_len = max_tokens - token_to_rep_atom.shape[0]
+        if pad_len > 0:
+            atom_to_token = pad_dim(atom_to_token, 1, pad_len)
+            token_to_rep_atom = pad_dim(token_to_rep_atom, 0, pad_len)
+            r_set_to_rep_atom = pad_dim(r_set_to_rep_atom, 0, pad_len)
+            disto_target = pad_dim(pad_dim(disto_target, 0, pad_len), 1, pad_len)
+            frames = pad_dim(frames, 0, pad_len)
+            frame_resolved_mask = pad_dim(frame_resolved_mask, 0, pad_len)
+    return {
+        "ref_pos": ref_pos,
+        "atom_resolved_mask": resolved_mask,
+        "ref_element": ref_element,
+        "ref_charge": ref_charge,
+        "ref_atom_name_chars": ref_atom_name_chars,
+        "ref_space_uid": ref_space_uid,
+        "coords": coords,
+        "atom_pad_mask": pad_mask,
+        "atom_to_token": atom_to_token,
+        "token_to_rep_atom": token_to_rep_atom,
+        "r_set_to_rep_atom": r_set_to_rep_atom,
+        "disto_target": disto_target,
+        "frames_idx": frames,
+        "frame_resolved_mask": frame_resolved_mask,
+    }
+def process_msa_features(
+    data: Tokenized,
+    max_seqs_batch: int,
+    max_seqs: int,
+    max_tokens: Optional[int] = None,
+    pad_to_max_seqs: bool = False,
+) -> dict[str, Tensor]:
+    """Get the MSA features.
+    Parameters
+    ----------
+    data : Tokenized
+        The tokenized data.
+    max_seqs : int
+        The maximum number of MSA sequences.
+    max_tokens : int
+        The maximum number of tokens.
+    pad_to_max_seqs : bool
+        Whether to pad to the maximum number of sequences.
+    Returns
+    -------
+    dict[str, Tensor]
+        The MSA features.
+    """
+    # Created paired MSA
+    msa, deletion, paired = construct_paired_msa(data, max_seqs_batch)
+    msa, deletion, paired = (
+        msa.transpose(1, 0),
+        deletion.transpose(1, 0),
+        paired.transpose(1, 0),
+    )  # (N_MSA, N_RES, N_AA)
+    # Prepare features
+    msa = torch.nn.functional.one_hot(msa, num_classes=const.num_tokens)
+    msa_mask = torch.ones_like(msa[:, :, 0])
+    profile = msa.float().mean(dim=0)
+    has_deletion = deletion > 0
+    deletion = np.pi / 2 * np.arctan(deletion / 3)
+    deletion_mean = deletion.mean(axis=0)
+    # Pad in the MSA dimension (dim=0)
+    if pad_to_max_seqs:
+        pad_len = max_seqs - msa.shape[0]
+        if pad_len > 0:
+            msa = pad_dim(msa, 0, pad_len, const.token_ids["-"])
+            paired = pad_dim(paired, 0, pad_len)
+            msa_mask = pad_dim(msa_mask, 0, pad_len)
+            has_deletion = pad_dim(has_deletion, 0, pad_len)
+            deletion = pad_dim(deletion, 0, pad_len)
+    # Pad in the token dimension (dim=1)
+    if max_tokens is not None:
+        pad_len = max_tokens - msa.shape[1]
+        if pad_len > 0:
+            msa = pad_dim(msa, 1, pad_len, const.token_ids["-"])
+            paired = pad_dim(paired, 1, pad_len)
+            msa_mask = pad_dim(msa_mask, 1, pad_len)
+            has_deletion = pad_dim(has_deletion, 1, pad_len)
+            deletion = pad_dim(deletion, 1, pad_len)
+            profile = pad_dim(profile, 0, pad_len)
+            deletion_mean = pad_dim(deletion_mean, 0, pad_len)
+    return {
+        "msa": msa,
+        "msa_paired": paired,
+        "deletion_value": deletion,
+        "has_deletion": has_deletion,
+        "deletion_mean": deletion_mean,
+        "profile": profile,
+        "msa_mask": msa_mask,
+    }
+def process_symmetry_features(
+    cropped: Tokenized, symmetries: dict
+) -> dict[str, Tensor]:
+    """Get the symmetry features.
+    Parameters
+    ----------
+    data : Tokenized
+        The tokenized data.
+    Returns
+    -------
+    dict[str, Tensor]
+        The symmetry features.
+    """
+    features = get_chain_symmetries(cropped)
+    features.update(get_amino_acids_symmetries(cropped))
+    features.update(get_ligand_symmetries(cropped, symmetries))
+    return features
+def process_residue_constraint_features(
+    data: Tokenized,
+) -> dict[str, Tensor]:
+    residue_constraints = data.residue_constraints
+    if residue_constraints is not None:
+        rdkit_bounds_constraints = residue_constraints.rdkit_bounds_constraints
+        chiral_atom_constraints = residue_constraints.chiral_atom_constraints
+        stereo_bond_constraints = residue_constraints.stereo_bond_constraints
+        planar_bond_constraints = residue_constraints.planar_bond_constraints
+        planar_ring_5_constraints = residue_constraints.planar_ring_5_constraints
+        planar_ring_6_constraints = residue_constraints.planar_ring_6_constraints
+        rdkit_bounds_index = torch.tensor(
+            rdkit_bounds_constraints["atom_idxs"].copy(), dtype=torch.long
+        ).T
+        rdkit_bounds_bond_mask = torch.tensor(
+            rdkit_bounds_constraints["is_bond"].copy(), dtype=torch.bool
+        )
+        rdkit_bounds_angle_mask = torch.tensor(
+            rdkit_bounds_constraints["is_angle"].copy(), dtype=torch.bool
+        )
+        rdkit_upper_bounds = torch.tensor(
+            rdkit_bounds_constraints["upper_bound"].copy(), dtype=torch.float
+        )
+        rdkit_lower_bounds = torch.tensor(
+            rdkit_bounds_constraints["lower_bound"].copy(), dtype=torch.float
+        )
+        chiral_atom_index = torch.tensor(
+            chiral_atom_constraints["atom_idxs"].copy(), dtype=torch.long
+        ).T
+        chiral_reference_mask = torch.tensor(
+            chiral_atom_constraints["is_reference"].copy(), dtype=torch.bool
+        )
+        chiral_atom_orientations = torch.tensor(
+            chiral_atom_constraints["is_r"].copy(), dtype=torch.bool
+        )
+        stereo_bond_index = torch.tensor(
+            stereo_bond_constraints["atom_idxs"].copy(), dtype=torch.long
+        ).T
+        stereo_reference_mask = torch.tensor(
+            stereo_bond_constraints["is_reference"].copy(), dtype=torch.bool
+        )
+        stereo_bond_orientations = torch.tensor(
+            stereo_bond_constraints["is_e"].copy(), dtype=torch.bool
+        )
+        planar_bond_index = torch.tensor(
+            planar_bond_constraints["atom_idxs"].copy(), dtype=torch.long
+        ).T
+        planar_ring_5_index = torch.tensor(
+            planar_ring_5_constraints["atom_idxs"].copy(), dtype=torch.long
+        ).T
+        planar_ring_6_index = torch.tensor(
+            planar_ring_6_constraints["atom_idxs"].copy(), dtype=torch.long
+        ).T
+    else:
+        rdkit_bounds_index = torch.empty((2, 0), dtype=torch.long)
+        rdkit_bounds_bond_mask = torch.empty((0,), dtype=torch.bool)
+        rdkit_bounds_angle_mask = torch.empty((0,), dtype=torch.bool)
+        rdkit_upper_bounds = torch.empty((0,), dtype=torch.float)
+        rdkit_lower_bounds = torch.empty((0,), dtype=torch.float)
+        chiral_atom_index = torch.empty(
+            (
+                4,
+                0,
+            ),
+            dtype=torch.long,
+        )
+        chiral_reference_mask = torch.empty((0,), dtype=torch.bool)
+        chiral_atom_orientations = torch.empty((0,), dtype=torch.bool)
+        stereo_bond_index = torch.empty((4, 0), dtype=torch.long)
+        stereo_reference_mask = torch.empty((0,), dtype=torch.bool)
+        stereo_bond_orientations = torch.empty((0,), dtype=torch.bool)
+        planar_bond_index = torch.empty((6, 0), dtype=torch.long)
+        planar_ring_5_index = torch.empty((5, 0), dtype=torch.long)
+        planar_ring_6_index = torch.empty((6, 0), dtype=torch.long)
+    return {
+        "rdkit_bounds_index": rdkit_bounds_index,
+        "rdkit_bounds_bond_mask": rdkit_bounds_bond_mask,
+        "rdkit_bounds_angle_mask": rdkit_bounds_angle_mask,
+        "rdkit_upper_bounds": rdkit_upper_bounds,
+        "rdkit_lower_bounds": rdkit_lower_bounds,
+        "chiral_atom_index": chiral_atom_index,
+        "chiral_reference_mask": chiral_reference_mask,
+        "chiral_atom_orientations": chiral_atom_orientations,
+        "stereo_bond_index": stereo_bond_index,
+        "stereo_reference_mask": stereo_reference_mask,
+        "stereo_bond_orientations": stereo_bond_orientations,
+        "planar_bond_index": planar_bond_index,
+        "planar_ring_5_index": planar_ring_5_index,
+        "planar_ring_6_index": planar_ring_6_index,
+    }
+def process_chain_feature_constraints(
+    data: Tokenized,
+) -> dict[str, Tensor]:
+    structure = data.structure
+    if structure.connections.shape[0] > 0:
+        connected_chain_index, connected_atom_index = [], []
+        for connection in structure.connections:
+            connected_chain_index.append([connection["chain_1"], connection["chain_2"]])
+            connected_atom_index.append([connection["atom_1"], connection["atom_2"]])
+        connected_chain_index = torch.tensor(connected_chain_index, dtype=torch.long).T
+        connected_atom_index = torch.tensor(connected_atom_index, dtype=torch.long).T
+    else:
+        connected_chain_index = torch.empty((2, 0), dtype=torch.long)
+        connected_atom_index = torch.empty((2, 0), dtype=torch.long)
+    symmetric_chain_index = []
+    for i, chain_i in enumerate(structure.chains):
+        for j, chain_j in enumerate(structure.chains):
+            if j <= i:
+                continue
+            if chain_i["entity_id"] == chain_j["entity_id"]:
+                symmetric_chain_index.append([i, j])
+    if len(symmetric_chain_index) > 0:
+        symmetric_chain_index = torch.tensor(symmetric_chain_index, dtype=torch.long).T
+    else:
+        symmetric_chain_index = torch.empty((2, 0), dtype=torch.long)
+    return {
+        "connected_chain_index": connected_chain_index,
+        "connected_atom_index": connected_atom_index,
+        "symmetric_chain_index": symmetric_chain_index,
+    }
+class BoltzFeaturizer:
+    """Boltz featurizer."""
+    def process(
+        self,
+        data: Tokenized,
+        training: bool,
+        max_seqs: int = 4096,
+        atoms_per_window_queries: int = 32,
+        min_dist: float = 2.0,
+        max_dist: float = 22.0,
+        num_bins: int = 64,
+        max_tokens: Optional[int] = None,
+        max_atoms: Optional[int] = None,
+        pad_to_max_seqs: bool = False,
+        compute_symmetries: bool = False,
+        symmetries: Optional[dict] = None,
+        binder_pocket_conditioned_prop: Optional[float] = 0.0,
+        binder_pocket_cutoff: Optional[float] = 6.0,
+        binder_pocket_sampling_geometric_p: Optional[float] = 0.0,
+        only_ligand_binder_pocket: Optional[bool] = False,
+        inference_binder: Optional[int] = None,
+        inference_pocket: Optional[list[tuple[int, int]]] = None,
+        compute_constraint_features: bool = False,
+    ) -> dict[str, Tensor]:
+        """Compute features.
+        Parameters
+        ----------
+        data : Tokenized
+            The tokenized data.
+        training : bool
+            Whether the model is in training mode.
+        max_tokens : int, optional
+            The maximum number of tokens.
+        max_atoms : int, optional
+            The maximum number of atoms
+        max_seqs : int, optional
+            The maximum number of sequences.
+        Returns
+        -------
+        dict[str, Tensor]
+            The features for model training.
+        """
+        # Compute random number of sequences
+        if training and max_seqs is not None:
+            max_seqs_batch = np.random.randint(1, max_seqs + 1)  # noqa: NPY002
+        else:
+            max_seqs_batch = max_seqs
+        # Compute token features
+        token_features = process_token_features(
+            data,
+            max_tokens,
+            binder_pocket_conditioned_prop,
+            binder_pocket_cutoff,
+            binder_pocket_sampling_geometric_p,
+            only_ligand_binder_pocket,
+            inference_binder=inference_binder,
+            inference_pocket=inference_pocket,
+        )
+        # Compute atom features
+        atom_features = process_atom_features(
+            data,
+            atoms_per_window_queries,
+            min_dist,
+            max_dist,
+            num_bins,
+            max_atoms,
+            max_tokens,
+        )
+        # Compute MSA features
+        msa_features = process_msa_features(
+            data,
+            max_seqs_batch,
+            max_seqs,
+            max_tokens,
+            pad_to_max_seqs,
+        )
+        # Compute symmetry features
+        symmetry_features = {}
+        if compute_symmetries:
+            symmetry_features = process_symmetry_features(data, symmetries)
+        # Compute constraint features
+        residue_constraint_features = {}
+        chain_constraint_features = {}
+        if compute_constraint_features:
+            residue_constraint_features = process_residue_constraint_features(data)
+            chain_constraint_features = process_chain_feature_constraints(data)
+        return {
+            **token_features,
+            **atom_features,
+            **msa_features,
+            **symmetry_features,
+            **residue_constraint_features,
+            **chain_constraint_features,
+        }

protify/FastPLMs/boltz/src/boltz/data/feature/featurizerv2.py ADDED Viewed

	@@ -0,0 +1,2354 @@

+import math
+from typing import Optional
+from collections import deque
+import numba
+import numpy as np
+import numpy.typing as npt
+import rdkit.Chem.Descriptors
+import torch
+from numba import types
+from rdkit.Chem import Mol
+from scipy.spatial.distance import cdist
+from torch import Tensor, from_numpy
+from torch.nn.functional import one_hot
+from boltz.data import const
+from boltz.data.mol import (
+    get_amino_acids_symmetries,
+    get_chain_symmetries,
+    get_ligand_symmetries,
+    get_symmetries,
+)
+from boltz.data.pad import pad_dim
+from boltz.data.types import (
+    MSA,
+    MSADeletion,
+    MSAResidue,
+    MSASequence,
+    TemplateInfo,
+    Tokenized,
+)
+from boltz.model.modules.utils import center_random_augmentation
+####################################################################################################
+# HELPERS
+####################################################################################################
+def convert_atom_name(name: str) -> tuple[int, int, int, int]:
+    """Convert an atom name to a standard format.
+    Parameters
+    ----------
+    name : str
+        The atom name.
+    Returns
+    -------
+    tuple[int, int, int, int]
+        The converted atom name.
+    """
+    name = str(name).strip()
+    name = [ord(c) - 32 for c in name]
+    name = name + [0] * (4 - len(name))
+    return tuple(name)
+def sample_d(
+    min_d: float,
+    max_d: float,
+    n_samples: int,
+    random: np.random.Generator,
+) -> np.ndarray:
+    """Generate samples from a 1/d distribution between min_d and max_d.
+    Parameters
+    ----------
+    min_d : float
+        Minimum value of d
+    max_d : float
+        Maximum value of d
+    n_samples : int
+        Number of samples to generate
+    random : numpy.random.Generator
+        Random number generator
+    Returns
+    -------
+    numpy.ndarray
+        Array of samples drawn from the distribution
+    Notes
+    -----
+    The probability density function is:
+    f(d) = 1/(d * ln(max_d/min_d)) for d in [min_d, max_d]
+    The inverse CDF transform is:
+    d = min_d * (max_d/min_d)**u where u ~ Uniform(0,1)
+    """
+    # Generate n_samples uniform random numbers in [0, 1]
+    u = random.random(n_samples)
+    # Transform u using the inverse CDF
+    return min_d * (max_d / min_d) ** u
+def compute_frames_nonpolymer(
+    data: Tokenized,
+    coords,
+    resolved_mask,
+    atom_to_token,
+    frame_data: list,
+    resolved_frame_data: list,
+) -> tuple[list, list]:
+    """Get the frames for non-polymer tokens.
+    Parameters
+    ----------
+    data : Tokenized
+        The input data to the model.
+    frame_data : list
+        The frame data.
+    resolved_frame_data : list
+        The resolved frame data.
+    Returns
+    -------
+    tuple[list, list]
+        The frame data and resolved frame data.
+    """
+    frame_data = np.array(frame_data)
+    resolved_frame_data = np.array(resolved_frame_data)
+    asym_id_token = data.tokens["asym_id"]
+    asym_id_atom = data.tokens["asym_id"][atom_to_token]
+    token_idx = 0
+    atom_idx = 0
+    for id in np.unique(data.tokens["asym_id"]):
+        mask_chain_token = asym_id_token == id
+        mask_chain_atom = asym_id_atom == id
+        num_tokens = mask_chain_token.sum()
+        num_atoms = mask_chain_atom.sum()
+        if (
+            data.tokens[token_idx]["mol_type"] != const.chain_type_ids["NONPOLYMER"]
+            or num_atoms < 3  # noqa: PLR2004
+        ):
+            token_idx += num_tokens
+            atom_idx += num_atoms
+            continue
+        dist_mat = (
+            (
+                coords.reshape(-1, 3)[mask_chain_atom][:, None, :]
+                - coords.reshape(-1, 3)[mask_chain_atom][None, :, :]
+            )
+            ** 2
+        ).sum(-1) ** 0.5
+        resolved_pair = 1 - (
+            resolved_mask[mask_chain_atom][None, :]
+            * resolved_mask[mask_chain_atom][:, None]
+        ).astype(np.float32)
+        resolved_pair[resolved_pair == 1] = math.inf
+        indices = np.argsort(dist_mat + resolved_pair, axis=1)
+        frames = (
+            np.concatenate(
+                [
+                    indices[:, 1:2],
+                    indices[:, 0:1],
+                    indices[:, 2:3],
+                ],
+                axis=1,
+            )
+            + atom_idx
+        )
+        frame_data[token_idx : token_idx + num_atoms, :] = frames
+        resolved_frame_data[token_idx : token_idx + num_atoms] = resolved_mask[
+            frames
+        ].all(axis=1)
+        token_idx += num_tokens
+        atom_idx += num_atoms
+    frames_expanded = coords.reshape(-1, 3)[frame_data]
+    mask_collinear = compute_collinear_mask(
+        frames_expanded[:, 1] - frames_expanded[:, 0],
+        frames_expanded[:, 1] - frames_expanded[:, 2],
+    )
+    return frame_data, resolved_frame_data & mask_collinear
+def compute_collinear_mask(v1, v2):
+    norm1 = np.linalg.norm(v1, axis=1, keepdims=True)
+    norm2 = np.linalg.norm(v2, axis=1, keepdims=True)
+    v1 = v1 / (norm1 + 1e-6)
+    v2 = v2 / (norm2 + 1e-6)
+    mask_angle = np.abs(np.sum(v1 * v2, axis=1)) < 0.9063
+    mask_overlap1 = norm1.reshape(-1) > 1e-2
+    mask_overlap2 = norm2.reshape(-1) > 1e-2
+    return mask_angle & mask_overlap1 & mask_overlap2
+def dummy_msa(residues: np.ndarray) -> MSA:
+    """Create a dummy MSA for a chain.
+    Parameters
+    ----------
+    residues : np.ndarray
+        The residues for the chain.
+    Returns
+    -------
+    MSA
+        The dummy MSA.
+    """
+    residues = [res["res_type"] for res in residues]
+    deletions = []
+    sequences = [(0, -1, 0, len(residues), 0, 0)]
+    return MSA(
+        residues=np.array(residues, dtype=MSAResidue),
+        deletions=np.array(deletions, dtype=MSADeletion),
+        sequences=np.array(sequences, dtype=MSASequence),
+    )
+def construct_paired_msa(  # noqa: C901, PLR0915, PLR0912
+    data: Tokenized,
+    random: np.random.Generator,
+    max_seqs: int,
+    max_pairs: int = 8192,
+    max_total: int = 16384,
+    random_subset: bool = False,
+) -> tuple[Tensor, Tensor, Tensor]:
+    """Pair the MSA data.
+    Parameters
+    ----------
+    data : Tokenized
+        The input data to the model.
+    Returns
+    -------
+    Tensor
+        The MSA data.
+    Tensor
+        The deletion data.
+    Tensor
+        Mask indicating paired sequences.
+    """
+    # Get unique chains (ensuring monotonicity in the order)
+    assert np.all(np.diff(data.tokens["asym_id"], n=1) >= 0)
+    chain_ids = np.unique(data.tokens["asym_id"])
+    # Get relevant MSA, and create a dummy for chains without
+    msa: dict[int, MSA] = {}
+    for chain_id in chain_ids:
+        # Get input sequence
+        chain = data.structure.chains[chain_id]
+        res_start = chain["res_idx"]
+        res_end = res_start + chain["res_num"]
+        residues = data.structure.residues[res_start:res_end]
+        # Check if we have an MSA, and that the
+        # first sequence matches the input sequence
+        if chain_id in data.msa:
+            # Set the MSA
+            msa[chain_id] = data.msa[chain_id]
+            # Run length and residue type checks
+            first = data.msa[chain_id].sequences[0]
+            first_start = first["res_start"]
+            first_end = first["res_end"]
+            msa_residues = data.msa[chain_id].residues
+            first_residues = msa_residues[first_start:first_end]
+            warning = "Warning: MSA does not match input sequence, creating dummy."
+            if len(residues) == len(first_residues):
+                # If there is a mismatch, check if it is between MET & UNK
+                # If so, replace the first sequence with the input sequence.
+                # Otherwise, replace with a dummy MSA for this chain.
+                mismatches = residues["res_type"] != first_residues["res_type"]
+                if mismatches.sum().item():
+                    idx = np.where(mismatches)[0]
+                    is_met = residues["res_type"][idx] == const.token_ids["MET"]
+                    is_unk = residues["res_type"][idx] == const.token_ids["UNK"]
+                    is_msa_unk = (
+                        first_residues["res_type"][idx] == const.token_ids["UNK"]
+                    )
+                    if (np.all(is_met) and np.all(is_msa_unk)) or np.all(is_unk):
+                        msa_residues[first_start:first_end]["res_type"] = residues[
+                            "res_type"
+                        ]
+                    else:
+                        print(
+                            warning,
+                            "1",
+                            residues["res_type"],
+                            first_residues["res_type"],
+                            data.record.id,
+                        )
+                        msa[chain_id] = dummy_msa(residues)
+            else:
+                print(
+                    warning,
+                    "2",
+                    residues["res_type"],
+                    first_residues["res_type"],
+                    data.record.id,
+                )
+                msa[chain_id] = dummy_msa(residues)
+        else:
+            msa[chain_id] = dummy_msa(residues)
+    # Map taxonomies to (chain_id, seq_idx)
+    taxonomy_map: dict[str, list] = {}
+    for chain_id, chain_msa in msa.items():
+        sequences = chain_msa.sequences
+        sequences = sequences[sequences["taxonomy"] != -1]
+        for sequence in sequences:
+            seq_idx = sequence["seq_idx"]
+            taxon = sequence["taxonomy"]
+            taxonomy_map.setdefault(taxon, []).append((chain_id, seq_idx))
+    # Remove taxonomies with only one sequence and sort by the
+    # number of chain_id present in each of the taxonomies
+    taxonomy_map = {k: v for k, v in taxonomy_map.items() if len(v) > 1}
+    taxonomy_map = sorted(
+        taxonomy_map.items(),
+        key=lambda x: len({c for c, _ in x[1]}),
+        reverse=True,
+    )
+    # Keep track of the sequences available per chain, keeping the original
+    # order of the sequences in the MSA to favor the best matching sequences
+    visited = {(c, s) for c, items in taxonomy_map for s in items}
+    available = {}
+    for c in chain_ids:
+        available[c] = deque(
+            i for i in range(1, len(msa[c].sequences)) if (c, i) not in visited
+        )
+    # Create sequence pairs
+    is_paired = []
+    pairing = []
+    # Start with the first sequence for each chain
+    is_paired.append({c: 1 for c in chain_ids})
+    pairing.append({c: 0 for c in chain_ids})
+    # Then add up to 8191 paired rows
+    for _, pairs in taxonomy_map:
+        # Group occurences by chain_id in case we have multiple
+        # sequences from the same chain and same taxonomy
+        chain_occurences = {}
+        for chain_id, seq_idx in pairs:
+            chain_occurences.setdefault(chain_id, []).append(seq_idx)
+        # We create as many pairings as the maximum number of occurences
+        max_occurences = max(len(v) for v in chain_occurences.values())
+        for i in range(max_occurences):
+            row_pairing = {}
+            row_is_paired = {}
+            # Add the chains present in the taxonomy
+            for chain_id, seq_idxs in chain_occurences.items():
+                # Roll over the sequence index to maximize diversity
+                idx = i % len(seq_idxs)
+                seq_idx = seq_idxs[idx]
+                # Add the sequence to the pairing
+                row_pairing[chain_id] = seq_idx
+                row_is_paired[chain_id] = 1
+            # Add any missing chains
+            for chain_id in chain_ids:
+                if chain_id not in row_pairing:
+                    row_is_paired[chain_id] = 0
+                    if available[chain_id]:
+                        # Add the next available sequence
+                        row_pairing[chain_id] = available[chain_id].popleft()
+                    else:
+                        # No more sequences available, we place a gap
+                        row_pairing[chain_id] = -1
+            pairing.append(row_pairing)
+            is_paired.append(row_is_paired)
+            # Break if we have enough pairs
+            if len(pairing) >= max_pairs:
+                break
+        # Break if we have enough pairs
+        if len(pairing) >= max_pairs:
+            break
+    # Now add up to 16384 unpaired rows total
+    max_left = max(len(v) for v in available.values())
+    for _ in range(min(max_total - len(pairing), max_left)):
+        row_pairing = {}
+        row_is_paired = {}
+        for chain_id in chain_ids:
+            row_is_paired[chain_id] = 0
+            if available[chain_id]:
+                # Add the next available sequence
+                row_pairing[chain_id] = available[chain_id].popleft()
+            else:
+                # No more sequences available, we place a gap
+                row_pairing[chain_id] = -1
+        pairing.append(row_pairing)
+        is_paired.append(row_is_paired)
+        # Break if we have enough sequences
+        if len(pairing) >= max_total:
+            break
+    # Randomly sample a subset of the pairs
+    # ensuring the first row is always present
+    if random_subset:
+        num_seqs = len(pairing)
+        if num_seqs > max_seqs:
+            indices = random.choice(
+                np.arange(1, num_seqs), size=max_seqs - 1, replace=False
+            )  # noqa: NPY002
+            pairing = [pairing[0]] + [pairing[i] for i in indices]
+            is_paired = [is_paired[0]] + [is_paired[i] for i in indices]
+    else:
+        # Deterministic downsample to max_seqs
+        pairing = pairing[:max_seqs]
+        is_paired = is_paired[:max_seqs]
+    # Map (chain_id, seq_idx, res_idx) to deletion
+    deletions = numba.typed.Dict.empty(
+        key_type=numba.types.Tuple(
+            [numba.types.int64, numba.types.int64, numba.types.int64]),
+        value_type=numba.types.int64
+    )
+    for chain_id, chain_msa in msa.items():
+        chain_deletions = chain_msa.deletions
+        for sequence in chain_msa.sequences:
+            seq_idx = sequence["seq_idx"]
+            del_start = sequence["del_start"]
+            del_end = sequence["del_end"]
+            chain_deletions = chain_deletions[del_start:del_end]
+            for deletion_data in chain_deletions:
+                res_idx = deletion_data["res_idx"]
+                deletion_values = deletion_data["deletion"]
+                deletions[(chain_id, seq_idx, res_idx)] = deletion_values
+    # Add all the token MSA data
+    msa_data, del_data, paired_data = prepare_msa_arrays(
+        data.tokens, pairing, is_paired, deletions, msa
+    )
+    msa_data = torch.tensor(msa_data, dtype=torch.long)
+    del_data = torch.tensor(del_data, dtype=torch.float)
+    paired_data = torch.tensor(paired_data, dtype=torch.float)
+    return msa_data, del_data, paired_data
+def prepare_msa_arrays(
+    tokens,
+    pairing: list[dict[int, int]],
+    is_paired: list[dict[int, int]],
+    deletions: dict[tuple[int, int, int], int],
+    msa: dict[int, MSA],
+) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64], npt.NDArray[np.int64]]:
+    """Reshape data to play nicely with numba jit."""
+    token_asym_ids_arr = np.array([t["asym_id"] for t in tokens], dtype=np.int64)
+    token_res_idxs_arr = np.array([t["res_idx"] for t in tokens], dtype=np.int64)
+    chain_ids = sorted(msa.keys())
+    # chain_ids are not necessarily contiguous (e.g. they might be 0, 24, 25).
+    # This allows us to look up a chain_id by it's index in the chain_ids list.
+    chain_id_to_idx = {chain_id: i for i, chain_id in enumerate(chain_ids)}
+    token_asym_ids_idx_arr = np.array(
+        [chain_id_to_idx[asym_id] for asym_id in token_asym_ids_arr], dtype=np.int64
+    )
+    pairing_arr = np.zeros((len(pairing), len(chain_ids)), dtype=np.int64)
+    is_paired_arr = np.zeros((len(is_paired), len(chain_ids)), dtype=np.int64)
+    for i, row_pairing in enumerate(pairing):
+        for chain_id in chain_ids:
+            pairing_arr[i, chain_id_to_idx[chain_id]] = row_pairing[chain_id]
+    for i, row_is_paired in enumerate(is_paired):
+        for chain_id in chain_ids:
+            is_paired_arr[i, chain_id_to_idx[chain_id]] = row_is_paired[chain_id]
+    max_seq_len = max(len(msa[chain_id].sequences) for chain_id in chain_ids)
+    # we want res_start from sequences
+    msa_sequences = np.full((len(chain_ids), max_seq_len), -1, dtype=np.int64)
+    for chain_id in chain_ids:
+        for i, seq in enumerate(msa[chain_id].sequences):
+            msa_sequences[chain_id_to_idx[chain_id], i] = seq["res_start"]
+    max_residues_len = max(len(msa[chain_id].residues) for chain_id in chain_ids)
+    msa_residues = np.full((len(chain_ids), max_residues_len), -1, dtype=np.int64)
+    for chain_id in chain_ids:
+        residues = msa[chain_id].residues.astype(np.int64)
+        idxs = np.arange(len(residues))
+        chain_idx = chain_id_to_idx[chain_id]
+        msa_residues[chain_idx, idxs] = residues
+    return _prepare_msa_arrays_inner(
+        token_asym_ids_arr,
+        token_res_idxs_arr,
+        token_asym_ids_idx_arr,
+        pairing_arr,
+        is_paired_arr,
+        deletions,
+        msa_sequences,
+        msa_residues,
+        const.token_ids["-"],
+    )
+deletions_dict_type = types.DictType(types.UniTuple(types.int64, 3), types.int64)
+@numba.njit(
+    [
+        types.Tuple(
+            (
+                types.int64[:, ::1],  # msa_data
+                types.int64[:, ::1],  # del_data
+                types.int64[:, ::1],  # paired_data
+            )
+        )(
+            types.int64[::1],  # token_asym_ids
+            types.int64[::1],  # token_res_idxs
+            types.int64[::1],  # token_asym_ids_idx
+            types.int64[:, ::1],  # pairing
+            types.int64[:, ::1],  # is_paired
+            deletions_dict_type,  # deletions
+            types.int64[:, ::1],  # msa_sequences
+            types.int64[:, ::1],  # msa_residues
+            types.int64,  # gap_token
+        )
+    ],
+    cache=True,
+)
+def _prepare_msa_arrays_inner(
+    token_asym_ids: npt.NDArray[np.int64],
+    token_res_idxs: npt.NDArray[np.int64],
+    token_asym_ids_idx: npt.NDArray[np.int64],
+    pairing: npt.NDArray[np.int64],
+    is_paired: npt.NDArray[np.int64],
+    deletions: dict[tuple[int, int, int], int],
+    msa_sequences: npt.NDArray[np.int64],
+    msa_residues: npt.NDArray[np.int64],
+    gap_token: int,
+) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64], npt.NDArray[np.int64]]:
+    n_tokens = len(token_asym_ids)
+    n_pairs = len(pairing)
+    msa_data = np.full((n_tokens, n_pairs), gap_token, dtype=np.int64)
+    paired_data = np.zeros((n_tokens, n_pairs), dtype=np.int64)
+    del_data = np.zeros((n_tokens, n_pairs), dtype=np.int64)
+    # Add all the token MSA data
+    for token_idx in range(n_tokens):
+        chain_id_idx = token_asym_ids_idx[token_idx]
+        chain_id = token_asym_ids[token_idx]
+        res_idx = token_res_idxs[token_idx]
+        for pair_idx in range(n_pairs):
+            seq_idx = pairing[pair_idx, chain_id_idx]
+            paired_data[token_idx, pair_idx] = is_paired[pair_idx, chain_id_idx]
+            # Add residue type
+            if seq_idx != -1:
+                res_start = msa_sequences[chain_id_idx, seq_idx]
+                res_type = msa_residues[chain_id_idx, res_start + res_idx]
+                k = (chain_id, seq_idx, res_idx)
+                if k in deletions:
+                    del_data[token_idx, pair_idx] = deletions[k]
+                msa_data[token_idx, pair_idx] = res_type
+    return msa_data, del_data, paired_data
+####################################################################################################
+# FEATURES
+####################################################################################################
+def select_subset_from_mask(mask, p, random: np.random.Generator) -> np.ndarray:
+    num_true = np.sum(mask)
+    v = random.geometric(p) + 1
+    k = min(v, num_true)
+    true_indices = np.where(mask)[0]
+    # Randomly select k indices from the true_indices
+    selected_indices = random.choice(true_indices, size=k, replace=False)
+    new_mask = np.zeros_like(mask)
+    new_mask[selected_indices] = 1
+    return new_mask
+def get_range_bin(value: float, range_dict: dict[tuple[float, float], int], default=0):
+    """Get the bin of a value given a range dictionary."""
+    value = float(value)
+    for k, idx in range_dict.items():
+        if k == "other":
+            continue
+        low, high = k
+        if low <= value < high:
+            return idx
+    return default
+def process_token_features(  # noqa: C901, PLR0915, PLR0912
+    data: Tokenized,
+    random: np.random.Generator,
+    max_tokens: Optional[int] = None,
+    binder_pocket_conditioned_prop: Optional[float] = 0.0,
+    contact_conditioned_prop: Optional[float] = 0.0,
+    binder_pocket_cutoff_min: Optional[float] = 4.0,
+    binder_pocket_cutoff_max: Optional[float] = 20.0,
+    binder_pocket_sampling_geometric_p: Optional[float] = 0.0,
+    only_ligand_binder_pocket: Optional[bool] = False,
+    only_pp_contact: Optional[bool] = False,
+    inference_pocket_constraints: Optional[
+        list[tuple[int, list[tuple[int, int]], float]]
+    ] = False,
+    inference_contact_constraints: Optional[
+        list[tuple[tuple[int, int], tuple[int, int], float]]
+    ] = False,
+    override_method: Optional[str] = None,
+) -> dict[str, Tensor]:
+    """Get the token features.
+    Parameters
+    ----------
+    data : Tokenized
+        The input data to the model.
+    max_tokens : int
+        The maximum number of tokens.
+    Returns
+    -------
+    dict[str, Tensor]
+        The token features.
+    """
+    # Token data
+    token_data = data.tokens
+    token_bonds = data.bonds
+    # Token core features
+    token_index = torch.arange(len(token_data), dtype=torch.long)
+    residue_index = from_numpy(token_data["res_idx"]).long()
+    asym_id = from_numpy(token_data["asym_id"]).long()
+    entity_id = from_numpy(token_data["entity_id"]).long()
+    sym_id = from_numpy(token_data["sym_id"]).long()
+    mol_type = from_numpy(token_data["mol_type"]).long()
+    res_type = from_numpy(token_data["res_type"]).long()
+    res_type = one_hot(res_type, num_classes=const.num_tokens)
+    disto_center = from_numpy(token_data["disto_coords"])
+    modified = from_numpy(token_data["modified"]).long()  # float()
+    cyclic_period = from_numpy(token_data["cyclic_period"].copy())
+    affinity_mask = from_numpy(token_data["affinity_mask"]).float()
+    ## Conditioning features ##
+    method = (
+        np.zeros(len(token_data))
+        + const.method_types_ids[
+            (
+                "x-ray diffraction"
+                if override_method is None
+                else override_method.lower()
+            )
+        ]
+    )
+    if data.record is not None:
+        if (
+            override_method is None
+            and data.record.structure.method is not None
+            and data.record.structure.method.lower() in const.method_types_ids
+        ):
+            method = (method * 0) + const.method_types_ids[
+                data.record.structure.method.lower()
+            ]
+    method_feature = from_numpy(method).long()
+    # Token mask features
+    pad_mask = torch.ones(len(token_data), dtype=torch.float)
+    resolved_mask = from_numpy(token_data["resolved_mask"]).float()
+    disto_mask = from_numpy(token_data["disto_mask"]).float()
+    # Token bond features
+    if max_tokens is not None:
+        pad_len = max_tokens - len(token_data)
+        num_tokens = max_tokens if pad_len > 0 else len(token_data)
+    else:
+        num_tokens = len(token_data)
+    tok_to_idx = {tok["token_idx"]: idx for idx, tok in enumerate(token_data)}
+    bonds = torch.zeros(num_tokens, num_tokens, dtype=torch.float)
+    bonds_type = torch.zeros(num_tokens, num_tokens, dtype=torch.long)
+    for token_bond in token_bonds:
+        token_1 = tok_to_idx[token_bond["token_1"]]
+        token_2 = tok_to_idx[token_bond["token_2"]]
+        bonds[token_1, token_2] = 1
+        bonds[token_2, token_1] = 1
+        bond_type = token_bond["type"]
+        bonds_type[token_1, token_2] = bond_type
+        bonds_type[token_2, token_1] = bond_type
+    bonds = bonds.unsqueeze(-1)
+    # Pocket conditioned feature
+    contact_conditioning = (
+        np.zeros((len(token_data), len(token_data)))
+        + const.contact_conditioning_info["UNSELECTED"]
+    )
+    contact_threshold = np.zeros((len(token_data), len(token_data)))
+    if inference_pocket_constraints is not None:
+        for binder, contacts, max_distance, force in inference_pocket_constraints:
+            binder_mask = token_data["asym_id"] == binder
+            for idx, token in enumerate(token_data):
+                if (
+                    token["mol_type"] != const.chain_type_ids["NONPOLYMER"]
+                    and (token["asym_id"], token["res_idx"]) in contacts
+                ) or (
+                    token["mol_type"] == const.chain_type_ids["NONPOLYMER"]
+                    and (token["asym_id"], token["atom_idx"]) in contacts
+                ):
+                    contact_conditioning[binder_mask, idx] = (
+                        const.contact_conditioning_info["BINDER>POCKET"]
+                    )
+                    contact_conditioning[idx, binder_mask] = (
+                        const.contact_conditioning_info["POCKET>BINDER"]
+                    )
+                    contact_threshold[binder_mask, idx] = max_distance
+                    contact_threshold[idx, binder_mask] = max_distance
+    if inference_contact_constraints is not None:
+        for token1, token2, max_distance, force in inference_contact_constraints:
+            for idx1, _token1 in enumerate(token_data):
+                if (
+                    _token1["mol_type"] != const.chain_type_ids["NONPOLYMER"]
+                    and (_token1["asym_id"], _token1["res_idx"]) == token1
+                ) or (
+                    _token1["mol_type"] == const.chain_type_ids["NONPOLYMER"]
+                    and (_token1["asym_id"], _token1["atom_idx"]) == token1
+                ):
+                    for idx2, _token2 in enumerate(token_data):
+                        if (
+                            _token2["mol_type"] != const.chain_type_ids["NONPOLYMER"]
+                            and (_token2["asym_id"], _token2["res_idx"]) == token2
+                        ) or (
+                            _token2["mol_type"] == const.chain_type_ids["NONPOLYMER"]
+                            and (_token2["asym_id"], _token2["atom_idx"]) == token2
+                        ):
+                            contact_conditioning[idx1, idx2] = (
+                                const.contact_conditioning_info["CONTACT"]
+                            )
+                            contact_conditioning[idx2, idx1] = (
+                                const.contact_conditioning_info["CONTACT"]
+                            )
+                            contact_threshold[idx1, idx2] = max_distance
+                            contact_threshold[idx2, idx1] = max_distance
+                            break
+                    break
+    if binder_pocket_conditioned_prop > 0.0:
+        # choose as binder a random ligand in the crop, if there are no ligands select a protein chain
+        binder_asym_ids = np.unique(
+            token_data["asym_id"][
+                token_data["mol_type"] == const.chain_type_ids["NONPOLYMER"]
+            ]
+        )
+        if len(binder_asym_ids) == 0:
+            if not only_ligand_binder_pocket:
+                binder_asym_ids = np.unique(token_data["asym_id"])
+        while random.random() < binder_pocket_conditioned_prop:
+            if len(binder_asym_ids) == 0:
+                break
+            pocket_asym_id = random.choice(binder_asym_ids)
+            binder_asym_ids = binder_asym_ids[binder_asym_ids != pocket_asym_id]
+            binder_pocket_cutoff = sample_d(
+                min_d=binder_pocket_cutoff_min,
+                max_d=binder_pocket_cutoff_max,
+                n_samples=1,
+                random=random,
+            )
+            binder_mask = token_data["asym_id"] == pocket_asym_id
+            binder_coords = []
+            for token in token_data:
+                if token["asym_id"] == pocket_asym_id:
+                    _coords = data.structure.atoms["coords"][
+                        token["atom_idx"] : token["atom_idx"] + token["atom_num"]
+                    ]
+                    _is_present = data.structure.atoms["is_present"][
+                        token["atom_idx"] : token["atom_idx"] + token["atom_num"]
+                    ]
+                    binder_coords.append(_coords[_is_present])
+            binder_coords = np.concatenate(binder_coords, axis=0)
+            # find the tokens in the pocket
+            token_dist = np.zeros(len(token_data)) + 1000
+            for i, token in enumerate(token_data):
+                if (
+                    token["mol_type"] != const.chain_type_ids["NONPOLYMER"]
+                    and token["asym_id"] != pocket_asym_id
+                    and token["resolved_mask"] == 1
+                ):
+                    token_coords = data.structure.atoms["coords"][
+                        token["atom_idx"] : token["atom_idx"] + token["atom_num"]
+                    ]
+                    token_is_present = data.structure.atoms["is_present"][
+                        token["atom_idx"] : token["atom_idx"] + token["atom_num"]
+                    ]
+                    token_coords = token_coords[token_is_present]
+                    # find chain and apply chain transformation
+                    for chain in data.structure.chains:
+                        if chain["asym_id"] == token["asym_id"]:
+                            break
+                    token_dist[i] = np.min(
+                        np.linalg.norm(
+                            token_coords[:, None, :] - binder_coords[None, :, :],
+                            axis=-1,
+                        )
+                    )
+            pocket_mask = token_dist < binder_pocket_cutoff
+            if np.sum(pocket_mask) > 0:
+                if binder_pocket_sampling_geometric_p > 0.0:
+                    # select a subset of the pocket, according
+                    # to a geometric distribution with one as minimum
+                    pocket_mask = select_subset_from_mask(
+                        pocket_mask,
+                        binder_pocket_sampling_geometric_p,
+                        random,
+                    )
+                contact_conditioning[np.ix_(binder_mask, pocket_mask)] = (
+                    const.contact_conditioning_info["BINDER>POCKET"]
+                )
+                contact_conditioning[np.ix_(pocket_mask, binder_mask)] = (
+                    const.contact_conditioning_info["POCKET>BINDER"]
+                )
+                contact_threshold[np.ix_(binder_mask, pocket_mask)] = (
+                    binder_pocket_cutoff
+                )
+                contact_threshold[np.ix_(pocket_mask, binder_mask)] = (
+                    binder_pocket_cutoff
+                )
+    # Contact conditioning feature
+    if contact_conditioned_prop > 0.0:
+        while random.random() < contact_conditioned_prop:
+            contact_cutoff = sample_d(
+                min_d=binder_pocket_cutoff_min,
+                max_d=binder_pocket_cutoff_max,
+                n_samples=1,
+                random=random,
+            )
+            if only_pp_contact:
+                chain_asym_ids = np.unique(
+                    token_data["asym_id"][
+                        token_data["mol_type"] == const.chain_type_ids["PROTEIN"]
+                    ]
+                )
+            else:
+                chain_asym_ids = np.unique(token_data["asym_id"])
+            if len(chain_asym_ids) > 1:
+                chain_asym_id = random.choice(chain_asym_ids)
+                chain_coords = []
+                for token in token_data:
+                    if token["asym_id"] == chain_asym_id:
+                        _coords = data.structure.atoms["coords"][
+                            token["atom_idx"] : token["atom_idx"] + token["atom_num"]
+                        ]
+                        _is_present = data.structure.atoms["is_present"][
+                            token["atom_idx"] : token["atom_idx"] + token["atom_num"]
+                        ]
+                        chain_coords.append(_coords[_is_present])
+                chain_coords = np.concatenate(chain_coords, axis=0)
+                # find contacts in other chains
+                possible_other_chains = []
+                for other_chain_id in chain_asym_ids[chain_asym_ids != chain_asym_id]:
+                    for token in token_data:
+                        if token["asym_id"] == other_chain_id:
+                            _coords = data.structure.atoms["coords"][
+                                token["atom_idx"] : token["atom_idx"]
+                                + token["atom_num"]
+                            ]
+                            _is_present = data.structure.atoms["is_present"][
+                                token["atom_idx"] : token["atom_idx"]
+                                + token["atom_num"]
+                            ]
+                            if _is_present.sum() == 0:
+                                continue
+                            token_coords = _coords[_is_present]
+                            # check minimum distance
+                            if (
+                                np.min(cdist(chain_coords, token_coords))
+                                < contact_cutoff
+                            ):
+                                possible_other_chains.append(other_chain_id)
+                                break
+                if len(possible_other_chains) > 0:
+                    other_chain_id = random.choice(possible_other_chains)
+                    pairs = []
+                    for token_1 in token_data:
+                        if token_1["asym_id"] == chain_asym_id:
+                            _coords = data.structure.atoms["coords"][
+                                token_1["atom_idx"] : token_1["atom_idx"]
+                                + token_1["atom_num"]
+                            ]
+                            _is_present = data.structure.atoms["is_present"][
+                                token_1["atom_idx"] : token_1["atom_idx"]
+                                + token_1["atom_num"]
+                            ]
+                            if _is_present.sum() == 0:
+                                continue
+                            token_1_coords = _coords[_is_present]
+                            for token_2 in token_data:
+                                if token_2["asym_id"] == other_chain_id:
+                                    _coords = data.structure.atoms["coords"][
+                                        token_2["atom_idx"] : token_2["atom_idx"]
+                                        + token_2["atom_num"]
+                                    ]
+                                    _is_present = data.structure.atoms["is_present"][
+                                        token_2["atom_idx"] : token_2["atom_idx"]
+                                        + token_2["atom_num"]
+                                    ]
+                                    if _is_present.sum() == 0:
+                                        continue
+                                    token_2_coords = _coords[_is_present]
+                                    if (
+                                        np.min(cdist(token_1_coords, token_2_coords))
+                                        < contact_cutoff
+                                    ):
+                                        pairs.append(
+                                            (token_1["token_idx"], token_2["token_idx"])
+                                        )
+                    assert len(pairs) > 0
+                    pair = random.choice(pairs)
+                    token_1_mask = token_data["token_idx"] == pair[0]
+                    token_2_mask = token_data["token_idx"] == pair[1]
+                    contact_conditioning[np.ix_(token_1_mask, token_2_mask)] = (
+                        const.contact_conditioning_info["CONTACT"]
+                    )
+                    contact_conditioning[np.ix_(token_2_mask, token_1_mask)] = (
+                        const.contact_conditioning_info["CONTACT"]
+                    )
+            elif not only_pp_contact:
+                # only one chain, find contacts within the chain with minimum residue distance
+                pairs = []
+                for token_1 in token_data:
+                    _coords = data.structure.atoms["coords"][
+                        token_1["atom_idx"] : token_1["atom_idx"] + token_1["atom_num"]
+                    ]
+                    _is_present = data.structure.atoms["is_present"][
+                        token_1["atom_idx"] : token_1["atom_idx"] + token_1["atom_num"]
+                    ]
+                    if _is_present.sum() == 0:
+                        continue
+                    token_1_coords = _coords[_is_present]
+                    for token_2 in token_data:
+                        if np.abs(token_1["res_idx"] - token_2["res_idx"]) <= 8:
+                            continue
+                        _coords = data.structure.atoms["coords"][
+                            token_2["atom_idx"] : token_2["atom_idx"]
+                            + token_2["atom_num"]
+                        ]
+                        _is_present = data.structure.atoms["is_present"][
+                            token_2["atom_idx"] : token_2["atom_idx"]
+                            + token_2["atom_num"]
+                        ]
+                        if _is_present.sum() == 0:
+                            continue
+                        token_2_coords = _coords[_is_present]
+                        if (
+                            np.min(cdist(token_1_coords, token_2_coords))
+                            < contact_cutoff
+                        ):
+                            pairs.append((token_1["token_idx"], token_2["token_idx"]))
+                if len(pairs) > 0:
+                    pair = random.choice(pairs)
+                    token_1_mask = token_data["token_idx"] == pair[0]
+                    token_2_mask = token_data["token_idx"] == pair[1]
+                    contact_conditioning[np.ix_(token_1_mask, token_2_mask)] = (
+                        const.contact_conditioning_info["CONTACT"]
+                    )
+                    contact_conditioning[np.ix_(token_2_mask, token_1_mask)] = (
+                        const.contact_conditioning_info["CONTACT"]
+                    )
+    if np.all(contact_conditioning == const.contact_conditioning_info["UNSELECTED"]):
+        contact_conditioning = (
+            contact_conditioning
+            - const.contact_conditioning_info["UNSELECTED"]
+            + const.contact_conditioning_info["UNSPECIFIED"]
+        )
+    contact_conditioning = from_numpy(contact_conditioning).long()
+    contact_conditioning = one_hot(
+        contact_conditioning, num_classes=len(const.contact_conditioning_info)
+    )
+    contact_threshold = from_numpy(contact_threshold).float()
+    # compute cyclic polymer mask
+    cyclic_ids = {}
+    for idx_chain, asym_id_iter in enumerate(data.structure.chains["asym_id"]):
+        for connection in data.structure.bonds:
+            if (
+                idx_chain == connection["chain_1"] == connection["chain_2"]
+                and data.structure.chains[connection["chain_1"]]["res_num"] > 2
+                and connection["res_1"]
+                != connection["res_2"]  # Avoid same residue bonds!
+            ):
+                if (
+                    data.structure.chains[connection["chain_1"]]["res_num"]
+                    == (connection["res_2"] + 1)
+                    and connection["res_1"] == 0
+                ) or (
+                    data.structure.chains[connection["chain_1"]]["res_num"]
+                    == (connection["res_1"] + 1)
+                    and connection["res_2"] == 0
+                ):
+                    cyclic_ids[asym_id_iter] = data.structure.chains[
+                        connection["chain_1"]
+                    ]["res_num"]
+    cyclic = from_numpy(
+        np.array(
+            [
+                (cyclic_ids[asym_id_iter] if asym_id_iter in cyclic_ids else 0)
+                for asym_id_iter in token_data["asym_id"]
+            ]
+        )
+    ).float()
+    # cyclic period is either computed from the bonds or given as input flag
+    cyclic_period = torch.maximum(cyclic, cyclic_period)
+    # Pad to max tokens if given
+    if max_tokens is not None:
+        pad_len = max_tokens - len(token_data)
+        if pad_len > 0:
+            token_index = pad_dim(token_index, 0, pad_len)
+            residue_index = pad_dim(residue_index, 0, pad_len)
+            asym_id = pad_dim(asym_id, 0, pad_len)
+            entity_id = pad_dim(entity_id, 0, pad_len)
+            sym_id = pad_dim(sym_id, 0, pad_len)
+            mol_type = pad_dim(mol_type, 0, pad_len)
+            res_type = pad_dim(res_type, 0, pad_len)
+            disto_center = pad_dim(disto_center, 0, pad_len)
+            pad_mask = pad_dim(pad_mask, 0, pad_len)
+            resolved_mask = pad_dim(resolved_mask, 0, pad_len)
+            disto_mask = pad_dim(disto_mask, 0, pad_len)
+            contact_conditioning = pad_dim(contact_conditioning, 0, pad_len)
+            contact_conditioning = pad_dim(contact_conditioning, 1, pad_len)
+            contact_threshold = pad_dim(contact_threshold, 0, pad_len)
+            contact_threshold = pad_dim(contact_threshold, 1, pad_len)
+            method_feature = pad_dim(method_feature, 0, pad_len)
+            modified = pad_dim(modified, 0, pad_len)
+            cyclic_period = pad_dim(cyclic_period, 0, pad_len)
+            affinity_mask = pad_dim(affinity_mask, 0, pad_len)
+    token_features = {
+        "token_index": token_index,
+        "residue_index": residue_index,
+        "asym_id": asym_id,
+        "entity_id": entity_id,
+        "sym_id": sym_id,
+        "mol_type": mol_type,
+        "res_type": res_type,
+        "disto_center": disto_center,
+        "token_bonds": bonds,
+        "type_bonds": bonds_type,
+        "token_pad_mask": pad_mask,
+        "token_resolved_mask": resolved_mask,
+        "token_disto_mask": disto_mask,
+        "contact_conditioning": contact_conditioning,
+        "contact_threshold": contact_threshold,
+        "method_feature": method_feature,
+        "modified": modified,
+        "cyclic_period": cyclic_period,
+        "affinity_token_mask": affinity_mask,
+    }
+    return token_features
+def process_atom_features(
+    data: Tokenized,
+    random: np.random.Generator,
+    ensemble_features: dict,
+    molecules: dict[str, Mol],
+    atoms_per_window_queries: int = 32,
+    min_dist: float = 2.0,
+    max_dist: float = 22.0,
+    num_bins: int = 64,
+    max_atoms: Optional[int] = None,
+    max_tokens: Optional[int] = None,
+    disto_use_ensemble: Optional[bool] = False,
+    override_bfactor: bool = False,
+    compute_frames: bool = False,
+    override_coords: Optional[Tensor] = None,
+    bfactor_md_correction: bool = False,
+) -> dict[str, Tensor]:
+    """Get the atom features.
+    Parameters
+    ----------
+    data : Tokenized
+        The input to the model.
+    max_atoms : int, optional
+        The maximum number of atoms.
+    Returns
+    -------
+    dict[str, Tensor]
+        The atom features.
+    """
+    # Filter to tokens' atoms
+    atom_data = []
+    atom_name = []
+    atom_element = []
+    atom_charge = []
+    atom_conformer = []
+    atom_chirality = []
+    ref_space_uid = []
+    coord_data = []
+    if compute_frames:
+        frame_data = []
+        resolved_frame_data = []
+    atom_to_token = []
+    token_to_rep_atom = []  # index on cropped atom table
+    r_set_to_rep_atom = []
+    disto_coords_ensemble = []
+    backbone_feat_index = []
+    token_to_center_atom = []
+    e_offsets = data.structure.ensemble["atom_coord_idx"]
+    atom_idx = 0
+    # Start atom idx in full atom table for structures chosen. Up to num_ensembles points.
+    ensemble_atom_starts = [
+        data.structure.ensemble[idx]["atom_coord_idx"]
+        for idx in ensemble_features["ensemble_ref_idxs"]
+    ]
+    # Set unk chirality id
+    unk_chirality = const.chirality_type_ids[const.unk_chirality_type]
+    chain_res_ids = {}
+    res_index_to_conf_id = {}
+    for token_id, token in enumerate(data.tokens):
+        # Get the chain residue ids
+        chain_idx, res_id = token["asym_id"], token["res_idx"]
+        chain = data.structure.chains[chain_idx]
+        if (chain_idx, res_id) not in chain_res_ids:
+            new_idx = len(chain_res_ids)
+            chain_res_ids[(chain_idx, res_id)] = new_idx
+        else:
+            new_idx = chain_res_ids[(chain_idx, res_id)]
+        # Get the molecule and conformer
+        mol = molecules[token["res_name"]]
+        atom_name_to_ref = {a.GetProp("name"): a for a in mol.GetAtoms()}
+        # Sample a random conformer
+        if (chain_idx, res_id) not in res_index_to_conf_id:
+            conf_ids = [int(conf.GetId()) for conf in mol.GetConformers()]
+            conf_id = int(random.choice(conf_ids))
+            res_index_to_conf_id[(chain_idx, res_id)] = conf_id
+        conf_id = res_index_to_conf_id[(chain_idx, res_id)]
+        conformer = mol.GetConformer(conf_id)
+        # Map atoms to token indices
+        ref_space_uid.extend([new_idx] * token["atom_num"])
+        atom_to_token.extend([token_id] * token["atom_num"])
+        # Add atom data
+        start = token["atom_idx"]
+        end = token["atom_idx"] + token["atom_num"]
+        token_atoms = data.structure.atoms[start:end]
+        # Add atom ref data
+        # element, charge, conformer, chirality
+        token_atom_name = np.array([convert_atom_name(a["name"]) for a in token_atoms])
+        token_atoms_ref = np.array([atom_name_to_ref[a["name"]] for a in token_atoms])
+        token_atoms_element = np.array([a.GetAtomicNum() for a in token_atoms_ref])
+        token_atoms_charge = np.array([a.GetFormalCharge() for a in token_atoms_ref])
+        token_atoms_conformer = np.array(
+            [
+                (
+                    conformer.GetAtomPosition(a.GetIdx()).x,
+                    conformer.GetAtomPosition(a.GetIdx()).y,
+                    conformer.GetAtomPosition(a.GetIdx()).z,
+                )
+                for a in token_atoms_ref
+            ]
+        )
+        token_atoms_chirality = np.array(
+            [
+                const.chirality_type_ids.get(a.GetChiralTag().name, unk_chirality)
+                for a in token_atoms_ref
+            ]
+        )
+        # Map token to representative atom
+        token_to_rep_atom.append(atom_idx + token["disto_idx"] - start)
+        token_to_center_atom.append(atom_idx + token["center_idx"] - start)
+        if (chain["mol_type"] != const.chain_type_ids["NONPOLYMER"]) and token[
+            "resolved_mask"
+        ]:
+            r_set_to_rep_atom.append(atom_idx + token["center_idx"] - start)
+        if chain["mol_type"] == const.chain_type_ids["PROTEIN"]:
+            backbone_index = [
+                (
+                    const.protein_backbone_atom_index[atom_name] + 1
+                    if atom_name in const.protein_backbone_atom_index
+                    else 0
+                )
+                for atom_name in token_atoms["name"]
+            ]
+        elif (
+            chain["mol_type"] == const.chain_type_ids["DNA"]
+            or chain["mol_type"] == const.chain_type_ids["RNA"]
+        ):
+            backbone_index = [
+                (
+                    const.nucleic_backbone_atom_index[atom_name]
+                    + 1
+                    + len(const.protein_backbone_atom_index)
+                    if atom_name in const.nucleic_backbone_atom_index
+                    else 0
+                )
+                for atom_name in token_atoms["name"]
+            ]
+        else:
+            backbone_index = [0] * token["atom_num"]
+        backbone_feat_index.extend(backbone_index)
+        # Get token coordinates across sampled ensembles  and apply transforms
+        token_coords = np.array(
+            [
+                data.structure.coords[
+                    ensemble_atom_start + start : ensemble_atom_start + end
+                ]["coords"]
+                for ensemble_atom_start in ensemble_atom_starts
+            ]
+        )
+        coord_data.append(token_coords)
+        if compute_frames:
+            # Get frame data
+            res_type = const.tokens[token["res_type"]]
+            res_name = str(token["res_name"])
+            if token["atom_num"] < 3 or res_type in ["PAD", "UNK", "-"]:
+                idx_frame_a, idx_frame_b, idx_frame_c = 0, 0, 0
+                mask_frame = False
+            elif (token["mol_type"] == const.chain_type_ids["PROTEIN"]) and (
+                res_name in const.ref_atoms
+            ):
+                idx_frame_a, idx_frame_b, idx_frame_c = (
+                    const.ref_atoms[res_name].index("N"),
+                    const.ref_atoms[res_name].index("CA"),
+                    const.ref_atoms[res_name].index("C"),
+                )
+                mask_frame = (
+                    token_atoms["is_present"][idx_frame_a]
+                    and token_atoms["is_present"][idx_frame_b]
+                    and token_atoms["is_present"][idx_frame_c]
+                )
+            elif (
+                token["mol_type"] == const.chain_type_ids["DNA"]
+                or token["mol_type"] == const.chain_type_ids["RNA"]
+            ) and (res_name in const.ref_atoms):
+                idx_frame_a, idx_frame_b, idx_frame_c = (
+                    const.ref_atoms[res_name].index("C1'"),
+                    const.ref_atoms[res_name].index("C3'"),
+                    const.ref_atoms[res_name].index("C4'"),
+                )
+                mask_frame = (
+                    token_atoms["is_present"][idx_frame_a]
+                    and token_atoms["is_present"][idx_frame_b]
+                    and token_atoms["is_present"][idx_frame_c]
+                )
+            elif token["mol_type"] == const.chain_type_ids["PROTEIN"]:
+                # Try to look for the atom nams in the modified residue
+                is_ca = token_atoms["name"] == "CA"
+                idx_frame_a = is_ca.argmax()
+                ca_present = (
+                    token_atoms[idx_frame_a]["is_present"] if is_ca.any() else False
+                )
+                is_n = token_atoms["name"] == "N"
+                idx_frame_b = is_n.argmax()
+                n_present = (
+                    token_atoms[idx_frame_b]["is_present"] if is_n.any() else False
+                )
+                is_c = token_atoms["name"] == "C"
+                idx_frame_c = is_c.argmax()
+                c_present = (
+                    token_atoms[idx_frame_c]["is_present"] if is_c.any() else False
+                )
+                mask_frame = ca_present and n_present and c_present
+            elif (token["mol_type"] == const.chain_type_ids["DNA"]) or (
+                token["mol_type"] == const.chain_type_ids["RNA"]
+            ):
+                # Try to look for the atom nams in the modified residue
+                is_c1 = token_atoms["name"] == "C1'"
+                idx_frame_a = is_c1.argmax()
+                c1_present = (
+                    token_atoms[idx_frame_a]["is_present"] if is_c1.any() else False
+                )
+                is_c3 = token_atoms["name"] == "C3'"
+                idx_frame_b = is_c3.argmax()
+                c3_present = (
+                    token_atoms[idx_frame_b]["is_present"] if is_c3.any() else False
+                )
+                is_c4 = token_atoms["name"] == "C4'"
+                idx_frame_c = is_c4.argmax()
+                c4_present = (
+                    token_atoms[idx_frame_c]["is_present"] if is_c4.any() else False
+                )
+                mask_frame = c1_present and c3_present and c4_present
+            else:
+                idx_frame_a, idx_frame_b, idx_frame_c = 0, 0, 0
+                mask_frame = False
+            frame_data.append(
+                [
+                    idx_frame_a + atom_idx,
+                    idx_frame_b + atom_idx,
+                    idx_frame_c + atom_idx,
+                ]
+            )
+            resolved_frame_data.append(mask_frame)
+        # Get distogram coordinates
+        disto_coords_ensemble_tok = data.structure.coords[
+            e_offsets + token["disto_idx"]
+        ]["coords"]
+        disto_coords_ensemble.append(disto_coords_ensemble_tok)
+        # Update atom data. This is technically never used again (we rely on coord_data),
+        # but we update for consistency and to make sure the Atom object has valid, transformed coordinates.
+        token_atoms = token_atoms.copy()
+        token_atoms["coords"] = token_coords[
+            0
+        ]  # atom has a copy of first coords in ensemble
+        atom_data.append(token_atoms)
+        atom_name.append(token_atom_name)
+        atom_element.append(token_atoms_element)
+        atom_charge.append(token_atoms_charge)
+        atom_conformer.append(token_atoms_conformer)
+        atom_chirality.append(token_atoms_chirality)
+        atom_idx += len(token_atoms)
+    disto_coords_ensemble = np.array(disto_coords_ensemble)  # (N_TOK, N_ENS, 3)
+    # Compute ensemble distogram
+    L = len(data.tokens)
+    if disto_use_ensemble:
+        # Use all available structures to create distogram
+        idx_list = range(disto_coords_ensemble.shape[1])
+    else:
+        # Only use a sampled structures to create distogram
+        idx_list = ensemble_features["ensemble_ref_idxs"]
+    # Create distogram
+    disto_target = torch.zeros(L, L, len(idx_list), num_bins)  # TODO1
+    # disto_target = torch.zeros(L, L, num_bins)
+    for i, e_idx in enumerate(idx_list):
+        t_center = torch.Tensor(disto_coords_ensemble[:, e_idx, :])
+        t_dists = torch.cdist(t_center, t_center)
+        boundaries = torch.linspace(min_dist, max_dist, num_bins - 1)
+        distogram = (t_dists.unsqueeze(-1) > boundaries).sum(dim=-1).long()
+        # disto_target += one_hot(distogram, num_classes=num_bins)
+        disto_target[:, :, i, :] = one_hot(distogram, num_classes=num_bins)  # TODO1
+    # Normalize distogram
+    # disto_target = disto_target / disto_target.sum(-1)[..., None]  # remove TODO1
+    atom_data = np.concatenate(atom_data)
+    atom_name = np.concatenate(atom_name)
+    atom_element = np.concatenate(atom_element)
+    atom_charge = np.concatenate(atom_charge)
+    atom_conformer = np.concatenate(atom_conformer)
+    atom_chirality = np.concatenate(atom_chirality)
+    coord_data = np.concatenate(coord_data, axis=1)
+    ref_space_uid = np.array(ref_space_uid)
+    # Compute features
+    disto_coords_ensemble = from_numpy(disto_coords_ensemble)
+    disto_coords_ensemble = disto_coords_ensemble[
+        :, ensemble_features["ensemble_ref_idxs"]
+    ].permute(1, 0, 2)
+    backbone_feat_index = from_numpy(np.asarray(backbone_feat_index)).long()
+    ref_atom_name_chars = from_numpy(atom_name).long()
+    ref_element = from_numpy(atom_element).long()
+    ref_charge = from_numpy(atom_charge).float()
+    ref_pos = from_numpy(atom_conformer).float()
+    ref_space_uid = from_numpy(ref_space_uid)
+    ref_chirality = from_numpy(atom_chirality).long()
+    coords = from_numpy(coord_data.copy())
+    resolved_mask = from_numpy(atom_data["is_present"])
+    pad_mask = torch.ones(len(atom_data), dtype=torch.float)
+    atom_to_token = torch.tensor(atom_to_token, dtype=torch.long)
+    token_to_rep_atom = torch.tensor(token_to_rep_atom, dtype=torch.long)
+    r_set_to_rep_atom = torch.tensor(r_set_to_rep_atom, dtype=torch.long)
+    token_to_center_atom = torch.tensor(token_to_center_atom, dtype=torch.long)
+    bfactor = from_numpy(atom_data["bfactor"].copy())
+    plddt = from_numpy(atom_data["plddt"].copy())
+    if override_bfactor:
+        bfactor = bfactor * 0.0
+    if bfactor_md_correction and data.record.structure.method.lower() == "md":
+        # MD bfactor was computed as RMSF
+        # Convert to b-factor
+        bfactor = 8 * (np.pi**2) * (bfactor**2)
+    # We compute frames within ensemble
+    if compute_frames:
+        frames = []
+        frame_resolved_mask = []
+        for i in range(coord_data.shape[0]):
+            frame_data_, resolved_frame_data_ = compute_frames_nonpolymer(
+                data,
+                coord_data[i],
+                atom_data["is_present"],
+                atom_to_token,
+                frame_data,
+                resolved_frame_data,
+            )  # Compute frames for NONPOLYMER tokens
+            frames.append(frame_data_.copy())
+            frame_resolved_mask.append(resolved_frame_data_.copy())
+        frames = from_numpy(np.stack(frames))  # (N_ENS, N_TOK, 3)
+        frame_resolved_mask = from_numpy(np.stack(frame_resolved_mask))
+    # Convert to one-hot
+    backbone_feat_index = one_hot(
+        backbone_feat_index,
+        num_classes=1
+        + len(const.protein_backbone_atom_index)
+        + len(const.nucleic_backbone_atom_index),
+    )
+    ref_atom_name_chars = one_hot(ref_atom_name_chars, num_classes=64)
+    ref_element = one_hot(ref_element, num_classes=const.num_elements)
+    atom_to_token = one_hot(atom_to_token, num_classes=token_id + 1)
+    token_to_rep_atom = one_hot(token_to_rep_atom, num_classes=len(atom_data))
+    r_set_to_rep_atom = one_hot(r_set_to_rep_atom, num_classes=len(atom_data))
+    token_to_center_atom = one_hot(token_to_center_atom, num_classes=len(atom_data))
+    # Center the ground truth coordinates
+    center = (coords * resolved_mask[None, :, None]).sum(dim=1)
+    center = center / resolved_mask.sum().clamp(min=1)
+    coords = coords - center[:, None]
+    if isinstance(override_coords, Tensor):
+        coords = override_coords.unsqueeze(0)
+    # Apply random roto-translation to the input conformers
+    for i in range(torch.max(ref_space_uid)):
+        included = ref_space_uid == i
+        if torch.sum(included) > 0 and torch.any(resolved_mask[included]):
+            ref_pos[included] = center_random_augmentation(
+                ref_pos[included][None], resolved_mask[included][None], centering=True
+            )[0]
+    # Compute padding and apply
+    if max_atoms is not None:
+        assert max_atoms % atoms_per_window_queries == 0
+        pad_len = max_atoms - len(atom_data)
+    else:
+        pad_len = (
+            (len(atom_data) - 1) // atoms_per_window_queries + 1
+        ) * atoms_per_window_queries - len(atom_data)
+    if pad_len > 0:
+        pad_mask = pad_dim(pad_mask, 0, pad_len)
+        ref_pos = pad_dim(ref_pos, 0, pad_len)
+        resolved_mask = pad_dim(resolved_mask, 0, pad_len)
+        ref_atom_name_chars = pad_dim(ref_atom_name_chars, 0, pad_len)
+        ref_element = pad_dim(ref_element, 0, pad_len)
+        ref_charge = pad_dim(ref_charge, 0, pad_len)
+        ref_chirality = pad_dim(ref_chirality, 0, pad_len)
+        backbone_feat_index = pad_dim(backbone_feat_index, 0, pad_len)
+        ref_space_uid = pad_dim(ref_space_uid, 0, pad_len)
+        coords = pad_dim(coords, 1, pad_len)
+        atom_to_token = pad_dim(atom_to_token, 0, pad_len)
+        token_to_rep_atom = pad_dim(token_to_rep_atom, 1, pad_len)
+        token_to_center_atom = pad_dim(token_to_center_atom, 1, pad_len)
+        r_set_to_rep_atom = pad_dim(r_set_to_rep_atom, 1, pad_len)
+        bfactor = pad_dim(bfactor, 0, pad_len)
+        plddt = pad_dim(plddt, 0, pad_len)
+    if max_tokens is not None:
+        pad_len = max_tokens - token_to_rep_atom.shape[0]
+        if pad_len > 0:
+            atom_to_token = pad_dim(atom_to_token, 1, pad_len)
+            token_to_rep_atom = pad_dim(token_to_rep_atom, 0, pad_len)
+            r_set_to_rep_atom = pad_dim(r_set_to_rep_atom, 0, pad_len)
+            token_to_center_atom = pad_dim(token_to_center_atom, 0, pad_len)
+            disto_target = pad_dim(pad_dim(disto_target, 0, pad_len), 1, pad_len)
+            disto_coords_ensemble = pad_dim(disto_coords_ensemble, 1, pad_len)
+            if compute_frames:
+                frames = pad_dim(frames, 1, pad_len)
+                frame_resolved_mask = pad_dim(frame_resolved_mask, 1, pad_len)
+    atom_features = {
+        "ref_pos": ref_pos,
+        "atom_resolved_mask": resolved_mask,
+        "ref_atom_name_chars": ref_atom_name_chars,
+        "ref_element": ref_element,
+        "ref_charge": ref_charge,
+        "ref_chirality": ref_chirality,
+        "atom_backbone_feat": backbone_feat_index,
+        "ref_space_uid": ref_space_uid,
+        "coords": coords,
+        "atom_pad_mask": pad_mask,
+        "atom_to_token": atom_to_token,
+        "token_to_rep_atom": token_to_rep_atom,
+        "r_set_to_rep_atom": r_set_to_rep_atom,
+        "token_to_center_atom": token_to_center_atom,
+        "disto_target": disto_target,
+        "disto_coords_ensemble": disto_coords_ensemble,
+        "bfactor": bfactor,
+        "plddt": plddt,
+    }
+    if compute_frames:
+        atom_features["frames_idx"] = frames
+        atom_features["frame_resolved_mask"] = frame_resolved_mask
+    return atom_features
+def process_msa_features(
+    data: Tokenized,
+    random: np.random.Generator,
+    max_seqs_batch: int,
+    max_seqs: int,
+    max_tokens: Optional[int] = None,
+    pad_to_max_seqs: bool = False,
+    msa_sampling: bool = False,
+    affinity: bool = False,
+) -> dict[str, Tensor]:
+    """Get the MSA features.
+    Parameters
+    ----------
+    data : Tokenized
+        The input to the model.
+    random : np.random.Generator
+        The random number generator.
+    max_seqs : int
+        The maximum number of MSA sequences.
+    max_tokens : int
+        The maximum number of tokens.
+    pad_to_max_seqs : bool
+        Whether to pad to the maximum number of sequences.
+    msa_sampling : bool
+        Whether to sample the MSA.
+    Returns
+    -------
+    dict[str, Tensor]
+        The MSA features.
+    """
+    # Created paired MSA
+    msa, deletion, paired = construct_paired_msa(
+        data=data,
+        random=random,
+        max_seqs=max_seqs_batch,
+        random_subset=msa_sampling,
+    )
+    msa, deletion, paired = (
+        msa.transpose(1, 0),
+        deletion.transpose(1, 0),
+        paired.transpose(1, 0),
+    )  # (N_MSA, N_RES, N_AA)
+    # Prepare features
+    assert torch.all(msa >= 0) and torch.all(msa < const.num_tokens)
+    msa_one_hot = torch.nn.functional.one_hot(msa, num_classes=const.num_tokens)
+    msa_mask = torch.ones_like(msa)
+    profile = msa_one_hot.float().mean(dim=0)
+    has_deletion = deletion > 0
+    deletion = np.pi / 2 * np.arctan(deletion / 3)
+    deletion_mean = deletion.mean(axis=0)
+    # Pad in the MSA dimension (dim=0)
+    if pad_to_max_seqs:
+        pad_len = max_seqs - msa.shape[0]
+        if pad_len > 0:
+            msa = pad_dim(msa, 0, pad_len, const.token_ids["-"])
+            paired = pad_dim(paired, 0, pad_len)
+            msa_mask = pad_dim(msa_mask, 0, pad_len)
+            has_deletion = pad_dim(has_deletion, 0, pad_len)
+            deletion = pad_dim(deletion, 0, pad_len)
+    # Pad in the token dimension (dim=1)
+    if max_tokens is not None:
+        pad_len = max_tokens - msa.shape[1]
+        if pad_len > 0:
+            msa = pad_dim(msa, 1, pad_len, const.token_ids["-"])
+            paired = pad_dim(paired, 1, pad_len)
+            msa_mask = pad_dim(msa_mask, 1, pad_len)
+            has_deletion = pad_dim(has_deletion, 1, pad_len)
+            deletion = pad_dim(deletion, 1, pad_len)
+            profile = pad_dim(profile, 0, pad_len)
+            deletion_mean = pad_dim(deletion_mean, 0, pad_len)
+    if affinity:
+        return {
+            "deletion_mean_affinity": deletion_mean,
+            "profile_affinity": profile,
+        }
+    else:
+        return {
+            "msa": msa,
+            "msa_paired": paired,
+            "deletion_value": deletion,
+            "has_deletion": has_deletion,
+            "deletion_mean": deletion_mean,
+            "profile": profile,
+            "msa_mask": msa_mask,
+        }
+def load_dummy_templates_features(tdim: int, num_tokens: int) -> dict:
+    """Load dummy templates for v2."""
+    # Allocate features
+    res_type = np.zeros((tdim, num_tokens), dtype=np.int64)
+    frame_rot = np.zeros((tdim, num_tokens, 3, 3), dtype=np.float32)
+    frame_t = np.zeros((tdim, num_tokens, 3), dtype=np.float32)
+    cb_coords = np.zeros((tdim, num_tokens, 3), dtype=np.float32)
+    ca_coords = np.zeros((tdim, num_tokens, 3), dtype=np.float32)
+    frame_mask = np.zeros((tdim, num_tokens), dtype=np.float32)
+    cb_mask = np.zeros((tdim, num_tokens), dtype=np.float32)
+    template_mask = np.zeros((tdim, num_tokens), dtype=np.float32)
+    query_to_template = np.zeros((tdim, num_tokens), dtype=np.int64)
+    visibility_ids = np.zeros((tdim, num_tokens), dtype=np.float32)
+    # Convert to one-hot
+    res_type = torch.from_numpy(res_type)
+    res_type = one_hot(res_type, num_classes=const.num_tokens)
+    return {
+        "template_restype": res_type,
+        "template_frame_rot": torch.from_numpy(frame_rot),
+        "template_frame_t": torch.from_numpy(frame_t),
+        "template_cb": torch.from_numpy(cb_coords),
+        "template_ca": torch.from_numpy(ca_coords),
+        "template_mask_cb": torch.from_numpy(cb_mask),
+        "template_mask_frame": torch.from_numpy(frame_mask),
+        "template_mask": torch.from_numpy(template_mask),
+        "query_to_template": torch.from_numpy(query_to_template),
+        "visibility_ids": torch.from_numpy(visibility_ids),
+    }
+def compute_template_features(
+    query_tokens: Tokenized,
+    tmpl_tokens: list[dict],
+    num_tokens: int,
+) -> dict:
+    """Compute the template features."""
+    # Allocate features
+    res_type = np.zeros((num_tokens,), dtype=np.int64)
+    frame_rot = np.zeros((num_tokens, 3, 3), dtype=np.float32)
+    frame_t = np.zeros((num_tokens, 3), dtype=np.float32)
+    cb_coords = np.zeros((num_tokens, 3), dtype=np.float32)
+    ca_coords = np.zeros((num_tokens, 3), dtype=np.float32)
+    frame_mask = np.zeros((num_tokens,), dtype=np.float32)
+    cb_mask = np.zeros((num_tokens,), dtype=np.float32)
+    template_mask = np.zeros((num_tokens,), dtype=np.float32)
+    query_to_template = np.zeros((num_tokens,), dtype=np.int64)
+    visibility_ids = np.zeros((num_tokens,), dtype=np.float32)
+    # Now create features per token
+    asym_id_to_pdb_id = {}
+    for token_dict in tmpl_tokens:
+        idx = token_dict["q_idx"]
+        pdb_id = token_dict["pdb_id"]
+        token = token_dict["token"]
+        query_token = query_tokens.tokens[idx]
+        asym_id_to_pdb_id[query_token["asym_id"]] = pdb_id
+        res_type[idx] = token["res_type"]
+        frame_rot[idx] = token["frame_rot"].reshape(3, 3)
+        frame_t[idx] = token["frame_t"]
+        cb_coords[idx] = token["disto_coords"]
+        ca_coords[idx] = token["center_coords"]
+        cb_mask[idx] = token["disto_mask"]
+        frame_mask[idx] = token["frame_mask"]
+        template_mask[idx] = 1.0
+    # Set visibility_id for templated chains
+    for asym_id, pdb_id in asym_id_to_pdb_id.items():
+        indices = (query_tokens.tokens["asym_id"] == asym_id).nonzero()
+        visibility_ids[indices] = pdb_id
+    # Set visibility for non templated chain + olygomerics
+    for asym_id in np.unique(query_tokens.structure.chains["asym_id"]):
+        if asym_id not in asym_id_to_pdb_id:
+            # We hack the chain id to be negative to not overlap with the above
+            indices = (query_tokens.tokens["asym_id"] == asym_id).nonzero()
+            visibility_ids[indices] = -1 - asym_id
+    # Convert to one-hot
+    res_type = torch.from_numpy(res_type)
+    res_type = one_hot(res_type, num_classes=const.num_tokens)
+    return {
+        "template_restype": res_type,
+        "template_frame_rot": torch.from_numpy(frame_rot),
+        "template_frame_t": torch.from_numpy(frame_t),
+        "template_cb": torch.from_numpy(cb_coords),
+        "template_ca": torch.from_numpy(ca_coords),
+        "template_mask_cb": torch.from_numpy(cb_mask),
+        "template_mask_frame": torch.from_numpy(frame_mask),
+        "template_mask": torch.from_numpy(template_mask),
+        "query_to_template": torch.from_numpy(query_to_template),
+        "visibility_ids": torch.from_numpy(visibility_ids),
+    }
+def process_template_features(
+    data: Tokenized,
+    max_tokens: int,
+) -> dict[str, torch.Tensor]:
+    """Load the given input data.
+    Parameters
+    ----------
+    data : Tokenized
+        The input to the model.
+    max_tokens : int
+        The maximum number of tokens.
+    Returns
+    -------
+    dict[str, torch.Tensor]
+        The loaded template features.
+    """
+    # Group templates by name
+    name_to_templates: dict[str, list[TemplateInfo]] = {}
+    for template_info in data.record.templates:
+        name_to_templates.setdefault(template_info.name, []).append(template_info)
+    # Map chain name to asym_id
+    chain_name_to_asym_id = {}
+    for chain in data.structure.chains:
+        chain_name_to_asym_id[chain["name"]] = chain["asym_id"]
+    # Compute the offset
+    template_features = []
+    for template_id, (template_name, templates) in enumerate(name_to_templates.items()):
+        row_tokens = []
+        template_structure = data.templates[template_name]
+        template_tokens = data.template_tokens[template_name]
+        tmpl_chain_name_to_asym_id = {}
+        for chain in template_structure.chains:
+            tmpl_chain_name_to_asym_id[chain["name"]] = chain["asym_id"]
+        for template in templates:
+            offset = template.template_st - template.query_st
+            # Get query and template tokens to map residues
+            query_tokens = data.tokens
+            chain_id = chain_name_to_asym_id[template.query_chain]
+            q_tokens = query_tokens[query_tokens["asym_id"] == chain_id]
+            q_indices = dict(zip(q_tokens["res_idx"], q_tokens["token_idx"]))
+            # Get the template tokens at the query residues
+            chain_id = tmpl_chain_name_to_asym_id[template.template_chain]
+            toks = template_tokens[template_tokens["asym_id"] == chain_id]
+            toks = [t for t in toks if t["res_idx"] - offset in q_indices]
+            for t in toks:
+                q_idx = q_indices[t["res_idx"] - offset]
+                row_tokens.append(
+                    {
+                        "token": t,
+                        "pdb_id": template_id,
+                        "q_idx": q_idx,
+                    }
+                )
+        # Compute template features for each row
+        row_features = compute_template_features(data, row_tokens, max_tokens)
+        row_features["template_force"] = torch.tensor(template.force)
+        row_features["template_force_threshold"] = torch.tensor(
+            template.threshold if template.threshold is not None else float("inf"),
+            dtype=torch.float32,
+        )
+        template_features.append(row_features)
+    # Stack each feature
+    out = {}
+    for k in template_features[0]:
+        out[k] = torch.stack([f[k] for f in template_features])
+    return out
+def process_symmetry_features(
+    cropped: Tokenized, symmetries: dict
+) -> dict[str, Tensor]:
+    """Get the symmetry features.
+    Parameters
+    ----------
+    data : Tokenized
+        The input to the model.
+    Returns
+    -------
+    dict[str, Tensor]
+        The symmetry features.
+    """
+    features = get_chain_symmetries(cropped)
+    features.update(get_amino_acids_symmetries(cropped))
+    features.update(get_ligand_symmetries(cropped, symmetries))
+    return features
+def process_ensemble_features(
+    data: Tokenized,
+    random: np.random.Generator,
+    num_ensembles: int,
+    ensemble_sample_replacement: bool,
+    fix_single_ensemble: bool,
+) -> dict[str, Tensor]:
+    """Get the ensemble features.
+    Parameters
+    ----------
+    data : Tokenized
+        The input to the model.
+    random : np.random.Generator
+        The random number generator.
+    num_ensembles : int
+        The maximum number of ensembles to sample.
+    ensemble_sample_replacement : bool
+        Whether to sample with replacement.
+    Returns
+    -------
+    dict[str, Tensor]
+        The ensemble features.
+    """
+    assert num_ensembles > 0, "Number of conformers sampled must be greater than 0."
+    # Number of available conformers in the structure
+    # s_ensemble_num = min(len(cropped.structure.ensemble), 24)  # Limit to 24 conformers DEBUG: TODO: remove !
+    s_ensemble_num = len(data.structure.ensemble)
+    if fix_single_ensemble:
+        # Always take the first conformer for train and validation
+        assert num_ensembles == 1, (
+            "Number of conformers sampled must be 1 with fix_single_ensemble=True."
+        )
+        ensemble_ref_idxs = np.array([0])
+    else:
+        if ensemble_sample_replacement:
+            # Used in training
+            ensemble_ref_idxs = random.integers(0, s_ensemble_num, (num_ensembles,))
+        else:
+            # Used in validation
+            if s_ensemble_num < num_ensembles:
+                # Take all available conformers
+                ensemble_ref_idxs = np.arange(0, s_ensemble_num)
+            else:
+                # Sample without replacement
+                ensemble_ref_idxs = random.choice(
+                    s_ensemble_num, num_ensembles, replace=False
+                )
+    ensemble_features = {
+        "ensemble_ref_idxs": torch.Tensor(ensemble_ref_idxs).long(),
+    }
+    return ensemble_features
+def process_residue_constraint_features(data: Tokenized) -> dict[str, Tensor]:
+    residue_constraints = data.residue_constraints
+    if residue_constraints is not None:
+        rdkit_bounds_constraints = residue_constraints.rdkit_bounds_constraints
+        chiral_atom_constraints = residue_constraints.chiral_atom_constraints
+        stereo_bond_constraints = residue_constraints.stereo_bond_constraints
+        planar_bond_constraints = residue_constraints.planar_bond_constraints
+        planar_ring_5_constraints = residue_constraints.planar_ring_5_constraints
+        planar_ring_6_constraints = residue_constraints.planar_ring_6_constraints
+        rdkit_bounds_index = torch.tensor(
+            rdkit_bounds_constraints["atom_idxs"].copy(), dtype=torch.long
+        ).T
+        rdkit_bounds_bond_mask = torch.tensor(
+            rdkit_bounds_constraints["is_bond"].copy(), dtype=torch.bool
+        )
+        rdkit_bounds_angle_mask = torch.tensor(
+            rdkit_bounds_constraints["is_angle"].copy(), dtype=torch.bool
+        )
+        rdkit_upper_bounds = torch.tensor(
+            rdkit_bounds_constraints["upper_bound"].copy(), dtype=torch.float
+        )
+        rdkit_lower_bounds = torch.tensor(
+            rdkit_bounds_constraints["lower_bound"].copy(), dtype=torch.float
+        )
+        chiral_atom_index = torch.tensor(
+            chiral_atom_constraints["atom_idxs"].copy(), dtype=torch.long
+        ).T
+        chiral_reference_mask = torch.tensor(
+            chiral_atom_constraints["is_reference"].copy(), dtype=torch.bool
+        )
+        chiral_atom_orientations = torch.tensor(
+            chiral_atom_constraints["is_r"].copy(), dtype=torch.bool
+        )
+        stereo_bond_index = torch.tensor(
+            stereo_bond_constraints["atom_idxs"].copy(), dtype=torch.long
+        ).T
+        stereo_reference_mask = torch.tensor(
+            stereo_bond_constraints["is_reference"].copy(), dtype=torch.bool
+        )
+        stereo_bond_orientations = torch.tensor(
+            stereo_bond_constraints["is_e"].copy(), dtype=torch.bool
+        )
+        planar_bond_index = torch.tensor(
+            planar_bond_constraints["atom_idxs"].copy(), dtype=torch.long
+        ).T
+        planar_ring_5_index = torch.tensor(
+            planar_ring_5_constraints["atom_idxs"].copy(), dtype=torch.long
+        ).T
+        planar_ring_6_index = torch.tensor(
+            planar_ring_6_constraints["atom_idxs"].copy(), dtype=torch.long
+        ).T
+    else:
+        rdkit_bounds_index = torch.empty((2, 0), dtype=torch.long)
+        rdkit_bounds_bond_mask = torch.empty((0,), dtype=torch.bool)
+        rdkit_bounds_angle_mask = torch.empty((0,), dtype=torch.bool)
+        rdkit_upper_bounds = torch.empty((0,), dtype=torch.float)
+        rdkit_lower_bounds = torch.empty((0,), dtype=torch.float)
+        chiral_atom_index = torch.empty(
+            (
+                4,
+                0,
+            ),
+            dtype=torch.long,
+        )
+        chiral_reference_mask = torch.empty((0,), dtype=torch.bool)
+        chiral_atom_orientations = torch.empty((0,), dtype=torch.bool)
+        stereo_bond_index = torch.empty((4, 0), dtype=torch.long)
+        stereo_reference_mask = torch.empty((0,), dtype=torch.bool)
+        stereo_bond_orientations = torch.empty((0,), dtype=torch.bool)
+        planar_bond_index = torch.empty((6, 0), dtype=torch.long)
+        planar_ring_5_index = torch.empty((5, 0), dtype=torch.long)
+        planar_ring_6_index = torch.empty((6, 0), dtype=torch.long)
+    return {
+        "rdkit_bounds_index": rdkit_bounds_index,
+        "rdkit_bounds_bond_mask": rdkit_bounds_bond_mask,
+        "rdkit_bounds_angle_mask": rdkit_bounds_angle_mask,
+        "rdkit_upper_bounds": rdkit_upper_bounds,
+        "rdkit_lower_bounds": rdkit_lower_bounds,
+        "chiral_atom_index": chiral_atom_index,
+        "chiral_reference_mask": chiral_reference_mask,
+        "chiral_atom_orientations": chiral_atom_orientations,
+        "stereo_bond_index": stereo_bond_index,
+        "stereo_reference_mask": stereo_reference_mask,
+        "stereo_bond_orientations": stereo_bond_orientations,
+        "planar_bond_index": planar_bond_index,
+        "planar_ring_5_index": planar_ring_5_index,
+        "planar_ring_6_index": planar_ring_6_index,
+    }
+def process_chain_feature_constraints(data: Tokenized) -> dict[str, Tensor]:
+    structure = data.structure
+    if structure.bonds.shape[0] > 0:
+        connected_chain_index, connected_atom_index = [], []
+        for connection in structure.bonds:
+            if connection["chain_1"] == connection["chain_2"]:
+                continue
+            connected_chain_index.append([connection["chain_1"], connection["chain_2"]])
+            connected_atom_index.append([connection["atom_1"], connection["atom_2"]])
+        if len(connected_chain_index) > 0:
+            connected_chain_index = torch.tensor(
+                connected_chain_index, dtype=torch.long
+            ).T
+            connected_atom_index = torch.tensor(
+                connected_atom_index, dtype=torch.long
+            ).T
+        else:
+            connected_chain_index = torch.empty((2, 0), dtype=torch.long)
+            connected_atom_index = torch.empty((2, 0), dtype=torch.long)
+    else:
+        connected_chain_index = torch.empty((2, 0), dtype=torch.long)
+        connected_atom_index = torch.empty((2, 0), dtype=torch.long)
+    symmetric_chain_index = []
+    for i, chain_i in enumerate(structure.chains):
+        for j, chain_j in enumerate(structure.chains):
+            if j <= i:
+                continue
+            if chain_i["entity_id"] == chain_j["entity_id"]:
+                symmetric_chain_index.append([i, j])
+    if len(symmetric_chain_index) > 0:
+        symmetric_chain_index = torch.tensor(symmetric_chain_index, dtype=torch.long).T
+    else:
+        symmetric_chain_index = torch.empty((2, 0), dtype=torch.long)
+    return {
+        "connected_chain_index": connected_chain_index,
+        "connected_atom_index": connected_atom_index,
+        "symmetric_chain_index": symmetric_chain_index,
+    }
+def process_contact_feature_constraints(
+    data: Tokenized,
+    inference_pocket_constraints: list[tuple[int, list[tuple[int, int]], float]],
+    inference_contact_constraints: list[tuple[tuple[int, int], tuple[int, int], float]],
+):
+    token_data = data.tokens
+    union_idx = 0
+    pair_index, union_index, negation_mask, thresholds = [], [], [], []
+    for binder, contacts, max_distance, force in inference_pocket_constraints:
+        if not force:
+            continue
+        binder_chain = data.structure.chains[binder]
+        for token in token_data:
+            if (
+                token["mol_type"] != const.chain_type_ids["NONPOLYMER"]
+                and (token["asym_id"], token["res_idx"]) in contacts
+            ) or (
+                token["mol_type"] == const.chain_type_ids["NONPOLYMER"]
+                and (token["asym_id"], token["atom_idx"]) in contacts
+            ):
+                atom_idx_pairs = torch.cartesian_prod(
+                    torch.arange(
+                        binder_chain["atom_idx"],
+                        binder_chain["atom_idx"] + binder_chain["atom_num"],
+                    ),
+                    torch.arange(
+                        token["atom_idx"], token["atom_idx"] + token["atom_num"]
+                    ),
+                ).T
+                pair_index.append(atom_idx_pairs)
+                union_index.append(torch.full((atom_idx_pairs.shape[1],), union_idx))
+                negation_mask.append(
+                    torch.ones((atom_idx_pairs.shape[1],), dtype=torch.bool)
+                )
+                thresholds.append(torch.full((atom_idx_pairs.shape[1],), max_distance))
+                union_idx += 1
+    for token1, token2, max_distance, force in inference_contact_constraints:
+        if not force:
+            continue
+        for idx1, _token1 in enumerate(token_data):
+            if (
+                _token1["mol_type"] != const.chain_type_ids["NONPOLYMER"]
+                and (_token1["asym_id"], _token1["res_idx"]) == token1
+            ) or (
+                _token1["mol_type"] == const.chain_type_ids["NONPOLYMER"]
+                and (_token1["asym_id"], _token1["atom_idx"]) == token1
+            ):
+                for idx2, _token2 in enumerate(token_data):
+                    if (
+                        _token2["mol_type"] != const.chain_type_ids["NONPOLYMER"]
+                        and (_token2["asym_id"], _token2["res_idx"]) == token2
+                    ) or (
+                        _token2["mol_type"] == const.chain_type_ids["NONPOLYMER"]
+                        and (_token2["asym_id"], _token2["atom_idx"]) == token2
+                    ):
+                        atom_idx_pairs = torch.cartesian_prod(
+                            torch.arange(
+                                _token1["atom_idx"],
+                                _token1["atom_idx"] + _token1["atom_num"],
+                            ),
+                            torch.arange(
+                                _token2["atom_idx"],
+                                _token2["atom_idx"] + _token2["atom_num"],
+                            ),
+                        ).T
+                        pair_index.append(atom_idx_pairs)
+                        union_index.append(
+                            torch.full((atom_idx_pairs.shape[1],), union_idx)
+                        )
+                        negation_mask.append(
+                            torch.ones((atom_idx_pairs.shape[1],), dtype=torch.bool)
+                        )
+                        thresholds.append(
+                            torch.full((atom_idx_pairs.shape[1],), max_distance)
+                        )
+                        union_idx += 1
+                        break
+                break
+    if len(pair_index) > 0:
+        pair_index = torch.cat(pair_index, dim=1)
+        union_index = torch.cat(union_index)
+        negation_mask = torch.cat(negation_mask)
+        thresholds = torch.cat(thresholds)
+    else:
+        pair_index = torch.empty((2, 0), dtype=torch.long)
+        union_index = torch.empty((0,), dtype=torch.long)
+        negation_mask = torch.empty((0,), dtype=torch.bool)
+        thresholds = torch.empty((0,), dtype=torch.float32)
+    return {
+        "contact_pair_index": pair_index,
+        "contact_union_index": union_index,
+        "contact_negation_mask": negation_mask,
+        "contact_thresholds": thresholds,
+    }
+class Boltz2Featurizer:
+    """Boltz2 featurizer."""
+    def process(
+        self,
+        data: Tokenized,
+        random: np.random.Generator,
+        molecules: dict[str, Mol],
+        training: bool,
+        max_seqs: int,
+        atoms_per_window_queries: int = 32,
+        min_dist: float = 2.0,
+        max_dist: float = 22.0,
+        num_bins: int = 64,
+        num_ensembles: int = 1,
+        ensemble_sample_replacement: bool = False,
+        disto_use_ensemble: Optional[bool] = False,
+        fix_single_ensemble: Optional[bool] = True,
+        max_tokens: Optional[int] = None,
+        max_atoms: Optional[int] = None,
+        pad_to_max_seqs: bool = False,
+        compute_symmetries: bool = False,
+        binder_pocket_conditioned_prop: Optional[float] = 0.0,
+        contact_conditioned_prop: Optional[float] = 0.0,
+        binder_pocket_cutoff_min: Optional[float] = 4.0,
+        binder_pocket_cutoff_max: Optional[float] = 20.0,
+        binder_pocket_sampling_geometric_p: Optional[float] = 0.0,
+        only_ligand_binder_pocket: Optional[bool] = False,
+        only_pp_contact: Optional[bool] = False,
+        single_sequence_prop: Optional[float] = 0.0,
+        msa_sampling: bool = False,
+        override_bfactor: float = False,
+        override_method: Optional[str] = None,
+        compute_frames: bool = False,
+        override_coords: Optional[Tensor] = None,
+        bfactor_md_correction: bool = False,
+        compute_constraint_features: bool = False,
+        inference_pocket_constraints: Optional[
+            list[tuple[int, list[tuple[int, int]], float]]
+        ] = None,
+        inference_contact_constraints: Optional[
+            list[tuple[tuple[int, int], tuple[int, int], float]]
+        ] = None,
+        compute_affinity: bool = False,
+    ) -> dict[str, Tensor]:
+        """Compute features.
+        Parameters
+        ----------
+        data : Tokenized
+            The input to the model.
+        training : bool
+            Whether the model is in training mode.
+        max_tokens : int, optional
+            The maximum number of tokens.
+        max_atoms : int, optional
+            The maximum number of atoms
+        max_seqs : int, optional
+            The maximum number of sequences.
+        Returns
+        -------
+        dict[str, Tensor]
+            The features for model training.
+        """
+        # Compute random number of sequences
+        if training and max_seqs is not None:
+            if random.random() > single_sequence_prop:
+                max_seqs_batch = random.integers(1, max_seqs + 1)
+            else:
+                max_seqs_batch = 1
+        else:
+            max_seqs_batch = max_seqs
+        # Compute ensemble features
+        ensemble_features = process_ensemble_features(
+            data=data,
+            random=random,
+            num_ensembles=num_ensembles,
+            ensemble_sample_replacement=ensemble_sample_replacement,
+            fix_single_ensemble=fix_single_ensemble,
+        )
+        # Compute token features
+        token_features = process_token_features(
+            data=data,
+            random=random,
+            max_tokens=max_tokens,
+            binder_pocket_conditioned_prop=binder_pocket_conditioned_prop,
+            contact_conditioned_prop=contact_conditioned_prop,
+            binder_pocket_cutoff_min=binder_pocket_cutoff_min,
+            binder_pocket_cutoff_max=binder_pocket_cutoff_max,
+            binder_pocket_sampling_geometric_p=binder_pocket_sampling_geometric_p,
+            only_ligand_binder_pocket=only_ligand_binder_pocket,
+            only_pp_contact=only_pp_contact,
+            override_method=override_method,
+            inference_pocket_constraints=inference_pocket_constraints,
+            inference_contact_constraints=inference_contact_constraints,
+        )
+        # Compute atom features
+        atom_features = process_atom_features(
+            data=data,
+            random=random,
+            molecules=molecules,
+            ensemble_features=ensemble_features,
+            atoms_per_window_queries=atoms_per_window_queries,
+            min_dist=min_dist,
+            max_dist=max_dist,
+            num_bins=num_bins,
+            max_atoms=max_atoms,
+            max_tokens=max_tokens,
+            disto_use_ensemble=disto_use_ensemble,
+            override_bfactor=override_bfactor,
+            compute_frames=compute_frames,
+            override_coords=override_coords,
+            bfactor_md_correction=bfactor_md_correction,
+        )
+        # Compute MSA features
+        msa_features = process_msa_features(
+            data=data,
+            random=random,
+            max_seqs_batch=max_seqs_batch,
+            max_seqs=max_seqs,
+            max_tokens=max_tokens,
+            pad_to_max_seqs=pad_to_max_seqs,
+            msa_sampling=training and msa_sampling,
+        )
+        # Compute MSA features
+        msa_features_affinity = {}
+        if compute_affinity:
+            msa_features_affinity = process_msa_features(
+                data=data,
+                random=random,
+                max_seqs_batch=1,
+                max_seqs=1,
+                max_tokens=max_tokens,
+                pad_to_max_seqs=pad_to_max_seqs,
+                msa_sampling=training and msa_sampling,
+                affinity=True,
+            )
+        # Compute affinity ligand Molecular Weight
+        ligand_to_mw = {}
+        if compute_affinity:
+            ligand_to_mw["affinity_mw"] = data.record.affinity.mw
+        # Compute template features
+        num_tokens = data.tokens.shape[0] if max_tokens is None else max_tokens
+        if data.templates and not compute_affinity:
+            template_features = process_template_features(
+                data=data,
+                max_tokens=num_tokens,
+            )
+        else:
+            template_features = load_dummy_templates_features(
+                tdim=1,
+                num_tokens=num_tokens,
+            )
+        # Compute symmetry features
+        symmetry_features = {}
+        if compute_symmetries:
+            symmetries = get_symmetries(molecules)
+            symmetry_features = process_symmetry_features(data, symmetries)
+        # Compute constraint features
+        residue_constraint_features = {}
+        chain_constraint_features = {}
+        contact_constraint_features = {}
+        if compute_constraint_features:
+            residue_constraint_features = process_residue_constraint_features(data)
+            chain_constraint_features = process_chain_feature_constraints(data)
+            contact_constraint_features = process_contact_feature_constraints(
+                data=data,
+                inference_pocket_constraints=inference_pocket_constraints if inference_pocket_constraints else [],
+                inference_contact_constraints=inference_contact_constraints if inference_contact_constraints else [],
+            )
+        return {
+            **token_features,
+            **atom_features,
+            **msa_features,
+            **msa_features_affinity,
+            **template_features,
+            **symmetry_features,
+            **ensemble_features,
+            **residue_constraint_features,
+            **chain_constraint_features,
+            **contact_constraint_features,
+            **ligand_to_mw,
+        }

protify/FastPLMs/boltz/src/boltz/data/feature/symmetry.py ADDED Viewed

	@@ -0,0 +1,602 @@

+import itertools
+import pickle
+import random
+from pathlib import Path
+import numpy as np
+import torch
+from boltz.data import const
+from boltz.data.pad import pad_dim
+from boltz.model.loss.confidence import lddt_dist
+from boltz.model.loss.validation import weighted_minimum_rmsd_single
+def convert_atom_name(name: str) -> tuple[int, int, int, int]:
+    """Convert an atom name to a standard format.
+    Parameters
+    ----------
+    name : str
+        The atom name.
+    Returns
+    -------
+    Tuple[int, int, int, int]
+        The converted atom name.
+    """
+    name = name.strip()
+    name = [ord(c) - 32 for c in name]
+    name = name + [0] * (4 - len(name))
+    return tuple(name)
+def get_symmetries(path: str) -> dict:
+    """Create a dictionary for the ligand symmetries.
+    Parameters
+    ----------
+    path : str
+        The path to the ligand symmetries.
+    Returns
+    -------
+    dict
+        The ligand symmetries.
+    """
+    with Path(path).open("rb") as f:
+        data: dict = pickle.load(f)  # noqa: S301
+    symmetries = {}
+    for key, mol in data.items():
+        try:
+            serialized_sym = bytes.fromhex(mol.GetProp("symmetries"))
+            sym = pickle.loads(serialized_sym)  # noqa: S301
+            atom_names = []
+            for atom in mol.GetAtoms():
+                # Get atom name
+                atom_name = convert_atom_name(atom.GetProp("name"))
+                atom_names.append(atom_name)
+            symmetries[key] = (sym, atom_names)
+        except Exception:  # noqa: BLE001, PERF203, S110
+            pass
+    return symmetries
+def compute_symmetry_idx_dictionary(data):
+    # Compute the symmetry index dictionary
+    total_count = 0
+    all_coords = []
+    for i, chain in enumerate(data.chains):
+        chain.start_idx = total_count
+        for j, token in enumerate(chain.tokens):
+            token.start_idx = total_count - chain.start_idx
+            all_coords.extend(
+                [[atom.coords.x, atom.coords.y, atom.coords.z] for atom in token.atoms]
+            )
+            total_count += len(token.atoms)
+    return all_coords
+def get_current_idx_list(data):
+    idx = []
+    for chain in data.chains:
+        if chain.in_crop:
+            for token in chain.tokens:
+                if token.in_crop:
+                    idx.extend(
+                        [
+                            chain.start_idx + token.start_idx + i
+                            for i in range(len(token.atoms))
+                        ]
+                    )
+    return idx
+def all_different_after_swap(l):
+    final = [s[-1] for s in l]
+    return len(final) == len(set(final))
+def minimum_symmetry_coords(
+    coords: torch.Tensor,
+    feats: dict,
+    index_batch: int,
+    **args_rmsd,
+):
+    all_coords = feats["all_coords"][index_batch].unsqueeze(0).to(coords)
+    all_resolved_mask = (
+        feats["all_resolved_mask"][index_batch].to(coords).to(torch.bool)
+    )
+    crop_to_all_atom_map = (
+        feats["crop_to_all_atom_map"][index_batch].to(coords).to(torch.long)
+    )
+    chain_symmetries = feats["chain_symmetries"][index_batch]
+    amino_acids_symmetries = feats["amino_acids_symmetries"][index_batch]
+    ligand_symmetries = feats["ligand_symmetries"][index_batch]
+    # Check best symmetry on chain swap
+    best_true_coords = None
+    best_rmsd = float("inf")
+    best_align_weights = None
+    for c in chain_symmetries:
+        true_all_coords = all_coords.clone()
+        true_all_resolved_mask = all_resolved_mask.clone()
+        for start1, end1, start2, end2, chainidx1, chainidx2 in c:
+            true_all_coords[:, start1:end1] = all_coords[:, start2:end2]
+            true_all_resolved_mask[start1:end1] = all_resolved_mask[start2:end2]
+        true_coords = true_all_coords[:, crop_to_all_atom_map]
+        true_resolved_mask = true_all_resolved_mask[crop_to_all_atom_map]
+        true_coords = pad_dim(true_coords, 1, coords.shape[1] - true_coords.shape[1])
+        true_resolved_mask = pad_dim(
+            true_resolved_mask,
+            0,
+            coords.shape[1] - true_resolved_mask.shape[0],
+        )
+        try:
+            rmsd, aligned_coords, align_weights = weighted_minimum_rmsd_single(
+                coords,
+                true_coords,
+                atom_mask=true_resolved_mask,
+                atom_to_token=feats["atom_to_token"][index_batch : index_batch + 1],
+                mol_type=feats["mol_type"][index_batch : index_batch + 1],
+                **args_rmsd,
+            )
+        except:
+            print("Warning: error in rmsd computation inside symmetry code")
+            continue
+        rmsd = rmsd.item()
+        if rmsd < best_rmsd:
+            best_rmsd = rmsd
+            best_true_coords = aligned_coords
+            best_align_weights = align_weights
+            best_true_resolved_mask = true_resolved_mask
+    # atom symmetries (nucleic acid and protein residues), resolved greedily without recomputing alignment
+    true_coords = best_true_coords.clone()
+    true_resolved_mask = best_true_resolved_mask.clone()
+    for symmetric_amino in amino_acids_symmetries:
+        for c in symmetric_amino:
+            # starting from greedy best, try to swap the atoms
+            new_true_coords = true_coords.clone()
+            new_true_resolved_mask = true_resolved_mask.clone()
+            for i, j in c:
+                new_true_coords[:, i] = true_coords[:, j]
+                new_true_resolved_mask[i] = true_resolved_mask[j]
+            # compute squared distance, for efficiency we do not recompute the alignment
+            best_mse_loss = torch.sum(
+                ((coords - best_true_coords) ** 2).sum(dim=-1)
+                * best_align_weights
+                * best_true_resolved_mask,
+                dim=-1,
+            ) / torch.sum(best_align_weights * best_true_resolved_mask, dim=-1)
+            new_mse_loss = torch.sum(
+                ((coords - new_true_coords) ** 2).sum(dim=-1)
+                * best_align_weights
+                * new_true_resolved_mask,
+                dim=-1,
+            ) / torch.sum(best_align_weights * new_true_resolved_mask, dim=-1)
+            if best_mse_loss > new_mse_loss:
+                best_true_coords = new_true_coords
+                best_true_resolved_mask = new_true_resolved_mask
+        # greedily update best coordinates after each amino acid
+        true_coords = best_true_coords.clone()
+        true_resolved_mask = best_true_resolved_mask.clone()
+    # Recomputing alignment
+    rmsd, true_coords, best_align_weights = weighted_minimum_rmsd_single(
+        coords,
+        true_coords,
+        atom_mask=true_resolved_mask,
+        atom_to_token=feats["atom_to_token"][index_batch : index_batch + 1],
+        mol_type=feats["mol_type"][index_batch : index_batch + 1],
+        **args_rmsd,
+    )
+    best_rmsd = rmsd.item()
+    # atom symmetries (ligand and non-standard), resolved greedily recomputing alignment
+    for symmetric_ligand in ligand_symmetries:
+        for c in symmetric_ligand:
+            new_true_coords = true_coords.clone()
+            new_true_resolved_mask = true_resolved_mask.clone()
+            for i, j in c:
+                new_true_coords[:, j] = true_coords[:, i]
+                new_true_resolved_mask[j] = true_resolved_mask[i]
+            try:
+                # TODO if this is too slow maybe we can get away with not recomputing alignment
+                rmsd, aligned_coords, align_weights = weighted_minimum_rmsd_single(
+                    coords,
+                    new_true_coords,
+                    atom_mask=new_true_resolved_mask,
+                    atom_to_token=feats["atom_to_token"][index_batch : index_batch + 1],
+                    mol_type=feats["mol_type"][index_batch : index_batch + 1],
+                    **args_rmsd,
+                )
+            except Exception as e:
+                raise e
+                print(e)
+                continue
+            rmsd = rmsd.item()
+            if rmsd < best_rmsd:
+                best_true_coords = aligned_coords
+                best_rmsd = rmsd
+                best_true_resolved_mask = new_true_resolved_mask
+        true_coords = best_true_coords.clone()
+        true_resolved_mask = best_true_resolved_mask.clone()
+    return best_true_coords, best_rmsd, best_true_resolved_mask.unsqueeze(0)
+def minimum_lddt_symmetry_coords(
+    coords: torch.Tensor,
+    feats: dict,
+    index_batch: int,
+    **args_rmsd,
+):
+    all_coords = feats["all_coords"][index_batch].unsqueeze(0).to(coords)
+    all_resolved_mask = (
+        feats["all_resolved_mask"][index_batch].to(coords).to(torch.bool)
+    )
+    crop_to_all_atom_map = (
+        feats["crop_to_all_atom_map"][index_batch].to(coords).to(torch.long)
+    )
+    chain_symmetries = feats["chain_symmetries"][index_batch]
+    amino_acids_symmetries = feats["amino_acids_symmetries"][index_batch]
+    ligand_symmetries = feats["ligand_symmetries"][index_batch]
+    dmat_predicted = torch.cdist(
+        coords[:, : len(crop_to_all_atom_map)], coords[:, : len(crop_to_all_atom_map)]
+    )
+    # Check best symmetry on chain swap
+    best_true_coords = None
+    best_lddt = 0
+    for c in chain_symmetries:
+        true_all_coords = all_coords.clone()
+        true_all_resolved_mask = all_resolved_mask.clone()
+        for start1, end1, start2, end2, chainidx1, chainidx2 in c:
+            true_all_coords[:, start1:end1] = all_coords[:, start2:end2]
+            true_all_resolved_mask[start1:end1] = all_resolved_mask[start2:end2]
+        true_coords = true_all_coords[:, crop_to_all_atom_map]
+        true_resolved_mask = true_all_resolved_mask[crop_to_all_atom_map]
+        dmat_true = torch.cdist(true_coords, true_coords)
+        pair_mask = (
+            true_resolved_mask[:, None]
+            * true_resolved_mask[None, :]
+            * (1 - torch.eye(len(true_resolved_mask))).to(true_resolved_mask)
+        )
+        lddt = lddt_dist(
+            dmat_predicted, dmat_true, pair_mask, cutoff=15.0, per_atom=False
+        )[0]
+        lddt = lddt.item()
+        if lddt > best_lddt:
+            best_lddt = lddt
+            best_true_coords = true_coords
+            best_true_resolved_mask = true_resolved_mask
+    # atom symmetries (nucleic acid and protein residues), resolved greedily without recomputing alignment
+    true_coords = best_true_coords.clone()
+    true_resolved_mask = best_true_resolved_mask.clone()
+    for symmetric_amino_or_lig in amino_acids_symmetries + ligand_symmetries:
+        for c in symmetric_amino_or_lig:
+            # starting from greedy best, try to swap the atoms
+            new_true_coords = true_coords.clone()
+            new_true_resolved_mask = true_resolved_mask.clone()
+            indices = []
+            for i, j in c:
+                new_true_coords[:, i] = true_coords[:, j]
+                new_true_resolved_mask[i] = true_resolved_mask[j]
+                indices.append(i)
+            indices = (
+                torch.from_numpy(np.asarray(indices)).to(new_true_coords.device).long()
+            )
+            pred_coords_subset = coords[:, : len(crop_to_all_atom_map)][:, indices]
+            true_coords_subset = true_coords[:, indices]
+            new_true_coords_subset = new_true_coords[:, indices]
+            sub_dmat_pred = torch.cdist(
+                coords[:, : len(crop_to_all_atom_map)], pred_coords_subset
+            )
+            sub_dmat_true = torch.cdist(true_coords, true_coords_subset)
+            sub_dmat_new_true = torch.cdist(new_true_coords, new_true_coords_subset)
+            sub_true_pair_lddt = (
+                true_resolved_mask[:, None] * true_resolved_mask[None, indices]
+            )
+            sub_true_pair_lddt[indices] = (
+                sub_true_pair_lddt[indices]
+                * (1 - torch.eye(len(indices))).to(sub_true_pair_lddt).bool()
+            )
+            sub_new_true_pair_lddt = (
+                new_true_resolved_mask[:, None] * new_true_resolved_mask[None, indices]
+            )
+            sub_new_true_pair_lddt[indices] = (
+                sub_new_true_pair_lddt[indices]
+                * (1 - torch.eye(len(indices))).to(sub_true_pair_lddt).bool()
+            )
+            lddt = lddt_dist(
+                sub_dmat_pred,
+                sub_dmat_true,
+                sub_true_pair_lddt,
+                cutoff=15.0,
+                per_atom=False,
+            )[0]
+            new_lddt = lddt_dist(
+                sub_dmat_pred,
+                sub_dmat_new_true,
+                sub_new_true_pair_lddt,
+                cutoff=15.0,
+                per_atom=False,
+            )[0]
+            if new_lddt > lddt:
+                best_true_coords = new_true_coords
+                best_true_resolved_mask = new_true_resolved_mask
+        # greedily update best coordinates after each amino acid
+        true_coords = best_true_coords.clone()
+        true_resolved_mask = best_true_resolved_mask.clone()
+    # Recomputing alignment
+    true_coords = pad_dim(true_coords, 1, coords.shape[1] - true_coords.shape[1])
+    true_resolved_mask = pad_dim(
+        true_resolved_mask,
+        0,
+        coords.shape[1] - true_resolved_mask.shape[0],
+    )
+    try:
+        rmsd, true_coords, _ = weighted_minimum_rmsd_single(
+            coords,
+            true_coords,
+            atom_mask=true_resolved_mask,
+            atom_to_token=feats["atom_to_token"][index_batch : index_batch + 1],
+            mol_type=feats["mol_type"][index_batch : index_batch + 1],
+            **args_rmsd,
+        )
+        best_rmsd = rmsd.item()
+    except Exception as e:
+        print("Failed proper RMSD computation, returning inf. Error: ", e)
+        best_rmsd = 1000
+    return true_coords, best_rmsd, true_resolved_mask.unsqueeze(0)
+def compute_all_coords_mask(structure):
+    # Compute all coords, crop mask and add start_idx to structure
+    total_count = 0
+    all_coords = []
+    all_coords_crop_mask = []
+    all_resolved_mask = []
+    for i, chain in enumerate(structure.chains):
+        chain.start_idx = total_count
+        for j, token in enumerate(chain.tokens):
+            token.start_idx = total_count - chain.start_idx
+            all_coords.extend(
+                [[atom.coords.x, atom.coords.y, atom.coords.z] for atom in token.atoms]
+            )
+            all_coords_crop_mask.extend(
+                [token.in_crop for _ in range(len(token.atoms))]
+            )
+            all_resolved_mask.extend(
+                [token.is_present for _ in range(len(token.atoms))]
+            )
+            total_count += len(token.atoms)
+    if len(all_coords_crop_mask) != len(all_resolved_mask):
+        pass
+    return all_coords, all_coords_crop_mask, all_resolved_mask
+def get_chain_symmetries(cropped, max_n_symmetries=100):
+    # get all coordinates and resolved mask
+    structure = cropped.structure
+    all_coords = []
+    all_resolved_mask = []
+    original_atom_idx = []
+    chain_atom_idx = []
+    chain_atom_num = []
+    chain_in_crop = []
+    chain_asym_id = []
+    new_atom_idx = 0
+    for chain in structure.chains:
+        atom_idx, atom_num = (
+            chain["atom_idx"],
+            chain["atom_num"],
+        )
+        # compute coordinates and resolved mask
+        resolved_mask = structure.atoms["is_present"][atom_idx : atom_idx + atom_num]
+        # ensemble_atom_starts = [structure.ensemble[idx]["atom_coord_idx"] for idx in cropped.ensemble_ref_idxs]
+        # coords = np.array(
+        #    [structure.coords[ensemble_atom_start + atom_idx: ensemble_atom_start + atom_idx + atom_num]["coords"] for
+        #     ensemble_atom_start in ensemble_atom_starts])
+        coords = structure.atoms["coords"][atom_idx : atom_idx + atom_num]
+        in_crop = False
+        for token in cropped.tokens:
+            if token["asym_id"] == chain["asym_id"]:
+                in_crop = True
+                break
+        all_coords.append(coords)
+        all_resolved_mask.append(resolved_mask)
+        original_atom_idx.append(atom_idx)
+        chain_atom_idx.append(new_atom_idx)
+        chain_atom_num.append(atom_num)
+        chain_in_crop.append(in_crop)
+        chain_asym_id.append(chain["asym_id"])
+        new_atom_idx += atom_num
+    # Compute backmapping from token to all coords
+    crop_to_all_atom_map = []
+    for token in cropped.tokens:
+        chain_idx = chain_asym_id.index(token["asym_id"])
+        start = (
+            chain_atom_idx[chain_idx] - original_atom_idx[chain_idx] + token["atom_idx"]
+        )
+        crop_to_all_atom_map.append(np.arange(start, start + token["atom_num"]))
+    # Compute the symmetries between chains
+    swaps = []
+    for i, chain in enumerate(structure.chains):
+        start = chain_atom_idx[i]
+        end = start + chain_atom_num[i]
+        if chain_in_crop[i]:
+            possible_swaps = []
+            for j, chain2 in enumerate(structure.chains):
+                start2 = chain_atom_idx[j]
+                end2 = start2 + chain_atom_num[j]
+                if (
+                    chain["entity_id"] == chain2["entity_id"]
+                    and end - start == end2 - start2
+                ):
+                    possible_swaps.append((start, end, start2, end2, i, j))
+            swaps.append(possible_swaps)
+    combinations = itertools.product(*swaps)
+    # to avoid combinatorial explosion, bound the number of combinations even considered
+    combinations = list(itertools.islice(combinations, max_n_symmetries * 10))
+    # filter for all chains getting a different assignment
+    combinations = [c for c in combinations if all_different_after_swap(c)]
+    if len(combinations) > max_n_symmetries:
+        combinations = random.sample(combinations, max_n_symmetries)
+    if len(combinations) == 0:
+        combinations.append([])
+    features = {}
+    features["all_coords"] = torch.Tensor(
+        np.concatenate(all_coords, axis=0)
+    )  # axis=1 with ensemble
+    features["all_resolved_mask"] = torch.Tensor(
+        np.concatenate(all_resolved_mask, axis=0)
+    )
+    features["crop_to_all_atom_map"] = torch.Tensor(
+        np.concatenate(crop_to_all_atom_map, axis=0)
+    )
+    features["chain_symmetries"] = combinations
+    return features
+def get_amino_acids_symmetries(cropped):
+    # Compute standard amino-acids symmetries
+    swaps = []
+    start_index_crop = 0
+    for token in cropped.tokens:
+        symmetries = const.ref_symmetries.get(const.tokens[token["res_type"]], [])
+        if len(symmetries) > 0:
+            residue_swaps = []
+            for sym in symmetries:
+                sym_new_idx = [
+                    (i + start_index_crop, j + start_index_crop) for i, j in sym
+                ]
+                residue_swaps.append(sym_new_idx)
+            swaps.append(residue_swaps)
+        start_index_crop += token["atom_num"]
+    features = {"amino_acids_symmetries": swaps}
+    return features
+def get_ligand_symmetries(cropped, symmetries):
+    # Compute ligand and non-standard amino-acids symmetries
+    structure = cropped.structure
+    added_molecules = {}
+    index_mols = []
+    atom_count = 0
+    for token in cropped.tokens:
+        # check if molecule is already added by identifying it through asym_id and res_idx
+        atom_count += token["atom_num"]
+        mol_id = (token["asym_id"], token["res_idx"])
+        if mol_id in added_molecules.keys():
+            added_molecules[mol_id] += token["atom_num"]
+            continue
+        added_molecules[mol_id] = token["atom_num"]
+        # get the molecule type and indices
+        residue_idx = token["res_idx"] + structure.chains[token["asym_id"]]["res_idx"]
+        mol_name = structure.residues[residue_idx]["name"]
+        atom_idx = structure.residues[residue_idx]["atom_idx"]
+        mol_atom_names = structure.atoms[
+            atom_idx : atom_idx + structure.residues[residue_idx]["atom_num"]
+        ]["name"]
+        mol_atom_names = [tuple(m) for m in mol_atom_names]
+        if mol_name not in const.ref_symmetries.keys():
+            index_mols.append(
+                (mol_name, atom_count - token["atom_num"], mol_id, mol_atom_names)
+            )
+    # for each molecule, get the symmetries
+    molecule_symmetries = []
+    for mol_name, start_mol, mol_id, mol_atom_names in index_mols:
+        if not mol_name in symmetries:
+            continue
+        else:
+            swaps = []
+            syms_ccd, mol_atom_names_ccd = symmetries[mol_name]
+            # Get indices of mol_atom_names_ccd that are in mol_atom_names
+            ccd_to_valid_ids = {
+                mol_atom_names_ccd.index(name): i
+                for i, name in enumerate(mol_atom_names)
+            }
+            ccd_valid_ids = set(ccd_to_valid_ids.keys())
+            syms = []
+            # Get syms
+            for sym_ccd in syms_ccd:
+                sym_dict = {}
+                bool_add = True
+                for i, j in enumerate(sym_ccd):
+                    if i in ccd_valid_ids:
+                        if j in ccd_valid_ids:
+                            i_true = ccd_to_valid_ids[i]
+                            j_true = ccd_to_valid_ids[j]
+                            sym_dict[i_true] = j_true
+                        else:
+                            bool_add = False
+                            break
+                if bool_add:
+                    syms.append([sym_dict[i] for i in range(len(ccd_valid_ids))])
+            for sym in syms:
+                if len(sym) != added_molecules[mol_id]:
+                    raise Exception(
+                        f"Symmetry length mismatch {len(sym)} {added_molecules[mol_id]}"
+                    )
+                # assert (
+                #     len(sym) == added_molecules[mol_id]
+                # ), f"Symmetry length mismatch {len(sym)} {added_molecules[mol_id]}"
+                sym_new_idx = []
+                for i, j in enumerate(sym):
+                    if i != int(j):
+                        sym_new_idx.append((i + start_mol, int(j) + start_mol))
+                if len(sym_new_idx) > 0:
+                    swaps.append(sym_new_idx)
+            if len(swaps) > 0:
+                molecule_symmetries.append(swaps)
+    features = {"ligand_symmetries": molecule_symmetries}
+    return features

protify/FastPLMs/boltz/src/boltz/data/filter/__init__.py ADDED Viewed

File without changes

protify/FastPLMs/boltz/src/boltz/data/filter/dynamic/__init__.py ADDED Viewed

File without changes

protify/FastPLMs/boltz/src/boltz/data/filter/dynamic/date.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from datetime import datetime
+from typing import Literal
+from boltz.data.types import Record
+from boltz.data.filter.dynamic.filter import DynamicFilter
+class DateFilter(DynamicFilter):
+    """A filter that filters complexes based on their date.
+    The date can be the deposition, release, or revision date.
+    If the date is not available, the previous date is used.
+    If no date is available, the complex is rejected.
+    """
+    def __init__(
+        self,
+        date: str,
+        ref: Literal["deposited", "revised", "released"],
+    ) -> None:
+        """Initialize the filter.
+        Parameters
+        ----------
+        date : str, optional
+            The maximum date of PDB entries to filter
+        ref : Literal["deposited", "revised", "released"]
+            The reference date to use.
+        """
+        self.filter_date = datetime.fromisoformat(date)
+        self.ref = ref
+        if ref not in ["deposited", "revised", "released"]:
+            msg = (
+                "Invalid reference date. Must be ",
+                "deposited, revised, or released",
+            )
+            raise ValueError(msg)
+    def filter(self, record: Record) -> bool:
+        """Filter a record based on its date.
+        Parameters
+        ----------
+        record : Record
+            The record to filter.
+        Returns
+        -------
+        bool
+            Whether the record should be filtered.
+        """
+        structure = record.structure
+        if self.ref == "deposited":
+            date = structure.deposited
+        elif self.ref == "released":
+            date = structure.released
+            if not date:
+                date = structure.deposited
+        elif self.ref == "revised":
+            date = structure.revised
+            if not date and structure.released:
+                date = structure.released
+            elif not date:
+                date = structure.deposited
+        if date is None or date == "":
+            return False
+        date = datetime.fromisoformat(date)
+        return date <= self.filter_date

protify/FastPLMs/boltz/src/boltz/data/filter/dynamic/filter.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from abc import ABC, abstractmethod
+from boltz.data.types import Record
+class DynamicFilter(ABC):
+    """Base class for data filters."""
+    @abstractmethod
+    def filter(self, record: Record) -> bool:
+        """Filter a data record.
+        Parameters
+        ----------
+        record : Record
+            The object to consider filtering in / out.
+        Returns
+        -------
+        bool
+            True if the data passes the filter, False otherwise.
+        """
+        raise NotImplementedError

protify/FastPLMs/boltz/src/boltz/data/filter/dynamic/max_residues.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from boltz.data.types import Record
+from boltz.data.filter.dynamic.filter import DynamicFilter
+class MaxResiduesFilter(DynamicFilter):
+    """A filter that filters structures based on their size."""
+    def __init__(self, min_residues: int = 1, max_residues: int = 500) -> None:
+        """Initialize the filter.
+        Parameters
+        ----------
+        min_chains : int
+            The minimum number of chains allowed.
+        max_chains : int
+            The maximum number of chains allowed.
+        """
+        self.min_residues = min_residues
+        self.max_residues = max_residues
+    def filter(self, record: Record) -> bool:
+        """Filter structures based on their resolution.
+        Parameters
+        ----------
+        record : Record
+            The record to filter.
+        Returns
+        -------
+        bool
+            Whether the record should be filtered.
+        """
+        num_residues = sum(chain.num_residues for chain in record.chains)
+        return num_residues <= self.max_residues and num_residues >= self.min_residues

protify/FastPLMs/boltz/src/boltz/data/filter/dynamic/resolution.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from boltz.data.types import Record
+from boltz.data.filter.dynamic.filter import DynamicFilter
+class ResolutionFilter(DynamicFilter):
+    """A filter that filters complexes based on their resolution."""
+    def __init__(self, resolution: float = 9.0) -> None:
+        """Initialize the filter.
+        Parameters
+        ----------
+        resolution : float, optional
+            The maximum allowed resolution.
+        """
+        self.resolution = resolution
+    def filter(self, record: Record) -> bool:
+        """Filter complexes based on their resolution.
+        Parameters
+        ----------
+        record : Record
+            The record to filter.
+        Returns
+        -------
+        bool
+            Whether the record should be filtered.
+        """
+        structure = record.structure
+        return structure.resolution <= self.resolution

protify/FastPLMs/boltz/src/boltz/data/filter/dynamic/size.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from boltz.data.types import Record
+from boltz.data.filter.dynamic.filter import DynamicFilter
+class SizeFilter(DynamicFilter):
+    """A filter that filters structures based on their size."""
+    def __init__(self, min_chains: int = 1, max_chains: int = 300) -> None:
+        """Initialize the filter.
+        Parameters
+        ----------
+        min_chains : int
+            The minimum number of chains allowed.
+        max_chains : int
+            The maximum number of chains allowed.
+        """
+        self.min_chains = min_chains
+        self.max_chains = max_chains
+    def filter(self, record: Record) -> bool:
+        """Filter structures based on their resolution.
+        Parameters
+        ----------
+        record : Record
+            The record to filter.
+        Returns
+        -------
+        bool
+            Whether the record should be filtered.
+        """
+        num_chains = record.structure.num_chains
+        num_valid = sum(1 for chain in record.chains if chain.valid)
+        return num_chains <= self.max_chains and num_valid >= self.min_chains

protify/FastPLMs/boltz/src/boltz/data/filter/dynamic/subset.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from pathlib import Path
+from boltz.data.types import Record
+from boltz.data.filter.dynamic.filter import DynamicFilter
+class SubsetFilter(DynamicFilter):
+    """Filter a data record based on a subset of the data."""
+    def __init__(self, subset: str, reverse: bool = False) -> None:
+        """Initialize the filter.
+        Parameters
+        ----------
+        subset : str
+            The subset of data to consider, one per line.
+        """
+        with Path(subset).open("r") as f:
+            subset = f.read().splitlines()
+        self.subset = {s.lower() for s in subset}
+        self.reverse = reverse
+    def filter(self, record: Record) -> bool:
+        """Filter a data record.
+        Parameters
+        ----------
+        record : Record
+            The object to consider filtering in / out.
+        Returns
+        -------
+        bool
+            True if the data passes the filter, False otherwise.
+        """
+        if self.reverse:
+            return record.id.lower() not in self.subset
+        else:  # noqa: RET505
+            return record.id.lower() in self.subset

protify/FastPLMs/boltz/src/boltz/data/filter/static/__init__.py ADDED Viewed

File without changes

protify/FastPLMs/boltz/src/boltz/data/filter/static/filter.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from abc import ABC, abstractmethod
+import numpy as np
+from boltz.data.types import Structure
+class StaticFilter(ABC):
+    """Base class for structure filters."""
+    @abstractmethod
+    def filter(self, structure: Structure) -> np.ndarray:
+        """Filter chains in a structure.
+        Parameters
+        ----------
+        structure : Structure
+            The structure to filter chains from.
+        Returns
+        -------
+        np.ndarray
+            The chains to keep, as a boolean mask.
+        """
+        raise NotImplementedError

protify/FastPLMs/boltz/src/boltz/data/filter/static/ligand.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import numpy as np
+from boltz.data import const
+from boltz.data.filter.static.filter import StaticFilter
+from boltz.data.types import Structure
+class ExcludedLigands(StaticFilter):
+    """Filter excluded ligands."""
+    def filter(self, structure: Structure) -> np.ndarray:
+        """Filter excluded ligands.
+        Parameters
+        ----------
+        structure : Structure
+            The structure to filter chains from.
+        Returns
+        -------
+        np.ndarray
+            The chains to keep, as a boolean mask.
+        """
+        valid = np.ones(len(structure.chains), dtype=bool)
+        for i, chain in enumerate(structure.chains):
+            if chain["mol_type"] != const.chain_type_ids["NONPOLYMER"]:
+                continue
+            res_start = chain["res_idx"]
+            res_end = res_start + chain["res_num"]
+            residues = structure.residues[res_start:res_end]
+            if any(res["name"] in const.ligand_exclusion for res in residues):
+                valid[i] = 0
+        return valid

protify/FastPLMs/boltz/src/boltz/data/filter/static/polymer.py ADDED Viewed

	@@ -0,0 +1,299 @@

+import itertools
+from dataclasses import dataclass
+import numpy as np
+from sklearn.neighbors import KDTree
+from boltz.data import const
+from boltz.data.filter.static.filter import StaticFilter
+from boltz.data.types import Structure
+class MinimumLengthFilter(StaticFilter):
+    """Filter polymers based on their length.
+    We use the number of resolved residues when considering
+    the minimum, and the sequence length for the maximum.
+    """
+    def __init__(self, min_len: int = 4, max_len: int = 5000) -> None:
+        """Initialize the filter.
+        Parameters
+        ----------
+        min_len : float, optional
+            The minimum allowed length.
+        max_len : float, optional
+            The maximum allowed length.
+        """
+        self._min = min_len
+        self._max = max_len
+    def filter(self, structure: Structure) -> np.ndarray:
+        """Filter a chains based on their length.
+        Parameters
+        ----------
+        structure : Structure
+            The structure to filter chains from.
+        Returns
+        -------
+        np.ndarray
+            The chains to keep, as a boolean mask.
+        """
+        valid = np.ones(len(structure.chains), dtype=bool)
+        for i, chain in enumerate(structure.chains):
+            if chain["mol_type"] == const.chain_type_ids["NONPOLYMER"]:
+                continue
+            res_start = chain["res_idx"]
+            res_end = res_start + chain["res_num"]
+            residues = structure.residues[res_start:res_end]
+            resolved = residues["is_present"].sum()
+            if (resolved < self._min) or (resolved > self._max):
+                valid[i] = 0
+        return valid
+class UnknownFilter(StaticFilter):
+    """Filter proteins with all unknown residues."""
+    def filter(self, structure: Structure) -> np.ndarray:
+        """Filter proteins with all unknown residues.
+        Parameters
+        ----------
+        structure : Structure
+            The structure to filter chains from.
+        Returns
+        -------
+        np.ndarray
+            The chains to keep, as a boolean mask.
+        """
+        valid = np.ones(len(structure.chains), dtype=bool)
+        unk_toks = {
+            const.chain_type_ids["PROTEIN"]: const.unk_token_ids["PROTEIN"],
+            const.chain_type_ids["DNA"]: const.unk_token_ids["DNA"],
+            const.chain_type_ids["RNA"]: const.unk_token_ids["RNA"],
+        }
+        for i, chain in enumerate(structure.chains):
+            if chain["mol_type"] == const.chain_type_ids["NONPOLYMER"]:
+                continue
+            res_start = chain["res_idx"]
+            res_end = res_start + chain["res_num"]
+            residues = structure.residues[res_start:res_end]
+            unk_id = unk_toks[chain["mol_type"]]
+            if np.all(residues["res_type"] == unk_id):
+                valid[i] = 0
+        return valid
+class ConsecutiveCA(StaticFilter):
+    """Filter proteins with consecutive CA atoms above a threshold."""
+    def __init__(self, max_dist: int = 10.0) -> None:
+        """Initialize the filter.
+        Parameters
+        ----------
+        max_dist : float, optional
+            The maximum allowed distance.
+        """
+        self._max_dist = max_dist
+    def filter(self, structure: Structure) -> np.ndarray:
+        """Filter protein if consecutive CA atoms above a threshold.
+        Parameters
+        ----------
+        structure : Structure
+            The structure to filter chains from.
+        Returns
+        -------
+        np.ndarray
+            The chains to keep, as a boolean mask.
+        """
+        valid = np.ones(len(structure.chains), dtype=bool)
+        # Remove chain if consecutive CA atoms are above threshold
+        for i, chain in enumerate(structure.chains):
+            # Skip non-protein chains
+            if chain["mol_type"] != const.chain_type_ids["PROTEIN"]:
+                continue
+            # Get residues
+            res_start = chain["res_idx"]
+            res_end = res_start + chain["res_num"]
+            residues = structure.residues[res_start:res_end]
+            # Get c-alphas
+            ca_ids = residues["atom_center"]
+            ca_atoms = structure.atoms[ca_ids]
+            res_valid = residues["is_present"]
+            ca_valid = ca_atoms["is_present"] & res_valid
+            ca_coords = ca_atoms["coords"]
+            # Compute distances between consecutive atoms
+            dist = np.linalg.norm(ca_coords[1:] - ca_coords[:-1], axis=1)
+            dist = dist > self._max_dist
+            dist = dist[ca_valid[1:] & ca_valid[:-1]]
+            # Remove the chain if any valid pair is above threshold
+            if np.any(dist):
+                valid[i] = 0
+        return valid
+@dataclass(frozen=True)
+class Clash:
+    """A clash between two chains."""
+    chain: int
+    other: int
+    num_atoms: int
+    num_clashes: int
+class ClashingChainsFilter(StaticFilter):
+    """A filter that filters clashing chains.
+    Clashing chains are defined as those with >30% of atoms
+    within 1.7 Å of an atom in another chain. If two chains
+    are clashing with each other, the chain with the greater
+    percentage of clashing atoms will be removed. If the same
+    fraction of atoms are clashing, the chain with fewer total
+    atoms is removed. If the chains have the same number of
+    atoms, then the chain with the larger chain id is removed.
+    """
+    def __init__(self, dist: float = 1.7, freq: float = 0.3) -> None:
+        """Initialize the filter.
+        Parameters
+        ----------
+        dist : float, optional
+            The maximum distance for a clash.
+        freq : float, optional
+            The maximum allowed frequency of clashes.
+        """
+        self._dist = dist
+        self._freq = freq
+    def filter(self, structure: Structure) -> np.ndarray:  # noqa: PLR0912, C901
+        """Filter out clashing chains.
+        Parameters
+        ----------
+        structure : Structure
+            The structure to filter chains from.
+        Returns
+        -------
+        np.ndarray
+            The chains to keep, as a boolean mask.
+        """
+        num_chains = len(structure.chains)
+        if num_chains < 2:  # noqa: PLR2004
+            return np.ones(num_chains, dtype=bool)
+        # Get unique chain pairs
+        pairs = itertools.combinations(range(num_chains), 2)
+        # Compute clashes
+        clashes: list[Clash] = []
+        for i, j in pairs:
+            # Get the chains
+            c1 = structure.chains[i]
+            c2 = structure.chains[j]
+            # Get the atoms from each chain
+            c1_start = c1["atom_idx"]
+            c2_start = c2["atom_idx"]
+            c1_end = c1_start + c1["atom_num"]
+            c2_end = c2_start + c2["atom_num"]
+            atoms1 = structure.atoms[c1_start:c1_end]
+            atoms2 = structure.atoms[c2_start:c2_end]
+            atoms1 = atoms1[atoms1["is_present"]]
+            atoms2 = atoms2[atoms2["is_present"]]
+            # Skip if either chain has no atoms
+            if len(atoms1) == 0 or len(atoms2) == 0:
+                continue
+            # Compute the number of clashes
+            # Compute the distance matrix
+            tree = KDTree(atoms1["coords"], metric="euclidean")
+            query = tree.query_radius(atoms2["coords"], self._dist)
+            c2_clashes = sum(len(neighbors) > 0 for neighbors in query)
+            c1_clashes = len(set(itertools.chain.from_iterable(query)))
+            # Save results
+            if (c1_clashes / len(atoms1)) > self._freq:
+                clashes.append(Clash(i, j, len(atoms1), c1_clashes))
+            if (c2_clashes / len(atoms2)) > self._freq:
+                clashes.append(Clash(j, i, len(atoms2), c2_clashes))
+        # Compute indices to clash map
+        removed = set()
+        ids_to_clash = {(c.chain, c.other): c for c in clashes}
+        # Filter out chains according to ruleset
+        for clash in clashes:
+            # If either is already removed, skip
+            if clash.chain in removed or clash.other in removed:
+                continue
+            # Check if the two chains clash with each other
+            other_clash = ids_to_clash.get((clash.other, clash.chain))
+            if other_clash is not None:
+                # Remove the chain with the most clashes
+                clash1_freq = clash.num_clashes / clash.num_atoms
+                clash2_freq = other_clash.num_clashes / other_clash.num_atoms
+                if clash1_freq > clash2_freq:
+                    removed.add(clash.chain)
+                elif clash1_freq < clash2_freq:
+                    removed.add(clash.other)
+                # If same, remove the chain with fewer atoms
+                elif clash.num_atoms < other_clash.num_atoms:
+                    removed.add(clash.chain)
+                elif clash.num_atoms > other_clash.num_atoms:
+                    removed.add(clash.other)
+                # If same, remove the chain with the larger chain id
+                else:
+                    removed.add(max(clash.chain, clash.other))
+            # Otherwise, just remove the chain directly
+            else:
+                removed.add(clash.chain)
+        # Remove the chains
+        valid = np.ones(len(structure.chains), dtype=bool)
+        for i in removed:
+            valid[i] = 0
+        return valid

protify/FastPLMs/boltz/src/boltz/data/module/__init__.py ADDED Viewed

File without changes

protify/FastPLMs/boltz/src/boltz/data/module/inference.py ADDED Viewed

	@@ -0,0 +1,310 @@

+from pathlib import Path
+from typing import Optional
+import numpy as np
+import pytorch_lightning as pl
+import torch
+from torch import Tensor
+from torch.utils.data import DataLoader
+from boltz.data import const
+from boltz.data.feature.featurizer import BoltzFeaturizer
+from boltz.data.pad import pad_to_max
+from boltz.data.tokenize.boltz import BoltzTokenizer
+from boltz.data.types import (
+    MSA,
+    Connection,
+    Input,
+    Manifest,
+    Record,
+    ResidueConstraints,
+    Structure,
+)
+def load_input(
+    record: Record,
+    target_dir: Path,
+    msa_dir: Path,
+    constraints_dir: Optional[Path] = None,
+) -> Input:
+    """Load the given input data.
+    Parameters
+    ----------
+    record : Record
+        The record to load.
+    target_dir : Path
+        The path to the data directory.
+    msa_dir : Path
+        The path to msa directory.
+    Returns
+    -------
+    Input
+        The loaded input.
+    """
+    # Load the structure
+    structure = np.load(target_dir / f"{record.id}.npz")
+    structure = Structure(
+        atoms=structure["atoms"],
+        bonds=structure["bonds"],
+        residues=structure["residues"],
+        chains=structure["chains"],
+        connections=structure["connections"].astype(Connection),
+        interfaces=structure["interfaces"],
+        mask=structure["mask"],
+    )
+    msas = {}
+    for chain in record.chains:
+        msa_id = chain.msa_id
+        # Load the MSA for this chain, if any
+        if msa_id != -1:
+            msa = np.load(msa_dir / f"{msa_id}.npz")
+            msas[chain.chain_id] = MSA(**msa)
+    residue_constraints = None
+    if constraints_dir is not None:
+        residue_constraints = ResidueConstraints.load(
+            constraints_dir / f"{record.id}.npz"
+        )
+    return Input(structure, msas, record, residue_constraints)
+def collate(data: list[dict[str, Tensor]]) -> dict[str, Tensor]:
+    """Collate the data.
+    Parameters
+    ----------
+    data : List[Dict[str, Tensor]]
+        The data to collate.
+    Returns
+    -------
+    Dict[str, Tensor]
+        The collated data.
+    """
+    # Get the keys
+    keys = data[0].keys()
+    # Collate the data
+    collated = {}
+    for key in keys:
+        values = [d[key] for d in data]
+        if key not in [
+            "all_coords",
+            "all_resolved_mask",
+            "crop_to_all_atom_map",
+            "chain_symmetries",
+            "amino_acids_symmetries",
+            "ligand_symmetries",
+            "record",
+        ]:
+            # Check if all have the same shape
+            shape = values[0].shape
+            if not all(v.shape == shape for v in values):
+                values, _ = pad_to_max(values, 0)
+            else:
+                values = torch.stack(values, dim=0)
+        # Stack the values
+        collated[key] = values
+    return collated
+class PredictionDataset(torch.utils.data.Dataset):
+    """Base iterable dataset."""
+    def __init__(
+        self,
+        manifest: Manifest,
+        target_dir: Path,
+        msa_dir: Path,
+        constraints_dir: Optional[Path] = None,
+    ) -> None:
+        """Initialize the training dataset.
+        Parameters
+        ----------
+        manifest : Manifest
+            The manifest to load data from.
+        target_dir : Path
+            The path to the target directory.
+        msa_dir : Path
+            The path to the msa directory.
+        """
+        super().__init__()
+        self.manifest = manifest
+        self.target_dir = target_dir
+        self.msa_dir = msa_dir
+        self.constraints_dir = constraints_dir
+        self.tokenizer = BoltzTokenizer()
+        self.featurizer = BoltzFeaturizer()
+    def __getitem__(self, idx: int) -> dict:
+        """Get an item from the dataset.
+        Returns
+        -------
+        Dict[str, Tensor]
+            The sampled data features.
+        """
+        # Get a sample from the dataset
+        record = self.manifest.records[idx]
+        # Get the structure
+        try:
+            input_data = load_input(
+                record,
+                self.target_dir,
+                self.msa_dir,
+                self.constraints_dir,
+            )
+        except Exception as e:  # noqa: BLE001
+            print(f"Failed to load input for {record.id} with error {e}. Skipping.")  # noqa: T201
+            return self.__getitem__(0)
+        # Tokenize structure
+        try:
+            tokenized = self.tokenizer.tokenize(input_data)
+        except Exception as e:  # noqa: BLE001
+            print(f"Tokenizer failed on {record.id} with error {e}. Skipping.")  # noqa: T201
+            return self.__getitem__(0)
+        # Inference specific options
+        options = record.inference_options
+        if options is None or len(options.pocket_constraints) == 0:
+            binder, pocket = None, None
+        else:
+            binder, pocket = (
+                options.pocket_constraints[0][0],
+                options.pocket_constraints[0][1],
+            )
+        # Compute features
+        try:
+            features = self.featurizer.process(
+                tokenized,
+                training=False,
+                max_atoms=None,
+                max_tokens=None,
+                max_seqs=const.max_msa_seqs,
+                pad_to_max_seqs=False,
+                symmetries={},
+                compute_symmetries=False,
+                inference_binder=binder,
+                inference_pocket=pocket,
+                compute_constraint_features=True,
+            )
+        except Exception as e:  # noqa: BLE001
+            print(f"Featurizer failed on {record.id} with error {e}. Skipping.")  # noqa: T201
+            return self.__getitem__(0)
+        features["record"] = record
+        return features
+    def __len__(self) -> int:
+        """Get the length of the dataset.
+        Returns
+        -------
+        int
+            The length of the dataset.
+        """
+        return len(self.manifest.records)
+class BoltzInferenceDataModule(pl.LightningDataModule):
+    """DataModule for Boltz inference."""
+    def __init__(
+        self,
+        manifest: Manifest,
+        target_dir: Path,
+        msa_dir: Path,
+        num_workers: int,
+        constraints_dir: Optional[Path] = None,
+    ) -> None:
+        """Initialize the DataModule.
+        Parameters
+        ----------
+        config : DataConfig
+            The data configuration.
+        """
+        super().__init__()
+        self.num_workers = num_workers
+        self.manifest = manifest
+        self.target_dir = target_dir
+        self.msa_dir = msa_dir
+        self.constraints_dir = constraints_dir
+    def predict_dataloader(self) -> DataLoader:
+        """Get the training dataloader.
+        Returns
+        -------
+        DataLoader
+            The training dataloader.
+        """
+        dataset = PredictionDataset(
+            manifest=self.manifest,
+            target_dir=self.target_dir,
+            msa_dir=self.msa_dir,
+            constraints_dir=self.constraints_dir,
+        )
+        return DataLoader(
+            dataset,
+            batch_size=1,
+            num_workers=self.num_workers,
+            pin_memory=True,
+            shuffle=False,
+            collate_fn=collate,
+        )
+    def transfer_batch_to_device(
+        self,
+        batch: dict,
+        device: torch.device,
+        dataloader_idx: int,  # noqa: ARG002
+    ) -> dict:
+        """Transfer a batch to the given device.
+        Parameters
+        ----------
+        batch : Dict
+            The batch to transfer.
+        device : torch.device
+            The device to transfer to.
+        dataloader_idx : int
+            The dataloader index.
+        Returns
+        -------
+        np.Any
+            The transferred batch.
+        """
+        for key in batch:
+            if key not in [
+                "all_coords",
+                "all_resolved_mask",
+                "crop_to_all_atom_map",
+                "chain_symmetries",
+                "amino_acids_symmetries",
+                "ligand_symmetries",
+                "record",
+            ]:
+                batch[key] = batch[key].to(device)
+        return batch

protify/FastPLMs/boltz/src/boltz/data/module/inferencev2.py ADDED Viewed

	@@ -0,0 +1,433 @@

+import pickle
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import pytorch_lightning as pl
+import torch
+from torch import Tensor
+from torch.utils.data import DataLoader
+from boltz.data import const
+from boltz.data.crop.affinity import AffinityCropper
+from boltz.data.feature.featurizerv2 import Boltz2Featurizer
+from boltz.data.mol import load_canonicals, load_molecules
+from boltz.data.pad import pad_to_max
+from boltz.data.tokenize.boltz2 import Boltz2Tokenizer
+from boltz.data.types import (
+    MSA,
+    Input,
+    Manifest,
+    Record,
+    ResidueConstraints,
+    StructureV2,
+)
+def load_input(
+    record: Record,
+    target_dir: Path,
+    msa_dir: Path,
+    constraints_dir: Optional[Path] = None,
+    template_dir: Optional[Path] = None,
+    extra_mols_dir: Optional[Path] = None,
+    affinity: bool = False,
+) -> Input:
+    """Load the given input data.
+    Parameters
+    ----------
+    record : Record
+        The record to load.
+    target_dir : Path
+        The path to the data directory.
+    msa_dir : Path
+        The path to msa directory.
+    constraints_dir : Optional[Path]
+        The path to the constraints directory.
+    template_dir : Optional[Path]
+        The path to the template directory.
+    extra_mols_dir : Optional[Path]
+        The path to the extra molecules directory.
+    affinity : bool
+        Whether to load the affinity data.
+    Returns
+    -------
+    Input
+        The loaded input.
+    """
+    # Load the structure
+    if affinity:
+        structure = StructureV2.load(
+            target_dir / record.id / f"pre_affinity_{record.id}.npz"
+        )
+    else:
+        structure = StructureV2.load(target_dir / f"{record.id}.npz")
+    msas = {}
+    for chain in record.chains:
+        msa_id = chain.msa_id
+        # Load the MSA for this chain, if any
+        if msa_id != -1:
+            msa = MSA.load(msa_dir / f"{msa_id}.npz")
+            msas[chain.chain_id] = msa
+    # Load templates
+    templates = None
+    if record.templates and template_dir is not None:
+        templates = {}
+        for template_info in record.templates:
+            template_id = template_info.name
+            template_path = template_dir / f"{record.id}_{template_id}.npz"
+            template = StructureV2.load(template_path)
+            templates[template_id] = template
+    # Load residue constraints
+    residue_constraints = None
+    if constraints_dir is not None:
+        residue_constraints = ResidueConstraints.load(
+            constraints_dir / f"{record.id}.npz"
+        )
+    # Load extra molecules
+    extra_mols = {}
+    if extra_mols_dir is not None:
+        extra_mol_path = extra_mols_dir / f"{record.id}.pkl"
+        if extra_mol_path.exists():
+            with extra_mol_path.open("rb") as f:
+                extra_mols = pickle.load(f)  # noqa: S301
+    return Input(
+        structure,
+        msas,
+        record=record,
+        residue_constraints=residue_constraints,
+        templates=templates,
+        extra_mols=extra_mols,
+    )
+def collate(data: list[dict[str, Tensor]]) -> dict[str, Tensor]:
+    """Collate the data.
+    Parameters
+    ----------
+    data : List[Dict[str, Tensor]]
+        The data to collate.
+    Returns
+    -------
+    Dict[str, Tensor]
+        The collated data.
+    """
+    # Get the keys
+    keys = data[0].keys()
+    # Collate the data
+    collated = {}
+    for key in keys:
+        values = [d[key] for d in data]
+        if key not in [
+            "all_coords",
+            "all_resolved_mask",
+            "crop_to_all_atom_map",
+            "chain_symmetries",
+            "amino_acids_symmetries",
+            "ligand_symmetries",
+            "record",
+            "affinity_mw",
+        ]:
+            # Check if all have the same shape
+            shape = values[0].shape
+            if not all(v.shape == shape for v in values):
+                values, _ = pad_to_max(values, 0)
+            else:
+                values = torch.stack(values, dim=0)
+        # Stack the values
+        collated[key] = values
+    return collated
+class PredictionDataset(torch.utils.data.Dataset):
+    """Base iterable dataset."""
+    def __init__(
+        self,
+        manifest: Manifest,
+        target_dir: Path,
+        msa_dir: Path,
+        mol_dir: Path,
+        constraints_dir: Optional[Path] = None,
+        template_dir: Optional[Path] = None,
+        extra_mols_dir: Optional[Path] = None,
+        override_method: Optional[str] = None,
+        affinity: bool = False,
+    ) -> None:
+        """Initialize the training dataset.
+        Parameters
+        ----------
+        manifest : Manifest
+            The manifest to load data from.
+        target_dir : Path
+            The path to the target directory.
+        msa_dir : Path
+            The path to the msa directory.
+        mol_dir : Path
+            The path to the moldir.
+        constraints_dir : Optional[Path]
+            The path to the constraints directory.
+        template_dir : Optional[Path]
+            The path to the template directory.
+        """
+        super().__init__()
+        self.manifest = manifest
+        self.target_dir = target_dir
+        self.msa_dir = msa_dir
+        self.mol_dir = mol_dir
+        self.constraints_dir = constraints_dir
+        self.template_dir = template_dir
+        self.tokenizer = Boltz2Tokenizer()
+        self.featurizer = Boltz2Featurizer()
+        self.canonicals = load_canonicals(self.mol_dir)
+        self.extra_mols_dir = extra_mols_dir
+        self.override_method = override_method
+        self.affinity = affinity
+        if self.affinity:
+            self.cropper = AffinityCropper()
+    def __getitem__(self, idx: int) -> dict:
+        """Get an item from the dataset.
+        Returns
+        -------
+        Dict[str, Tensor]
+            The sampled data features.
+        """
+        # Get record
+        record = self.manifest.records[idx]
+        # Finalize input data
+        input_data = load_input(
+            record=record,
+            target_dir=self.target_dir,
+            msa_dir=self.msa_dir,
+            constraints_dir=self.constraints_dir,
+            template_dir=self.template_dir,
+            extra_mols_dir=self.extra_mols_dir,
+            affinity=self.affinity,
+        )
+        # Tokenize structure
+        try:
+            tokenized = self.tokenizer.tokenize(input_data)
+        except Exception as e:  # noqa: BLE001
+            print(  # noqa: T201
+                f"Tokenizer failed on {record.id} with error {e}. Skipping."
+            )
+            return self.__getitem__(0)
+        if self.affinity:
+            try:
+                tokenized = self.cropper.crop(
+                    tokenized,
+                    max_tokens=256,
+                    max_atoms=2048,
+                )
+            except Exception as e:  # noqa: BLE001
+                print(f"Cropper failed on {record.id} with error {e}. Skipping.")  # noqa: T201
+                return self.__getitem__(0)
+        # Load conformers
+        try:
+            molecules = {}
+            molecules.update(self.canonicals)
+            molecules.update(input_data.extra_mols)
+            mol_names = set(tokenized.tokens["res_name"].tolist())
+            mol_names = mol_names - set(molecules.keys())
+            molecules.update(load_molecules(self.mol_dir, mol_names))
+        except Exception as e:  # noqa: BLE001
+            print(f"Molecule loading failed for {record.id} with error {e}. Skipping.")
+            return self.__getitem__(0)
+        # Inference specific options
+        options = record.inference_options
+        if options is None:
+            pocket_constraints, contact_constraints = None, None
+        else:
+            pocket_constraints, contact_constraints = (
+                options.pocket_constraints,
+                options.contact_constraints,
+            )
+        # Get random seed
+        seed = 42
+        random = np.random.default_rng(seed)
+        # Compute features
+        try:
+            features = self.featurizer.process(
+                tokenized,
+                molecules=molecules,
+                random=random,
+                training=False,
+                max_atoms=None,
+                max_tokens=None,
+                max_seqs=const.max_msa_seqs,
+                pad_to_max_seqs=False,
+                single_sequence_prop=0.0,
+                compute_frames=True,
+                inference_pocket_constraints=pocket_constraints,
+                inference_contact_constraints=contact_constraints,
+                compute_constraint_features=True,
+                override_method=self.override_method,
+                compute_affinity=self.affinity,
+            )
+        except Exception as e:  # noqa: BLE001
+            import traceback
+            traceback.print_exc()
+            print(f"Featurizer failed on {record.id} with error {e}. Skipping.")  # noqa: T201
+            return self.__getitem__(0)
+        # Add record
+        features["record"] = record
+        return features
+    def __len__(self) -> int:
+        """Get the length of the dataset.
+        Returns
+        -------
+        int
+            The length of the dataset.
+        """
+        return len(self.manifest.records)
+class Boltz2InferenceDataModule(pl.LightningDataModule):
+    """DataModule for Boltz2 inference."""
+    def __init__(
+        self,
+        manifest: Manifest,
+        target_dir: Path,
+        msa_dir: Path,
+        mol_dir: Path,
+        num_workers: int,
+        constraints_dir: Optional[Path] = None,
+        template_dir: Optional[Path] = None,
+        extra_mols_dir: Optional[Path] = None,
+        override_method: Optional[str] = None,
+        affinity: bool = False,
+    ) -> None:
+        """Initialize the DataModule.
+        Parameters
+        ----------
+        manifest : Manifest
+            The manifest to load data from.
+        target_dir : Path
+            The path to the target directory.
+        msa_dir : Path
+            The path to the msa directory.
+        mol_dir : Path
+            The path to the moldir.
+        num_workers : int
+            The number of workers to use.
+        constraints_dir : Optional[Path]
+            The path to the constraints directory.
+        template_dir : Optional[Path]
+            The path to the template directory.
+        extra_mols_dir : Optional[Path]
+            The path to the extra molecules directory.
+        override_method : Optional[str]
+            The method to override.
+        """
+        super().__init__()
+        self.num_workers = num_workers
+        self.manifest = manifest
+        self.target_dir = target_dir
+        self.msa_dir = msa_dir
+        self.mol_dir = mol_dir
+        self.constraints_dir = constraints_dir
+        self.template_dir = template_dir
+        self.extra_mols_dir = extra_mols_dir
+        self.override_method = override_method
+        self.affinity = affinity
+    def predict_dataloader(self) -> DataLoader:
+        """Get the training dataloader.
+        Returns
+        -------
+        DataLoader
+            The training dataloader.
+        """
+        dataset = PredictionDataset(
+            manifest=self.manifest,
+            target_dir=self.target_dir,
+            msa_dir=self.msa_dir,
+            mol_dir=self.mol_dir,
+            constraints_dir=self.constraints_dir,
+            template_dir=self.template_dir,
+            extra_mols_dir=self.extra_mols_dir,
+            override_method=self.override_method,
+            affinity=self.affinity,
+        )
+        return DataLoader(
+            dataset,
+            batch_size=1,
+            num_workers=self.num_workers,
+            pin_memory=True,
+            shuffle=False,
+            collate_fn=collate,
+        )
+    def transfer_batch_to_device(
+        self,
+        batch: dict,
+        device: torch.device,
+        dataloader_idx: int,  # noqa: ARG002
+    ) -> dict:
+        """Transfer a batch to the given device.
+        Parameters
+        ----------
+        batch : Dict
+            The batch to transfer.
+        device : torch.device
+            The device to transfer to.
+        dataloader_idx : int
+            The dataloader index.
+        Returns
+        -------
+        np.Any
+            The transferred batch.
+        """
+        for key in batch:
+            if key not in [
+                "all_coords",
+                "all_resolved_mask",
+                "crop_to_all_atom_map",
+                "chain_symmetries",
+                "amino_acids_symmetries",
+                "ligand_symmetries",
+                "record",
+                "affinity_mw",
+            ]:
+                batch[key] = batch[key].to(device)
+        return batch

protify/FastPLMs/boltz/src/boltz/data/module/training.py ADDED Viewed

	@@ -0,0 +1,687 @@

+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import pytorch_lightning as pl
+import torch
+from torch import Tensor
+from torch.utils.data import DataLoader
+from boltz.data.crop.cropper import Cropper
+from boltz.data.feature.featurizer import BoltzFeaturizer
+from boltz.data.feature.symmetry import get_symmetries
+from boltz.data.filter.dynamic.filter import DynamicFilter
+from boltz.data.pad import pad_to_max
+from boltz.data.sample.sampler import Sample, Sampler
+from boltz.data.tokenize.tokenizer import Tokenizer
+from boltz.data.types import MSA, Connection, Input, Manifest, Record, Structure
+@dataclass
+class DatasetConfig:
+    """Dataset configuration."""
+    target_dir: str
+    msa_dir: str
+    prob: float
+    sampler: Sampler
+    cropper: Cropper
+    filters: Optional[list] = None
+    split: Optional[str] = None
+    manifest_path: Optional[str] = None
+@dataclass
+class DataConfig:
+    """Data configuration."""
+    datasets: list[DatasetConfig]
+    filters: list[DynamicFilter]
+    featurizer: BoltzFeaturizer
+    tokenizer: Tokenizer
+    max_atoms: int
+    max_tokens: int
+    max_seqs: int
+    samples_per_epoch: int
+    batch_size: int
+    num_workers: int
+    random_seed: int
+    pin_memory: bool
+    symmetries: str
+    atoms_per_window_queries: int
+    min_dist: float
+    max_dist: float
+    num_bins: int
+    overfit: Optional[int] = None
+    pad_to_max_tokens: bool = False
+    pad_to_max_atoms: bool = False
+    pad_to_max_seqs: bool = False
+    crop_validation: bool = False
+    return_train_symmetries: bool = False
+    return_val_symmetries: bool = True
+    train_binder_pocket_conditioned_prop: float = 0.0
+    val_binder_pocket_conditioned_prop: float = 0.0
+    binder_pocket_cutoff: float = 6.0
+    binder_pocket_sampling_geometric_p: float = 0.0
+    val_batch_size: int = 1
+    compute_constraint_features: bool = False
+@dataclass
+class Dataset:
+    """Data holder."""
+    target_dir: Path
+    msa_dir: Path
+    manifest: Manifest
+    prob: float
+    sampler: Sampler
+    cropper: Cropper
+    tokenizer: Tokenizer
+    featurizer: BoltzFeaturizer
+def load_input(record: Record, target_dir: Path, msa_dir: Path) -> Input:
+    """Load the given input data.
+    Parameters
+    ----------
+    record : Record
+        The record to load.
+    target_dir : Path
+        The path to the data directory.
+    msa_dir : Path
+        The path to msa directory.
+    Returns
+    -------
+    Input
+        The loaded input.
+    """
+    # Load the structure
+    structure = np.load(target_dir / "structures" / f"{record.id}.npz")
+    # In order to add cyclic_period to chains if it does not exist
+    # Extract the chains array
+    chains = structure["chains"]
+    # Check if the field exists
+    if "cyclic_period" not in chains.dtype.names:
+        # Create a new dtype with the additional field
+        new_dtype = chains.dtype.descr + [("cyclic_period", "i4")]
+        # Create a new array with the new dtype
+        new_chains = np.empty(chains.shape, dtype=new_dtype)
+        # Copy over existing fields
+        for name in chains.dtype.names:
+            new_chains[name] = chains[name]
+        # Set the new field to 0
+        new_chains["cyclic_period"] = 0
+        # Replace old chains array with new one
+        chains = new_chains
+    structure = Structure(
+        atoms=structure["atoms"],
+        bonds=structure["bonds"],
+        residues=structure["residues"],
+        chains=chains,  # chains var accounting for missing cyclic_period
+        connections=structure["connections"].astype(Connection),
+        interfaces=structure["interfaces"],
+        mask=structure["mask"],
+    )
+    msas = {}
+    for chain in record.chains:
+        msa_id = chain.msa_id
+        # Load the MSA for this chain, if any
+        if msa_id != -1 and msa_id != "":
+            msa = np.load(msa_dir / f"{msa_id}.npz")
+            msas[chain.chain_id] = MSA(**msa)
+    return Input(structure, msas)
+def collate(data: list[dict[str, Tensor]]) -> dict[str, Tensor]:
+    """Collate the data.
+    Parameters
+    ----------
+    data : list[dict[str, Tensor]]
+        The data to collate.
+    Returns
+    -------
+    dict[str, Tensor]
+        The collated data.
+    """
+    # Get the keys
+    keys = data[0].keys()
+    # Collate the data
+    collated = {}
+    for key in keys:
+        values = [d[key] for d in data]
+        if key not in [
+            "all_coords",
+            "all_resolved_mask",
+            "crop_to_all_atom_map",
+            "chain_symmetries",
+            "amino_acids_symmetries",
+            "ligand_symmetries",
+        ]:
+            # Check if all have the same shape
+            shape = values[0].shape
+            if not all(v.shape == shape for v in values):
+                values, _ = pad_to_max(values, 0)
+            else:
+                values = torch.stack(values, dim=0)
+        # Stack the values
+        collated[key] = values
+    return collated
+class TrainingDataset(torch.utils.data.Dataset):
+    """Base iterable dataset."""
+    def __init__(
+        self,
+        datasets: list[Dataset],
+        samples_per_epoch: int,
+        symmetries: dict,
+        max_atoms: int,
+        max_tokens: int,
+        max_seqs: int,
+        pad_to_max_atoms: bool = False,
+        pad_to_max_tokens: bool = False,
+        pad_to_max_seqs: bool = False,
+        atoms_per_window_queries: int = 32,
+        min_dist: float = 2.0,
+        max_dist: float = 22.0,
+        num_bins: int = 64,
+        overfit: Optional[int] = None,
+        binder_pocket_conditioned_prop: Optional[float] = 0.0,
+        binder_pocket_cutoff: Optional[float] = 6.0,
+        binder_pocket_sampling_geometric_p: Optional[float] = 0.0,
+        return_symmetries: Optional[bool] = False,
+        compute_constraint_features: bool = False,
+    ) -> None:
+        """Initialize the training dataset."""
+        super().__init__()
+        self.datasets = datasets
+        self.probs = [d.prob for d in datasets]
+        self.samples_per_epoch = samples_per_epoch
+        self.symmetries = symmetries
+        self.max_tokens = max_tokens
+        self.max_seqs = max_seqs
+        self.max_atoms = max_atoms
+        self.pad_to_max_tokens = pad_to_max_tokens
+        self.pad_to_max_atoms = pad_to_max_atoms
+        self.pad_to_max_seqs = pad_to_max_seqs
+        self.atoms_per_window_queries = atoms_per_window_queries
+        self.min_dist = min_dist
+        self.max_dist = max_dist
+        self.num_bins = num_bins
+        self.binder_pocket_conditioned_prop = binder_pocket_conditioned_prop
+        self.binder_pocket_cutoff = binder_pocket_cutoff
+        self.binder_pocket_sampling_geometric_p = binder_pocket_sampling_geometric_p
+        self.return_symmetries = return_symmetries
+        self.compute_constraint_features = compute_constraint_features
+        self.samples = []
+        for dataset in datasets:
+            records = dataset.manifest.records
+            if overfit is not None:
+                records = records[:overfit]
+            iterator = dataset.sampler.sample(records, np.random)
+            self.samples.append(iterator)
+    def __getitem__(self, idx: int) -> dict[str, Tensor]:
+        """Get an item from the dataset.
+        Parameters
+        ----------
+        idx : int
+            The data index.
+        Returns
+        -------
+        dict[str, Tensor]
+            The sampled data features.
+        """
+        # Pick a random dataset
+        dataset_idx = np.random.choice(
+            len(self.datasets),
+            p=self.probs,
+        )
+        dataset = self.datasets[dataset_idx]
+        # Get a sample from the dataset
+        sample: Sample = next(self.samples[dataset_idx])
+        # Get the structure
+        try:
+            input_data = load_input(sample.record, dataset.target_dir, dataset.msa_dir)
+        except Exception as e:
+            print(
+                f"Failed to load input for {sample.record.id} with error {e}. Skipping."
+            )
+            return self.__getitem__(idx)
+        # Tokenize structure
+        try:
+            tokenized = dataset.tokenizer.tokenize(input_data)
+        except Exception as e:
+            print(f"Tokenizer failed on {sample.record.id} with error {e}. Skipping.")
+            return self.__getitem__(idx)
+        # Compute crop
+        try:
+            if self.max_tokens is not None:
+                tokenized = dataset.cropper.crop(
+                    tokenized,
+                    max_atoms=self.max_atoms,
+                    max_tokens=self.max_tokens,
+                    random=np.random,
+                    chain_id=sample.chain_id,
+                    interface_id=sample.interface_id,
+                )
+        except Exception as e:
+            print(f"Cropper failed on {sample.record.id} with error {e}. Skipping.")
+            return self.__getitem__(idx)
+        # Check if there are tokens
+        if len(tokenized.tokens) == 0:
+            msg = "No tokens in cropped structure."
+            raise ValueError(msg)
+        # Compute features
+        try:
+            features = dataset.featurizer.process(
+                tokenized,
+                training=True,
+                max_atoms=self.max_atoms if self.pad_to_max_atoms else None,
+                max_tokens=self.max_tokens if self.pad_to_max_tokens else None,
+                max_seqs=self.max_seqs,
+                pad_to_max_seqs=self.pad_to_max_seqs,
+                symmetries=self.symmetries,
+                atoms_per_window_queries=self.atoms_per_window_queries,
+                min_dist=self.min_dist,
+                max_dist=self.max_dist,
+                num_bins=self.num_bins,
+                compute_symmetries=self.return_symmetries,
+                binder_pocket_conditioned_prop=self.binder_pocket_conditioned_prop,
+                binder_pocket_cutoff=self.binder_pocket_cutoff,
+                binder_pocket_sampling_geometric_p=self.binder_pocket_sampling_geometric_p,
+                compute_constraint_features=self.compute_constraint_features,
+            )
+        except Exception as e:
+            print(f"Featurizer failed on {sample.record.id} with error {e}. Skipping.")
+            return self.__getitem__(idx)
+        return features
+    def __len__(self) -> int:
+        """Get the length of the dataset.
+        Returns
+        -------
+        int
+            The length of the dataset.
+        """
+        return self.samples_per_epoch
+class ValidationDataset(torch.utils.data.Dataset):
+    """Base iterable dataset."""
+    def __init__(
+        self,
+        datasets: list[Dataset],
+        seed: int,
+        symmetries: dict,
+        max_atoms: Optional[int] = None,
+        max_tokens: Optional[int] = None,
+        max_seqs: Optional[int] = None,
+        pad_to_max_atoms: bool = False,
+        pad_to_max_tokens: bool = False,
+        pad_to_max_seqs: bool = False,
+        atoms_per_window_queries: int = 32,
+        min_dist: float = 2.0,
+        max_dist: float = 22.0,
+        num_bins: int = 64,
+        overfit: Optional[int] = None,
+        crop_validation: bool = False,
+        return_symmetries: Optional[bool] = False,
+        binder_pocket_conditioned_prop: Optional[float] = 0.0,
+        binder_pocket_cutoff: Optional[float] = 6.0,
+        compute_constraint_features: bool = False,
+    ) -> None:
+        """Initialize the validation dataset."""
+        super().__init__()
+        self.datasets = datasets
+        self.max_atoms = max_atoms
+        self.max_tokens = max_tokens
+        self.max_seqs = max_seqs
+        self.seed = seed
+        self.symmetries = symmetries
+        self.random = np.random if overfit else np.random.RandomState(self.seed)
+        self.pad_to_max_tokens = pad_to_max_tokens
+        self.pad_to_max_atoms = pad_to_max_atoms
+        self.pad_to_max_seqs = pad_to_max_seqs
+        self.overfit = overfit
+        self.crop_validation = crop_validation
+        self.atoms_per_window_queries = atoms_per_window_queries
+        self.min_dist = min_dist
+        self.max_dist = max_dist
+        self.num_bins = num_bins
+        self.return_symmetries = return_symmetries
+        self.binder_pocket_conditioned_prop = binder_pocket_conditioned_prop
+        self.binder_pocket_cutoff = binder_pocket_cutoff
+        self.compute_constraint_features = compute_constraint_features
+    def __getitem__(self, idx: int) -> dict[str, Tensor]:
+        """Get an item from the dataset.
+        Parameters
+        ----------
+        idx : int
+            The data index.
+        Returns
+        -------
+        dict[str, Tensor]
+            The sampled data features.
+        """
+        # Pick dataset based on idx
+        for dataset in self.datasets:
+            size = len(dataset.manifest.records)
+            if self.overfit is not None:
+                size = min(size, self.overfit)
+            if idx < size:
+                break
+            idx -= size
+        # Get a sample from the dataset
+        record = dataset.manifest.records[idx]
+        # Get the structure
+        try:
+            input_data = load_input(record, dataset.target_dir, dataset.msa_dir)
+        except Exception as e:
+            print(f"Failed to load input for {record.id} with error {e}. Skipping.")
+            return self.__getitem__(0)
+        # Tokenize structure
+        try:
+            tokenized = dataset.tokenizer.tokenize(input_data)
+        except Exception as e:
+            print(f"Tokenizer failed on {record.id} with error {e}. Skipping.")
+            return self.__getitem__(0)
+        # Compute crop
+        try:
+            if self.crop_validation and (self.max_tokens is not None):
+                tokenized = dataset.cropper.crop(
+                    tokenized,
+                    max_tokens=self.max_tokens,
+                    random=self.random,
+                    max_atoms=self.max_atoms,
+                )
+        except Exception as e:
+            print(f"Cropper failed on {record.id} with error {e}. Skipping.")
+            return self.__getitem__(0)
+        # Check if there are tokens
+        if len(tokenized.tokens) == 0:
+            msg = "No tokens in cropped structure."
+            raise ValueError(msg)
+        # Compute features
+        try:
+            pad_atoms = self.crop_validation and self.pad_to_max_atoms
+            pad_tokens = self.crop_validation and self.pad_to_max_tokens
+            features = dataset.featurizer.process(
+                tokenized,
+                training=False,
+                max_atoms=self.max_atoms if pad_atoms else None,
+                max_tokens=self.max_tokens if pad_tokens else None,
+                max_seqs=self.max_seqs,
+                pad_to_max_seqs=self.pad_to_max_seqs,
+                symmetries=self.symmetries,
+                atoms_per_window_queries=self.atoms_per_window_queries,
+                min_dist=self.min_dist,
+                max_dist=self.max_dist,
+                num_bins=self.num_bins,
+                compute_symmetries=self.return_symmetries,
+                binder_pocket_conditioned_prop=self.binder_pocket_conditioned_prop,
+                binder_pocket_cutoff=self.binder_pocket_cutoff,
+                binder_pocket_sampling_geometric_p=1.0,  # this will only sample a single pocket token
+                only_ligand_binder_pocket=True,
+                compute_constraint_features=self.compute_constraint_features,
+            )
+        except Exception as e:
+            print(f"Featurizer failed on {record.id} with error {e}. Skipping.")
+            return self.__getitem__(0)
+        return features
+    def __len__(self) -> int:
+        """Get the length of the dataset.
+        Returns
+        -------
+        int
+            The length of the dataset.
+        """
+        if self.overfit is not None:
+            length = sum(len(d.manifest.records[: self.overfit]) for d in self.datasets)
+        else:
+            length = sum(len(d.manifest.records) for d in self.datasets)
+        return length
+class BoltzTrainingDataModule(pl.LightningDataModule):
+    """DataModule for boltz."""
+    def __init__(self, cfg: DataConfig) -> None:
+        """Initialize the DataModule.
+        Parameters
+        ----------
+        config : DataConfig
+            The data configuration.
+        """
+        super().__init__()
+        self.cfg = cfg
+        assert self.cfg.val_batch_size == 1, "Validation only works with batch size=1."
+        # Load symmetries
+        symmetries = get_symmetries(cfg.symmetries)
+        # Load datasets
+        train: list[Dataset] = []
+        val: list[Dataset] = []
+        for data_config in cfg.datasets:
+            # Set target_dir
+            target_dir = Path(data_config.target_dir)
+            msa_dir = Path(data_config.msa_dir)
+            # Load manifest
+            if data_config.manifest_path is not None:
+                path = Path(data_config.manifest_path)
+            else:
+                path = target_dir / "manifest.json"
+            manifest: Manifest = Manifest.load(path)
+            # Split records if given
+            if data_config.split is not None:
+                with Path(data_config.split).open("r") as f:
+                    split = {x.lower() for x in f.read().splitlines()}
+                train_records = []
+                val_records = []
+                for record in manifest.records:
+                    if record.id.lower() in split:
+                        val_records.append(record)
+                    else:
+                        train_records.append(record)
+            else:
+                train_records = manifest.records
+                val_records = []
+            # Filter training records
+            train_records = [
+                record
+                for record in train_records
+                if all(f.filter(record) for f in cfg.filters)
+            ]
+            # Filter training records
+            if data_config.filters is not None:
+                train_records = [
+                    record
+                    for record in train_records
+                    if all(f.filter(record) for f in data_config.filters)
+                ]
+            # Create train dataset
+            train_manifest = Manifest(train_records)
+            train.append(
+                Dataset(
+                    target_dir,
+                    msa_dir,
+                    train_manifest,
+                    data_config.prob,
+                    data_config.sampler,
+                    data_config.cropper,
+                    cfg.tokenizer,
+                    cfg.featurizer,
+                )
+            )
+            # Create validation dataset
+            if val_records:
+                val_manifest = Manifest(val_records)
+                val.append(
+                    Dataset(
+                        target_dir,
+                        msa_dir,
+                        val_manifest,
+                        data_config.prob,
+                        data_config.sampler,
+                        data_config.cropper,
+                        cfg.tokenizer,
+                        cfg.featurizer,
+                    )
+                )
+        # Print dataset sizes
+        for dataset in train:
+            dataset: Dataset
+            print(f"Training dataset size: {len(dataset.manifest.records)}")
+        for dataset in val:
+            dataset: Dataset
+            print(f"Validation dataset size: {len(dataset.manifest.records)}")
+        # Create wrapper datasets
+        self._train_set = TrainingDataset(
+            datasets=train,
+            samples_per_epoch=cfg.samples_per_epoch,
+            max_atoms=cfg.max_atoms,
+            max_tokens=cfg.max_tokens,
+            max_seqs=cfg.max_seqs,
+            pad_to_max_atoms=cfg.pad_to_max_atoms,
+            pad_to_max_tokens=cfg.pad_to_max_tokens,
+            pad_to_max_seqs=cfg.pad_to_max_seqs,
+            symmetries=symmetries,
+            atoms_per_window_queries=cfg.atoms_per_window_queries,
+            min_dist=cfg.min_dist,
+            max_dist=cfg.max_dist,
+            num_bins=cfg.num_bins,
+            overfit=cfg.overfit,
+            binder_pocket_conditioned_prop=cfg.train_binder_pocket_conditioned_prop,
+            binder_pocket_cutoff=cfg.binder_pocket_cutoff,
+            binder_pocket_sampling_geometric_p=cfg.binder_pocket_sampling_geometric_p,
+            return_symmetries=cfg.return_train_symmetries,
+            compute_constraint_features=cfg.compute_constraint_features,
+        )
+        self._val_set = ValidationDataset(
+            datasets=train if cfg.overfit is not None else val,
+            seed=cfg.random_seed,
+            max_atoms=cfg.max_atoms,
+            max_tokens=cfg.max_tokens,
+            max_seqs=cfg.max_seqs,
+            pad_to_max_atoms=cfg.pad_to_max_atoms,
+            pad_to_max_tokens=cfg.pad_to_max_tokens,
+            pad_to_max_seqs=cfg.pad_to_max_seqs,
+            symmetries=symmetries,
+            atoms_per_window_queries=cfg.atoms_per_window_queries,
+            min_dist=cfg.min_dist,
+            max_dist=cfg.max_dist,
+            num_bins=cfg.num_bins,
+            overfit=cfg.overfit,
+            crop_validation=cfg.crop_validation,
+            return_symmetries=cfg.return_val_symmetries,
+            binder_pocket_conditioned_prop=cfg.val_binder_pocket_conditioned_prop,
+            binder_pocket_cutoff=cfg.binder_pocket_cutoff,
+            compute_constraint_features=cfg.compute_constraint_features,
+        )
+    def setup(self, stage: Optional[str] = None) -> None:
+        """Run the setup for the DataModule.
+        Parameters
+        ----------
+        stage : str, optional
+            The stage, one of 'fit', 'validate', 'test'.
+        """
+        return
+    def train_dataloader(self) -> DataLoader:
+        """Get the training dataloader.
+        Returns
+        -------
+        DataLoader
+            The training dataloader.
+        """
+        return DataLoader(
+            self._train_set,
+            batch_size=self.cfg.batch_size,
+            num_workers=self.cfg.num_workers,
+            pin_memory=self.cfg.pin_memory,
+            shuffle=False,
+            collate_fn=collate,
+        )
+    def val_dataloader(self) -> DataLoader:
+        """Get the validation dataloader.
+        Returns
+        -------
+        DataLoader
+            The validation dataloader.
+        """
+        return DataLoader(
+            self._val_set,
+            batch_size=self.cfg.val_batch_size,
+            num_workers=self.cfg.num_workers,
+            pin_memory=self.cfg.pin_memory,
+            shuffle=False,
+            collate_fn=collate,
+        )

protify/FastPLMs/boltz/src/boltz/data/module/trainingv2.py ADDED Viewed

	@@ -0,0 +1,660 @@

+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import pytorch_lightning as pl
+import torch
+from torch import Tensor
+from torch.utils.data import DataLoader
+from boltz.data.crop.cropper import Cropper
+from boltz.data.feature.featurizer import BoltzFeaturizer
+from boltz.data.feature.symmetry import get_symmetries
+from boltz.data.filter.dynamic.filter import DynamicFilter
+from boltz.data.pad import pad_to_max
+from boltz.data.sample.sampler import Sample, Sampler
+from boltz.data.tokenize.tokenizer import Tokenizer
+from boltz.data.types import MSA, Connection, Input, Manifest, Record, Structure
+@dataclass
+class DatasetConfig:
+    """Dataset configuration."""
+    target_dir: str
+    msa_dir: str
+    prob: float
+    sampler: Sampler
+    cropper: Cropper
+    filters: Optional[list] = None
+    split: Optional[str] = None
+    manifest_path: Optional[str] = None
+@dataclass
+class DataConfig:
+    """Data configuration."""
+    datasets: list[DatasetConfig]
+    filters: list[DynamicFilter]
+    featurizer: BoltzFeaturizer
+    tokenizer: Tokenizer
+    max_atoms: int
+    max_tokens: int
+    max_seqs: int
+    samples_per_epoch: int
+    batch_size: int
+    num_workers: int
+    random_seed: int
+    pin_memory: bool
+    symmetries: str
+    atoms_per_window_queries: int
+    min_dist: float
+    max_dist: float
+    num_bins: int
+    overfit: Optional[int] = None
+    pad_to_max_tokens: bool = False
+    pad_to_max_atoms: bool = False
+    pad_to_max_seqs: bool = False
+    crop_validation: bool = False
+    return_train_symmetries: bool = False
+    return_val_symmetries: bool = True
+    train_binder_pocket_conditioned_prop: float = 0.0
+    val_binder_pocket_conditioned_prop: float = 0.0
+    binder_pocket_cutoff: float = 6.0
+    binder_pocket_sampling_geometric_p: float = 0.0
+    val_batch_size: int = 1
+@dataclass
+class Dataset:
+    """Data holder."""
+    target_dir: Path
+    msa_dir: Path
+    manifest: Manifest
+    prob: float
+    sampler: Sampler
+    cropper: Cropper
+    tokenizer: Tokenizer
+    featurizer: BoltzFeaturizer
+def load_input(record: Record, target_dir: Path, msa_dir: Path) -> Input:
+    """Load the given input data.
+    Parameters
+    ----------
+    record : Record
+        The record to load.
+    target_dir : Path
+        The path to the data directory.
+    msa_dir : Path
+        The path to msa directory.
+    Returns
+    -------
+    Input
+        The loaded input.
+    """
+    # Load the structure
+    structure = np.load(target_dir / "structures" / f"{record.id}.npz")
+    structure = Structure(
+        atoms=structure["atoms"],
+        bonds=structure["bonds"],
+        residues=structure["residues"],
+        chains=structure["chains"],
+        connections=structure["connections"].astype(Connection),
+        interfaces=structure["interfaces"],
+        mask=structure["mask"],
+    )
+    msas = {}
+    for chain in record.chains:
+        msa_id = chain.msa_id
+        # Load the MSA for this chain, if any
+        if msa_id != -1 and msa_id != "":
+            msa = np.load(msa_dir / f"{msa_id}.npz")
+            msas[chain.chain_id] = MSA(**msa)
+    return Input(structure, msas)
+def collate(data: list[dict[str, Tensor]]) -> dict[str, Tensor]:
+    """Collate the data.
+    Parameters
+    ----------
+    data : list[dict[str, Tensor]]
+        The data to collate.
+    Returns
+    -------
+    dict[str, Tensor]
+        The collated data.
+    """
+    # Get the keys
+    keys = data[0].keys()
+    # Collate the data
+    collated = {}
+    for key in keys:
+        values = [d[key] for d in data]
+        if key not in [
+            "all_coords",
+            "all_resolved_mask",
+            "crop_to_all_atom_map",
+            "chain_symmetries",
+            "amino_acids_symmetries",
+            "ligand_symmetries",
+        ]:
+            # Check if all have the same shape
+            shape = values[0].shape
+            if not all(v.shape == shape for v in values):
+                values, _ = pad_to_max(values, 0)
+            else:
+                values = torch.stack(values, dim=0)
+        # Stack the values
+        collated[key] = values
+    return collated
+class TrainingDataset(torch.utils.data.Dataset):
+    """Base iterable dataset."""
+    def __init__(
+        self,
+        datasets: list[Dataset],
+        samples_per_epoch: int,
+        symmetries: dict,
+        max_atoms: int,
+        max_tokens: int,
+        max_seqs: int,
+        pad_to_max_atoms: bool = False,
+        pad_to_max_tokens: bool = False,
+        pad_to_max_seqs: bool = False,
+        atoms_per_window_queries: int = 32,
+        min_dist: float = 2.0,
+        max_dist: float = 22.0,
+        num_bins: int = 64,
+        overfit: Optional[int] = None,
+        binder_pocket_conditioned_prop: Optional[float] = 0.0,
+        binder_pocket_cutoff: Optional[float] = 6.0,
+        binder_pocket_sampling_geometric_p: Optional[float] = 0.0,
+        return_symmetries: Optional[bool] = False,
+    ) -> None:
+        """Initialize the training dataset."""
+        super().__init__()
+        self.datasets = datasets
+        self.probs = [d.prob for d in datasets]
+        self.samples_per_epoch = samples_per_epoch
+        self.symmetries = symmetries
+        self.max_tokens = max_tokens
+        self.max_seqs = max_seqs
+        self.max_atoms = max_atoms
+        self.pad_to_max_tokens = pad_to_max_tokens
+        self.pad_to_max_atoms = pad_to_max_atoms
+        self.pad_to_max_seqs = pad_to_max_seqs
+        self.atoms_per_window_queries = atoms_per_window_queries
+        self.min_dist = min_dist
+        self.max_dist = max_dist
+        self.num_bins = num_bins
+        self.binder_pocket_conditioned_prop = binder_pocket_conditioned_prop
+        self.binder_pocket_cutoff = binder_pocket_cutoff
+        self.binder_pocket_sampling_geometric_p = binder_pocket_sampling_geometric_p
+        self.return_symmetries = return_symmetries
+        self.samples = []
+        for dataset in datasets:
+            records = dataset.manifest.records
+            if overfit is not None:
+                records = records[:overfit]
+            iterator = dataset.sampler.sample(records, np.random)
+            self.samples.append(iterator)
+    def __getitem__(self, idx: int) -> dict[str, Tensor]:
+        """Get an item from the dataset.
+        Parameters
+        ----------
+        idx : int
+            The data index.
+        Returns
+        -------
+        dict[str, Tensor]
+            The sampled data features.
+        """
+        # Pick a random dataset
+        dataset_idx = np.random.choice(
+            len(self.datasets),
+            p=self.probs,
+        )
+        dataset = self.datasets[dataset_idx]
+        # Get a sample from the dataset
+        sample: Sample = next(self.samples[dataset_idx])
+        # Get the structure
+        try:
+            input_data = load_input(sample.record, dataset.target_dir, dataset.msa_dir)
+        except Exception as e:
+            print(
+                f"Failed to load input for {sample.record.id} with error {e}. Skipping."
+            )
+            return self.__getitem__(idx)
+        # Tokenize structure
+        try:
+            tokenized = dataset.tokenizer.tokenize(input_data)
+        except Exception as e:
+            print(f"Tokenizer failed on {sample.record.id} with error {e}. Skipping.")
+            return self.__getitem__(idx)
+        # Compute crop
+        try:
+            if self.max_tokens is not None:
+                tokenized = dataset.cropper.crop(
+                    tokenized,
+                    max_atoms=self.max_atoms,
+                    max_tokens=self.max_tokens,
+                    random=np.random,
+                    chain_id=sample.chain_id,
+                    interface_id=sample.interface_id,
+                )
+        except Exception as e:
+            print(f"Cropper failed on {sample.record.id} with error {e}. Skipping.")
+            return self.__getitem__(idx)
+        # Check if there are tokens
+        if len(tokenized.tokens) == 0:
+            msg = "No tokens in cropped structure."
+            raise ValueError(msg)
+        # Compute features
+        try:
+            features = dataset.featurizer.process(
+                tokenized,
+                training=True,
+                max_atoms=self.max_atoms if self.pad_to_max_atoms else None,
+                max_tokens=self.max_tokens if self.pad_to_max_tokens else None,
+                max_seqs=self.max_seqs,
+                pad_to_max_seqs=self.pad_to_max_seqs,
+                symmetries=self.symmetries,
+                atoms_per_window_queries=self.atoms_per_window_queries,
+                min_dist=self.min_dist,
+                max_dist=self.max_dist,
+                num_bins=self.num_bins,
+                compute_symmetries=self.return_symmetries,
+                binder_pocket_conditioned_prop=self.binder_pocket_conditioned_prop,
+                binder_pocket_cutoff=self.binder_pocket_cutoff,
+                binder_pocket_sampling_geometric_p=self.binder_pocket_sampling_geometric_p,
+            )
+        except Exception as e:
+            print(f"Featurizer failed on {sample.record.id} with error {e}. Skipping.")
+            return self.__getitem__(idx)
+        return features
+    def __len__(self) -> int:
+        """Get the length of the dataset.
+        Returns
+        -------
+        int
+            The length of the dataset.
+        """
+        return self.samples_per_epoch
+class ValidationDataset(torch.utils.data.Dataset):
+    """Base iterable dataset."""
+    def __init__(
+        self,
+        datasets: list[Dataset],
+        seed: int,
+        symmetries: dict,
+        max_atoms: Optional[int] = None,
+        max_tokens: Optional[int] = None,
+        max_seqs: Optional[int] = None,
+        pad_to_max_atoms: bool = False,
+        pad_to_max_tokens: bool = False,
+        pad_to_max_seqs: bool = False,
+        atoms_per_window_queries: int = 32,
+        min_dist: float = 2.0,
+        max_dist: float = 22.0,
+        num_bins: int = 64,
+        overfit: Optional[int] = None,
+        crop_validation: bool = False,
+        return_symmetries: Optional[bool] = False,
+        binder_pocket_conditioned_prop: Optional[float] = 0.0,
+        binder_pocket_cutoff: Optional[float] = 6.0,
+    ) -> None:
+        """Initialize the validation dataset."""
+        super().__init__()
+        self.datasets = datasets
+        self.max_atoms = max_atoms
+        self.max_tokens = max_tokens
+        self.max_seqs = max_seqs
+        self.seed = seed
+        self.symmetries = symmetries
+        self.random = np.random if overfit else np.random.RandomState(self.seed)
+        self.pad_to_max_tokens = pad_to_max_tokens
+        self.pad_to_max_atoms = pad_to_max_atoms
+        self.pad_to_max_seqs = pad_to_max_seqs
+        self.overfit = overfit
+        self.crop_validation = crop_validation
+        self.atoms_per_window_queries = atoms_per_window_queries
+        self.min_dist = min_dist
+        self.max_dist = max_dist
+        self.num_bins = num_bins
+        self.return_symmetries = return_symmetries
+        self.binder_pocket_conditioned_prop = binder_pocket_conditioned_prop
+        self.binder_pocket_cutoff = binder_pocket_cutoff
+    def __getitem__(self, idx: int) -> dict[str, Tensor]:
+        """Get an item from the dataset.
+        Parameters
+        ----------
+        idx : int
+            The data index.
+        Returns
+        -------
+        dict[str, Tensor]
+            The sampled data features.
+        """
+        # Pick dataset based on idx
+        for dataset in self.datasets:
+            size = len(dataset.manifest.records)
+            if self.overfit is not None:
+                size = min(size, self.overfit)
+            if idx < size:
+                break
+            idx -= size
+        # Get a sample from the dataset
+        record = dataset.manifest.records[idx]
+        # Get the structure
+        try:
+            input_data = load_input(record, dataset.target_dir, dataset.msa_dir)
+        except Exception as e:
+            print(f"Failed to load input for {record.id} with error {e}. Skipping.")
+            return self.__getitem__(0)
+        # Tokenize structure
+        try:
+            tokenized = dataset.tokenizer.tokenize(input_data)
+        except Exception as e:
+            print(f"Tokenizer failed on {record.id} with error {e}. Skipping.")
+            return self.__getitem__(0)
+        # Compute crop
+        try:
+            if self.crop_validation and (self.max_tokens is not None):
+                tokenized = dataset.cropper.crop(
+                    tokenized,
+                    max_tokens=self.max_tokens,
+                    random=self.random,
+                    max_atoms=self.max_atoms,
+                )
+        except Exception as e:
+            print(f"Cropper failed on {record.id} with error {e}. Skipping.")
+            return self.__getitem__(0)
+        # Check if there are tokens
+        if len(tokenized.tokens) == 0:
+            msg = "No tokens in cropped structure."
+            raise ValueError(msg)
+        # Compute features
+        try:
+            pad_atoms = self.crop_validation and self.pad_to_max_atoms
+            pad_tokens = self.crop_validation and self.pad_to_max_tokens
+            features = dataset.featurizer.process(
+                tokenized,
+                training=False,
+                max_atoms=self.max_atoms if pad_atoms else None,
+                max_tokens=self.max_tokens if pad_tokens else None,
+                max_seqs=self.max_seqs,
+                pad_to_max_seqs=self.pad_to_max_seqs,
+                symmetries=self.symmetries,
+                atoms_per_window_queries=self.atoms_per_window_queries,
+                min_dist=self.min_dist,
+                max_dist=self.max_dist,
+                num_bins=self.num_bins,
+                compute_symmetries=self.return_symmetries,
+                binder_pocket_conditioned_prop=self.binder_pocket_conditioned_prop,
+                binder_pocket_cutoff=self.binder_pocket_cutoff,
+                binder_pocket_sampling_geometric_p=1.0,  # this will only sample a single pocket token
+                only_ligand_binder_pocket=True,
+            )
+        except Exception as e:
+            print(f"Featurizer failed on {record.id} with error {e}. Skipping.")
+            return self.__getitem__(0)
+        return features
+    def __len__(self) -> int:
+        """Get the length of the dataset.
+        Returns
+        -------
+        int
+            The length of the dataset.
+        """
+        if self.overfit is not None:
+            length = sum(len(d.manifest.records[: self.overfit]) for d in self.datasets)
+        else:
+            length = sum(len(d.manifest.records) for d in self.datasets)
+        return length
+class BoltzTrainingDataModule(pl.LightningDataModule):
+    """DataModule for boltz."""
+    def __init__(self, cfg: DataConfig) -> None:
+        """Initialize the DataModule.
+        Parameters
+        ----------
+        config : DataConfig
+            The data configuration.
+        """
+        super().__init__()
+        self.cfg = cfg
+        assert self.cfg.val_batch_size == 1, "Validation only works with batch size=1."
+        # Load symmetries
+        symmetries = get_symmetries(cfg.symmetries)
+        # Load datasets
+        train: list[Dataset] = []
+        val: list[Dataset] = []
+        for data_config in cfg.datasets:
+            # Set target_dir
+            target_dir = Path(data_config.target_dir)
+            msa_dir = Path(data_config.msa_dir)
+            # Load manifest
+            if data_config.manifest_path is not None:
+                path = Path(data_config.manifest_path)
+            else:
+                path = target_dir / "manifest.json"
+            manifest: Manifest = Manifest.load(path)
+            # Split records if given
+            if data_config.split is not None:
+                with Path(data_config.split).open("r") as f:
+                    split = {x.lower() for x in f.read().splitlines()}
+                train_records = []
+                val_records = []
+                for record in manifest.records:
+                    if record.id.lower() in split:
+                        val_records.append(record)
+                    else:
+                        train_records.append(record)
+            else:
+                train_records = manifest.records
+                val_records = []
+            # Filter training records
+            train_records = [
+                record
+                for record in train_records
+                if all(f.filter(record) for f in cfg.filters)
+            ]
+            # Filter training records
+            if data_config.filters is not None:
+                train_records = [
+                    record
+                    for record in train_records
+                    if all(f.filter(record) for f in data_config.filters)
+                ]
+            # Create train dataset
+            train_manifest = Manifest(train_records)
+            train.append(
+                Dataset(
+                    target_dir,
+                    msa_dir,
+                    train_manifest,
+                    data_config.prob,
+                    data_config.sampler,
+                    data_config.cropper,
+                    cfg.tokenizer,
+                    cfg.featurizer,
+                )
+            )
+            # Create validation dataset
+            if val_records:
+                val_manifest = Manifest(val_records)
+                val.append(
+                    Dataset(
+                        target_dir,
+                        msa_dir,
+                        val_manifest,
+                        data_config.prob,
+                        data_config.sampler,
+                        data_config.cropper,
+                        cfg.tokenizer,
+                        cfg.featurizer,
+                    )
+                )
+        # Print dataset sizes
+        for dataset in train:
+            dataset: Dataset
+            print(f"Training dataset size: {len(dataset.manifest.records)}")
+        for dataset in val:
+            dataset: Dataset
+            print(f"Validation dataset size: {len(dataset.manifest.records)}")
+        # Create wrapper datasets
+        self._train_set = TrainingDataset(
+            datasets=train,
+            samples_per_epoch=cfg.samples_per_epoch,
+            max_atoms=cfg.max_atoms,
+            max_tokens=cfg.max_tokens,
+            max_seqs=cfg.max_seqs,
+            pad_to_max_atoms=cfg.pad_to_max_atoms,
+            pad_to_max_tokens=cfg.pad_to_max_tokens,
+            pad_to_max_seqs=cfg.pad_to_max_seqs,
+            symmetries=symmetries,
+            atoms_per_window_queries=cfg.atoms_per_window_queries,
+            min_dist=cfg.min_dist,
+            max_dist=cfg.max_dist,
+            num_bins=cfg.num_bins,
+            overfit=cfg.overfit,
+            binder_pocket_conditioned_prop=cfg.train_binder_pocket_conditioned_prop,
+            binder_pocket_cutoff=cfg.binder_pocket_cutoff,
+            binder_pocket_sampling_geometric_p=cfg.binder_pocket_sampling_geometric_p,
+            return_symmetries=cfg.return_train_symmetries,
+        )
+        self._val_set = ValidationDataset(
+            datasets=train if cfg.overfit is not None else val,
+            seed=cfg.random_seed,
+            max_atoms=cfg.max_atoms,
+            max_tokens=cfg.max_tokens,
+            max_seqs=cfg.max_seqs,
+            pad_to_max_atoms=cfg.pad_to_max_atoms,
+            pad_to_max_tokens=cfg.pad_to_max_tokens,
+            pad_to_max_seqs=cfg.pad_to_max_seqs,
+            symmetries=symmetries,
+            atoms_per_window_queries=cfg.atoms_per_window_queries,
+            min_dist=cfg.min_dist,
+            max_dist=cfg.max_dist,
+            num_bins=cfg.num_bins,
+            overfit=cfg.overfit,
+            crop_validation=cfg.crop_validation,
+            return_symmetries=cfg.return_val_symmetries,
+            binder_pocket_conditioned_prop=cfg.val_binder_pocket_conditioned_prop,
+            binder_pocket_cutoff=cfg.binder_pocket_cutoff,
+        )
+    def setup(self, stage: Optional[str] = None) -> None:
+        """Run the setup for the DataModule.
+        Parameters
+        ----------
+        stage : str, optional
+            The stage, one of 'fit', 'validate', 'test'.
+        """
+        return
+    def train_dataloader(self) -> DataLoader:
+        """Get the training dataloader.
+        Returns
+        -------
+        DataLoader
+            The training dataloader.
+        """
+        return DataLoader(
+            self._train_set,
+            batch_size=self.cfg.batch_size,
+            num_workers=self.cfg.num_workers,
+            pin_memory=self.cfg.pin_memory,
+            shuffle=False,
+            collate_fn=collate,
+        )
+    def val_dataloader(self) -> DataLoader:
+        """Get the validation dataloader.
+        Returns
+        -------
+        DataLoader
+            The validation dataloader.
+        """
+        return DataLoader(
+            self._val_set,
+            batch_size=self.cfg.val_batch_size,
+            num_workers=self.cfg.num_workers,
+            pin_memory=self.cfg.pin_memory,
+            shuffle=False,
+            collate_fn=collate,
+        )

protify/FastPLMs/boltz/src/boltz/data/mol.py ADDED Viewed

	@@ -0,0 +1,900 @@

+import itertools
+import pickle
+import random
+from pathlib import Path
+import numpy as np
+import torch
+from rdkit.Chem import Mol
+from tqdm import tqdm
+from boltz.data import const
+from boltz.data.pad import pad_dim
+from boltz.model.loss.confidence import lddt_dist
+def load_molecules(moldir: str, molecules: list[str]) -> dict[str, Mol]:
+    """Load the given input data.
+    Parameters
+    ----------
+    moldir : str
+        The path to the molecules directory.
+    molecules : list[str]
+        The molecules to load.
+    Returns
+    -------
+    dict[str, Mol]
+        The loaded molecules.
+    """
+    loaded_mols = {}
+    for molecule in molecules:
+        path = Path(moldir) / f"{molecule}.pkl"
+        if not path.exists():
+            msg = f"CCD component {molecule} not found!"
+            raise ValueError(msg)
+        with path.open("rb") as f:
+            loaded_mols[molecule] = pickle.load(f)  # noqa: S301
+    return loaded_mols
+def load_canonicals(moldir: str) -> dict[str, Mol]:
+    """Load the given input data.
+    Parameters
+    ----------
+    moldir : str
+        The molecules to load.
+    Returns
+    -------
+    dict[str, Mol]
+        The loaded molecules.
+    """
+    return load_molecules(moldir, const.canonical_tokens)
+def load_all_molecules(moldir: str) -> dict[str, Mol]:
+    """Load the given input data.
+    Parameters
+    ----------
+    moldir : str
+        The path to the molecules directory.
+    molecules : list[str]
+        The molecules to load.
+    Returns
+    -------
+    dict[str, Mol]
+        The loaded molecules.
+    """
+    loaded_mols = {}
+    files = list(Path(moldir).glob("*.pkl"))
+    for path in tqdm(files, total=len(files), desc="Loading molecules", leave=False):
+        mol_name = path.stem
+        with path.open("rb") as f:
+            loaded_mols[mol_name] = pickle.load(f)  # noqa: S301
+    return loaded_mols
+def get_symmetries(mols: dict[str, Mol]) -> dict:  # noqa: PLR0912
+    """Create a dictionary for the ligand symmetries.
+    Parameters
+    ----------
+    path : str
+        The path to the ligand symmetries.
+    Returns
+    -------
+    dict
+        The ligand symmetries.
+    """
+    symmetries = {}
+    for key, mol in mols.items():
+        try:
+            sym = pickle.loads(bytes.fromhex(mol.GetProp("symmetries")))  # noqa: S301
+            if mol.HasProp("pb_edge_index"):
+                edge_index = pickle.loads(
+                    bytes.fromhex(mol.GetProp("pb_edge_index"))
+                ).astype(np.int64)  # noqa: S301
+                lower_bounds = pickle.loads(
+                    bytes.fromhex(mol.GetProp("pb_lower_bounds"))
+                )  # noqa: S301
+                upper_bounds = pickle.loads(
+                    bytes.fromhex(mol.GetProp("pb_upper_bounds"))
+                )  # noqa: S301
+                bond_mask = pickle.loads(bytes.fromhex(mol.GetProp("pb_bond_mask")))  # noqa: S301
+                angle_mask = pickle.loads(bytes.fromhex(mol.GetProp("pb_angle_mask")))  # noqa: S301
+            else:
+                edge_index = np.empty((2, 0), dtype=np.int64)
+                lower_bounds = np.array([], dtype=np.float32)
+                upper_bounds = np.array([], dtype=np.float32)
+                bond_mask = np.array([], dtype=np.float32)
+                angle_mask = np.array([], dtype=np.float32)
+            if mol.HasProp("chiral_atom_index"):
+                chiral_atom_index = pickle.loads(
+                    bytes.fromhex(mol.GetProp("chiral_atom_index"))
+                ).astype(np.int64)
+                chiral_check_mask = pickle.loads(
+                    bytes.fromhex(mol.GetProp("chiral_check_mask"))
+                ).astype(np.int64)
+                chiral_atom_orientations = pickle.loads(
+                    bytes.fromhex(mol.GetProp("chiral_atom_orientations"))
+                )
+            else:
+                chiral_atom_index = np.empty((4, 0), dtype=np.int64)
+                chiral_check_mask = np.array([], dtype=bool)
+                chiral_atom_orientations = np.array([], dtype=bool)
+            if mol.HasProp("stereo_bond_index"):
+                stereo_bond_index = pickle.loads(
+                    bytes.fromhex(mol.GetProp("stereo_bond_index"))
+                ).astype(np.int64)
+                stereo_check_mask = pickle.loads(
+                    bytes.fromhex(mol.GetProp("stereo_check_mask"))
+                ).astype(np.int64)
+                stereo_bond_orientations = pickle.loads(
+                    bytes.fromhex(mol.GetProp("stereo_bond_orientations"))
+                )
+            else:
+                stereo_bond_index = np.empty((4, 0), dtype=np.int64)
+                stereo_check_mask = np.array([], dtype=bool)
+                stereo_bond_orientations = np.array([], dtype=bool)
+            if mol.HasProp("aromatic_5_ring_index"):
+                aromatic_5_ring_index = pickle.loads(
+                    bytes.fromhex(mol.GetProp("aromatic_5_ring_index"))
+                ).astype(np.int64)
+            else:
+                aromatic_5_ring_index = np.empty((5, 0), dtype=np.int64)
+            if mol.HasProp("aromatic_6_ring_index"):
+                aromatic_6_ring_index = pickle.loads(
+                    bytes.fromhex(mol.GetProp("aromatic_6_ring_index"))
+                ).astype(np.int64)
+            else:
+                aromatic_6_ring_index = np.empty((6, 0), dtype=np.int64)
+            if mol.HasProp("planar_double_bond_index"):
+                planar_double_bond_index = pickle.loads(
+                    bytes.fromhex(mol.GetProp("planar_double_bond_index"))
+                ).astype(np.int64)
+            else:
+                planar_double_bond_index = np.empty((6, 0), dtype=np.int64)
+            atom_names = [atom.GetProp("name") for atom in mol.GetAtoms()]
+            symmetries[key] = (
+                sym,
+                atom_names,
+                edge_index,
+                lower_bounds,
+                upper_bounds,
+                bond_mask,
+                angle_mask,
+                chiral_atom_index,
+                chiral_check_mask,
+                chiral_atom_orientations,
+                stereo_bond_index,
+                stereo_check_mask,
+                stereo_bond_orientations,
+                aromatic_5_ring_index,
+                aromatic_6_ring_index,
+                planar_double_bond_index,
+            )
+        except Exception as e:  # noqa: BLE001, PERF203, S110
+            pass
+    return symmetries
+def compute_symmetry_idx_dictionary(data):
+    # Compute the symmetry index dictionary
+    total_count = 0
+    all_coords = []
+    for i, chain in enumerate(data.chains):
+        chain.start_idx = total_count
+        for j, token in enumerate(chain.tokens):
+            token.start_idx = total_count - chain.start_idx
+            all_coords.extend(
+                [[atom.coords.x, atom.coords.y, atom.coords.z] for atom in token.atoms]
+            )
+            total_count += len(token.atoms)
+    return all_coords
+def get_current_idx_list(data):
+    idx = []
+    for chain in data.chains:
+        if chain.in_crop:
+            for token in chain.tokens:
+                if token.in_crop:
+                    idx.extend(
+                        [
+                            chain.start_idx + token.start_idx + i
+                            for i in range(len(token.atoms))
+                        ]
+                    )
+    return idx
+def all_different_after_swap(l):
+    final = [s[-1] for s in l]
+    return len(final) == len(set(final))
+def minimum_lddt_symmetry_coords(
+    coords: torch.Tensor,
+    feats: dict,
+    index_batch: int,
+):
+    all_coords = feats["all_coords"][index_batch].unsqueeze(0).to(coords)
+    all_resolved_mask = (
+        feats["all_resolved_mask"][index_batch].to(coords).to(torch.bool)
+    )
+    crop_to_all_atom_map = (
+        feats["crop_to_all_atom_map"][index_batch].to(coords).to(torch.long)
+    )
+    chain_symmetries = feats["chain_swaps"][index_batch]
+    amino_acids_symmetries = feats["amino_acids_symmetries"][index_batch]
+    ligand_symmetries = feats["ligand_symmetries"][index_batch]
+    dmat_predicted = torch.cdist(
+        coords[:, : len(crop_to_all_atom_map)], coords[:, : len(crop_to_all_atom_map)]
+    )
+    # Check best symmetry on chain swap
+    best_true_coords = all_coords[:, crop_to_all_atom_map].clone()
+    best_true_resolved_mask = all_resolved_mask[crop_to_all_atom_map].clone()
+    best_lddt = -1.0
+    for c in chain_symmetries:
+        true_all_coords = all_coords.clone()
+        true_all_resolved_mask = all_resolved_mask.clone()
+        for start1, end1, start2, end2, chainidx1, chainidx2 in c:
+            true_all_coords[:, start1:end1] = all_coords[:, start2:end2]
+            true_all_resolved_mask[start1:end1] = all_resolved_mask[start2:end2]
+        true_coords = true_all_coords[:, crop_to_all_atom_map]
+        true_resolved_mask = true_all_resolved_mask[crop_to_all_atom_map]
+        dmat_true = torch.cdist(true_coords, true_coords)
+        pair_mask = (
+            true_resolved_mask[:, None]
+            * true_resolved_mask[None, :]
+            * (1 - torch.eye(len(true_resolved_mask))).to(true_resolved_mask)
+        )
+        lddt = lddt_dist(
+            dmat_predicted, dmat_true, pair_mask, cutoff=15.0, per_atom=False
+        )[0]
+        lddt = lddt.item()
+        if lddt > best_lddt and torch.sum(true_resolved_mask) > 3:
+            best_lddt = lddt
+            best_true_coords = true_coords
+            best_true_resolved_mask = true_resolved_mask
+    # atom symmetries (nucleic acid and protein residues), resolved greedily without recomputing alignment
+    true_coords = best_true_coords.clone()
+    true_resolved_mask = best_true_resolved_mask.clone()
+    for symmetric_amino_or_lig in amino_acids_symmetries + ligand_symmetries:
+        best_lddt_improvement = 0.0
+        indices = set()
+        for c in symmetric_amino_or_lig:
+            for i, j in c:
+                indices.add(i)
+        indices = sorted(list(indices))
+        indices = torch.from_numpy(np.asarray(indices)).to(true_coords.device).long()
+        pred_coords_subset = coords[:, : len(crop_to_all_atom_map)][:, indices]
+        sub_dmat_pred = torch.cdist(
+            coords[:, : len(crop_to_all_atom_map)], pred_coords_subset
+        )
+        for c in symmetric_amino_or_lig:
+            # starting from greedy best, try to swap the atoms
+            new_true_coords = true_coords.clone()
+            new_true_resolved_mask = true_resolved_mask.clone()
+            for i, j in c:
+                new_true_coords[:, i] = true_coords[:, j]
+                new_true_resolved_mask[i] = true_resolved_mask[j]
+            true_coords_subset = true_coords[:, indices]
+            new_true_coords_subset = new_true_coords[:, indices]
+            sub_dmat_true = torch.cdist(true_coords, true_coords_subset)
+            sub_dmat_new_true = torch.cdist(new_true_coords, new_true_coords_subset)
+            sub_true_pair_lddt = (
+                true_resolved_mask[:, None] * true_resolved_mask[None, indices]
+            )
+            sub_true_pair_lddt[indices] = (
+                sub_true_pair_lddt[indices]
+                * (1 - torch.eye(len(indices))).to(sub_true_pair_lddt).bool()
+            )
+            sub_new_true_pair_lddt = (
+                new_true_resolved_mask[:, None] * new_true_resolved_mask[None, indices]
+            )
+            sub_new_true_pair_lddt[indices] = (
+                sub_new_true_pair_lddt[indices]
+                * (1 - torch.eye(len(indices))).to(sub_true_pair_lddt).bool()
+            )
+            lddt, total = lddt_dist(
+                sub_dmat_pred,
+                sub_dmat_true,
+                sub_true_pair_lddt,
+                cutoff=15.0,
+                per_atom=False,
+            )
+            new_lddt, new_total = lddt_dist(
+                sub_dmat_pred,
+                sub_dmat_new_true,
+                sub_new_true_pair_lddt,
+                cutoff=15.0,
+                per_atom=False,
+            )
+            lddt_improvement = new_lddt - lddt
+            if lddt_improvement > best_lddt_improvement:
+                best_true_coords = new_true_coords
+                best_true_resolved_mask = new_true_resolved_mask
+                best_lddt_improvement = lddt_improvement
+        # greedily update best coordinates after each amino acid
+        true_coords = best_true_coords.clone()
+        true_resolved_mask = best_true_resolved_mask.clone()
+    # Recomputing alignment
+    true_coords = pad_dim(true_coords, 1, coords.shape[1] - true_coords.shape[1])
+    true_resolved_mask = pad_dim(
+        true_resolved_mask,
+        0,
+        coords.shape[1] - true_resolved_mask.shape[0],
+    )
+    return true_coords, true_resolved_mask.unsqueeze(0)
+def compute_single_distogram_loss(pred, target, mask):
+    # Compute the distogram loss
+    errors = -1 * torch.sum(
+        target * torch.nn.functional.log_softmax(pred, dim=-1),
+        dim=-1,
+    )
+    denom = 1e-5 + torch.sum(mask, dim=(-1, -2))
+    mean = errors * mask
+    mean = torch.sum(mean, dim=-1)
+    mean = mean / denom[..., None]
+    batch_loss = torch.sum(mean, dim=-1)
+    global_loss = torch.mean(batch_loss)
+    return global_loss
+def minimum_lddt_symmetry_dist(
+    pred_distogram: torch.Tensor,
+    feats: dict,
+    index_batch: int,
+):
+    # Note: for now only ligand symmetries are resolved
+    disto_target = feats["disto_target"][index_batch]
+    mask = feats["token_disto_mask"][index_batch]
+    mask = mask[None, :] * mask[:, None]
+    mask = mask * (1 - torch.eye(mask.shape[1])).to(disto_target)
+    coords = feats["coords"][index_batch]
+    ligand_symmetries = feats["ligand_symmetries"][index_batch]
+    atom_to_token_map = feats["atom_to_token"][index_batch].argmax(dim=-1)
+    # atom symmetries, resolved greedily without recomputing alignment
+    for symmetric_amino_or_lig in ligand_symmetries:
+        best_c, best_disto, best_loss_improvement = None, None, 0.0
+        for c in symmetric_amino_or_lig:
+            # starting from greedy best, try to swap the atoms
+            new_disto_target = disto_target.clone()
+            indices = []
+            # fix the distogram by replacing first the columns then the rows
+            disto_temp = new_disto_target.clone()
+            for i, j in c:
+                new_disto_target[:, atom_to_token_map[i]] = disto_temp[
+                    :, atom_to_token_map[j]
+                ]
+                indices.append(atom_to_token_map[i].item())
+            disto_temp = new_disto_target.clone()
+            for i, j in c:
+                new_disto_target[atom_to_token_map[i], :] = disto_temp[
+                    atom_to_token_map[j], :
+                ]
+            indices = (
+                torch.from_numpy(np.asarray(indices)).to(disto_target.device).long()
+            )
+            pred_distogram_subset = pred_distogram[:, indices]
+            disto_target_subset = disto_target[:, indices]
+            new_disto_target_subset = new_disto_target[:, indices]
+            mask_subset = mask[:, indices]
+            loss = compute_single_distogram_loss(
+                pred_distogram_subset, disto_target_subset, mask_subset
+            )
+            new_loss = compute_single_distogram_loss(
+                pred_distogram_subset, new_disto_target_subset, mask_subset
+            )
+            loss_improvement = (loss - new_loss) * len(indices)
+            if loss_improvement > best_loss_improvement:
+                best_c = c
+                best_disto = new_disto_target
+                best_loss_improvement = loss_improvement
+        # greedily update best coordinates after each ligand
+        if best_loss_improvement > 0:
+            disto_target = best_disto.clone()
+            old_coords = coords.clone()
+            for i, j in best_c:
+                coords[:, i] = old_coords[:, j]
+    # update features to be used in diffusion and in distogram loss
+    feats["disto_target"][index_batch] = disto_target
+    feats["coords"][index_batch] = coords
+    return
+def compute_all_coords_mask(structure):
+    # Compute all coords, crop mask and add start_idx to structure
+    total_count = 0
+    all_coords = []
+    all_coords_crop_mask = []
+    all_resolved_mask = []
+    for i, chain in enumerate(structure.chains):
+        chain.start_idx = total_count
+        for j, token in enumerate(chain.tokens):
+            token.start_idx = total_count - chain.start_idx
+            all_coords.extend(
+                [[atom.coords.x, atom.coords.y, atom.coords.z] for atom in token.atoms]
+            )
+            all_coords_crop_mask.extend(
+                [token.in_crop for _ in range(len(token.atoms))]
+            )
+            all_resolved_mask.extend(
+                [token.is_present for _ in range(len(token.atoms))]
+            )
+            total_count += len(token.atoms)
+    if len(all_coords_crop_mask) != len(all_resolved_mask):
+        pass
+    return all_coords, all_coords_crop_mask, all_resolved_mask
+def get_chain_symmetries(cropped, max_n_symmetries=100):
+    # get all coordinates and resolved mask
+    structure = cropped.structure
+    all_coords = []
+    all_resolved_mask = []
+    original_atom_idx = []
+    chain_atom_idx = []
+    chain_atom_num = []
+    chain_in_crop = []
+    chain_asym_id = []
+    new_atom_idx = 0
+    for chain in structure.chains:
+        atom_idx, atom_num = (
+            chain["atom_idx"],  # Global index of first atom in the chain
+            chain["atom_num"],  # Number of atoms in the chain
+        )
+        # compute coordinates and resolved mask
+        resolved_mask = structure.atoms["is_present"][
+            atom_idx : atom_idx + atom_num
+        ]  # Whether each atom in the chain is actually resolved
+        # ensemble_atom_starts = [structure.ensemble[idx]["atom_coord_idx"] for idx in cropped.ensemble_ref_idxs]
+        # coords = np.array(
+        #    [structure.coords[ensemble_atom_start + atom_idx: ensemble_atom_start + atom_idx + atom_num]["coords"] for
+        #     ensemble_atom_start in ensemble_atom_starts])
+        coords = structure.atoms["coords"][atom_idx : atom_idx + atom_num]
+        in_crop = False
+        for token in cropped.tokens:
+            if token["asym_id"] == chain["asym_id"]:
+                in_crop = True
+                break
+        all_coords.append(coords)
+        all_resolved_mask.append(resolved_mask)
+        original_atom_idx.append(atom_idx)
+        chain_atom_idx.append(new_atom_idx)
+        chain_atom_num.append(atom_num)
+        chain_in_crop.append(in_crop)
+        chain_asym_id.append(chain["asym_id"])
+        new_atom_idx += atom_num
+    all_coords = np.concatenate(all_coords, axis=0)
+    # Compute backmapping from token to all coords
+    crop_to_all_atom_map = []
+    for token in cropped.tokens:
+        chain_idx = chain_asym_id.index(token["asym_id"])
+        start = (
+            chain_atom_idx[chain_idx] - original_atom_idx[chain_idx] + token["atom_idx"]
+        )
+        crop_to_all_atom_map.append(np.arange(start, start + token["atom_num"]))
+    crop_to_all_atom_map = np.concatenate(crop_to_all_atom_map, axis=0)
+    # Compute the connections edge index for covalent bonds
+    all_atom_to_crop_map = np.zeros(all_coords.shape[0], dtype=np.int64)
+    all_atom_to_crop_map[crop_to_all_atom_map.astype(np.int64)] = np.arange(
+        crop_to_all_atom_map.shape[0]
+    )
+    connections_edge_index = []
+    for connection in structure.bonds:
+        if (connection["chain_1"] == connection["chain_2"]) and (
+            connection["res_1"] == connection["res_2"]
+        ):
+            continue
+        connections_edge_index.append([connection["atom_1"], connection["atom_2"]])
+    if len(connections_edge_index) > 0:
+        connections_edge_index = np.array(connections_edge_index, dtype=np.int64).T
+        connections_edge_index = all_atom_to_crop_map[connections_edge_index]
+    else:
+        connections_edge_index = np.empty((2, 0))
+    # Compute the symmetries between chains
+    symmetries = []
+    swaps = []
+    for i, chain in enumerate(structure.chains):
+        start = chain_atom_idx[i]
+        end = start + chain_atom_num[i]
+        if chain_in_crop[i]:
+            possible_swaps = []
+            for j, chain2 in enumerate(structure.chains):
+                start2 = chain_atom_idx[j]
+                end2 = start2 + chain_atom_num[j]
+                if (
+                    chain["entity_id"] == chain2["entity_id"]
+                    and end - start == end2 - start2
+                ):
+                    possible_swaps.append((start, end, start2, end2, i, j))
+            swaps.append(possible_swaps)
+        found = False
+        for symmetry_idx, symmetry in enumerate(symmetries):
+            j = symmetry[0][0]
+            chain2 = structure.chains[j]
+            start2 = chain_atom_idx[j]
+            end2 = start2 + chain_atom_num[j]
+            if (
+                chain["entity_id"] == chain2["entity_id"]
+                and end - start == end2 - start2
+            ):
+                symmetries[symmetry_idx].append(
+                    (i, start, end, chain_in_crop[i], chain["mol_type"])
+                )
+                found = True
+        if not found:
+            symmetries.append([(i, start, end, chain_in_crop[i], chain["mol_type"])])
+    combinations = itertools.product(*swaps)
+    # to avoid combinatorial explosion, bound the number of combinations even considered
+    combinations = list(itertools.islice(combinations, max_n_symmetries * 10))
+    # filter for all chains getting a different assignment
+    combinations = [c for c in combinations if all_different_after_swap(c)]
+    if len(combinations) > max_n_symmetries:
+        combinations = random.sample(combinations, max_n_symmetries)
+    if len(combinations) == 0:
+        combinations.append([])
+    for i in range(len(symmetries) - 1, -1, -1):
+        if not any(chain[3] for chain in symmetries[i]):
+            symmetries.pop(i)
+    features = {}
+    features["all_coords"] = torch.Tensor(all_coords)  # axis=1 with ensemble
+    features["all_resolved_mask"] = torch.Tensor(
+        np.concatenate(all_resolved_mask, axis=0)
+    )
+    features["crop_to_all_atom_map"] = torch.Tensor(crop_to_all_atom_map)
+    features["chain_symmetries"] = symmetries
+    features["connections_edge_index"] = torch.tensor(connections_edge_index)
+    features["chain_swaps"] = combinations
+    return features
+def get_amino_acids_symmetries(cropped):
+    # Compute standard amino-acids symmetries
+    swaps = []
+    start_index_crop = 0
+    for token in cropped.tokens:
+        symmetries = const.ref_symmetries.get(const.tokens[token["res_type"]], [])
+        if len(symmetries) > 0:
+            residue_swaps = []
+            for sym in symmetries:
+                sym_new_idx = [
+                    (i + start_index_crop, j + start_index_crop) for i, j in sym
+                ]
+                residue_swaps.append(sym_new_idx)
+            swaps.append(residue_swaps)
+        start_index_crop += token["atom_num"]
+    features = {"amino_acids_symmetries": swaps}
+    return features
+def slice_valid_index(index, ccd_to_valid_id_array, args=None):
+    index = ccd_to_valid_id_array[index]
+    valid_index_mask = (~np.isnan(index)).all(axis=0)
+    index = index[:, valid_index_mask]
+    if args is None:
+        return index
+    args = (arg[valid_index_mask] for arg in args)
+    return index, args
+def get_ligand_symmetries(cropped, symmetries, return_physical_metrics=False):
+    # Compute ligand and non-standard amino-acids symmetries
+    structure = cropped.structure
+    added_molecules = {}
+    index_mols = []
+    atom_count = 0
+    for token in cropped.tokens:
+        # check if molecule is already added by identifying it through asym_id and res_idx
+        atom_count += token["atom_num"]
+        mol_id = (token["asym_id"], token["res_idx"])
+        if mol_id in added_molecules:
+            added_molecules[mol_id] += token["atom_num"]
+            continue
+        added_molecules[mol_id] = token["atom_num"]
+        # get the molecule type and indices
+        residue_idx = token["res_idx"] + structure.chains[token["asym_id"]]["res_idx"]
+        mol_name = structure.residues[residue_idx]["name"]
+        atom_idx = structure.residues[residue_idx]["atom_idx"]
+        mol_atom_names = structure.atoms[
+            atom_idx : atom_idx + structure.residues[residue_idx]["atom_num"]
+        ]["name"]
+        if mol_name not in const.ref_symmetries:
+            index_mols.append(
+                (mol_name, atom_count - token["atom_num"], mol_id, mol_atom_names)
+            )
+    # for each molecule, get the symmetries
+    molecule_symmetries = []
+    all_edge_index = []
+    all_lower_bounds, all_upper_bounds = [], []
+    all_bond_mask, all_angle_mask = [], []
+    all_chiral_atom_index, all_chiral_check_mask, all_chiral_atom_orientations = (
+        [],
+        [],
+        [],
+    )
+    all_stereo_bond_index, all_stereo_check_mask, all_stereo_bond_orientations = (
+        [],
+        [],
+        [],
+    )
+    (
+        all_aromatic_5_ring_index,
+        all_aromatic_6_ring_index,
+        all_planar_double_bond_index,
+    ) = (
+        [],
+        [],
+        [],
+    )
+    for mol_name, start_mol, mol_id, mol_atom_names in index_mols:
+        if not mol_name in symmetries:
+            continue
+        else:
+            swaps = []
+            (
+                syms_ccd,
+                mol_atom_names_ccd,
+                edge_index,
+                lower_bounds,
+                upper_bounds,
+                bond_mask,
+                angle_mask,
+                chiral_atom_index,
+                chiral_check_mask,
+                chiral_atom_orientations,
+                stereo_bond_index,
+                stereo_check_mask,
+                stereo_bond_orientations,
+                aromatic_5_ring_index,
+                aromatic_6_ring_index,
+                planar_double_bond_index,
+            ) = symmetries[mol_name]
+            # Get indices of mol_atom_names_ccd that are in mol_atom_names
+            ccd_to_valid_ids = {
+                mol_atom_names_ccd.index(name): i
+                for i, name in enumerate(mol_atom_names)
+            }
+            ccd_to_valid_id_array = np.array(
+                [
+                    float("nan") if i not in ccd_to_valid_ids else ccd_to_valid_ids[i]
+                    for i in range(len(mol_atom_names_ccd))
+                ]
+            )
+            ccd_valid_ids = set(ccd_to_valid_ids.keys())
+            syms = []
+            # Get syms
+            for sym_ccd in syms_ccd:
+                sym_dict = {}
+                bool_add = True
+                for i, j in enumerate(sym_ccd):
+                    if i in ccd_valid_ids:
+                        if j in ccd_valid_ids:
+                            i_true = ccd_to_valid_ids[i]
+                            j_true = ccd_to_valid_ids[j]
+                            sym_dict[i_true] = j_true
+                        else:
+                            bool_add = False
+                            break
+                if bool_add:
+                    syms.append([sym_dict[i] for i in range(len(ccd_valid_ids))])
+            for sym in syms:
+                if len(sym) != added_molecules[mol_id]:
+                    raise Exception(
+                        f"Symmetry length mismatch {len(sym)} {added_molecules[mol_id]}"
+                    )
+                # assert (
+                #     len(sym) == added_molecules[mol_id]
+                # ), f"Symmetry length mismatch {len(sym)} {added_molecules[mol_id]}"
+                sym_new_idx = []
+                for i, j in enumerate(sym):
+                    if i != int(j):
+                        sym_new_idx.append((i + start_mol, int(j) + start_mol))
+                if len(sym_new_idx) > 0:
+                    swaps.append(sym_new_idx)
+            if len(swaps) > 0:
+                molecule_symmetries.append(swaps)
+            if return_physical_metrics:
+                edge_index, (lower_bounds, upper_bounds, bond_mask, angle_mask) = (
+                    slice_valid_index(
+                        edge_index,
+                        ccd_to_valid_id_array,
+                        (lower_bounds, upper_bounds, bond_mask, angle_mask),
+                    )
+                )
+                all_edge_index.append(edge_index + start_mol)
+                all_lower_bounds.append(lower_bounds)
+                all_upper_bounds.append(upper_bounds)
+                all_bond_mask.append(bond_mask)
+                all_angle_mask.append(angle_mask)
+                chiral_atom_index, (chiral_check_mask, chiral_atom_orientations) = (
+                    slice_valid_index(
+                        chiral_atom_index,
+                        ccd_to_valid_id_array,
+                        (chiral_check_mask, chiral_atom_orientations),
+                    )
+                )
+                all_chiral_atom_index.append(chiral_atom_index + start_mol)
+                all_chiral_check_mask.append(chiral_check_mask)
+                all_chiral_atom_orientations.append(chiral_atom_orientations)
+                stereo_bond_index, (stereo_check_mask, stereo_bond_orientations) = (
+                    slice_valid_index(
+                        stereo_bond_index,
+                        ccd_to_valid_id_array,
+                        (stereo_check_mask, stereo_bond_orientations),
+                    )
+                )
+                all_stereo_bond_index.append(stereo_bond_index + start_mol)
+                all_stereo_check_mask.append(stereo_check_mask)
+                all_stereo_bond_orientations.append(stereo_bond_orientations)
+                aromatic_5_ring_index = slice_valid_index(
+                    aromatic_5_ring_index, ccd_to_valid_id_array
+                )
+                aromatic_6_ring_index = slice_valid_index(
+                    aromatic_6_ring_index, ccd_to_valid_id_array
+                )
+                planar_double_bond_index = slice_valid_index(
+                    planar_double_bond_index, ccd_to_valid_id_array
+                )
+                all_aromatic_5_ring_index.append(aromatic_5_ring_index + start_mol)
+                all_aromatic_6_ring_index.append(aromatic_6_ring_index + start_mol)
+                all_planar_double_bond_index.append(
+                    planar_double_bond_index + start_mol
+                )
+    if return_physical_metrics:
+        if len(all_edge_index) > 0:
+            all_edge_index = np.concatenate(all_edge_index, axis=1)
+            all_lower_bounds = np.concatenate(all_lower_bounds, axis=0)
+            all_upper_bounds = np.concatenate(all_upper_bounds, axis=0)
+            all_bond_mask = np.concatenate(all_bond_mask, axis=0)
+            all_angle_mask = np.concatenate(all_angle_mask, axis=0)
+            all_chiral_atom_index = np.concatenate(all_chiral_atom_index, axis=1)
+            all_chiral_check_mask = np.concatenate(all_chiral_check_mask, axis=0)
+            all_chiral_atom_orientations = np.concatenate(
+                all_chiral_atom_orientations, axis=0
+            )
+            all_stereo_bond_index = np.concatenate(all_stereo_bond_index, axis=1)
+            all_stereo_check_mask = np.concatenate(all_stereo_check_mask, axis=0)
+            all_stereo_bond_orientations = np.concatenate(
+                all_stereo_bond_orientations, axis=0
+            )
+            all_aromatic_5_ring_index = np.concatenate(
+                all_aromatic_5_ring_index, axis=1
+            )
+            all_aromatic_6_ring_index = np.concatenate(
+                all_aromatic_6_ring_index, axis=1
+            )
+            all_planar_double_bond_index = np.empty(
+                (6, 0), dtype=np.int64
+            )  # TODO remove np.concatenate(all_planar_double_bond_index, axis=1)
+        else:
+            all_edge_index = np.empty((2, 0), dtype=np.int64)
+            all_lower_bounds = np.array([], dtype=np.float32)
+            all_upper_bounds = np.array([], dtype=np.float32)
+            all_bond_mask = np.array([], dtype=bool)
+            all_angle_mask = np.array([], dtype=bool)
+            all_chiral_atom_index = np.empty((4, 0), dtype=np.int64)
+            all_chiral_check_mask = np.array([], dtype=bool)
+            all_chiral_atom_orientations = np.array([], dtype=bool)
+            all_stereo_bond_index = np.empty((4, 0), dtype=np.int64)
+            all_stereo_check_mask = np.array([], dtype=bool)
+            all_stereo_bond_orientations = np.array([], dtype=bool)
+            all_aromatic_5_ring_index = np.empty((5, 0), dtype=np.int64)
+            all_aromatic_6_ring_index = np.empty((6, 0), dtype=np.int64)
+            all_planar_double_bond_index = np.empty((6, 0), dtype=np.int64)
+        features = {
+            "ligand_symmetries": molecule_symmetries,
+            "ligand_edge_index": torch.tensor(all_edge_index).long(),
+            "ligand_edge_lower_bounds": torch.tensor(all_lower_bounds),
+            "ligand_edge_upper_bounds": torch.tensor(all_upper_bounds),
+            "ligand_edge_bond_mask": torch.tensor(all_bond_mask),
+            "ligand_edge_angle_mask": torch.tensor(all_angle_mask),
+            "ligand_chiral_atom_index": torch.tensor(all_chiral_atom_index).long(),
+            "ligand_chiral_check_mask": torch.tensor(all_chiral_check_mask),
+            "ligand_chiral_atom_orientations": torch.tensor(
+                all_chiral_atom_orientations
+            ),
+            "ligand_stereo_bond_index": torch.tensor(all_stereo_bond_index).long(),
+            "ligand_stereo_check_mask": torch.tensor(all_stereo_check_mask),
+            "ligand_stereo_bond_orientations": torch.tensor(
+                all_stereo_bond_orientations
+            ),
+            "ligand_aromatic_5_ring_index": torch.tensor(
+                all_aromatic_5_ring_index
+            ).long(),
+            "ligand_aromatic_6_ring_index": torch.tensor(
+                all_aromatic_6_ring_index
+            ).long(),
+            "ligand_planar_double_bond_index": torch.tensor(
+                all_planar_double_bond_index
+            ).long(),
+        }
+    else:
+        features = {
+            "ligand_symmetries": molecule_symmetries,
+        }
+    return features

protify/FastPLMs/boltz/src/boltz/data/msa/__init__.py ADDED Viewed

File without changes

protify/FastPLMs/boltz/src/boltz/data/msa/mmseqs2.py ADDED Viewed

	@@ -0,0 +1,286 @@

+# From https://github.com/sokrypton/ColabFold/blob/main/colabfold/colabfold.py
+import logging
+import os
+import random
+import tarfile
+import time
+from typing import Optional, Union, Dict
+import requests
+from requests.auth import HTTPBasicAuth
+from tqdm import tqdm
+logger = logging.getLogger(__name__)
+TQDM_BAR_FORMAT = (
+    "{l_bar}{bar}| {n_fmt}/{total_fmt} [elapsed: {elapsed} remaining: {remaining}]"
+)
+def run_mmseqs2(  # noqa: PLR0912, D103, C901, PLR0915
+    x: Union[str, list[str]],
+    prefix: str = "tmp",
+    use_env: bool = True,
+    use_filter: bool = True,
+    use_pairing: bool = False,
+    pairing_strategy: str = "greedy",
+    host_url: str = "https://api.colabfold.com",
+    msa_server_username: Optional[str] = None,
+    msa_server_password: Optional[str] = None,
+    auth_headers: Optional[Dict[str, str]] = None,
+) -> tuple[list[str], list[str]]:
+    submission_endpoint = "ticket/pair" if use_pairing else "ticket/msa"
+    # Validate mutually exclusive authentication methods
+    has_basic_auth = msa_server_username and msa_server_password
+    has_header_auth = auth_headers is not None
+    if has_basic_auth and (has_header_auth or auth_headers):
+        raise ValueError(
+            "Cannot use both basic authentication (username/password) and header/API key authentication. "
+            "Please use only one authentication method."
+        )
+    # Set header agent as boltz
+    headers = {}
+    headers["User-Agent"] = "boltz"
+    # Set up authentication
+    auth = None
+    if has_basic_auth:
+        auth = HTTPBasicAuth(msa_server_username, msa_server_password)
+        logger.debug(f"MMSeqs2 server authentication: using basic auth for user '{msa_server_username}'")
+    elif has_header_auth:
+        headers.update(auth_headers)
+        logger.debug("MMSeqs2 server authentication: using header-based authentication")
+    else:
+        logger.debug("MMSeqs2 server authentication: no credentials provided")
+    logger.debug(f"Connecting to MMSeqs2 server at: {host_url}")
+    logger.debug(f"Using endpoint: {submission_endpoint}")
+    logger.debug(f"Pairing strategy: {pairing_strategy}")
+    logger.debug(f"Use environment databases: {use_env}")
+    logger.debug(f"Use filtering: {use_filter}")
+    def submit(seqs, mode, N=101):
+        n, query = N, ""
+        for seq in seqs:
+            query += f">{n}\n{seq}\n"
+            n += 1
+        error_count = 0
+        while True:
+            try:
+                # https://requests.readthedocs.io/en/latest/user/advanced/#advanced
+                # "good practice to set connect timeouts to slightly larger than a multiple of 3"
+                logger.debug(f"Submitting MSA request to {host_url}/{submission_endpoint}")
+                res = requests.post(
+                    f"{host_url}/{submission_endpoint}",
+                    data={"q": query, "mode": mode},
+                    timeout=6.02,
+                    headers=headers,
+                    auth=auth,
+                )
+                logger.debug(f"MSA submission response status: {res.status_code}")
+            except Exception as e:
+                error_count += 1
+                logger.warning(
+                    f"Error while fetching result from MSA server. Retrying... ({error_count}/5)"
+                )
+                logger.warning(f"Error: {e}")
+                if error_count > 5:
+                    raise Exception(
+                        "Too many failed attempts for the MSA generation request."
+                    )
+                time.sleep(5)
+            else:
+                break
+        try:
+            out = res.json()
+        except ValueError:
+            logger.error(f"Server didn't reply with json: {res.text}")
+            out = {"status": "ERROR"}
+        return out
+    def status(ID):
+        error_count = 0
+        while True:
+            try:
+                logger.debug(f"Checking MSA job status for ID: {ID}")
+                res = requests.get(
+                    f"{host_url}/ticket/{ID}", timeout=6.02, headers=headers, auth=auth
+                )
+                logger.debug(f"MSA status check response status: {res.status_code}")
+            except Exception as e:
+                error_count += 1
+                logger.warning(
+                    f"Error while fetching result from MSA server. Retrying... ({error_count}/5)"
+                )
+                logger.warning(f"Error: {e}")
+                if error_count > 5:
+                    raise Exception(
+                        "Too many failed attempts for the MSA generation request."
+                    )
+                time.sleep(5)
+            else:
+                break
+        try:
+            out = res.json()
+        except ValueError:
+            logger.error(f"Server didn't reply with json: {res.text}")
+            out = {"status": "ERROR"}
+        return out
+    def download(ID, path):
+        error_count = 0
+        while True:
+            try:
+                logger.debug(f"Downloading MSA results for ID: {ID}")
+                res = requests.get(
+                    f"{host_url}/result/download/{ID}", timeout=6.02, headers=headers, auth=auth
+                )
+                logger.debug(f"MSA download response status: {res.status_code}")
+            except Exception as e:
+                error_count += 1
+                logger.warning(
+                    f"Error while fetching result from MSA server. Retrying... ({error_count}/5)"
+                )
+                logger.warning(f"Error: {e}")
+                if error_count > 5:
+                    raise Exception(
+                        "Too many failed attempts for the MSA generation request."
+                    )
+                time.sleep(5)
+            else:
+                break
+        with open(path, "wb") as out:
+            out.write(res.content)
+    # process input x
+    seqs = [x] if isinstance(x, str) else x
+    # setup mode
+    if use_filter:
+        mode = "env" if use_env else "all"
+    else:
+        mode = "env-nofilter" if use_env else "nofilter"
+    if use_pairing:
+        mode = ""
+        # greedy is default, complete was the previous behavior
+        if pairing_strategy == "greedy":
+            mode = "pairgreedy"
+        elif pairing_strategy == "complete":
+            mode = "paircomplete"
+        if use_env:
+            mode = mode + "-env"
+    # define path
+    path = f"{prefix}_{mode}"
+    if not os.path.isdir(path):
+        os.mkdir(path)
+    # call mmseqs2 api
+    tar_gz_file = f"{path}/out.tar.gz"
+    N, REDO = 101, True
+    # deduplicate and keep track of order
+    seqs_unique = []
+    # TODO this might be slow for large sets
+    [seqs_unique.append(x) for x in seqs if x not in seqs_unique]
+    Ms = [N + seqs_unique.index(seq) for seq in seqs]
+    # lets do it!
+    if not os.path.isfile(tar_gz_file):
+        TIME_ESTIMATE = 150 * len(seqs_unique)
+        with tqdm(total=TIME_ESTIMATE, bar_format=TQDM_BAR_FORMAT) as pbar:
+            while REDO:
+                pbar.set_description("SUBMIT")
+                # Resubmit job until it goes through
+                out = submit(seqs_unique, mode, N)
+                while out["status"] in ["UNKNOWN", "RATELIMIT"]:
+                    sleep_time = 5 + random.randint(0, 5)
+                    logger.error(f"Sleeping for {sleep_time}s. Reason: {out['status']}")
+                    # resubmit
+                    time.sleep(sleep_time)
+                    out = submit(seqs_unique, mode, N)
+                if out["status"] == "ERROR":
+                    msg = (
+                        "MMseqs2 API is giving errors. Please confirm your "
+                        " input is a valid protein sequence. If error persists, "
+                        "please try again an hour later."
+                    )
+                    raise Exception(msg)
+                if out["status"] == "MAINTENANCE":
+                    msg = (
+                        "MMseqs2 API is undergoing maintenance. "
+                        "Please try again in a few minutes."
+                    )
+                    raise Exception(msg)
+                # wait for job to finish
+                ID, TIME = out["id"], 0
+                logger.debug(f"MSA job submitted successfully with ID: {ID}")
+                pbar.set_description(out["status"])
+                while out["status"] in ["UNKNOWN", "RUNNING", "PENDING"]:
+                    t = 5 + random.randint(0, 5)
+                    logger.error(f"Sleeping for {t}s. Reason: {out['status']}")
+                    time.sleep(t)
+                    out = status(ID)
+                    pbar.set_description(out["status"])
+                    if out["status"] == "RUNNING":
+                        TIME += t
+                        pbar.update(n=t)
+                if out["status"] == "COMPLETE":
+                    logger.debug(f"MSA job completed successfully for ID: {ID}")
+                    if TIME < TIME_ESTIMATE:
+                        pbar.update(n=(TIME_ESTIMATE - TIME))
+                    REDO = False
+                if out["status"] == "ERROR":
+                    REDO = False
+                    msg = (
+                        "MMseqs2 API is giving errors. Please confirm your "
+                        " input is a valid protein sequence. If error persists, "
+                        "please try again an hour later."
+                    )
+                    raise Exception(msg)
+            # Download results
+            download(ID, tar_gz_file)
+    # prep list of a3m files
+    if use_pairing:
+        a3m_files = [f"{path}/pair.a3m"]
+    else:
+        a3m_files = [f"{path}/uniref.a3m"]
+        if use_env:
+            a3m_files.append(f"{path}/bfd.mgnify30.metaeuk30.smag30.a3m")
+    # extract a3m files
+    if any(not os.path.isfile(a3m_file) for a3m_file in a3m_files):
+        with tarfile.open(tar_gz_file) as tar_gz:
+            tar_gz.extractall(path)
+    # gather a3m lines
+    a3m_lines = {}
+    for a3m_file in a3m_files:
+        update_M, M = True, None
+        for line in open(a3m_file, "r"):
+            if len(line) > 0:
+                if "\x00" in line:
+                    line = line.replace("\x00", "")
+                    update_M = True
+                if line.startswith(">") and update_M:
+                    M = int(line[1:].rstrip())
+                    update_M = False
+                    if M not in a3m_lines:
+                        a3m_lines[M] = []
+                a3m_lines[M].append(line)
+    a3m_lines = ["".join(a3m_lines[n]) for n in Ms]
+    return a3m_lines

protify/FastPLMs/boltz/src/boltz/data/pad.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import torch
+from torch import Tensor
+from torch.nn.functional import pad
+def pad_dim(data: Tensor, dim: int, pad_len: float, value: float = 0) -> Tensor:
+    """Pad a tensor along a given dimension.
+    Parameters
+    ----------
+    data : Tensor
+        The input tensor.
+    dim : int
+        The dimension to pad.
+    pad_len : float
+        The padding length.
+    value : int, optional
+        The value to pad with.
+    Returns
+    -------
+    Tensor
+        The padded tensor.
+    """
+    if pad_len == 0:
+        return data
+    total_dims = len(data.shape)
+    padding = [0] * (2 * (total_dims - dim))
+    padding[2 * (total_dims - 1 - dim) + 1] = pad_len
+    return pad(data, tuple(padding), value=value)
+def pad_to_max(data: list[Tensor], value: float = 0) -> tuple[Tensor, Tensor]:
+    """Pad the data in all dimensions to the maximum found.
+    Parameters
+    ----------
+    data : list[Tensor]
+        list of tensors to pad.
+    value : float
+        The value to use for padding.
+    Returns
+    -------
+    Tensor
+        The padded tensor.
+    Tensor
+        The padding mask.
+    """
+    if isinstance(data[0], str):
+        return data, 0
+    # Check if all have the same shape
+    if all(d.shape == data[0].shape for d in data):
+        return torch.stack(data, dim=0), 0
+    # Get the maximum in each dimension
+    num_dims = len(data[0].shape)
+    max_dims = [max(d.shape[i] for d in data) for i in range(num_dims)]
+    # Get the padding lengths
+    pad_lengths = []
+    for d in data:
+        dims = []
+        for i in range(num_dims):
+            dims.append(0)
+            dims.append(max_dims[num_dims - i - 1] - d.shape[num_dims - i - 1])
+        pad_lengths.append(dims)
+    # Pad the data
+    padding = [
+        pad(torch.ones_like(d), pad_len, value=0)
+        for d, pad_len in zip(data, pad_lengths)
+    ]
+    data = [pad(d, pad_len, value=value) for d, pad_len in zip(data, pad_lengths)]
+    # Stack the data
+    padding = torch.stack(padding, dim=0)
+    data = torch.stack(data, dim=0)
+    return data, padding

protify/FastPLMs/boltz/src/boltz/data/parse/__init__.py ADDED Viewed

File without changes

protify/FastPLMs/boltz/src/boltz/data/parse/a3m.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import gzip
+from pathlib import Path
+from typing import Optional, TextIO
+import numpy as np
+from boltz.data import const
+from boltz.data.types import MSA, MSADeletion, MSAResidue, MSASequence
+def _parse_a3m(  # noqa: C901
+    lines: TextIO,
+    taxonomy: Optional[dict[str, str]],
+    max_seqs: Optional[int] = None,
+) -> MSA:
+    """Process an MSA file.
+    Parameters
+    ----------
+    lines : TextIO
+        The lines of the MSA file.
+    taxonomy : dict[str, str]
+        The taxonomy database, if available.
+    max_seqs : int, optional
+        The maximum number of sequences.
+    Returns
+    -------
+    MSA
+        The MSA object.
+    """
+    visited = set()
+    sequences = []
+    deletions = []
+    residues = []
+    seq_idx = 0
+    for line in lines:
+        line: str
+        line = line.strip()  # noqa: PLW2901
+        if not line or line.startswith("#"):
+            continue
+        # Get taxonomy, if annotated
+        if line.startswith(">"):
+            header = line.split()[0]
+            if taxonomy and header.startswith(">UniRef100"):
+                uniref_id = header.split("_")[1]
+                taxonomy_id = taxonomy.get(uniref_id)
+                if taxonomy_id is None:
+                    taxonomy_id = -1
+            else:
+                taxonomy_id = -1
+            continue
+        # Skip if duplicate sequence
+        str_seq = line.replace("-", "").upper()
+        if str_seq not in visited:
+            visited.add(str_seq)
+        else:
+            continue
+        # Process sequence
+        residue = []
+        deletion = []
+        count = 0
+        res_idx = 0
+        for c in line:
+            if c != "-" and c.islower():
+                count += 1
+                continue
+            token = const.prot_letter_to_token[c]
+            token = const.token_ids[token]
+            residue.append(token)
+            if count > 0:
+                deletion.append((res_idx, count))
+                count = 0
+            res_idx += 1
+        res_start = len(residues)
+        res_end = res_start + len(residue)
+        del_start = len(deletions)
+        del_end = del_start + len(deletion)
+        sequences.append((seq_idx, taxonomy_id, res_start, res_end, del_start, del_end))
+        residues.extend(residue)
+        deletions.extend(deletion)
+        seq_idx += 1
+        if (max_seqs is not None) and (seq_idx >= max_seqs):
+            break
+    # Create MSA object
+    msa = MSA(
+        residues=np.array(residues, dtype=MSAResidue),
+        deletions=np.array(deletions, dtype=MSADeletion),
+        sequences=np.array(sequences, dtype=MSASequence),
+    )
+    return msa
+def parse_a3m(
+    path: Path,
+    taxonomy: Optional[dict[str, str]],
+    max_seqs: Optional[int] = None,
+) -> MSA:
+    """Process an A3M file.
+    Parameters
+    ----------
+    path : Path
+        The path to the a3m(.gz) file.
+    taxonomy : Redis
+        The taxonomy database.
+    max_seqs : int, optional
+        The maximum number of sequences.
+    Returns
+    -------
+    MSA
+        The MSA object.
+    """
+    # Read the file
+    if path.suffix == ".gz":
+        with gzip.open(str(path), "rt") as f:
+            msa = _parse_a3m(f, taxonomy, max_seqs)
+    else:
+        with path.open("r") as f:
+            msa = _parse_a3m(f, taxonomy, max_seqs)
+    return msa

protify/FastPLMs/boltz/src/boltz/data/parse/csv.py ADDED Viewed

	@@ -0,0 +1,100 @@

+from pathlib import Path
+from typing import Optional
+import numpy as np
+import pandas as pd
+from boltz.data import const
+from boltz.data.types import MSA, MSADeletion, MSAResidue, MSASequence
+def parse_csv(
+    path: Path,
+    max_seqs: Optional[int] = None,
+) -> MSA:
+    """Process an A3M file.
+    Parameters
+    ----------
+    path : Path
+        The path to the a3m(.gz) file.
+    max_seqs : int, optional
+        The maximum number of sequences.
+    Returns
+    -------
+    MSA
+        The MSA object.
+    """
+    # Read file
+    data = pd.read_csv(path)
+    # Check columns
+    if tuple(sorted(data.columns)) != ("key", "sequence"):
+        msg = "Invalid CSV format, expected columns: ['sequence', 'key']"
+        raise ValueError(msg)
+    # Create taxonomy mapping
+    visited = set()
+    sequences = []
+    deletions = []
+    residues = []
+    seq_idx = 0
+    for line, key in zip(data["sequence"], data["key"]):
+        line: str
+        line = line.strip()  # noqa: PLW2901
+        if not line:
+            continue
+        # Get taxonomy, if annotated
+        taxonomy_id = -1
+        if (str(key) != "nan") and (key is not None) and (key != ""):
+            taxonomy_id = key
+        # Skip if duplicate sequence
+        str_seq = line.replace("-", "").upper()
+        if str_seq not in visited:
+            visited.add(str_seq)
+        else:
+            continue
+        # Process sequence
+        residue = []
+        deletion = []
+        count = 0
+        res_idx = 0
+        for c in line:
+            if c != "-" and c.islower():
+                count += 1
+                continue
+            token = const.prot_letter_to_token[c]
+            token = const.token_ids[token]
+            residue.append(token)
+            if count > 0:
+                deletion.append((res_idx, count))
+                count = 0
+            res_idx += 1
+        res_start = len(residues)
+        res_end = res_start + len(residue)
+        del_start = len(deletions)
+        del_end = del_start + len(deletion)
+        sequences.append((seq_idx, taxonomy_id, res_start, res_end, del_start, del_end))
+        residues.extend(residue)
+        deletions.extend(deletion)
+        seq_idx += 1
+        if (max_seqs is not None) and (seq_idx >= max_seqs):
+            break
+    # Create MSA object
+    msa = MSA(
+        residues=np.array(residues, dtype=MSAResidue),
+        deletions=np.array(deletions, dtype=MSADeletion),
+        sequences=np.array(sequences, dtype=MSASequence),
+    )
+    return msa

protify/FastPLMs/boltz/src/boltz/data/parse/fasta.py ADDED Viewed

	@@ -0,0 +1,138 @@

+from collections.abc import Mapping
+from pathlib import Path
+from Bio import SeqIO
+from rdkit.Chem.rdchem import Mol
+from boltz.data.parse.yaml import parse_boltz_schema
+from boltz.data.types import Target
+def parse_fasta(  # noqa: C901, PLR0912
+    path: Path,
+    ccd: Mapping[str, Mol],
+    mol_dir: Path,
+    boltz2: bool = False,
+) -> Target:
+    """Parse a fasta file.
+    The name of the fasta file is used as the name of this job.
+    We rely on the fasta record id to determine the entity type.
+    > CHAIN_ID|ENTITY_TYPE|MSA_ID
+    SEQUENCE
+    > CHAIN_ID|ENTITY_TYPE|MSA_ID
+    ...
+    Where ENTITY_TYPE is either protein, rna, dna, ccd or smiles,
+    and CHAIN_ID is the chain identifier, which should be unique.
+    The MSA_ID is optional and should only be used on proteins.
+    Parameters
+    ----------
+    fasta_file : Path
+        Path to the fasta file.
+    ccd : Dict
+        Dictionary of CCD components.
+    mol_dir : Path
+        Path to the directory containing the molecules.
+    boltz2 : bool
+        Whether to parse the input for Boltz2.
+    Returns
+    -------
+    Target
+        The parsed target.
+    """
+    # Read fasta file
+    with path.open("r") as f:
+        records = list(SeqIO.parse(f, "fasta"))
+    # Make sure all records have a chain id and entity
+    for seq_record in records:
+        if "|" not in seq_record.id:
+            msg = f"Invalid record id: {seq_record.id}"
+            raise ValueError(msg)
+        header = seq_record.id.split("|")
+        assert len(header) >= 2, f"Invalid record id: {seq_record.id}"
+        chain_id, entity_type = header[:2]
+        if entity_type.lower() not in {"protein", "dna", "rna", "ccd", "smiles"}:
+            msg = f"Invalid entity type: {entity_type}"
+            raise ValueError(msg)
+        if chain_id == "":
+            msg = "Empty chain id in input fasta!"
+            raise ValueError(msg)
+        if entity_type == "":
+            msg = "Empty entity type in input fasta!"
+            raise ValueError(msg)
+    # Convert to yaml format
+    sequences = []
+    for seq_record in records:
+        # Get chain id, entity type and sequence
+        header = seq_record.id.split("|")
+        chain_id, entity_type = header[:2]
+        if len(header) == 3 and header[2] != "":
+            assert entity_type.lower() == "protein", (
+                "MSA_ID is only allowed for proteins"
+            )
+            msa_id = header[2]
+        else:
+            msa_id = None
+        entity_type = entity_type.upper()
+        seq = str(seq_record.seq)
+        if entity_type == "PROTEIN":
+            molecule = {
+                "protein": {
+                    "id": chain_id,
+                    "sequence": seq,
+                    "modifications": [],
+                    "msa": msa_id,
+                },
+            }
+        elif entity_type == "RNA":
+            molecule = {
+                "rna": {
+                    "id": chain_id,
+                    "sequence": seq,
+                    "modifications": [],
+                },
+            }
+        elif entity_type == "DNA":
+            molecule = {
+                "dna": {
+                    "id": chain_id,
+                    "sequence": seq,
+                    "modifications": [],
+                }
+            }
+        elif entity_type.upper() == "CCD":
+            molecule = {
+                "ligand": {
+                    "id": chain_id,
+                    "ccd": seq,
+                }
+            }
+        elif entity_type.upper() == "SMILES":
+            molecule = {
+                "ligand": {
+                    "id": chain_id,
+                    "smiles": seq,
+                }
+            }
+        sequences.append(molecule)
+    data = {
+        "sequences": sequences,
+        "bonds": [],
+        "version": 1,
+    }
+    name = path.stem
+    return parse_boltz_schema(name, data, ccd, mol_dir, boltz2)